Browse Source

First commit

Amine GHOZLANE 2 years ago
commit
069e99e46b
9 changed files with 3059 additions and 0 deletions
  1. 2 0
      .gitignore
  2. 151 0
      data/eva71.fna
  3. 2820 0
      data/eva71_plus_perfect.fq
  4. 0 0
      tests/__init__.py
  5. 9 0
      tests/context.py
  6. 14 0
      tests/kmer.pck
  7. 4 0
      tests/test_build.fq
  8. 51 0
      tests/test_construction_debruijn.py
  9. 8 0
      tests/test_two_reads.fq

+ 2 - 0
.gitignore View File

@@ -0,0 +1,2 @@
1
+__py_cache__
2
+*.pyc

+ 151 - 0
data/eva71.fna View File

@@ -0,0 +1,151 @@
1
+>EVA71_BrCr_U22521 EVA71
2
+TTAAAACAGCTGTGGGTTGTCACCCACCCACAGGGTCCACTGGGCGCT
3
+AGTACACTGGTATCTCGGTACCTTTGTACGCCTGTTTTATACCCCCTCCC
4
+TGATTTGCAACTTAGAAGCAACGCAAACCAGATCAATAGTAGGT
5
+GTGACATACCAGTCGCATCTTGATCAAGCACTTCTGTATCCCCGGACCGA
6
+GTATCAATAGACTGTGCACACGGTTGAAGGAGAAAACGTCCGTTACCCGG
7
+CTAACTACTTCGAGAAGCCTAGTAACGCCATTGAAGTTGCAGAGTGTTTC
8
+GCTCAGCACTCCCCCCGTGTAGATCAGGTCGATGAGTCACCGCATTC
9
+CCCACGGGCGACCGTGGCGGTGGCTGCGTTGGCGGCCTGCCTATGGGGTA
10
+ACCCATAGGACGCTCTAATACGGACATGGCGTGAAGAGTCTATTGAGCTA
11
+GTTAGTAGTCCTCCGGCCCCTGAATGCGGCTAATCCTAACTGCGGAGCAC
12
+ATACCCTTAATCCAAAGGGCAGTGTGTCGTAACGGGCAACTCTGCAGCGG
13
+AACCGACTACTTTGGGTGTCCGTGTTTCTTTTTATTCTTGTATTGGCTGC
14
+TTATGGTGACAATTAAAGAATTGTTACCATATAGCTATTGGATTGGCCAT
15
+CCAGTGTCAAACAGAGCTATTGTATATCTCTTTGTTGGATTCACACCTCT
16
+CACTCTTGAAACGTTACACACCCTCAATTACATTATACTGCTGAACAC
17
+GAAGCGATGGGCTCCCAGGTCTCCACACAGCGATCCGGCTCGCAT
18
+GAGAATTCCAACTCAGCCACGGAAGGCTCCACTATAAATTACACAACCAT
19
+TAATTACTACAAAGACTCGTATGCTGCCACTGCTGGAAAGCAAAGTCTCA
20
+AACAAGATCCTGACAAGTTTGCGAACCCTGTGAAGGACATCTTTACTGAA
21
+ATGGCAGCGCCCTTAAAGTCTCCCTCTGCTGAAGCATGTGGCTATAGCGA
22
+CCGAGTGGCACAGCTTACCATTGGAAATTCCACCATTACTACACAAGAAG
23
+CAGCAAACATAATAGTTGGGTATGGTGAGTGGCCTTCATACTGCTCTGAT
24
+AATGATGCAACAGCGGTAGACAAACCTACACGGCCTGATGTCTCAGTAAA
25
+TAGATTTTACACGCTAGACACTAAGCTATGGGAGAAATCATCCAAGGGGT
26
+GGTACTGGAAGTTCCCAGATGTACTGACTGAAACCGGAGTTTTTGGTCCA
27
+AATGCACAATTTCACTACTTATACCGTTCAGGGTTCTGCATCCACGTTCA
28
+ATGTAACGCTAGCAAATTTCACCAAGGGGCGCTACTCGTTGCGGTATTGC
29
+CCGAGTATGTCATTGGAACAGTGGCAGGCGGCACAGGCACAGA
30
+GAACAGTCACCCTCCTTATAAACAAACCCAACCCGGCGCTGATGG
31
+ATTTGAATTACAACATCCATATGTTCTTGATGCTGGAATTCCAATATCTC
32
+AGTTGACAGTGTGCCCTCACCAGTGGATCAATTTACGAACCAACAATTGT
33
+GCCACCATAATAGTGCCATACATGAACACACTACCTTTTGATTCCGCATT
34
+GAACCACTGTAATTTCGGACTATTGGTGGTGCCTATCAGCCCGCTGGAT
35
+TTCGACCAAGGGGCGACACCGGTAATTCCTATCACTATCACGTTGGCTCC
36
+GATGTGTTCTGAGTTTGCGGGTCTCAGGCAGGCAGTTACGCAGGGTTTTC
37
+CCACTGAATTGAAACCTGGCACTAATCAGTTCTTAACCACGGATGATGGT
38
+GTGTCAGCACCTATATTGCCAAATTTCCACCCCACCCCGTGCATTCACAT
39
+ACCTGGCGAGGTTAGAAACTTACTAGAACTGTGCCAGGTAGAAACCATTT
40
+TAGAAGTCAACAATGTGCCCACCAACGCAACCAGTTTGATGGAAAGGCTA
41
+CGGTTTCCAGTGTCAGCCCAAGCAGGGAAAGGTGAGTTGTGTGCAGTGTT
42
+CAGGGCCGACCCTGGGAGGGATGGTCCTTGGCAATCCACCATGCTAGGCC
43
+AGTTGTGTGGATATTACACCCAATGGTCAGGGTCTTTGGAAGTCACTTTT
44
+ATGTTCACCGGATCCTTTATGGCAACTGGTAAAATGCTTATAGCTTACAC
45
+ACCCCCAGGGGGCCCTTTGCCTAAAGATAGAGCCACAGCTATGCTGGGGA
46
+CGCACGTCATCTGGGACTTTGGCTTGCAATCGTCCGTCACCCTCGTCATA
47
+CCATGGATCAGTAACACTCACTATAGGGCGCATGCTCGAGATGGGGTGTT
48
+TGATTACTACACCACAGGTTTGGTTAGTATATGGTACCAAACAAATTATG
49
+TAGTCCCTATTGGAGCACCTAATACTGCCTATATAATAGCGTTGGCAGCA
50
+GCCCAAAAGAATTTCACTATGAAATTGTGCAAGGACACCAGTGACATTTT
51
+GGAAACGGCCACTATTCAAGGGGACAGAGTGGCAGATGTGATTGAGAGCT
52
+CTATAGGAGATAGTGTGAGTAAGGCCCTCACCCCAGCTTTACCTGCACC
53
+CACAGGCCCAGACACCCAAGTGAGCAGTCATCGCTTAGACACTGGAAA
54
+AGTACCAGCACTTCAAGCCGCCGAAATCGGAGCTTCGTCGAATGCTAGTG
55
+ATGAGAGTATGATTGAGACTCGGTGTGTTCTTAACTCACATAGCACAGCT
56
+GAAACCACCCTTGATAGTTTCTTCAGCAGAGCAGGCTTAGTTGGGGAGAT
57
+AGATCTTCCTCTAAAGGGCACCACCAATCCGAACGGGTATGCCAACTGGG
58
+ACATAGACATAACCGGTTATGCGCAGATGCGCAGAAAAGTGGAACTATTC
59
+ACCTATATGCGCTTTGACGCAGAGTTCACTTTTGTCGCGTGCACACCTAC
60
+CGGAAGGGTCGTTCCACAGCTGCTTCAATACATGTTTGTTCCACCCGGGG
61
+CCCCCAAACCAGACTCCAGAGACTCTTTGGCTTGGCCAACGGCCACGAAC
62
+CCCTCAGTTTTTGTCAAATCATCCGACCCACCAGCACAAGTCTCAGTGCC
63
+ATTTATGTCACCTGCAAGCGCATACCAATGGTTTTATGACGGATACCCTA
64
+CATTTGGAGAGCACAAGCAAGAGAAGGATCTCGAGTATGGGGCATGCCCG
65
+AATAACATGATGGGCACATTCTCAGTGCGGACTGTGGGATCGTCAAAGTC
66
+AGAATATTCCTTAGTCATCAGAATATACATGAGAATGAAGCACGTCAGAG
67
+CGTGGATACCTCGGCCGATGCGCAATCAGAACTATTTGTTCAAATCCAAC
68
+CCAAACTATGCTGGTGATTCCATTAAACCAACTGGTACCAGCCGAA
69
+CGGCAATCACTA
70
+CGCTCGGGAAATTCGGTCAGCAGTCTGGGGCTATTTATGTGGGCAACTTT
71
+AGGGTAGTAAACAGACACCTAGCCACCCATACTGACTGGGCCAACTTGGT
72
+GTGGGAAGACAGCTCTAGAGACCTCCTAGTTTCTTCAACTACCGCTCAAG
73
+GGTGTGACACCATTGCTCGATGTAACTGCCAAACCGGAGTGTATTACTGT
74
+AACTCTCGCAGAAAACACTATCCAGTCAGTTTTTCGAAACCTAGTTTGGT
75
+GTTTGTAGAAGCTAGTGAGTATTATCCAGCTAGATATCAGTCCCATCTTA
76
+TGCTTGCTGAGGGCCATTCAGAACCTGGTGATTGTGGCGGTATTCTTAGA
77
+TGCCAACACGGTGTGGTGGGAATTGTCTCCACTGGCGGAAGTGGCCTTGT
78
+GGGATTTGCTGACGTTAGAGATCTTCTGTGGCTAGATGAGGAAGCGATGG
79
+AGCAGGGGGTATCTGATTACATCAAAGGTCTCGGTCGAGCCTTCGGCACA
80
+GGTTTCACTGACGCAGTGTCTAGGGAAGTGGAAGCGTTGAAGAACCACTT
81
+AATCGGCTCCGAAGGGGCTGTTGAGAAGATCTTGAAGAACTTGGTGAAGC
82
+TAATTTCAGCCTTAGTTATAGTCATCAGAAGTGATTATGATATGGTCACC
83
+CTCACAGCCACACTAGCTCTGATCGGGTGCCACGGGAGTCCTTGGGCGTG
84
+GATCAAATCAAAGACAGCTTCCATACTGGGCATTCCCATGGCACAAAAAC
85
+AGAGTGCCTCATGGCTAAAGAAGTTCAATGACATGGCAAATGCTGCAAAA
86
+GGGCTTGAGTGGATTTTCAACAAGATCAGTAAGTTCATTGACTGGCTTAA
87
+AGAGAAGATCATTCCAGCTGCCAAAGAGAAAGTTGAGTTTTTGAACAACC
88
+TAAAACAGCTCCCCTTGTTGGAGAACCAAGTCTCCAATCTTGAACAGTCT
89
+GCTGCCTCACAAGAAGACTTAGAAGCTATGTTTGGTAATGTGATATATCT
90
+GGCTCACTTTTGCCGCAAATTCCAACCACTCTACGCAACTGAGGCCAAGA
91
+GAGTCTACGCTTTAGAGAAAAGGATGAATAACTACATGCAGTTCAAGAGC
92
+AAACACCGTATTGAACCTGTATGCTTGATCATCAGAGGTTCCCCCGGAAC
93
+GGGCAAATCGCTCGCCACAGGCATTATAGCTAGAGCCATTGCTGACAAGT
94
+ATCGCTCTAGTGTATACTCACTCCCCCCAGACCCAGATCACTTTGATGGG
95
+TATAAGCAACAGGTGGTCGCGGTCATGGATGATCTCTGCCAGAACCCGGA
96
+CGGAAAAGACATGTCCCTATTTTGTCAAATGGTTTCTACAGTAGATTTTG
97
+TACCACCCATGGCATCACTAGAGGAGAAAGGAGTGTCCTTCACCTCTAAG
98
+TTTGTCATTGCATCGACCAATGCTAGTAACATCATAGTCCCCACAGTTTC
99
+AGATTCAGATGCAATTCGCAGGCGATTCTATATGGACTGCGATATAGAAG
100
+TGACAGATTCTTACAAGACAGACCTCGGTCGGCTGGACGCAGGTAGAGCT
101
+GCCAAGCTTTGTACAGAAAATAACACTGCTAATTTTAAGAGATGCAGCCC
102
+ACTGGTGTGTGGTAAGGCTATTCAGCTGAGAGACAGGAAGTCCAAAGTGA
103
+GATATAGCGTCGACACCGTGGTATCGGAACTGATCAGAGAGTACAACAAT
104
+AGATCTGCTATTGGGAATACTATAGAAGCACTCTTTCAAGGACCCCCTAA
105
+ATTCAGGCCTATAAGAATTAGTCTCGAAGAAAAGCCAGCCCCAGATGCCA
106
+TTAGCGATCTCCTCGCTAGTGTAGATAGCGAGGAGGTGCGTCAGTACTGC
107
+AGGGAACAAGGCTGGATCATCCCTGAAACTCCCACCAATGTTGAGCGTCA
108
+CCTCAATAGAGCAGTATTGGTAATGCAGTCCATCGCCACTGTGGTTGCAG
109
+TTGTGTCTCTTGTTTATGTCATTTATAAGCTGTTTGCCGGGTTCCAGGGT
110
+GCTTACTCTGGAGCGCCCAAGCCCATTCTCAAGAAGCCCGTGTTAAGAAC
111
+AGCCACGGTCCAAGGGCCCAGCTTAGACTTCGCCTTGTCTCTTTTGAGGC
112
+GCAACATTAGACAAGCGCAAACTGACCAAGGACACTTCACCATGCTAGGA
113
+GTGCGAGATCGCCTAGCCATCCTGCCGCGCCACTCGCAACCAGGGAAGAC
114
+CATCTGGGTAGAGCATAAATTAATCAATGTACTAGATGCAGTTGAGTTGG
115
+TGGATGAGCAAGGTGTAAACTTGGAACTCACACTGGTAACTTTGGACACC
116
+AATGAAAAATTTAGGGATATCACCAAGTGTATCCCAGAAGTGATCACCGG
117
+GGCGAGTGACGCAACTCTAGTCATCAACACTGAGCACATTCCCTCAATGT
118
+TTGTGCCGGTGGGTGACGTTGTGCAGTACGGTTTCTTGAACCTTAGTGGT
119
+AAACCCACACACAGAACCATGATGTATAACTTCCCCACGAAGCCAGGACA
120
+GTGTGGGGGGGTGGTTACCTCAGTTGGTAAGATCATTGGAATCCACATTG
121
+GCGGGAATGGACGCCAGGCCTTTTGCGCTGGCCTAAAGAGGAGTTATTTT
122
+GCCAGCGAGCAAGGAGAGATCCAGTGGATGAAGCCTAACAGAGAAACCGG
123
+GAGGTTGAATATTAATGGTCCAACCCGAACTAAGCTGGAACCCAGTGTAT
124
+TCCATGATGTGTTCGAGGGCAACAAGGAACCAGCGGTCCTGACTAGTAAG
125
+GACCCCAGACTTGAGGTTGATTTTGAGCAAGCTTTGTTCTCCAAGTATGT
126
+GGGTAACACCCTGCATGAACCTGATGAGTACGTGACACAGGCTGCTCTCC
127
+ACTACGCAAATCAGCTGAAGCAACTGGACATCAACACCAGCAAGATGAGC
128
+ATGGAAGAAGCGTGCTATGGCACAGAATATTTAGAAGCTATAGACTTGCA
129
+CACCAGTGCTGGATACCCTTATAGTGCTTTGGGCATCAAGAAAAGAGACA
130
+TCCTCGACCCAGTTACCAGAGACACCTCCAGGATGAAGTTATATATGGAT
131
+AAGTATGGGTTGGACTTGCCTAATTCCACTTATGTAAAGGATGAGCTTAG
132
+TTCTCTAGATAAGATCAGAAAAGGGGAGTCTCGCCTGATTGAGGCTAGCA
133
+GCTTAAATGATCCTGTCTACCCTAGATTGACTTTTGGACACCTTTATGAA
134
+GTGTTTCACGCCAACCCAGGGACTGTAACAGGATCTGCAGTTGGGTGCAA
135
+CCCTGATGTATTTTGGAGCAAGTTACCAATTTTGTTACCGGGTTCACTCT
136
+TTGCATTTGACTACTCAGGATATGATGCAAGCCTTAGTCCTGTGTGGTTC
137
+AGAGCTCTAGAGTTGGTTCTGAGAGAGATCGGTTACTCGGAGGAGGCTGT
138
+GTCACTCATAGAAGGGATCAATCACACCCACCACGTGTACCGAAACAAGA
139
+CATATTGTGTACTTGGTGGAATGCCCTCAGGCTGCTCCGGTACTTCCATT
140
+TTCAATTCCATGATTAACAACATAATCATCAGAACCCTCCTGATTAAAAC
141
+ATTCAAAGGTATAGACTTAGATGAGCTGAAAATGGTAGCTTATGGAGATG
142
+ACGTGTTGGCCAGCTACCCGTTTCCTATTGATTGCTTGGAATGGGGTAAA
143
+ACAGGCAAAGAATATGGGCTGACTATGACTCCTGCTGATAAATCACCTTG
144
+TTTCAATGAGGTTACCTGGGAGAATGCAACCTTCTTAAAACGGGGTTTTC
145
+TACCGGACCATCAGTTCCCTTTTCTGATCCATCCCACTATGCCCATGAGG
146
+GAAATCCATGAGTCCATCCGCTGGACCAAGGACGCGCGCAATACTCAAGA
147
+TCATGTGCGCTCCCTTTGTCTCCTGGCATGGCATAATGGAAAAGAGGAGT
148
+ATGAGAAATTTGTGAGTACAATTAGATCAGTCCCCATTGGAAGGGCTTTA
149
+GCAATTCCAAATTTGGAGAACTTGAGAAGAAATTGGCTCGAGTTATTTTA
150
+AACTTACAGCTCAATGCTGAACCCCACCAGAAATCTGGTCGTGTCAAT
151
+GACTGGTGGGGGTAAATTTGTTATAACCAGAATAGC

File diff suppressed because it is too large
+ 2820 - 0
data/eva71_plus_perfect.fq


+ 0 - 0
tests/__init__.py View File


+ 9 - 0
tests/context.py View File

@@ -0,0 +1,9 @@
1
+import os
2
+import sys
3
+# Path of debruijn package
4
+#sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__),
5
+#                                                '..')))
6
+# Path for core
7
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__),
8
+                                                '../debruijn')))
9
+import debruijn

+ 14 - 0
tests/kmer.pck View File

@@ -0,0 +1,14 @@
1
+(dp0
2
+S'GAG'
3
+p1
4
+I1
5
+sS'CAG'
6
+p2
7
+I1
8
+sS'AGA'
9
+p3
10
+I2
11
+sS'TCA'
12
+p4
13
+I1
14
+s.

+ 4 - 0
tests/test_build.fq View File

@@ -0,0 +1,4 @@
1
+@EVA71_BrCr_U22521-1480
2
+TCAGAGA
3
++
4
+JJJJJJJ

+ 51 - 0
tests/test_construction_debruijn.py View File

@@ -0,0 +1,51 @@
1
+"""Tests for graph build"""
2
+import pytest
3
+import os
4
+import networkx as nx
5
+import pickle
6
+from .context import debruijn
7
+from debruijn import read_fastq
8
+from debruijn import cut_kmer
9
+from debruijn import build_kmer_dict
10
+from debruijn import build_graph
11
+
12
+
13
+def test_read_fastq():
14
+    """Test fastq reading"""
15
+    fastq_reader = read_fastq(os.path.abspath(os.path.join(os.path.dirname(__file__), "test_two_reads.fq")))
16
+    assert next(fastq_reader) == "TCAGAGCTCTAGAGTTGGTTCTGAGAGAGATCGGTTACTCGGAGGAGGCTGTGTCACTCATAGAAGGGATCAATCACACCCACCACGTGTACCGAAACAA"
17
+    assert next(fastq_reader) == "TTTGAATTACAACATCCATATGTTCTTGATGCTGGAATTCCAATATCTCAGTTGACAGTGTGCCCTCACCAGTGGATCAATTTACGAACCAACAATTGTG"
18
+
19
+
20
+def test_cut_kmer():
21
+    """test Kmer cut"""
22
+    kmer_reader = cut_kmer("TCAGA", 3)
23
+    assert next(kmer_reader) == "TCA"
24
+    assert next(kmer_reader) == "CAG"
25
+    assert next(kmer_reader) == "AGA"
26
+
27
+
28
+def test_build_kmer_dict():
29
+    kmer_dict = build_kmer_dict(os.path.abspath(os.path.join(os.path.dirname(__file__), "test_build.fq")), 3)
30
+    assert(len(kmer_dict.keys()) == 4)
31
+    assert "TCA" in kmer_dict
32
+    assert "CAG" in kmer_dict
33
+    assert "AGA" in kmer_dict
34
+    assert "GAG" in kmer_dict
35
+    assert kmer_dict["AGA"] == 2
36
+
37
+def test_build_graph():
38
+    file = open(os.path.abspath(os.path.join(os.path.dirname(__file__), "kmer.pck")),'rb')
39
+    kmer_dict = pickle.load(file)
40
+    graph = build_graph(kmer_dict)
41
+    #TCAGAGA
42
+    #TCA  TC CA
43
+    #CAG CA AG
44
+    #AGA AG GA
45
+    #GAG GA AG
46
+    #AGA AG GA
47
+    assert graph.number_of_nodes() == 4
48
+    assert graph.number_of_edges() == 4
49
+    assert "AG" in graph
50
+    assert "GA" in graph
51
+    assert graph.edges["AG", "GA"]['weight'] == 2

+ 8 - 0
tests/test_two_reads.fq View File

@@ -0,0 +1,8 @@
1
+@EVA71_BrCr_U22521-1480
2
+TCAGAGCTCTAGAGTTGGTTCTGAGAGAGATCGGTTACTCGGAGGAGGCTGTGTCACTCATAGAAGGGATCAATCACACCCACCACGTGTACCGAAACAA
3
++
4
+JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ
5
+@EVA71_BrCr_U22521-1478
6
+TTTGAATTACAACATCCATATGTTCTTGATGCTGGAATTCCAATATCTCAGTTGACAGTGTGCCCTCACCAGTGGATCAATTTACGAACCAACAATTGTG
7
++
8
+JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ