|
@@ -8,6 +8,8 @@ from networkx import algorithms
|
8
|
8
|
|
9
|
9
|
def read_fastq(fichier):
|
10
|
10
|
"""
|
|
11
|
+ Returns an iterator object that retrieves only the nucleic sequences of a
|
|
12
|
+ fastq file.
|
11
|
13
|
Arguments:
|
12
|
14
|
fichier, str: path to fastq file
|
13
|
15
|
|
|
@@ -22,6 +24,7 @@ def read_fastq(fichier):
|
22
|
24
|
|
23
|
25
|
def cut_kmer(seq, k):
|
24
|
26
|
"""
|
|
27
|
+ Returns an iterator that returns k-mers of k-size of a sequence
|
25
|
28
|
Arguments:
|
26
|
29
|
seq, str: a sequence
|
27
|
30
|
k, int: k-mer size, must be shorter than len(seq)
|
|
@@ -42,10 +45,10 @@ def build_kmer_dict(fichier, k):
|
42
|
45
|
hash_table, dict: dictionnary with key = k-mer as str
|
43
|
46
|
and value count of k-mer occurence
|
44
|
47
|
"""
|
45
|
|
- hash_table = {}
|
|
48
|
+ hash_table = {} # initialise empty hash table
|
46
|
49
|
it_fastq = read_fastq(fichier)
|
47
|
|
- for seq in it_fastq:
|
48
|
|
- it_kmer = cut_kmer(seq, k)
|
|
50
|
+ for seq in it_fastq: # for each sequence
|
|
51
|
+ it_kmer = cut_kmer(seq, k) # count each occurence of k-mer
|
49
|
52
|
for kmer in it_kmer:
|
50
|
53
|
try:
|
51
|
54
|
hash_table[kmer]
|
|
@@ -58,6 +61,8 @@ def build_kmer_dict(fichier, k):
|
58
|
61
|
|
59
|
62
|
def build_graph(hash_table):
|
60
|
63
|
"""
|
|
64
|
+ Returns a graph from a hash table
|
|
65
|
+
|
61
|
66
|
Arguments:
|
62
|
67
|
hash_table, dict: dictionnary obtained with build_kmer_dict() function
|
63
|
68
|
Return:
|
|
@@ -71,6 +76,8 @@ def build_graph(hash_table):
|
71
|
76
|
|
72
|
77
|
def get_starting_nodes(graph):
|
73
|
78
|
"""
|
|
79
|
+ Returns the list of starting nodes of a graph
|
|
80
|
+
|
74
|
81
|
Arguments:
|
75
|
82
|
graph, nx.DiGraph: de Bruijn tree
|
76
|
83
|
|
|
@@ -79,24 +86,27 @@ def get_starting_nodes(graph):
|
79
|
86
|
"""
|
80
|
87
|
starting_nodes = []
|
81
|
88
|
for node in graph:
|
82
|
|
- if graph.in_degree(node) == 0:
|
|
89
|
+ if graph.in_degree(node) == 0: # if count of input edge == 0
|
83
|
90
|
starting_nodes.append(node)
|
84
|
91
|
|
85
|
92
|
return starting_nodes
|
86
|
93
|
|
87
|
94
|
def std(values):
|
88
|
95
|
"""
|
|
96
|
+ Computes standard deviation from a list of value
|
89
|
97
|
Arguments:
|
90
|
98
|
values, list: list of values
|
91
|
99
|
|
92
|
100
|
Returns :
|
93
|
101
|
standard deviation of the 'values' data list
|
94
|
102
|
"""
|
95
|
|
- return stdev(float(values))
|
|
103
|
+ return statistics.stdev(float(values))
|
|
104
|
+
|
96
|
105
|
|
97
|
106
|
|
98
|
107
|
def get_sink_nodes(graph):
|
99
|
108
|
"""
|
|
109
|
+
|
100
|
110
|
Arguments:
|
101
|
111
|
graph, nx.DiGraph: de Bruijn tree
|
102
|
112
|
|
|
@@ -105,7 +115,7 @@ def get_sink_nodes(graph):
|
105
|
115
|
"""
|
106
|
116
|
sink_nodes = []
|
107
|
117
|
for node in graph:
|
108
|
|
- if graph.out_degree(node) == 0:
|
|
118
|
+ if graph.out_degree(node) == 0: # if count of output edge == 0
|
109
|
119
|
sink_nodes.append(node)
|
110
|
120
|
|
111
|
121
|
return sink_nodes
|
|
@@ -154,8 +164,27 @@ def remove_paths(graph, paths, delete_entry_node=False, delete_sink_node=False):
|
154
|
164
|
return graph
|
155
|
165
|
|
156
|
166
|
|
157
|
|
-def select_best_path():
|
158
|
|
- pass
|
|
167
|
+def select_best_path(graph, paths, path_len, mean_weights,
|
|
168
|
+ delete_entry_node=False, delete_sink_node=False):
|
|
169
|
+ """
|
|
170
|
+
|
|
171
|
+ """
|
|
172
|
+ max_weight = max(mean_weights)
|
|
173
|
+ heaviest = [i for i, j in enumerate(mean_weights) if j == mean_weights]
|
|
174
|
+ if len(heaviest) > 1:
|
|
175
|
+ max_len = max(path_lengths)
|
|
176
|
+ longest = [i for i in heaviest if path_len[i] == max_len]
|
|
177
|
+ if len(longest) > 1:
|
|
178
|
+ Random.seed(9001)
|
|
179
|
+ best = random.choice[longest]
|
|
180
|
+ else:
|
|
181
|
+ best = longest[0]
|
|
182
|
+ else:
|
|
183
|
+ best = heaviest[0]
|
|
184
|
+ paths.pop(best)
|
|
185
|
+
|
|
186
|
+ return remove_paths(graph, paths, delete_entry_node, delete_sink_node)
|
|
187
|
+
|
159
|
188
|
|
160
|
189
|
def fill(text, width=80):
|
161
|
190
|
"""Split text with a line return to respect fasta format"""
|
|
@@ -173,7 +202,6 @@ def save_contigs(tuples, outname):
|
173
|
202
|
i += 1
|
174
|
203
|
outfile.write(">contig_{} len={}\n".format(i, duo[1]))
|
175
|
204
|
outfile.write("{}\n".format(fill(duo[0])))
|
176
|
|
-
|
177
|
205
|
return
|
178
|
206
|
|
179
|
207
|
|
|
@@ -192,9 +220,9 @@ def get_contigs(graph, starting_nodes, sink_nodes):
|
192
|
220
|
for sink_node in sink_nodes:
|
193
|
221
|
if algorithms.has_path(graph, starting_node, sink_node) == True:
|
194
|
222
|
path = algorithms.shortest_path(graph, starting_node, sink_node)
|
195
|
|
- contig = path[0]
|
|
223
|
+ contig = path[0] # base of the contig is seq of the first node
|
196
|
224
|
for i in range(len(path)-1):
|
197
|
|
- contig += path[i+1][-1]
|
|
225
|
+ contig += path[i+1][-1] # adds last char of node
|
198
|
226
|
contigs.append((contig, len(contig)))
|
199
|
227
|
|
200
|
228
|
return contigs
|