|
@@ -45,7 +45,7 @@ def build_kmer_dict(fichier, k):
|
45
|
45
|
hash_table, dict: dictionnary with key = k-mer as str
|
46
|
46
|
and value count of k-mer occurence
|
47
|
47
|
"""
|
48
|
|
- hash_table = {} # initialise empty hash table
|
|
48
|
+ hash_table = {}# initialise empty hash table
|
49
|
49
|
it_fastq = read_fastq(fichier)
|
50
|
50
|
for seq in it_fastq: # for each sequence
|
51
|
51
|
it_kmer = cut_kmer(seq, k) # count each occurence of k-mer
|
|
@@ -132,11 +132,11 @@ def path_average_weight(graph, path):
|
132
|
132
|
"""
|
133
|
133
|
weight = 0
|
134
|
134
|
for i in range(len(path)-1):
|
135
|
|
- weight += graph[path[i][i+i][weight]
|
136
|
|
-
|
137
|
|
- return weight/(len(path)-1)
|
|
135
|
+ weight += graph.edges[path[i], path[i+1]]["weight"]
|
138
|
136
|
|
|
137
|
+ weight = weight/(len(path) - 1)
|
139
|
138
|
|
|
139
|
+ return weight
|
140
|
140
|
|
141
|
141
|
def remove_paths(graph, paths, delete_entry_node=False, delete_sink_node=False):
|
142
|
142
|
"""
|
|
@@ -164,16 +164,30 @@ def remove_paths(graph, paths, delete_entry_node=False, delete_sink_node=False):
|
164
|
164
|
return graph
|
165
|
165
|
|
166
|
166
|
|
167
|
|
-def select_best_path(graph, paths, path_len, mean_weights,
|
|
167
|
+def select_best_path(graph, paths, path_lens, mean_weights,
|
168
|
168
|
delete_entry_node=False, delete_sink_node=False):
|
169
|
169
|
"""
|
170
|
|
-
|
|
170
|
+ Given path list, their length and weight, keeps only the best path among
|
|
171
|
+ them considering the following priority : weight, length and randomly
|
|
172
|
+
|
|
173
|
+ Arguments:
|
|
174
|
+ graph, nx.DiGraph: a de bruijn graph
|
|
175
|
+ paths, list of str: list of paths
|
|
176
|
+ path_lens: lengths of the paths
|
|
177
|
+ mean_weights: mean weights of the paths
|
|
178
|
+ delete_entry_node, boolean: either or not if the entry node
|
|
179
|
+ should be deleted
|
|
180
|
+ delete_sink_node, boolean: either or not if the sink node
|
|
181
|
+ should be deleted
|
|
182
|
+
|
|
183
|
+ Returns:
|
|
184
|
+ graph, nx.DiGraph: graph with deleted paths
|
171
|
185
|
"""
|
172
|
186
|
max_weight = max(mean_weights)
|
173
|
|
- heaviest = [i for i, j in enumerate(mean_weights) if j == mean_weights]
|
|
187
|
+ heaviest = [i for i, j in enumerate(mean_weights) if j == max_weight]
|
174
|
188
|
if len(heaviest) > 1:
|
175
|
|
- max_len = max(path_lengths)
|
176
|
|
- longest = [i for i in heaviest if path_len[i] == max_len]
|
|
189
|
+ max_len = max(path_lens)
|
|
190
|
+ longest = [i for i in heaviest if path_lens[i] == max_len]
|
177
|
191
|
if len(longest) > 1:
|
178
|
192
|
Random.seed(9001)
|
179
|
193
|
best = random.choice[longest]
|
|
@@ -181,9 +195,14 @@ def select_best_path(graph, paths, path_len, mean_weights,
|
181
|
195
|
best = longest[0]
|
182
|
196
|
else:
|
183
|
197
|
best = heaviest[0]
|
184
|
|
- paths.pop(best)
|
|
198
|
+
|
|
199
|
+ for p in paths:
|
|
200
|
+ print(p)
|
185
|
201
|
|
186
|
|
- return remove_paths(graph, paths, delete_entry_node, delete_sink_node)
|
|
202
|
+ paths2 = [p for p in paths]
|
|
203
|
+ paths2.pop(best)
|
|
204
|
+
|
|
205
|
+ return remove_paths(graph, paths2, delete_entry_node, delete_sink_node)
|
187
|
206
|
|
188
|
207
|
|
189
|
208
|
def fill(text, width=80):
|
|
@@ -227,13 +246,47 @@ def get_contigs(graph, starting_nodes, sink_nodes):
|
227
|
246
|
|
228
|
247
|
return contigs
|
229
|
248
|
|
230
|
|
-def solve_bubble():
|
231
|
|
- pass
|
|
249
|
+def solve_bubble(graph, ancestor_node, descendent_node):
|
|
250
|
+ """
|
|
251
|
+ solve a bubble
|
232
|
252
|
|
|
253
|
+ Arguments:
|
|
254
|
+ graph, nx.DiGraph: a de bruijn graph
|
|
255
|
+ ancestor_node, str: a node
|
|
256
|
+ descendent_node, str: a node
|
233
|
257
|
|
234
|
|
-def simplify_bubbles():
|
235
|
|
- pass
|
|
258
|
+ Returns:
|
|
259
|
+ graph, nx.DiGraph: the same graph without the bubble
|
|
260
|
+ """
|
|
261
|
+ paths = algorithms.all_simple_paths(graph, ancestor_node, descendent_node)
|
|
262
|
+
|
|
263
|
+ weights = []
|
|
264
|
+ path_lens = []
|
|
265
|
+ for path in paths:# constituting weights and length lists
|
|
266
|
+ weights.append(path_average_weight(graph, path))
|
|
267
|
+ path_lens.append(len(path))
|
|
268
|
+
|
|
269
|
+ return select_best_path(graph, paths, weights, path_lens) # keep best path
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+def simplify_bubbles(graph):
|
|
273
|
+ """
|
|
274
|
+ Returns a bubble-less graph
|
|
275
|
+
|
|
276
|
+ Arguments:
|
|
277
|
+ graph, nx.DiGraph: a de bruijn graph
|
236
|
278
|
|
|
279
|
+ Returns:
|
|
280
|
+ graph, nx.DiGraph: a bubble-less de bruijn graph
|
|
281
|
+ """
|
|
282
|
+ fork_nodes = []# empty list containing nodes with 2 or more ancestors
|
|
283
|
+ for node in graph:
|
|
284
|
+ while graph.in_degree(node) >= 2: # if 2 or more ancestor add node
|
|
285
|
+ pred = [n for n in graph.predecessors(node)]
|
|
286
|
+ ancestor = algorithms.lowest_common_ancestor(graph,pred[0], pred[1])
|
|
287
|
+ graph = solve_bubble(graph, ancestor, node)
|
|
288
|
+
|
|
289
|
+ return graph
|
237
|
290
|
|
238
|
291
|
def solve_entry_tips():
|
239
|
292
|
pass
|