Browse Source

last version still buggy

nicolas-zimmermann 4 years ago
parent
commit
e6337ad43e
1 changed files with 68 additions and 15 deletions
  1. 68 15
      debruijn/debruijn.py

+ 68 - 15
debruijn/debruijn.py View File

@@ -45,7 +45,7 @@ def build_kmer_dict(fichier, k):
45 45
         hash_table, dict: dictionnary with key = k-mer as str
46 46
                           and value count of k-mer occurence
47 47
     """
48
-    hash_table = {} # initialise empty hash table
48
+    hash_table = {}# initialise empty hash table
49 49
     it_fastq = read_fastq(fichier)
50 50
     for seq in it_fastq: # for each sequence
51 51
         it_kmer = cut_kmer(seq, k) # count each occurence of k-mer
@@ -132,11 +132,11 @@ def path_average_weight(graph, path):
132 132
     """
133 133
     weight = 0
134 134
     for i in range(len(path)-1):
135
-        weight += graph[path[i][i+i][weight]
136
-    
137
-    return weight/(len(path)-1)
135
+        weight += graph.edges[path[i], path[i+1]]["weight"]
138 136
 
137
+    weight = weight/(len(path) - 1)
139 138
 
139
+    return weight
140 140
 
141 141
 def remove_paths(graph, paths, delete_entry_node=False, delete_sink_node=False):
142 142
     """
@@ -164,16 +164,30 @@ def remove_paths(graph, paths, delete_entry_node=False, delete_sink_node=False):
164 164
     return graph
165 165
 
166 166
 
167
-def select_best_path(graph, paths, path_len, mean_weights,
167
+def select_best_path(graph, paths, path_lens, mean_weights,
168 168
                      delete_entry_node=False, delete_sink_node=False):
169 169
     """
170
-    
170
+    Given path list, their length and weight, keeps only the best path among
171
+    them considering the following priority : weight, length and randomly
172
+
173
+    Arguments:
174
+        graph, nx.DiGraph: a de bruijn graph
175
+        paths, list of str: list of paths
176
+        path_lens: lengths of the paths
177
+        mean_weights: mean weights of the paths
178
+        delete_entry_node, boolean: either or not if the entry node
179
+                                    should be deleted
180
+        delete_sink_node, boolean: either or not if the sink node
181
+                                   should be deleted
182
+
183
+    Returns:
184
+        graph, nx.DiGraph: graph with deleted paths
171 185
     """
172 186
     max_weight = max(mean_weights)
173
-    heaviest = [i for i, j in enumerate(mean_weights) if j == mean_weights]
187
+    heaviest = [i for i, j in enumerate(mean_weights) if j == max_weight]
174 188
     if len(heaviest) > 1:
175
-        max_len = max(path_lengths)
176
-        longest = [i for i in heaviest if path_len[i] == max_len]
189
+        max_len = max(path_lens)
190
+        longest = [i for i in heaviest if path_lens[i] == max_len]
177 191
         if len(longest) > 1:
178 192
             Random.seed(9001)
179 193
             best = random.choice[longest]
@@ -181,9 +195,14 @@ def select_best_path(graph, paths, path_len, mean_weights,
181 195
             best = longest[0]
182 196
     else:
183 197
         best = heaviest[0]
184
-    paths.pop(best)
198
+    
199
+    for p in paths:
200
+        print(p)
185 201
 
186
-    return remove_paths(graph, paths, delete_entry_node, delete_sink_node)
202
+    paths2 = [p for p in paths]
203
+    paths2.pop(best)
204
+
205
+    return remove_paths(graph, paths2, delete_entry_node, delete_sink_node)
187 206
 
188 207
 
189 208
 def fill(text, width=80):
@@ -227,13 +246,47 @@ def get_contigs(graph, starting_nodes, sink_nodes):
227 246
     
228 247
     return contigs
229 248
 
230
-def solve_bubble():
231
-    pass
249
+def solve_bubble(graph, ancestor_node, descendent_node):
250
+    """
251
+    solve a bubble
232 252
 
253
+    Arguments:
254
+        graph, nx.DiGraph: a de bruijn graph
255
+        ancestor_node, str: a node
256
+        descendent_node, str: a node
233 257
 
234
-def simplify_bubbles():
235
-    pass
258
+    Returns:
259
+        graph, nx.DiGraph: the same graph without the bubble
260
+    """
261
+    paths = algorithms.all_simple_paths(graph, ancestor_node, descendent_node)
262
+
263
+    weights = []
264
+    path_lens = []
265
+    for path in paths:# constituting weights and length lists
266
+        weights.append(path_average_weight(graph, path))
267
+        path_lens.append(len(path))
268
+
269
+    return select_best_path(graph, paths, weights, path_lens) # keep best path
270
+
271
+
272
+def simplify_bubbles(graph):
273
+    """
274
+    Returns a bubble-less graph 
275
+
276
+    Arguments:
277
+        graph, nx.DiGraph: a de bruijn graph
236 278
 
279
+    Returns:
280
+        graph, nx.DiGraph: a bubble-less de bruijn graph
281
+    """
282
+    fork_nodes = []# empty list containing nodes with 2 or more ancestors
283
+    for node in graph:
284
+        while graph.in_degree(node) >= 2: # if 2 or more ancestor add node
285
+            pred = [n for n in graph.predecessors(node)] 
286
+            ancestor = algorithms.lowest_common_ancestor(graph,pred[0], pred[1])
287
+            graph = solve_bubble(graph, ancestor, node)
288
+
289
+    return graph
237 290
 
238 291
 def solve_entry_tips():
239 292
     pass