Explorar el Código

added select_best_path function

nicolas-zimmermann hace 5 años
padre
commit
c978b20705
Se han modificado 1 ficheros con 39 adiciones y 11 borrados
  1. 39 11
      debruijn/debruijn.py

+ 39 - 11
debruijn/debruijn.py Ver fichero

@@ -8,6 +8,8 @@ from networkx import algorithms
8 8
 
9 9
 def read_fastq(fichier):
10 10
     """
11
+    Returns an iterator object that retrieves only the nucleic sequences of a
12
+    fastq file.
11 13
     Arguments:
12 14
         fichier, str: path to fastq file
13 15
 
@@ -22,6 +24,7 @@ def read_fastq(fichier):
22 24
 
23 25
 def cut_kmer(seq, k):
24 26
     """
27
+    Returns an iterator that returns k-mers of k-size of a sequence
25 28
     Arguments:
26 29
         seq, str: a sequence
27 30
         k, int: k-mer size, must be shorter than len(seq)
@@ -42,10 +45,10 @@ def build_kmer_dict(fichier, k):
42 45
         hash_table, dict: dictionnary with key = k-mer as str
43 46
                           and value count of k-mer occurence
44 47
     """
45
-    hash_table = {}
48
+    hash_table = {} # initialise empty hash table
46 49
     it_fastq = read_fastq(fichier)
47
-    for seq in it_fastq:
48
-        it_kmer = cut_kmer(seq, k)
50
+    for seq in it_fastq: # for each sequence
51
+        it_kmer = cut_kmer(seq, k) # count each occurence of k-mer
49 52
         for kmer in it_kmer:
50 53
             try:
51 54
                 hash_table[kmer]
@@ -58,6 +61,8 @@ def build_kmer_dict(fichier, k):
58 61
 
59 62
 def build_graph(hash_table):
60 63
     """
64
+    Returns a graph from a hash table
65
+
61 66
     Arguments:
62 67
         hash_table, dict: dictionnary obtained with build_kmer_dict() function
63 68
     Return:
@@ -71,6 +76,8 @@ def build_graph(hash_table):
71 76
 
72 77
 def get_starting_nodes(graph):
73 78
     """
79
+    Returns the list of starting nodes of a graph
80
+
74 81
     Arguments:
75 82
         graph, nx.DiGraph: de Bruijn tree
76 83
 
@@ -79,24 +86,27 @@ def get_starting_nodes(graph):
79 86
     """
80 87
     starting_nodes = []
81 88
     for node in graph:
82
-        if graph.in_degree(node) == 0:
89
+        if graph.in_degree(node) == 0: # if count of input edge == 0
83 90
             starting_nodes.append(node)
84 91
 
85 92
     return starting_nodes
86 93
 
87 94
 def std(values):
88 95
     """
96
+    Computes standard deviation from a list of value
89 97
     Arguments:
90 98
         values, list: list of values
91 99
 
92 100
     Returns :
93 101
         standard deviation of the 'values' data list
94 102
     """
95
-    return stdev(float(values))
103
+    return statistics.stdev(float(values))
104
+
96 105
 
97 106
 
98 107
 def get_sink_nodes(graph):
99 108
     """
109
+
100 110
     Arguments:
101 111
         graph, nx.DiGraph: de Bruijn tree
102 112
 
@@ -105,7 +115,7 @@ def get_sink_nodes(graph):
105 115
     """
106 116
     sink_nodes = []
107 117
     for node in graph:
108
-        if graph.out_degree(node) == 0:
118
+        if graph.out_degree(node) == 0: # if count of output edge == 0
109 119
             sink_nodes.append(node)
110 120
 
111 121
     return sink_nodes
@@ -154,8 +164,27 @@ def remove_paths(graph, paths, delete_entry_node=False, delete_sink_node=False):
154 164
     return graph
155 165
 
156 166
 
157
-def select_best_path():
158
-    pass
167
+def select_best_path(graph, paths, path_len, mean_weights,
168
+                     delete_entry_node=False, delete_sink_node=False):
169
+    """
170
+    
171
+    """
172
+    max_weight = max(mean_weights)
173
+    heaviest = [i for i, j in enumerate(mean_weights) if j == mean_weights]
174
+    if len(heaviest) > 1:
175
+        max_len = max(path_lengths)
176
+        longest = [i for i in heaviest if path_len[i] == max_len]
177
+        if len(longest) > 1:
178
+            Random.seed(9001)
179
+            best = random.choice[longest]
180
+        else:
181
+            best = longest[0]
182
+    else:
183
+        best = heaviest[0]
184
+    paths.pop(best)
185
+
186
+    return remove_paths(graph, paths, delete_entry_node, delete_sink_node)
187
+
159 188
 
160 189
 def fill(text, width=80):
161 190
     """Split text with a line return to respect fasta format"""
@@ -173,7 +202,6 @@ def save_contigs(tuples, outname):
173 202
             i += 1
174 203
             outfile.write(">contig_{} len={}\n".format(i, duo[1]))
175 204
             outfile.write("{}\n".format(fill(duo[0])))
176
-
177 205
     return
178 206
 
179 207
 
@@ -192,9 +220,9 @@ def get_contigs(graph, starting_nodes, sink_nodes):
192 220
         for sink_node in sink_nodes:
193 221
             if algorithms.has_path(graph, starting_node, sink_node) == True:
194 222
                 path = algorithms.shortest_path(graph, starting_node, sink_node)
195
-                contig = path[0]
223
+                contig = path[0] # base of the contig is seq of the first node
196 224
                 for i in range(len(path)-1):
197
-                    contig += path[i+1][-1]
225
+                    contig += path[i+1][-1] # adds last char of node
198 226
                 contigs.append((contig, len(contig)))
199 227
     
200 228
     return contigs