Browse Source

added select_best_path function

nicolas-zimmermann 5 years ago
parent
commit
c978b20705
1 changed files with 39 additions and 11 deletions
  1. 39 11
      debruijn/debruijn.py

+ 39 - 11
debruijn/debruijn.py View File

8
 
8
 
9
 def read_fastq(fichier):
9
 def read_fastq(fichier):
10
     """
10
     """
11
+    Returns an iterator object that retrieves only the nucleic sequences of a
12
+    fastq file.
11
     Arguments:
13
     Arguments:
12
         fichier, str: path to fastq file
14
         fichier, str: path to fastq file
13
 
15
 
22
 
24
 
23
 def cut_kmer(seq, k):
25
 def cut_kmer(seq, k):
24
     """
26
     """
27
+    Returns an iterator that returns k-mers of k-size of a sequence
25
     Arguments:
28
     Arguments:
26
         seq, str: a sequence
29
         seq, str: a sequence
27
         k, int: k-mer size, must be shorter than len(seq)
30
         k, int: k-mer size, must be shorter than len(seq)
42
         hash_table, dict: dictionnary with key = k-mer as str
45
         hash_table, dict: dictionnary with key = k-mer as str
43
                           and value count of k-mer occurence
46
                           and value count of k-mer occurence
44
     """
47
     """
45
-    hash_table = {}
48
+    hash_table = {} # initialise empty hash table
46
     it_fastq = read_fastq(fichier)
49
     it_fastq = read_fastq(fichier)
47
-    for seq in it_fastq:
48
-        it_kmer = cut_kmer(seq, k)
50
+    for seq in it_fastq: # for each sequence
51
+        it_kmer = cut_kmer(seq, k) # count each occurence of k-mer
49
         for kmer in it_kmer:
52
         for kmer in it_kmer:
50
             try:
53
             try:
51
                 hash_table[kmer]
54
                 hash_table[kmer]
58
 
61
 
59
 def build_graph(hash_table):
62
 def build_graph(hash_table):
60
     """
63
     """
64
+    Returns a graph from a hash table
65
+
61
     Arguments:
66
     Arguments:
62
         hash_table, dict: dictionnary obtained with build_kmer_dict() function
67
         hash_table, dict: dictionnary obtained with build_kmer_dict() function
63
     Return:
68
     Return:
71
 
76
 
72
 def get_starting_nodes(graph):
77
 def get_starting_nodes(graph):
73
     """
78
     """
79
+    Returns the list of starting nodes of a graph
80
+
74
     Arguments:
81
     Arguments:
75
         graph, nx.DiGraph: de Bruijn tree
82
         graph, nx.DiGraph: de Bruijn tree
76
 
83
 
79
     """
86
     """
80
     starting_nodes = []
87
     starting_nodes = []
81
     for node in graph:
88
     for node in graph:
82
-        if graph.in_degree(node) == 0:
89
+        if graph.in_degree(node) == 0: # if count of input edge == 0
83
             starting_nodes.append(node)
90
             starting_nodes.append(node)
84
 
91
 
85
     return starting_nodes
92
     return starting_nodes
86
 
93
 
87
 def std(values):
94
 def std(values):
88
     """
95
     """
96
+    Computes standard deviation from a list of value
89
     Arguments:
97
     Arguments:
90
         values, list: list of values
98
         values, list: list of values
91
 
99
 
92
     Returns :
100
     Returns :
93
         standard deviation of the 'values' data list
101
         standard deviation of the 'values' data list
94
     """
102
     """
95
-    return stdev(float(values))
103
+    return statistics.stdev(float(values))
104
+
96
 
105
 
97
 
106
 
98
 def get_sink_nodes(graph):
107
 def get_sink_nodes(graph):
99
     """
108
     """
109
+
100
     Arguments:
110
     Arguments:
101
         graph, nx.DiGraph: de Bruijn tree
111
         graph, nx.DiGraph: de Bruijn tree
102
 
112
 
105
     """
115
     """
106
     sink_nodes = []
116
     sink_nodes = []
107
     for node in graph:
117
     for node in graph:
108
-        if graph.out_degree(node) == 0:
118
+        if graph.out_degree(node) == 0: # if count of output edge == 0
109
             sink_nodes.append(node)
119
             sink_nodes.append(node)
110
 
120
 
111
     return sink_nodes
121
     return sink_nodes
154
     return graph
164
     return graph
155
 
165
 
156
 
166
 
157
-def select_best_path():
158
-    pass
167
+def select_best_path(graph, paths, path_len, mean_weights,
168
+                     delete_entry_node=False, delete_sink_node=False):
169
+    """
170
+    
171
+    """
172
+    max_weight = max(mean_weights)
173
+    heaviest = [i for i, j in enumerate(mean_weights) if j == mean_weights]
174
+    if len(heaviest) > 1:
175
+        max_len = max(path_lengths)
176
+        longest = [i for i in heaviest if path_len[i] == max_len]
177
+        if len(longest) > 1:
178
+            Random.seed(9001)
179
+            best = random.choice[longest]
180
+        else:
181
+            best = longest[0]
182
+    else:
183
+        best = heaviest[0]
184
+    paths.pop(best)
185
+
186
+    return remove_paths(graph, paths, delete_entry_node, delete_sink_node)
187
+
159
 
188
 
160
 def fill(text, width=80):
189
 def fill(text, width=80):
161
     """Split text with a line return to respect fasta format"""
190
     """Split text with a line return to respect fasta format"""
173
             i += 1
202
             i += 1
174
             outfile.write(">contig_{} len={}\n".format(i, duo[1]))
203
             outfile.write(">contig_{} len={}\n".format(i, duo[1]))
175
             outfile.write("{}\n".format(fill(duo[0])))
204
             outfile.write("{}\n".format(fill(duo[0])))
176
-
177
     return
205
     return
178
 
206
 
179
 
207
 
192
         for sink_node in sink_nodes:
220
         for sink_node in sink_nodes:
193
             if algorithms.has_path(graph, starting_node, sink_node) == True:
221
             if algorithms.has_path(graph, starting_node, sink_node) == True:
194
                 path = algorithms.shortest_path(graph, starting_node, sink_node)
222
                 path = algorithms.shortest_path(graph, starting_node, sink_node)
195
-                contig = path[0]
223
+                contig = path[0] # base of the contig is seq of the first node
196
                 for i in range(len(path)-1):
224
                 for i in range(len(path)-1):
197
-                    contig += path[i+1][-1]
225
+                    contig += path[i+1][-1] # adds last char of node
198
                 contigs.append((contig, len(contig)))
226
                 contigs.append((contig, len(contig)))
199
     
227
     
200
     return contigs
228
     return contigs