5 anos atrás · c978b20705
--- a/debruijn/debruijn.py
+++ b/debruijn/debruijn.py
@@ -8,6 +8,8 @@ from networkx import algorithms
 
				 
			
 
				 def read_fastq(fichier):
			
 
				     """
			
 
				+    Returns an iterator object that retrieves only the nucleic sequences of a
			
 
				+    fastq file.
			
 
				     Arguments:
			
 
				         fichier, str: path to fastq file
			
 
				 
			
@@ -22,6 +24,7 @@ def read_fastq(fichier):
 
				 
			
 
				 def cut_kmer(seq, k):
			
 
				     """
			
 
				+    Returns an iterator that returns k-mers of k-size of a sequence
			
 
				     Arguments:
			
 
				         seq, str: a sequence
			
 
				         k, int: k-mer size, must be shorter than len(seq)
			
@@ -42,10 +45,10 @@ def build_kmer_dict(fichier, k):
 
				         hash_table, dict: dictionnary with key = k-mer as str
			
 
				                           and value count of k-mer occurence
			
 
				     """
			
 
				-    hash_table = {}
			
 
				+    hash_table = {} # initialise empty hash table
			
 
				     it_fastq = read_fastq(fichier)
			
 
				-    for seq in it_fastq:
			
 
				-        it_kmer = cut_kmer(seq, k)
			
 
				+    for seq in it_fastq: # for each sequence
			
 
				+        it_kmer = cut_kmer(seq, k) # count each occurence of k-mer
			
 
				         for kmer in it_kmer:
			
 
				             try:
			
 
				                 hash_table[kmer]
			
@@ -58,6 +61,8 @@ def build_kmer_dict(fichier, k):
 
				 
			
 
				 def build_graph(hash_table):
			
 
				     """
			
 
				+    Returns a graph from a hash table
			
 
				+
			
 
				     Arguments:
			
 
				         hash_table, dict: dictionnary obtained with build_kmer_dict() function
			
 
				     Return:
			
@@ -71,6 +76,8 @@ def build_graph(hash_table):
 
				 
			
 
				 def get_starting_nodes(graph):
			
 
				     """
			
 
				+    Returns the list of starting nodes of a graph
			
 
				+
			
 
				     Arguments:
			
 
				         graph, nx.DiGraph: de Bruijn tree
			
 
				 
			
@@ -79,24 +86,27 @@ def get_starting_nodes(graph):
 
				     """
			
 
				     starting_nodes = []
			
 
				     for node in graph:
			
 
				-        if graph.in_degree(node) == 0:
			
 
				+        if graph.in_degree(node) == 0: # if count of input edge == 0
			
 
				             starting_nodes.append(node)
			
 
				 
			
 
				     return starting_nodes
			
 
				 
			
 
				 def std(values):
			
 
				     """
			
 
				+    Computes standard deviation from a list of value
			
 
				     Arguments:
			
 
				         values, list: list of values
			
 
				 
			
 
				     Returns :
			
 
				         standard deviation of the 'values' data list
			
 
				     """
			
 
				-    return stdev(float(values))
			
 
				+    return statistics.stdev(float(values))
			
 
				+
			
 
				 
			
 
				 
			
 
				 def get_sink_nodes(graph):
			
 
				     """
			
 
				+
			
 
				     Arguments:
			
 
				         graph, nx.DiGraph: de Bruijn tree
			
 
				 
			
@@ -105,7 +115,7 @@ def get_sink_nodes(graph):
 
				     """
			
 
				     sink_nodes = []
			
 
				     for node in graph:
			
 
				-        if graph.out_degree(node) == 0:
			
 
				+        if graph.out_degree(node) == 0: # if count of output edge == 0
			
 
				             sink_nodes.append(node)
			
 
				 
			
 
				     return sink_nodes
			
@@ -154,8 +164,27 @@ def remove_paths(graph, paths, delete_entry_node=False, delete_sink_node=False):
 
				     return graph
			
 
				 
			
 
				 
			
 
				-def select_best_path():
			
 
				-    pass
			
 
				+def select_best_path(graph, paths, path_len, mean_weights,
			
 
				+                     delete_entry_node=False, delete_sink_node=False):
			
 
				+    """
			
 
				+    
			
 
				+    """
			
 
				+    max_weight = max(mean_weights)
			
 
				+    heaviest = [i for i, j in enumerate(mean_weights) if j == mean_weights]
			
 
				+    if len(heaviest) > 1:
			
 
				+        max_len = max(path_lengths)
			
 
				+        longest = [i for i in heaviest if path_len[i] == max_len]
			
 
				+        if len(longest) > 1:
			
 
				+            Random.seed(9001)
			
 
				+            best = random.choice[longest]
			
 
				+        else:
			
 
				+            best = longest[0]
			
 
				+    else:
			
 
				+        best = heaviest[0]
			
 
				+    paths.pop(best)
			
 
				+
			
 
				+    return remove_paths(graph, paths, delete_entry_node, delete_sink_node)
			
 
				+
			
 
				 
			
 
				 def fill(text, width=80):
			
 
				     """Split text with a line return to respect fasta format"""
			
@@ -173,7 +202,6 @@ def save_contigs(tuples, outname):
 
				             i += 1
			
 
				             outfile.write(">contig_{} len={}\n".format(i, duo[1]))
			
 
				             outfile.write("{}\n".format(fill(duo[0])))
			
 
				-
			
 
				     return
			
 
				 
			
 
				 
			
@@ -192,9 +220,9 @@ def get_contigs(graph, starting_nodes, sink_nodes):
 
				         for sink_node in sink_nodes:
			
 
				             if algorithms.has_path(graph, starting_node, sink_node) == True:
			
 
				                 path = algorithms.shortest_path(graph, starting_node, sink_node)
			
 
				-                contig = path[0]
			
 
				+                contig = path[0] # base of the contig is seq of the first node
			
 
				                 for i in range(len(path)-1):
			
 
				-                    contig += path[i+1][-1]
			
 
				+                    contig += path[i+1][-1] # adds last char of node
			
 
				                 contigs.append((contig, len(contig)))
			
 
				     
			
 
				     return contigs