tp assemblage cours amine ghozlane

debruijn.py 8.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. """
  2. Small assembly module based on de bruijn graphs
  3. """
  4. import os
  5. import statistics
  6. import networkx as nx
  7. from networkx import algorithms
  8. def read_fastq(fichier):
  9. """
  10. Returns an iterator object that retrieves only the nucleic sequences of a
  11. fastq file.
  12. Arguments:
  13. fichier, str: path to fastq file
  14. Returns:
  15. a str generator, generator of sequences
  16. """
  17. with open(fichier, 'r') as filin:
  18. for line in filin:
  19. yield filin.readline().strip()
  20. filin.readline()
  21. filin.readline()
  22. def cut_kmer(seq, k):
  23. """
  24. Returns an iterator that returns k-mers of k-size of a sequence
  25. Arguments:
  26. seq, str: a sequence
  27. k, int: k-mer size, must be shorter than len(seq)
  28. Returns:
  29. an iterator returning str
  30. """
  31. for i in range(len(seq)-(k-1)):
  32. yield seq[i:i+k]
  33. def build_kmer_dict(fichier, k):
  34. """
  35. Arguments:
  36. fichier, str: path to fastq file
  37. k, int: k-mer size, must be shorter than len(seq)
  38. Return:
  39. hash_table, dict: dictionnary with key = k-mer as str
  40. and value count of k-mer occurence
  41. """
  42. hash_table = {}# initialise empty hash table
  43. it_fastq = read_fastq(fichier)
  44. for seq in it_fastq: # for each sequence
  45. it_kmer = cut_kmer(seq, k) # count each occurence of k-mer
  46. for kmer in it_kmer:
  47. try:
  48. hash_table[kmer]
  49. except KeyError:
  50. hash_table[kmer] = 1
  51. else:
  52. hash_table[kmer] += 1
  53. return hash_table
  54. def build_graph(hash_table):
  55. """
  56. Returns a graph from a hash table
  57. Arguments:
  58. hash_table, dict: dictionnary obtained with build_kmer_dict() function
  59. Return:
  60. graph, nx.DiGraph: the de Bruijn tree corresponding to hash_table
  61. """
  62. graph = nx.DiGraph()
  63. for key in hash_table:
  64. graph.add_edge(key[:-1], key[1:], weight=hash_table[key])
  65. return graph
  66. def get_starting_nodes(graph):
  67. """
  68. Returns the list of starting nodes of a graph
  69. Arguments:
  70. graph, nx.DiGraph: de Bruijn tree
  71. Return:
  72. starting_nodes, list of strings: list of starting nodes
  73. """
  74. starting_nodes = []
  75. for node in graph:
  76. if graph.in_degree(node) == 0: # if count of input edge == 0
  77. starting_nodes.append(node)
  78. return starting_nodes
  79. def std(values):
  80. """
  81. Computes standard deviation from a list of value
  82. Arguments:
  83. values, list: list of values
  84. Returns :
  85. standard deviation of the 'values' data list
  86. """
  87. return statistics.stdev(float(values))
  88. def get_sink_nodes(graph):
  89. """
  90. Arguments:
  91. graph, nx.DiGraph: de Bruijn tree
  92. Return:
  93. sink_nodes, list of strings: list of terminal nodes
  94. """
  95. sink_nodes = []
  96. for node in graph:
  97. if graph.out_degree(node) == 0: # if count of output edge == 0
  98. sink_nodes.append(node)
  99. return sink_nodes
  100. def path_average_weight(graph, path):
  101. """
  102. Arguments:
  103. graph, nx.DiGraph: a de bruijn graph
  104. path, list of str(nodes): list of nodes constituing a path
  105. Return:
  106. mean weight, float: the mean weight of the path
  107. """
  108. weight = 0
  109. for i in range(len(path)-1):
  110. weight += graph.edges[path[i], path[i+1]]["weight"]
  111. weight = weight/(len(path) - 1)
  112. return weight
  113. def remove_paths(graph, paths, delete_entry_node=False, delete_sink_node=False):
  114. """
  115. Removes paths in 'paths' list from 'graph' object. Don't delete entry and
  116. sink nodes unless specified.
  117. Arguments:
  118. graph, nd.DiGraph(): de bruijn graph
  119. paths, list of string lists: list of paths composed of nodes
  120. delete_entry_node, boolean: Delete entry if True
  121. delete_sink_node, boolean: Delete sink node if True
  122. Return:
  123. graph, nd.DiGraph(): de bruijn graph with deletes paths.
  124. """
  125. entry = 1
  126. sink = -1
  127. if delete_entry_node == True:
  128. entry = 0
  129. if delete_sink_node == True:
  130. sink = None
  131. for path in paths:
  132. graph.remove_nodes_from(path[entry:sink])
  133. return graph
  134. def select_best_path(graph, paths, path_lens, mean_weights,
  135. delete_entry_node=False, delete_sink_node=False):
  136. """
  137. Given path list, their length and weight, keeps only the best path among
  138. them considering the following priority : weight, length and randomly
  139. Arguments:
  140. graph, nx.DiGraph: a de bruijn graph
  141. paths, list of str: list of paths
  142. path_lens: lengths of the paths
  143. mean_weights: mean weights of the paths
  144. delete_entry_node, boolean: either or not if the entry node
  145. should be deleted
  146. delete_sink_node, boolean: either or not if the sink node
  147. should be deleted
  148. Returns:
  149. graph, nx.DiGraph: graph with deleted paths
  150. """
  151. max_weight = max(mean_weights)
  152. heaviest = [i for i, j in enumerate(mean_weights) if j == max_weight]
  153. if len(heaviest) > 1:
  154. max_len = max(path_lens)
  155. longest = [i for i in heaviest if path_lens[i] == max_len]
  156. if len(longest) > 1:
  157. Random.seed(9001)
  158. best = random.choice[longest]
  159. else:
  160. best = longest[0]
  161. else:
  162. best = heaviest[0]
  163. for p in paths:
  164. print(p)
  165. paths2 = [p for p in paths]
  166. paths2.pop(best)
  167. return remove_paths(graph, paths2, delete_entry_node, delete_sink_node)
  168. def fill(text, width=80):
  169. """Split text with a line return to respect fasta format"""
  170. return os.linesep.join(text[i:i+width] for i in range(0, len(text), width))
  171. def save_contigs(tuples, outname):
  172. """
  173. Arguments:
  174. tuples, tuple: Obtained from get_contigs()
  175. outname, str: name of the file to be written
  176. """
  177. i = 0
  178. with open(outname, "w") as outfile:
  179. for duo in tuples:
  180. i += 1
  181. outfile.write(">contig_{} len={}\n".format(i, duo[1]))
  182. outfile.write("{}\n".format(fill(duo[0])))
  183. return
  184. def get_contigs(graph, starting_nodes, sink_nodes):
  185. """
  186. Arguments:
  187. graph, nx.DiGraph: de Bruijn tree
  188. starting_nodes, list of strings: list of starting nodes
  189. sink_nodes, list of strings: list of terminal nodes
  190. Return:
  191. contigs, list of tupple: list of tupple (contigs, len(contigs))
  192. """
  193. contigs = []
  194. for starting_node in starting_nodes:
  195. for sink_node in sink_nodes:
  196. if algorithms.has_path(graph, starting_node, sink_node) == True:
  197. path = algorithms.shortest_path(graph, starting_node, sink_node)
  198. contig = path[0] # base of the contig is seq of the first node
  199. for i in range(len(path)-1):
  200. contig += path[i+1][-1] # adds last char of node
  201. contigs.append((contig, len(contig)))
  202. return contigs
  203. def solve_bubble(graph, ancestor_node, descendent_node):
  204. """
  205. solve a bubble
  206. Arguments:
  207. graph, nx.DiGraph: a de bruijn graph
  208. ancestor_node, str: a node
  209. descendent_node, str: a node
  210. Returns:
  211. graph, nx.DiGraph: the same graph without the bubble
  212. """
  213. paths = algorithms.all_simple_paths(graph, ancestor_node, descendent_node)
  214. weights = []
  215. path_lens = []
  216. for path in paths:# constituting weights and length lists
  217. weights.append(path_average_weight(graph, path))
  218. path_lens.append(len(path))
  219. return select_best_path(graph, paths, weights, path_lens) # keep best path
  220. def simplify_bubbles(graph):
  221. """
  222. Returns a bubble-less graph
  223. Arguments:
  224. graph, nx.DiGraph: a de bruijn graph
  225. Returns:
  226. graph, nx.DiGraph: a bubble-less de bruijn graph
  227. """
  228. fork_nodes = []# empty list containing nodes with 2 or more ancestors
  229. for node in graph:
  230. while graph.in_degree(node) >= 2: # if 2 or more ancestor add node
  231. pred = [n for n in graph.predecessors(node)]
  232. ancestor = algorithms.lowest_common_ancestor(graph,pred[0], pred[1])
  233. graph = solve_bubble(graph, ancestor, node)
  234. return graph
  235. def solve_entry_tips():
  236. pass
  237. def solve_out_tips():
  238. pass