tp assemblage cours amine ghozlane

debruijn.py 8.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. """
  2. Small assembly module based on de bruijn graphs
  3. """
  4. import os
  5. import sys
  6. import statistics
  7. import networkx as nx
  8. from networkx import algorithms
  9. def read_fastq(fichier):
  10. """
  11. Returns an iterator object that retrieves only the nucleic sequences of a
  12. fastq file.
  13. Arguments:
  14. fichier, str: path to fastq file
  15. Returns:
  16. a str generator, generator of sequences
  17. """
  18. with open(fichier, 'r') as filin:
  19. for line in filin:
  20. yield filin.readline().strip()
  21. filin.readline()
  22. filin.readline()
  23. def cut_kmer(seq, k):
  24. """
  25. Returns an iterator that returns k-mers of k-size of a sequence
  26. Arguments:
  27. seq, str: a sequence
  28. k, int: k-mer size, must be shorter than len(seq)
  29. Returns:
  30. an iterator returning str
  31. """
  32. for i in range(len(seq)-(k-1)):
  33. yield seq[i:i+k]
  34. def build_kmer_dict(fichier, k):
  35. """
  36. Arguments:
  37. fichier, str: path to fastq file
  38. k, int: k-mer size, must be shorter than len(seq)
  39. Return:
  40. hash_table, dict: dictionnary with key = k-mer as str
  41. and value count of k-mer occurence
  42. """
  43. hash_table = {}# initialise empty hash table
  44. it_fastq = read_fastq(fichier)
  45. for seq in it_fastq: # for each sequence
  46. it_kmer = cut_kmer(seq, k) # count each occurence of k-mer
  47. for kmer in it_kmer:
  48. try:
  49. hash_table[kmer]
  50. except KeyError:
  51. hash_table[kmer] = 1
  52. else:
  53. hash_table[kmer] += 1
  54. return hash_table
  55. def build_graph(hash_table):
  56. """
  57. Returns a graph from a hash table
  58. Arguments:
  59. hash_table, dict: dictionnary obtained with build_kmer_dict() function
  60. Return:
  61. graph, nx.DiGraph: the de Bruijn tree corresponding to hash_table
  62. """
  63. graph = nx.DiGraph()
  64. for key in hash_table:
  65. graph.add_edge(key[:-1], key[1:], weight=hash_table[key])
  66. return graph
  67. def get_starting_nodes(graph):
  68. """
  69. Returns the list of starting nodes of a graph
  70. Arguments:
  71. graph, nx.DiGraph: de Bruijn tree
  72. Return:
  73. starting_nodes, list of strings: list of starting nodes
  74. """
  75. starting_nodes = []
  76. for node in graph:
  77. if graph.in_degree(node) == 0: # if count of input edge == 0
  78. starting_nodes.append(node)
  79. return starting_nodes
  80. def std(values):
  81. """
  82. Computes standard deviation from a list of value
  83. Arguments:
  84. values, list: list of values
  85. Returns :
  86. standard deviation of the 'values' data list
  87. """
  88. return statistics.stdev(float(values))
  89. def get_sink_nodes(graph):
  90. """
  91. Arguments:
  92. graph, nx.DiGraph: de Bruijn tree
  93. Return:
  94. sink_nodes, list of strings: list of terminal nodes
  95. """
  96. sink_nodes = []
  97. for node in graph:
  98. if graph.out_degree(node) == 0: # if count of output edge == 0
  99. sink_nodes.append(node)
  100. return sink_nodes
  101. def path_average_weight(graph, path):
  102. """
  103. Arguments:
  104. graph, nx.DiGraph: a de bruijn graph
  105. path, list of str(nodes): list of nodes constituing a path
  106. Return:
  107. mean weight, float: the mean weight of the path
  108. """
  109. weight = 0
  110. for i in range(len(path)-1):
  111. weight += graph.edges[path[i], path[i+1]]["weight"]
  112. weight = weight/(len(path) - 1)
  113. return weight
  114. def remove_paths(graph, paths, delete_entry_node=False, delete_sink_node=False):
  115. """
  116. Removes paths in 'paths' list from 'graph' object. Don't delete entry and
  117. sink nodes unless specified.
  118. Arguments:
  119. graph, nd.DiGraph(): de bruijn graph
  120. paths, list of string lists: list of paths composed of nodes
  121. delete_entry_node, boolean: Delete entry if True
  122. delete_sink_node, boolean: Delete sink node if True
  123. Return:
  124. graph, nd.DiGraph(): de bruijn graph with deletes paths.
  125. """
  126. entry = 1
  127. sink = -1
  128. if delete_entry_node == True:
  129. entry = 0
  130. if delete_sink_node == True:
  131. sink = None
  132. for path in paths:
  133. graph.remove_nodes_from(path[entry:sink])
  134. return graph
  135. def select_best_path(graph, paths, path_lens, mean_weights,
  136. delete_entry_node=False, delete_sink_node=False):
  137. """
  138. Given path list, their length and weight, keeps only the best path among
  139. them considering the following priority : weight, length and randomly
  140. Arguments:
  141. graph, nx.DiGraph: a de bruijn graph
  142. paths, list of str: list of paths
  143. path_lens: lengths of the paths
  144. mean_weights: mean weights of the paths
  145. delete_entry_node, boolean: either or not if the entry node
  146. should be deleted
  147. delete_sink_node, boolean: either or not if the sink node
  148. should be deleted
  149. Returns:
  150. graph, nx.DiGraph: graph with deleted paths
  151. """
  152. max_weight = max(mean_weights)
  153. heaviest = [i for i, j in enumerate(mean_weights) if j == max_weight]
  154. if len(heaviest) > 1:
  155. max_len = max(path_lens)
  156. longest = [i for i in heaviest if path_lens[i] == max_len]
  157. if len(longest) > 1:
  158. Random.seed(9001)
  159. best = random.choice[longest]
  160. else:
  161. best = longest[0]
  162. else:
  163. best = heaviest[0]
  164. for p in paths:
  165. print(p)
  166. paths2 = [p for p in paths]
  167. paths2.pop(best)
  168. return remove_paths(graph, paths2, delete_entry_node, delete_sink_node)
  169. def fill(text, width=80):
  170. """Split text with a line return to respect fasta format"""
  171. return os.linesep.join(text[i:i+width] for i in range(0, len(text), width))
  172. def save_contigs(tuples, outname):
  173. """
  174. Arguments:
  175. tuples, tuple: Obtained from get_contigs()
  176. outname, str: name of the file to be written
  177. """
  178. i = 0
  179. with open(outname, "w") as outfile:
  180. for duo in tuples:
  181. i += 1
  182. outfile.write(">contig_{} len={}\n".format(i, duo[1]))
  183. outfile.write("{}\n".format(fill(duo[0])))
  184. return
  185. def get_contigs(graph, starting_nodes, sink_nodes):
  186. """
  187. Arguments:
  188. graph, nx.DiGraph: de Bruijn tree
  189. starting_nodes, list of strings: list of starting nodes
  190. sink_nodes, list of strings: list of terminal nodes
  191. Return:
  192. contigs, list of tupple: list of tupple (contigs, len(contigs))
  193. """
  194. contigs = []
  195. for starting_node in starting_nodes:
  196. for sink_node in sink_nodes:
  197. if algorithms.has_path(graph, starting_node, sink_node) == True:
  198. path = algorithms.shortest_path(graph, starting_node, sink_node)
  199. contig = path[0] # base of the contig is seq of the first node
  200. for i in range(len(path)-1):
  201. contig += path[i+1][-1] # adds last char of node
  202. contigs.append((contig, len(contig)))
  203. return contigs
  204. def solve_bubble(graph, ancestor_node, descendent_node):
  205. """
  206. solve a bubble
  207. Arguments:
  208. graph, nx.DiGraph: a de bruijn graph
  209. ancestor_node, str: a node
  210. descendent_node, str: a node
  211. Returns:
  212. graph, nx.DiGraph: the same graph without the bubble
  213. """
  214. paths = algorithms.all_simple_paths(graph, ancestor_node, descendent_node)
  215. paths2= []
  216. for p in paths:
  217. paths2.append(p)
  218. paths = paths2
  219. weights = []
  220. path_lens = []
  221. for path in paths:# constituting weights and length lists
  222. weights.append(path_average_weight(graph, path))
  223. path_lens.append(len(path))
  224. return select_best_path(graph, paths, weights, path_lens) # keep best path
  225. def simplify_bubbles(graph):
  226. """
  227. Returns a bubble-less graph
  228. Arguments:
  229. graph, nx.DiGraph: a de bruijn graph
  230. Returns:
  231. graph, nx.DiGraph: a bubble-less de bruijn graph
  232. """
  233. fork_nodes = []# empty list containing nodes with 2 or more ancestors
  234. f=1
  235. while f == 1:
  236. for node in graph:
  237. if graph.in_degree(node) >= 2: # if 2 or more ancestor add node
  238. pred = [n for n in graph.predecessors(node)]
  239. if len(pred) > 2:
  240. f = 1
  241. else:
  242. f = 0
  243. ancestor = algorithms.lowest_common_ancestor(graph,pred[0], pred[1])
  244. graph = solve_bubble(graph, ancestor, node)
  245. return graph
  246. def solve_entry_tips():
  247. pass
  248. def solve_out_tips():
  249. pass
  250. def main():
  251. fichier = str(sys.argv[1])
  252. k = int(sys.argv[2])
  253. hash_table = build_kmer_dict(fichier, k)
  254. G = build_graph(hash_table)
  255. nb = 0
  256. for n in G:
  257. if G.in_degree(n) > 1:
  258. nb += 1
  259. print("il y a {} bulles dans le graphe\n".format(nb))
  260. simplify_bubbles(G)
  261. nb = 0
  262. for n in G:
  263. if G.in_degree(n) > 1:
  264. nb += 1
  265. print("il y a {} bulles dans le graphe\n".format(nb))
  266. if __name__ == "__main__":
  267. main()