tp assemblage cours amine ghozlane

debruijn.py 5.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. """
  2. Small assembly module based on de bruijn graphs
  3. """
  4. import os
  5. import statistics
  6. import networkx as nx
  7. from networkx import algorithms
  8. def read_fastq(fichier):
  9. """
  10. Arguments:
  11. fichier, str: path to fastq file
  12. Returns:
  13. a str generator, generator of sequences
  14. """
  15. with open(fichier, 'r') as filin:
  16. for line in filin:
  17. yield filin.readline().strip()
  18. filin.readline()
  19. filin.readline()
  20. def cut_kmer(seq, k):
  21. """
  22. Arguments:
  23. seq, str: a sequence
  24. k, int: k-mer size, must be shorter than len(seq)
  25. Returns:
  26. an iterator returning str
  27. """
  28. for i in range(len(seq)-(k-1)):
  29. yield seq[i:i+k]
  30. def build_kmer_dict(fichier, k):
  31. """
  32. Arguments:
  33. fichier, str: path to fastq file
  34. k, int: k-mer size, must be shorter than len(seq)
  35. Return:
  36. hash_table, dict: dictionnary with key = k-mer as str
  37. and value count of k-mer occurence
  38. """
  39. hash_table = {}
  40. it_fastq = read_fastq(fichier)
  41. for seq in it_fastq:
  42. it_kmer = cut_kmer(seq, k)
  43. for kmer in it_kmer:
  44. try:
  45. hash_table[kmer]
  46. except KeyError:
  47. hash_table[kmer] = 1
  48. else:
  49. hash_table[kmer] += 1
  50. return hash_table
  51. def build_graph(hash_table):
  52. """
  53. Arguments:
  54. hash_table, dict: dictionnary obtained with build_kmer_dict() function
  55. Return:
  56. graph, nx.DiGraph: the de Bruijn tree corresponding to hash_table
  57. """
  58. graph = nx.DiGraph()
  59. for key in hash_table:
  60. graph.add_edge(key[:-1], key[1:], weight=hash_table[key])
  61. return graph
  62. def get_starting_nodes(graph):
  63. """
  64. Arguments:
  65. graph, nx.DiGraph: de Bruijn tree
  66. Return:
  67. starting_nodes, list of strings: list of starting nodes
  68. """
  69. starting_nodes = []
  70. for node in graph:
  71. if graph.in_degree(node) == 0:
  72. starting_nodes.append(node)
  73. return starting_nodes
  74. def std(values):
  75. """
  76. Arguments:
  77. values, list: list of values
  78. Returns :
  79. standard deviation of the 'values' data list
  80. """
  81. return stdev(float(values))
  82. def get_sink_nodes(graph):
  83. """
  84. Arguments:
  85. graph, nx.DiGraph: de Bruijn tree
  86. Return:
  87. sink_nodes, list of strings: list of terminal nodes
  88. """
  89. sink_nodes = []
  90. for node in graph:
  91. if graph.out_degree(node) == 0:
  92. sink_nodes.append(node)
  93. return sink_nodes
  94. def path_average_weight(graph, path):
  95. """
  96. Arguments:
  97. graph, nx.DiGraph: a de bruijn graph
  98. path, list of str(nodes): list of nodes constituing a path
  99. Return:
  100. mean weight, float: the mean weight of the path
  101. """
  102. weight = 0
  103. for i in range(len(path)-1):
  104. weight += graph[path[i][i+i][weight]
  105. return weight/(len(path)-1)
  106. def remove_paths(graph, paths, delete_entry_node=False, delete_sink_node=False):
  107. """
  108. Removes paths in 'paths' list from 'graph' object. Don't delete entry and
  109. sink nodes unless specified.
  110. Arguments:
  111. graph, nd.DiGraph(): de bruijn graph
  112. paths, list of string lists: list of paths composed of nodes
  113. delete_entry_node, boolean: Delete entry if True
  114. delete_sink_node, boolean: Delete sink node if True
  115. Return:
  116. graph, nd.DiGraph(): de bruijn graph with deletes paths.
  117. """
  118. entry = 1
  119. sink = -1
  120. if delete_entry_node == True:
  121. entry = 0
  122. if delete_sink_node == True:
  123. sink = None
  124. for path in paths:
  125. graph.remove_nodes_from(path[entry:sink])
  126. return graph
  127. def select_best_path():
  128. pass
  129. def fill(text, width=80):
  130. """Split text with a line return to respect fasta format"""
  131. return os.linesep.join(text[i:i+width] for i in range(0, len(text), width))
  132. def save_contigs(tuples, outname):
  133. """
  134. Arguments:
  135. tuples, tuple: Obtained from get_contigs()
  136. outname, str: name of the file to be written
  137. """
  138. i = 0
  139. with open(outname, "w") as outfile:
  140. for duo in tuples:
  141. i += 1
  142. outfile.write(">contig_{} len={}\n".format(i, duo[1]))
  143. outfile.write("{}\n".format(fill(duo[0])))
  144. return
  145. def get_contigs(graph, starting_nodes, sink_nodes):
  146. """
  147. Arguments:
  148. graph, nx.DiGraph: de Bruijn tree
  149. starting_nodes, list of strings: list of starting nodes
  150. sink_nodes, list of strings: list of terminal nodes
  151. Return:
  152. contigs, list of tupple: list of tupple (contigs, len(contigs))
  153. """
  154. contigs = []
  155. for starting_node in starting_nodes:
  156. for sink_node in sink_nodes:
  157. if algorithms.has_path(graph, starting_node, sink_node) == True:
  158. path = algorithms.shortest_path(graph, starting_node, sink_node)
  159. contig = path[0]
  160. for i in range(len(path)-1):
  161. contig += path[i+1][-1]
  162. contigs.append((contig, len(contig)))
  163. return contigs
  164. def solve_bubble():
  165. pass
  166. def simplify_bubbles():
  167. pass
  168. def solve_entry_tips():
  169. pass
  170. def solve_out_tips():
  171. pass