tp assemblage cours amine ghozlane

debruijn.py 6.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. """
  2. Small assembly module based on de bruijn graphs
  3. """
  4. import os
  5. import statistics
  6. import networkx as nx
  7. from networkx import algorithms
  8. def read_fastq(fichier):
  9. """
  10. Returns an iterator object that retrieves only the nucleic sequences of a
  11. fastq file.
  12. Arguments:
  13. fichier, str: path to fastq file
  14. Returns:
  15. a str generator, generator of sequences
  16. """
  17. with open(fichier, 'r') as filin:
  18. for line in filin:
  19. yield filin.readline().strip()
  20. filin.readline()
  21. filin.readline()
  22. def cut_kmer(seq, k):
  23. """
  24. Returns an iterator that returns k-mers of k-size of a sequence
  25. Arguments:
  26. seq, str: a sequence
  27. k, int: k-mer size, must be shorter than len(seq)
  28. Returns:
  29. an iterator returning str
  30. """
  31. for i in range(len(seq)-(k-1)):
  32. yield seq[i:i+k]
  33. def build_kmer_dict(fichier, k):
  34. """
  35. Arguments:
  36. fichier, str: path to fastq file
  37. k, int: k-mer size, must be shorter than len(seq)
  38. Return:
  39. hash_table, dict: dictionnary with key = k-mer as str
  40. and value count of k-mer occurence
  41. """
  42. hash_table = {} # initialise empty hash table
  43. it_fastq = read_fastq(fichier)
  44. for seq in it_fastq: # for each sequence
  45. it_kmer = cut_kmer(seq, k) # count each occurence of k-mer
  46. for kmer in it_kmer:
  47. try:
  48. hash_table[kmer]
  49. except KeyError:
  50. hash_table[kmer] = 1
  51. else:
  52. hash_table[kmer] += 1
  53. return hash_table
  54. def build_graph(hash_table):
  55. """
  56. Returns a graph from a hash table
  57. Arguments:
  58. hash_table, dict: dictionnary obtained with build_kmer_dict() function
  59. Return:
  60. graph, nx.DiGraph: the de Bruijn tree corresponding to hash_table
  61. """
  62. graph = nx.DiGraph()
  63. for key in hash_table:
  64. graph.add_edge(key[:-1], key[1:], weight=hash_table[key])
  65. return graph
  66. def get_starting_nodes(graph):
  67. """
  68. Returns the list of starting nodes of a graph
  69. Arguments:
  70. graph, nx.DiGraph: de Bruijn tree
  71. Return:
  72. starting_nodes, list of strings: list of starting nodes
  73. """
  74. starting_nodes = []
  75. for node in graph:
  76. if graph.in_degree(node) == 0: # if count of input edge == 0
  77. starting_nodes.append(node)
  78. return starting_nodes
  79. def std(values):
  80. """
  81. Computes standard deviation from a list of value
  82. Arguments:
  83. values, list: list of values
  84. Returns :
  85. standard deviation of the 'values' data list
  86. """
  87. return statistics.stdev(float(values))
  88. def get_sink_nodes(graph):
  89. """
  90. Arguments:
  91. graph, nx.DiGraph: de Bruijn tree
  92. Return:
  93. sink_nodes, list of strings: list of terminal nodes
  94. """
  95. sink_nodes = []
  96. for node in graph:
  97. if graph.out_degree(node) == 0: # if count of output edge == 0
  98. sink_nodes.append(node)
  99. return sink_nodes
  100. def path_average_weight(graph, path):
  101. """
  102. Arguments:
  103. graph, nx.DiGraph: a de bruijn graph
  104. path, list of str(nodes): list of nodes constituing a path
  105. Return:
  106. mean weight, float: the mean weight of the path
  107. """
  108. weight = 0
  109. for i in range(len(path)-1):
  110. weight += graph[path[i][i+i][weight]
  111. return weight/(len(path)-1)
  112. def remove_paths(graph, paths, delete_entry_node=False, delete_sink_node=False):
  113. """
  114. Removes paths in 'paths' list from 'graph' object. Don't delete entry and
  115. sink nodes unless specified.
  116. Arguments:
  117. graph, nd.DiGraph(): de bruijn graph
  118. paths, list of string lists: list of paths composed of nodes
  119. delete_entry_node, boolean: Delete entry if True
  120. delete_sink_node, boolean: Delete sink node if True
  121. Return:
  122. graph, nd.DiGraph(): de bruijn graph with deletes paths.
  123. """
  124. entry = 1
  125. sink = -1
  126. if delete_entry_node == True:
  127. entry = 0
  128. if delete_sink_node == True:
  129. sink = None
  130. for path in paths:
  131. graph.remove_nodes_from(path[entry:sink])
  132. return graph
  133. def select_best_path(graph, paths, path_len, mean_weights,
  134. delete_entry_node=False, delete_sink_node=False):
  135. """
  136. """
  137. max_weight = max(mean_weights)
  138. heaviest = [i for i, j in enumerate(mean_weights) if j == mean_weights]
  139. if len(heaviest) > 1:
  140. max_len = max(path_lengths)
  141. longest = [i for i in heaviest if path_len[i] == max_len]
  142. if len(longest) > 1:
  143. Random.seed(9001)
  144. best = random.choice[longest]
  145. else:
  146. best = longest[0]
  147. else:
  148. best = heaviest[0]
  149. paths.pop(best)
  150. return remove_paths(graph, paths, delete_entry_node, delete_sink_node)
  151. def fill(text, width=80):
  152. """Split text with a line return to respect fasta format"""
  153. return os.linesep.join(text[i:i+width] for i in range(0, len(text), width))
  154. def save_contigs(tuples, outname):
  155. """
  156. Arguments:
  157. tuples, tuple: Obtained from get_contigs()
  158. outname, str: name of the file to be written
  159. """
  160. i = 0
  161. with open(outname, "w") as outfile:
  162. for duo in tuples:
  163. i += 1
  164. outfile.write(">contig_{} len={}\n".format(i, duo[1]))
  165. outfile.write("{}\n".format(fill(duo[0])))
  166. return
  167. def get_contigs(graph, starting_nodes, sink_nodes):
  168. """
  169. Arguments:
  170. graph, nx.DiGraph: de Bruijn tree
  171. starting_nodes, list of strings: list of starting nodes
  172. sink_nodes, list of strings: list of terminal nodes
  173. Return:
  174. contigs, list of tupple: list of tupple (contigs, len(contigs))
  175. """
  176. contigs = []
  177. for starting_node in starting_nodes:
  178. for sink_node in sink_nodes:
  179. if algorithms.has_path(graph, starting_node, sink_node) == True:
  180. path = algorithms.shortest_path(graph, starting_node, sink_node)
  181. contig = path[0] # base of the contig is seq of the first node
  182. for i in range(len(path)-1):
  183. contig += path[i+1][-1] # adds last char of node
  184. contigs.append((contig, len(contig)))
  185. return contigs
  186. def solve_bubble():
  187. pass
  188. def simplify_bubbles():
  189. pass
  190. def solve_entry_tips():
  191. pass
  192. def solve_out_tips():
  193. pass