tp assemblage cours amine ghozlane

debruijn.py 3.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. """
  2. Small assembly module based on de bruijn graphs
  3. """
  4. import os
  5. import networkx as nx
  6. from networkx import algorithms
  7. def read_fastq(fichier):
  8. """
  9. Arguments:
  10. fichier, str: path to fastq file
  11. Returns:
  12. a str generator, generator of sequences
  13. """
  14. with open(fichier, 'r') as filin:
  15. for line in filin:
  16. yield filin.readline().strip()
  17. filin.readline()
  18. filin.readline()
  19. def cut_kmer(seq, k):
  20. """
  21. Arguments:
  22. seq, str: a sequence
  23. k, int: k-mer size, must be shorter than len(seq)
  24. Returns:
  25. an iterator returning str
  26. """
  27. for i in range(len(seq)-(k-1)):
  28. yield seq[i:i+k]
  29. def build_kmer_dict(fichier, k):
  30. """
  31. Arguments:
  32. fichier, str: path to fastq file
  33. k, int: k-mer size, must be shorter than len(seq)
  34. Return:
  35. hash_table, dict: dictionnary with key = k-mer as str
  36. and value count of k-mer occurence
  37. """
  38. hash_table = {}
  39. it_fastq = read_fastq(fichier)
  40. for seq in it_fastq:
  41. it_kmer = cut_kmer(seq, k)
  42. for kmer in it_kmer:
  43. try:
  44. hash_table[kmer]
  45. except KeyError:
  46. hash_table[kmer] = 1
  47. else:
  48. hash_table[kmer] += 1
  49. return hash_table
  50. def build_graph(hash_table):
  51. """
  52. Arguments:
  53. hash_table, dict: dictionnary obtained with build_kmer_dict() function
  54. Return:
  55. graph, nx.DiGraph: the de Bruijn tree corresponding to hash_table
  56. """
  57. graph = nx.DiGraph()
  58. for key in hash_table:
  59. graph.add_edge(key[:-1], key[1:], weight=hash_table[key])
  60. return graph
  61. def get_starting_nodes(graph):
  62. """
  63. Arguments:
  64. graph, nx.DiGraph: de Bruijn tree
  65. Return:
  66. starting_nodes, list of strings: list of starting nodes
  67. """
  68. starting_nodes = []
  69. for node in graph:
  70. if graph.in_degree(node) == 0:
  71. starting_nodes.append(node)
  72. return starting_nodes
  73. def std():
  74. pass
  75. def get_sink_nodes(graph):
  76. """
  77. Arguments:
  78. graph, nx.DiGraph: de Bruijn tree
  79. Return:
  80. sink_nodes, list of strings: list of terminal nodes
  81. """
  82. sink_nodes = []
  83. for node in graph:
  84. if graph.out_degree(node) == 0:
  85. sink_nodes.append(node)
  86. return sink_nodes
  87. def path_average_weight():
  88. pass
  89. def remove_paths():
  90. pass
  91. def select_best_path():
  92. pass
  93. def fill(text, width=80):
  94. """Split text with a line return to respect fasta format"""
  95. return os.linesep.join(text[i:i+width] for i in range(0, len(text), width))
  96. def save_contigs(tuples, outname):
  97. """
  98. Arguments:
  99. tuples, tuple: Obtained from get_contigs()
  100. outname, str: name of the file to be written
  101. """
  102. i = 0
  103. with open(outname, "w") as outfile:
  104. for duo in tuples:
  105. i += 1
  106. outfile.write(">contig_{} len={}\n".format(i, duo[1]))
  107. outfile.write("{}\n".format(fill(duo[0])))
  108. return
  109. def get_contigs(graph, starting_nodes, sink_nodes):
  110. """
  111. Arguments:
  112. graph, nx.DiGraph: de Bruijn tree
  113. starting_nodes, list of strings: list of starting nodes
  114. sink_nodes, list of strings: list of terminal nodes
  115. Return:
  116. contigs, list of tupple: list of tupple (contigs, len(contigs))
  117. """
  118. contigs = []
  119. for starting_node in starting_nodes:
  120. for sink_node in sink_nodes:
  121. if algorithms.has_path(graph, starting_node, sink_node) == True:
  122. path = algorithms.shortest_path(graph, starting_node, sink_node)
  123. contig = path[0]
  124. for i in range(len(path)-1):
  125. contig += path[i+1][-1]
  126. contigs.append((contig, len(contig)))
  127. return contigs
  128. def solve_bubble():
  129. pass
  130. def simplify_bubbles():
  131. pass
  132. def solve_entry_tips():
  133. pass
  134. def solve_out_tips():
  135. pass