tp assemblage cours amine ghozlane

debruijn.py 3.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. """
  2. Small assembly module based on de bruijn graphs
  3. """
  4. import networkx as nx
  5. from networkx import algorithms
  6. def read_fastq(fichier):
  7. """
  8. Arguments:
  9. fichier, str: path to fastq file
  10. Returns:
  11. a str generator, generator of sequences
  12. """
  13. with open(fichier, 'r') as filin:
  14. for line in filin:
  15. yield filin.readline().strip()
  16. filin.readline()
  17. filin.readline()
  18. def cut_kmer(seq, k):
  19. """
  20. Arguments:
  21. seq, str: a sequence
  22. k, int: k-mer size, must be shorter than len(seq)
  23. Returns:
  24. an iterator returning str
  25. """
  26. for i in range(len(seq)-(k-1)):
  27. yield seq[i:i+k]
  28. def build_kmer_dict(fichier, k):
  29. """
  30. Arguments:
  31. fichier, str: path to fastq file
  32. k, int: k-mer size, must be shorter than len(seq)
  33. Return:
  34. hash_table, dict: dictionnary with key = k-mer as str
  35. and value count of k-mer occurence
  36. """
  37. hash_table = {}
  38. it_fastq = read_fastq(fichier)
  39. for seq in it_fastq:
  40. it_kmer = cut_kmer(seq, k)
  41. for kmer in it_kmer:
  42. try:
  43. hash_table[kmer]
  44. except KeyError:
  45. hash_table[kmer] = 1
  46. else:
  47. hash_table[kmer] += 1
  48. return hash_table
  49. def build_graph(hash_table):
  50. """
  51. Arguments:
  52. hash_table, dict: dictionnary obtained with build_kmer_dict() function
  53. Return:
  54. graph, nx.DiGraph: the de Bruijn tree corresponding to hash_table
  55. """
  56. graph = nx.DiGraph()
  57. for key in hash_table:
  58. graph.add_edge(key[:-1], key[1:], weight=hash_table[key])
  59. return graph
  60. def get_starting_nodes(graph):
  61. """
  62. Arguments:
  63. graph, nx.DiGraph: de Bruijn tree
  64. Return:
  65. starting_nodes, list of strings: list of starting nodes
  66. """
  67. starting_nodes = []
  68. for node in graph:
  69. if graph.in_degree(node) == 0:
  70. starting_nodes.append(node)
  71. return starting_nodes
  72. def std():
  73. pass
  74. def get_sink_nodes(graph):
  75. """
  76. Arguments:
  77. graph, nx.DiGraph: de Bruijn tree
  78. Return:
  79. sink_nodes, list of strings: list of terminal nodes
  80. """
  81. sink_nodes = []
  82. for node in graph:
  83. if graph.out_degree(node) == 0:
  84. sink_nodes.append(node)
  85. return sink_nodes
  86. def path_average_weight():
  87. pass
  88. def remove_paths():
  89. pass
  90. def select_best_path():
  91. pass
  92. def save_contigs(tuples, outname):
  93. """
  94. Arguments:
  95. tuples, tuple: Obtained from get_contigs()
  96. outname, str: name of the file to be written
  97. """
  98. with open(outname, "w") as outfile:
  99. for duo in tuples:
  100. outfile.write("{} {}".format(duo[0], duo[1]))
  101. return
  102. def get_contigs(graph, starting_nodes, sink_nodes):
  103. """
  104. Arguments:
  105. graph, nx.DiGraph: de Bruijn tree
  106. starting_nodes, list of strings: list of starting nodes
  107. sink_nodes, list of strings: list of terminal nodes
  108. Return:
  109. contigs, list of tupple: list of tupple (contigs, len(contigs))
  110. """
  111. contigs = []
  112. for starting_node in starting_nodes:
  113. for sink_node in sink_nodes:
  114. if algorithms.has_path(graph, starting_node, sink_node) == True:
  115. path = algorithms.shortest_path(graph, starting_node, sink_node)
  116. contig = path[0]
  117. for i in range(len(path)-1):
  118. contig += path[i+1][-1]
  119. contigs.append((contig, len(contig)))
  120. return contigs
  121. def solve_bubble():
  122. pass
  123. def simplify_bubbles():
  124. pass
  125. def solve_entry_tips():
  126. pass
  127. def solve_out_tips():
  128. pass