tp assemblage cours amine ghozlane

debruijn.py 1.6KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. """
  2. Small assembly module based on de bruijn graphs
  3. """
  4. import networkx as nx
  5. def read_fastq(fichier):
  6. """
  7. Arguments:
  8. fichier, str: path to fastq file
  9. Returns:
  10. a str generator, generator of sequences
  11. """
  12. with open(fichier, 'r') as filin:
  13. for line in filin:
  14. yield filin.readline().strip()
  15. filin.readline()
  16. filin.readline()
  17. def cut_kmer(seq, k):
  18. """
  19. Arguments:
  20. seq, str: a sequence
  21. k, int: k-mer size, must be shorter than len(seq)
  22. Returns:
  23. an iterator returning str
  24. """
  25. for i in range(len(seq)-(k-1)):
  26. yield seq[i:i+k]
  27. def build_kmer_dict(fichier, k):
  28. """
  29. Arguments:
  30. fichier, str: path to fastq file
  31. k, int: k-mer size, must be shorter than len(seq)
  32. Return:
  33. hash_table, dict: dictionnary with key = k-mer as str
  34. and value count of k-mer occurence
  35. """
  36. hash_table = {}
  37. it_fastq = read_fastq(fichier)
  38. for seq in it_fastq:
  39. it_kmer = cut_kmer(seq, k)
  40. for kmer in it_kmer:
  41. try:
  42. hash_table[kmer]
  43. except KeyError:
  44. hash_table[kmer] = 1
  45. else:
  46. hash_table[kmer] += 1
  47. return hash_table
  48. def build_graph(hash_table):
  49. """
  50. Arguments:
  51. hash_table, dict: dictionnary obtained with build_kmer_dict() function
  52. Return:
  53. graph, nx.Graph: the de Bruijn tree corresponding to hash_table
  54. """
  55. graph = nx.DiGraph()
  56. for key in hash_table:
  57. graph.add_edge(key[:-1], key[1:], weight=hash_table[key])
  58. return graph