""" Small assembly module based on de bruijn graphs """ import networkx as nx def read_fastq(fichier): """ Arguments: fichier, str: path to fastq file Returns: a str generator, generator of sequences """ with open(fichier, 'r') as filin: for line in filin: yield filin.readline().strip() filin.readline() filin.readline() def cut_kmer(seq, k): """ Arguments: seq, str: a sequence k, int: k-mer size, must be shorter than len(seq) Returns: an iterator returning str """ for i in range(len(seq)-(k-1)): yield seq[i:i+k] def build_kmer_dict(fichier, k): """ Arguments: fichier, str: path to fastq file k, int: k-mer size, must be shorter than len(seq) Return: hash_table, dict: dictionnary with key = k-mer as str and value count of k-mer occurence """ hash_table = {} it_fastq = read_fastq(fichier) for seq in it_fastq: it_kmer = cut_kmer(seq, k) for kmer in it_kmer: try: hash_table[kmer] except KeyError: hash_table[kmer] = 1 else: hash_table[kmer] += 1 return hash_table def build_graph(hash_table): """ Arguments: hash_table, dict: dictionnary obtained with build_kmer_dict() function Return: graph, nx.Graph: the de Bruijn tree corresponding to hash_table """ graph = nx.DiGraph() for key in hash_table: graph.add_edge(key[:-1], key[1:], weight=hash_table[key]) return graph