1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768 |
- """
- Small assembly module based on de bruijn graphs
- """
- import networkx as nx
-
- def read_fastq(fichier):
- """
- Arguments:
- fichier, str: path to fastq file
-
- Returns:
- a str generator, generator of sequences
- """
- with open(fichier, 'r') as filin:
- for line in filin:
- yield filin.readline().strip()
- filin.readline()
- filin.readline()
-
- def cut_kmer(seq, k):
- """
- Arguments:
- seq, str: a sequence
- k, int: k-mer size, must be shorter than len(seq)
-
- Returns:
- an iterator returning str
- """
- for i in range(len(seq)-(k-1)):
- yield seq[i:i+k]
-
- def build_kmer_dict(fichier, k):
- """
- Arguments:
- fichier, str: path to fastq file
- k, int: k-mer size, must be shorter than len(seq)
-
- Return:
- hash_table, dict: dictionnary with key = k-mer as str
- and value count of k-mer occurence
- """
- hash_table = {}
- it_fastq = read_fastq(fichier)
- for seq in it_fastq:
- it_kmer = cut_kmer(seq, k)
- for kmer in it_kmer:
- try:
- hash_table[kmer]
- except KeyError:
- hash_table[kmer] = 1
- else:
- hash_table[kmer] += 1
-
- return hash_table
-
- def build_graph(hash_table):
- """
- Arguments:
- hash_table, dict: dictionnary obtained with build_kmer_dict() function
- Return:
- graph, nx.Graph: the de Bruijn tree corresponding to hash_table
- """
- graph = nx.DiGraph()
- for key in hash_table:
- graph.add_edge(key[:-1], key[1:], weight=hash_table[key])
-
- return graph
|