|
@@ -0,0 +1,67 @@
|
|
1
|
+"""
|
|
2
|
+Small assembly module based on de bruijn graphs
|
|
3
|
+"""
|
|
4
|
+import networkx as nx
|
|
5
|
+
|
|
6
|
+def read_fastq(fichier):
|
|
7
|
+ """
|
|
8
|
+ Arguments:
|
|
9
|
+ fichier, str: path to fastq file
|
|
10
|
+
|
|
11
|
+ Returns:
|
|
12
|
+ a str generator, generator of sequences
|
|
13
|
+ """
|
|
14
|
+ with open(fichier, 'r') as filin:
|
|
15
|
+ for line in filin:
|
|
16
|
+ yield filin.readline().strip()
|
|
17
|
+ filin.readline()
|
|
18
|
+ filin.readline()
|
|
19
|
+
|
|
20
|
+def cut_kmer(seq, k):
|
|
21
|
+ """
|
|
22
|
+ Arguments:
|
|
23
|
+ seq, str: a sequence
|
|
24
|
+ k, int: k-mer size, must be shorter than len(seq)
|
|
25
|
+
|
|
26
|
+ Returns:
|
|
27
|
+ an iterator returning str
|
|
28
|
+ """
|
|
29
|
+ for i in range(len(seq)-(k-1)):
|
|
30
|
+ yield seq[i:i+k]
|
|
31
|
+
|
|
32
|
+def build_kmer_dict(fichier, k):
|
|
33
|
+ """
|
|
34
|
+ Arguments:
|
|
35
|
+ fichier, str: path to fastq file
|
|
36
|
+ k, int: k-mer size, must be shorter than len(seq)
|
|
37
|
+
|
|
38
|
+ Return:
|
|
39
|
+ hash_table, dict: dictionnary with key = k-mer as str
|
|
40
|
+ and value count of k-mer occurence
|
|
41
|
+ """
|
|
42
|
+ hash_table = {}
|
|
43
|
+ it_fastq = read_fastq(fichier)
|
|
44
|
+ for seq in it_fastq:
|
|
45
|
+ it_kmer = cut_kmer(seq, k)
|
|
46
|
+ for kmer in it_kmer:
|
|
47
|
+ try:
|
|
48
|
+ hash_table[kmer]
|
|
49
|
+ except KeyError:
|
|
50
|
+ hash_table[kmer] = 1
|
|
51
|
+ else:
|
|
52
|
+ hash_table[kmer] += 1
|
|
53
|
+
|
|
54
|
+ return hash_table
|
|
55
|
+
|
|
56
|
+def build_graph(hash_table):
|
|
57
|
+ """
|
|
58
|
+ Arguments:
|
|
59
|
+ hash_table, dict: dictionnary obtained with build_kmer_dict() function
|
|
60
|
+ Return:
|
|
61
|
+ graph, nx.Graph: the de Bruijn tree corresponding to hash_table
|
|
62
|
+ """
|
|
63
|
+ graph = nx.DiGraph()
|
|
64
|
+ for key in hash_table:
|
|
65
|
+ graph.add_edge(key[:-1], key[1:], weight=hash_table[key])
|
|
66
|
+
|
|
67
|
+ return graph
|