Explorar el Código

added debruijn.py with read functions and build_graph

nicolas-zimmermann hace 5 años
padre
commit
6d9c404565
Se han modificado 1 ficheros con 67 adiciones y 0 borrados
  1. 67 0
      debruijn/debruijn.py

+ 67 - 0
debruijn/debruijn.py Ver fichero

@@ -0,0 +1,67 @@
1
+"""
2
+Small assembly module based on de bruijn graphs
3
+"""
4
+import networkx as nx
5
+
6
+def read_fastq(fichier):
7
+    """
8
+    Arguments:
9
+        fichier, str: path to fastq file
10
+
11
+    Returns:
12
+        a str generator, generator of sequences
13
+    """
14
+    with open(fichier, 'r') as filin:
15
+        for line in filin:
16
+            yield filin.readline().strip()
17
+            filin.readline()
18
+            filin.readline()
19
+
20
+def cut_kmer(seq, k):
21
+    """
22
+    Arguments:
23
+        seq, str: a sequence
24
+        k, int: k-mer size, must be shorter than len(seq)
25
+
26
+    Returns:
27
+        an iterator returning str
28
+    """
29
+    for i in range(len(seq)-(k-1)):
30
+        yield seq[i:i+k]
31
+
32
+def build_kmer_dict(fichier, k):
33
+    """
34
+    Arguments:
35
+        fichier, str: path to fastq file
36
+        k, int: k-mer size, must be shorter than len(seq)
37
+
38
+    Return:
39
+        hash_table, dict: dictionnary with key = k-mer as str
40
+                          and value count of k-mer occurence
41
+    """
42
+    hash_table = {}
43
+    it_fastq = read_fastq(fichier)
44
+    for seq in it_fastq:
45
+        it_kmer = cut_kmer(seq, k)
46
+        for kmer in it_kmer:
47
+            try:
48
+                hash_table[kmer]
49
+            except KeyError:
50
+                hash_table[kmer] = 1
51
+            else:
52
+                hash_table[kmer] += 1
53
+
54
+    return hash_table
55
+
56
+def build_graph(hash_table):
57
+    """
58
+    Arguments:
59
+        hash_table, dict: dictionnary obtained with build_kmer_dict() function
60
+    Return:
61
+        graph, nx.Graph: the de Bruijn tree corresponding to hash_table
62
+    """
63
+    graph = nx.DiGraph()
64
+    for key in hash_table:
65
+        graph.add_edge(key[:-1], key[1:], weight=hash_table[key])
66
+
67
+    return graph