ソースを参照

update sfs plot and vcf parsing

tforest 2 年 前
コミット
ec972a7cdf

+ 2 - 1
__init__.py ファイルの表示

@@ -1 +1,2 @@
1
-from frst import sfs_tools, customgraphics, vcf_utils, sfs_tools, stats_sfs
1
+from frst import sfs_tools, customgraphics, vcf_utils, sfs_tools, stats_sfs, dependences
2
+

+ 17 - 0
bam_utils.py ファイルの表示

@@ -0,0 +1,17 @@
1
+from frst.dependences import pybam
2
+
3
+def parse_bam(bam_file):
4
+    cov = {}
5
+    cov_val = 0
6
+    for alignment in pybam.read(bam_file):
7
+        #pos 1 based (see Pybam doc)
8
+        start_pos = alignment.sam_pos1
9
+        end_pos = alignment.sam_pos1 + alignment.sam_block_size
10
+        if start_pos not in cov:
11
+            cov[start_pos] = 0
12
+        if end_pos not in cov:
13
+            cov[end_pos] = 0
14
+        # add a weight on the start and lower on the end
15
+        cov[start_pos] = cov_val +1
16
+        cov[end_pos] = cov_val -1
17
+    return cov

+ 6 - 2
customgraphics.py ファイルの表示

@@ -200,8 +200,12 @@ def scatter(x, y, ylab=None, xlab=None, title=None):
200 200
         plt.title(title)
201 201
     plt.show()
202 202
 
203
-def barplot(x, y, ylab=None, xlab=None, title=None):
204
-    plt.bar(x, y)
203
+def barplot(x=None, y=None, ylab=None, xlab=None, title=None):
204
+    if x:
205
+        plt.bar(x, y)
206
+    else:
207
+        x = list(range(len(y)))
208
+        plt.bar(x, y)
205 209
     if ylab:
206 210
         plt.ylabel(ylab)
207 211
     if xlab:

+ 1 - 0
dependences/__init__.py ファイルの表示

@@ -0,0 +1 @@
1
+from frst.dependences import pybam

BIN
dependences/__pycache__/__init__.cpython-310.pyc ファイルの表示


BIN
dependences/__pycache__/__init__.cpython-38.pyc ファイルの表示


BIN
dependences/__pycache__/__init__.cpython-39.pyc ファイルの表示


BIN
dependences/__pycache__/pybam.cpython-310.pyc ファイルの表示


BIN
dependences/__pycache__/pybam.cpython-38.pyc ファイルの表示


BIN
dependences/__pycache__/pybam.cpython-39.pyc ファイルの表示


File diff suppressed because it is too large
+ 755 - 0
dependences/pybam.py


File diff suppressed because it is too large
+ 743 - 0
dependences/pybam.py~


File diff suppressed because it is too large
+ 743 - 0
dependences/pybam_old.py


+ 60 - 13
sfs_tools.py ファイルの表示

@@ -10,13 +10,19 @@ ARGS
10 10
 
11 11
 standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
12 12
 
13
+TODO
14
+_____
15
+Externalize sfs transforms in a function
16
+Rectify SFS comp in parsed funct.
17
+
13 18
 """
14 19
 
15 20
 import gzip
16 21
 import sys
17 22
 import matplotlib.pyplot as plt
18 23
 
19
-def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False):
24
+def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False,
25
+                 strip = False, count_ext = False):
20 26
 
21 27
     """
22 28
     Generates a Site Frequency Spectrum from a gzipped VCF file format.
@@ -41,7 +47,13 @@ def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, ver
41 47
     if diploid and not folded:
42 48
         n *= 2
43 49
     # initiate SFS_values with a zeros dict
44
-    SFS_values = dict.fromkeys(range(n),0)
50
+    # if strip:
51
+    #     # "[1" removes the 0 bin
52
+    #     # "n-1]" crop the last bin (n or n/2 for folded)
53
+    #     SFS_dim = [1, n-1]
54
+    # else:
55
+    SFS_dim = [0, n+1]
56
+    SFS_values = dict.fromkeys(range(SFS_dim[1]),0)
45 57
     count_pluriall = 0
46 58
     with gzip.open(vcf_file, "rb") as inputgz:
47 59
         line = inputgz.readline()
@@ -81,13 +93,20 @@ def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, ver
81 93
                     nb_alleles = set(smpl_genotype)
82 94
                     snp_genotypes += smpl_genotype
83 95
                 # skip if all individuals have the same genotype
84
-                if len(set(snp_genotypes)) == 1:
85
-                    line = inputgz.readline()
86
-                    continue
96
+                # if len(set(snp_genotypes)) == 1:
97
+                #     if folded or (folded == False and snp_genotypes.count(1) == 0) :
98
+                #         line = inputgz.readline()
99
+                #         continue         
87 100
                 for k in set(snp_genotypes):
88 101
                     allele_counts[snp_genotypes.count(k)] = k
89 102
                     allele_counts_list.append(snp_genotypes.count(k))
90
-                if folded and len(ALT) >= 2:
103
+                #print(allele_counts_list)
104
+                if  len(set(snp_genotypes)) == 1 or allele_counts_list[0] == allele_counts_list[1]:
105
+                    # If only heterozygous sites 0/1; skip the site (equivalent to n bin or n/2 bin for folded)
106
+                    # skip if all individuals have the same genotype
107
+                    line = inputgz.readline()
108
+                    continue     
109
+                if len(ALT) >= 2:
91 110
                     #pass
92 111
                     count_pluriall +=1
93 112
                     # TODO - work in progress
@@ -95,10 +114,19 @@ def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, ver
95 114
                     #     SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
96 115
                     #     allele_counts_list.remove(min(allele_counts_list))
97 116
                 else:
98
-                    SFS_values[min(allele_counts_list)-1] += 1
117
+                    if folded:
118
+                        SFS_values[min(allele_counts_list)-SFS_dim[0]] += 1
119
+                    else :
120
+                        # if unfolded, count the Ones (ALT allele)
121
+                        #print(snp_genotypes, snp_genotypes.count(1))
122
+                        SFS_values[snp_genotypes.count(1)-SFS_dim[0]] += 1
123
+            # all the parsing is done, change line
99 124
             line = inputgz.readline()
100 125
             if verbose:
101 126
                 print("SFS=", SFS_values)
127
+        if strip:
128
+            del SFS_values[0]
129
+            del SFS_values[n]
102 130
         print("Pluriallelic sites =", count_pluriall)
103 131
     return SFS_values, count_pluriall
104 132
 
@@ -163,20 +191,39 @@ def sfs_from_parsed_vcf(n, vcf_dict, folded = True, diploid = True, phased = Fal
163 191
     return SFS_values, count_pluriall
164 192
 
165 193
 
166
-def barplot_sfs(sfs, folded=True, title = "Barplot"):
194
+def barplot_sfs(sfs,  xlab, ylab, folded=True, title = "Barplot", transformed = False):
167 195
     sfs_val = []
168 196
     n = len(sfs.values())
169 197
     for k in range(1, n):
170 198
         ksi = list(sfs.values())[k-1]
171 199
         # k+1 because k starts from 0
172
-        if folded:
173
-            sfs_val.append(ksi * k * (n - k))
200
+        # if folded:
201
+        #     # ?check if 2*n or not?
202
+        #     sfs_val.append(ksi * k * (2*n - k))
203
+        # else:
204
+        #     if transformed:
205
+        #         sfs_val.append(ksi * k)
206
+        #     else:
207
+        #         sfs_val.append(ksi)
208
+        if transformed:
209
+            if folded:
210
+                sfs_val.append(ksi * k * (2*n - k))
211
+            else:
212
+                sfs_val.append(ksi * k)
174 213
         else:
175
-            sfs_val.append(ksi * k)
214
+             sfs_val.append(ksi)
215
+            
176 216
     #terminal case, same for folded or unfolded
177
-    sfs_val.append(list(sfs.values())[n-1] * n)
217
+    if transformed:
218
+        sfs_val.append(list(sfs.values())[n-1] * n)
219
+    else:
220
+         sfs_val.append(list(sfs.values())[n-1])
178 221
     #build the plot
179 222
     title = title+" [folded="+str(folded)+"]"
223
+    if ylab:
224
+        plt.ylabel(ylab)
225
+    if xlab:
226
+        plt.xlabel(xlab)
180 227
     plt.title(title)
181 228
     plt.bar([i+1 for i in sfs.keys()], sfs_val)
182 229
     plt.show()
@@ -189,5 +236,5 @@ if __name__ == "__main__":
189 236
 
190 237
     # PARAM : Nb of indiv
191 238
     n = int(sys.argv[2])
192
-    sfs = sfs_from_vcf(n, sys.argv[1], folded = True, diploid = True, phased = False)
239
+    sfs = sfs_from_vcf(n, sys.argv[1], folded = True, diploid = True, phased = False, strip = True)
193 240
     print(sfs)

+ 0 - 1
stats_sfs.py ファイルの表示

@@ -1,4 +1,3 @@
1
-from frst import vcf_to_sfs
2 1
 
3 2
 import math
4 3
 

+ 19 - 0
vcf_utils.py ファイルの表示

@@ -214,6 +214,25 @@ def free(obj):
214 214
     del obj
215 215
     gc.collect()
216 216
 
217
+def random_indiv_from_vcfs(vcf_files_list):
218
+    """
219
+    Either for two VCFs of two indiv or two samplenames from the same VCF.
220
+    """
221
+    if len(vcf_files_list)>1:
222
+        # two invid from two VCFs
223
+        for vcf in vcf_files_list:
224
+            with open(vcf) as vcf_stream:
225
+                for line in vcf_stream:
226
+                    if line.startswith('#'):
227
+                        continue
228
+                    genotype  = line.split("\t")[-1].strip()
229
+                    if genotype.split(":")[1] != '0':
230
+                        genotype_list = genotype.split(':')[0].split("/")
231
+                        print(genotype_list)
232
+    else:
233
+        # two samplename to randomly pick
234
+        pass
235
+    
217 236
 if __name__ == "__main__":
218 237
     # check args
219 238
     if len(sys.argv) !=2: