|
@@ -10,13 +10,19 @@ ARGS
|
10
|
10
|
|
11
|
11
|
standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
|
12
|
12
|
|
|
13
|
+TODO
|
|
14
|
+_____
|
|
15
|
+Externalize sfs transforms in a function
|
|
16
|
+Rectify SFS comp in parsed funct.
|
|
17
|
+
|
13
|
18
|
"""
|
14
|
19
|
|
15
|
20
|
import gzip
|
16
|
21
|
import sys
|
17
|
22
|
import matplotlib.pyplot as plt
|
18
|
23
|
|
19
|
|
-def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False):
|
|
24
|
+def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False,
|
|
25
|
+ strip = False, count_ext = False):
|
20
|
26
|
|
21
|
27
|
"""
|
22
|
28
|
Generates a Site Frequency Spectrum from a gzipped VCF file format.
|
|
@@ -41,7 +47,13 @@ def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, ver
|
41
|
47
|
if diploid and not folded:
|
42
|
48
|
n *= 2
|
43
|
49
|
# initiate SFS_values with a zeros dict
|
44
|
|
- SFS_values = dict.fromkeys(range(n),0)
|
|
50
|
+ # if strip:
|
|
51
|
+ # # "[1" removes the 0 bin
|
|
52
|
+ # # "n-1]" crop the last bin (n or n/2 for folded)
|
|
53
|
+ # SFS_dim = [1, n-1]
|
|
54
|
+ # else:
|
|
55
|
+ SFS_dim = [0, n+1]
|
|
56
|
+ SFS_values = dict.fromkeys(range(SFS_dim[1]),0)
|
45
|
57
|
count_pluriall = 0
|
46
|
58
|
with gzip.open(vcf_file, "rb") as inputgz:
|
47
|
59
|
line = inputgz.readline()
|
|
@@ -81,13 +93,20 @@ def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, ver
|
81
|
93
|
nb_alleles = set(smpl_genotype)
|
82
|
94
|
snp_genotypes += smpl_genotype
|
83
|
95
|
# skip if all individuals have the same genotype
|
84
|
|
- if len(set(snp_genotypes)) == 1:
|
85
|
|
- line = inputgz.readline()
|
86
|
|
- continue
|
|
96
|
+ # if len(set(snp_genotypes)) == 1:
|
|
97
|
+ # if folded or (folded == False and snp_genotypes.count(1) == 0) :
|
|
98
|
+ # line = inputgz.readline()
|
|
99
|
+ # continue
|
87
|
100
|
for k in set(snp_genotypes):
|
88
|
101
|
allele_counts[snp_genotypes.count(k)] = k
|
89
|
102
|
allele_counts_list.append(snp_genotypes.count(k))
|
90
|
|
- if folded and len(ALT) >= 2:
|
|
103
|
+ #print(allele_counts_list)
|
|
104
|
+ if len(set(snp_genotypes)) == 1 or allele_counts_list[0] == allele_counts_list[1]:
|
|
105
|
+ # If only heterozygous sites 0/1; skip the site (equivalent to n bin or n/2 bin for folded)
|
|
106
|
+ # skip if all individuals have the same genotype
|
|
107
|
+ line = inputgz.readline()
|
|
108
|
+ continue
|
|
109
|
+ if len(ALT) >= 2:
|
91
|
110
|
#pass
|
92
|
111
|
count_pluriall +=1
|
93
|
112
|
# TODO - work in progress
|
|
@@ -95,10 +114,19 @@ def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, ver
|
95
|
114
|
# SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
|
96
|
115
|
# allele_counts_list.remove(min(allele_counts_list))
|
97
|
116
|
else:
|
98
|
|
- SFS_values[min(allele_counts_list)-1] += 1
|
|
117
|
+ if folded:
|
|
118
|
+ SFS_values[min(allele_counts_list)-SFS_dim[0]] += 1
|
|
119
|
+ else :
|
|
120
|
+ # if unfolded, count the Ones (ALT allele)
|
|
121
|
+ #print(snp_genotypes, snp_genotypes.count(1))
|
|
122
|
+ SFS_values[snp_genotypes.count(1)-SFS_dim[0]] += 1
|
|
123
|
+ # all the parsing is done, change line
|
99
|
124
|
line = inputgz.readline()
|
100
|
125
|
if verbose:
|
101
|
126
|
print("SFS=", SFS_values)
|
|
127
|
+ if strip:
|
|
128
|
+ del SFS_values[0]
|
|
129
|
+ del SFS_values[n]
|
102
|
130
|
print("Pluriallelic sites =", count_pluriall)
|
103
|
131
|
return SFS_values, count_pluriall
|
104
|
132
|
|
|
@@ -163,20 +191,39 @@ def sfs_from_parsed_vcf(n, vcf_dict, folded = True, diploid = True, phased = Fal
|
163
|
191
|
return SFS_values, count_pluriall
|
164
|
192
|
|
165
|
193
|
|
166
|
|
-def barplot_sfs(sfs, folded=True, title = "Barplot"):
|
|
194
|
+def barplot_sfs(sfs, xlab, ylab, folded=True, title = "Barplot", transformed = False):
|
167
|
195
|
sfs_val = []
|
168
|
196
|
n = len(sfs.values())
|
169
|
197
|
for k in range(1, n):
|
170
|
198
|
ksi = list(sfs.values())[k-1]
|
171
|
199
|
# k+1 because k starts from 0
|
172
|
|
- if folded:
|
173
|
|
- sfs_val.append(ksi * k * (n - k))
|
|
200
|
+ # if folded:
|
|
201
|
+ # # ?check if 2*n or not?
|
|
202
|
+ # sfs_val.append(ksi * k * (2*n - k))
|
|
203
|
+ # else:
|
|
204
|
+ # if transformed:
|
|
205
|
+ # sfs_val.append(ksi * k)
|
|
206
|
+ # else:
|
|
207
|
+ # sfs_val.append(ksi)
|
|
208
|
+ if transformed:
|
|
209
|
+ if folded:
|
|
210
|
+ sfs_val.append(ksi * k * (2*n - k))
|
|
211
|
+ else:
|
|
212
|
+ sfs_val.append(ksi * k)
|
174
|
213
|
else:
|
175
|
|
- sfs_val.append(ksi * k)
|
|
214
|
+ sfs_val.append(ksi)
|
|
215
|
+
|
176
|
216
|
#terminal case, same for folded or unfolded
|
177
|
|
- sfs_val.append(list(sfs.values())[n-1] * n)
|
|
217
|
+ if transformed:
|
|
218
|
+ sfs_val.append(list(sfs.values())[n-1] * n)
|
|
219
|
+ else:
|
|
220
|
+ sfs_val.append(list(sfs.values())[n-1])
|
178
|
221
|
#build the plot
|
179
|
222
|
title = title+" [folded="+str(folded)+"]"
|
|
223
|
+ if ylab:
|
|
224
|
+ plt.ylabel(ylab)
|
|
225
|
+ if xlab:
|
|
226
|
+ plt.xlabel(xlab)
|
180
|
227
|
plt.title(title)
|
181
|
228
|
plt.bar([i+1 for i in sfs.keys()], sfs_val)
|
182
|
229
|
plt.show()
|
|
@@ -189,5 +236,5 @@ if __name__ == "__main__":
|
189
|
236
|
|
190
|
237
|
# PARAM : Nb of indiv
|
191
|
238
|
n = int(sys.argv[2])
|
192
|
|
- sfs = sfs_from_vcf(n, sys.argv[1], folded = True, diploid = True, phased = False)
|
|
239
|
+ sfs = sfs_from_vcf(n, sys.argv[1], folded = True, diploid = True, phased = False, strip = True)
|
193
|
240
|
print(sfs)
|