|
@@ -3,6 +3,10 @@
|
3
|
3
|
"""
|
4
|
4
|
Caution : At the moment for gzipped files only.
|
5
|
5
|
|
|
6
|
+ARGS
|
|
7
|
+--------
|
|
8
|
+
|
|
9
|
+usage : vcf_to_sfs.py VCF.gz nb_indiv
|
6
|
10
|
|
7
|
11
|
"""
|
8
|
12
|
|
|
@@ -12,22 +16,23 @@ import sys
|
12
|
16
|
# default folded SFS
|
13
|
17
|
folded = True
|
14
|
18
|
diploid = True
|
|
19
|
+phased = False
|
15
|
20
|
|
16
|
21
|
# PARAM : Nb of indiv
|
17
|
22
|
n = int(sys.argv[2])
|
18
|
23
|
|
19
|
24
|
if diploid and not folded:
|
20
|
25
|
n *= 2
|
21
|
|
-
|
|
26
|
+# initiate SFS_values with a zeros dict
|
22
|
27
|
SFS_values = dict.fromkeys(range(n),0)
|
23
|
28
|
|
24
|
|
-
|
25
|
29
|
with gzip.open(sys.argv[1], "rb") as inputgz:
|
26
|
30
|
line = inputgz.readline()
|
27
|
31
|
genotypes = []
|
28
|
|
- #SFS_values = {}
|
29
|
32
|
while line:
|
|
33
|
+ # decode gzipped binary lines
|
30
|
34
|
line = line.decode('utf-8').strip()
|
|
35
|
+ # every snp line, not comment or header
|
31
|
36
|
if not line.startswith("##") and not line.startswith("#"):
|
32
|
37
|
FORMAT = line.split("\t")[8:9]
|
33
|
38
|
SAMPLES = line.split("\t")[9:]
|
|
@@ -35,28 +40,17 @@ with gzip.open(sys.argv[1], "rb") as inputgz:
|
35
|
40
|
allele_counts = {}
|
36
|
41
|
for sample in SAMPLES:
|
37
|
42
|
# for UNPHASED data
|
38
|
|
- smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
|
|
43
|
+ smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
|
|
44
|
+
|
39
|
45
|
nb_alleles = set(smpl_genotype)
|
40
|
46
|
snp_genotypes += smpl_genotype
|
|
47
|
+ # skip if all individuals have the same genotype
|
41
|
48
|
if len(set(snp_genotypes)) == 1:
|
42
|
49
|
line = inputgz.readline()
|
43
|
50
|
continue
|
44
|
|
- #print(snp_genotypes)
|
45
|
51
|
for k in set(snp_genotypes):
|
46
|
52
|
allele_counts[snp_genotypes.count(k)] = k
|
47
|
|
- if 7 in allele_counts.keys():
|
48
|
|
- print(allele_counts)
|
49
|
|
- #print(allele_counts)
|
50
|
53
|
if folded :
|
51
|
|
- #for count in allele_counts.keys():
|
52
|
|
- # for count in allele_counts.keys():
|
53
|
|
- # if count <= len(snp_genotypes)/2 :
|
54
|
|
- # SFS_values[count-1] += 1
|
55
|
|
- # else:
|
56
|
|
- # SFS_values[len(snp_genotypes)-count-1] += 1
|
57
|
54
|
SFS_values[min(allele_counts.keys())-1] += 1
|
58
|
55
|
line = inputgz.readline()
|
59
|
|
- #print(SFS_values)
|
60
|
|
-
|
61
|
|
-
|
62
|
|
-# Note : tout est doublé là
|
|
56
|
+ print(SFS_values)
|