|
@@ -1,6 +1,8 @@
|
1
|
1
|
#!/usr/bin/env python3
|
2
|
2
|
|
3
|
3
|
"""
|
|
4
|
+FOREST Thomas (thomas.forest@college-de-france.fr)
|
|
5
|
+
|
4
|
6
|
Caution : At the moment for gzipped files only.
|
5
|
7
|
|
6
|
8
|
ARGS
|
|
@@ -26,6 +28,8 @@ if diploid and not folded:
|
26
|
28
|
# initiate SFS_values with a zeros dict
|
27
|
29
|
SFS_values = dict.fromkeys(range(n),0)
|
28
|
30
|
|
|
31
|
+polycount = 0
|
|
32
|
+
|
29
|
33
|
with gzip.open(sys.argv[1], "rb") as inputgz:
|
30
|
34
|
line = inputgz.readline()
|
31
|
35
|
genotypes = []
|
|
@@ -34,23 +38,60 @@ with gzip.open(sys.argv[1], "rb") as inputgz:
|
34
|
38
|
line = line.decode('utf-8').strip()
|
35
|
39
|
# every snp line, not comment or header
|
36
|
40
|
if not line.startswith("##") and not line.startswith("#"):
|
|
41
|
+ FIELDS = line.split("\t")
|
|
42
|
+ # REF is col 4 of VCF
|
|
43
|
+ REF = FIELDS[3].split(",")
|
|
44
|
+ # ALT is col 5 of VCF
|
|
45
|
+ ALT = FIELDS[4].split(",")
|
37
|
46
|
FORMAT = line.split("\t")[8:9]
|
38
|
47
|
SAMPLES = line.split("\t")[9:]
|
39
|
48
|
snp_genotypes = []
|
40
|
49
|
allele_counts = {}
|
|
50
|
+ allele_counts_list = []
|
|
51
|
+ # SKIP the SNP if :
|
|
52
|
+ # 1 : missing
|
|
53
|
+ # 2 : deletion among REF
|
|
54
|
+ # 3 : deletion among ALT
|
|
55
|
+ if "./.:." in line \
|
|
56
|
+ or len(ALT[0]) > 1 \
|
|
57
|
+ or len(REF[0]) > 1:
|
|
58
|
+ line = inputgz.readline()
|
|
59
|
+ continue
|
41
|
60
|
for sample in SAMPLES:
|
42
|
|
- # for UNPHASED data
|
43
|
|
- smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
|
44
|
|
-
|
|
61
|
+ if not phased:
|
|
62
|
+ # for UNPHASED data
|
|
63
|
+ smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
|
|
64
|
+ else:
|
|
65
|
+ # for PHASED
|
|
66
|
+ smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
|
45
|
67
|
nb_alleles = set(smpl_genotype)
|
46
|
68
|
snp_genotypes += smpl_genotype
|
|
69
|
+ # if set(snp_genotypes) > 2:
|
|
70
|
+ # polyallelic = set(snp_genotypes)
|
|
71
|
+ # else:
|
|
72
|
+ # polyallelic = False
|
|
73
|
+ polyallelic = len(ALT)
|
|
74
|
+ ##print(REF, ALT, snp_genotypes)
|
47
|
75
|
# skip if all individuals have the same genotype
|
48
|
76
|
if len(set(snp_genotypes)) == 1:
|
49
|
77
|
line = inputgz.readline()
|
50
|
78
|
continue
|
51
|
79
|
for k in set(snp_genotypes):
|
52
|
80
|
allele_counts[snp_genotypes.count(k)] = k
|
|
81
|
+ allele_counts_list.append(snp_genotypes.count(k))
|
53
|
82
|
if folded :
|
54
|
|
- SFS_values[min(allele_counts.keys())-1] += 1
|
|
83
|
+ #allele_counts_list = list(allele_counts.keys())
|
|
84
|
+ ##print("ALC", allele_counts_list, "POLY", polyallelic, ALT)
|
|
85
|
+ # for al in range(polyallelic-1):
|
|
86
|
+ # SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
|
|
87
|
+ # allele_counts_list.remove(min(allele_counts_list))
|
|
88
|
+ if len(ALT) == 1:
|
|
89
|
+ SFS_values[min(allele_counts_list)-1] += 1
|
|
90
|
+ else:
|
|
91
|
+ for al in range(polyallelic-1):
|
|
92
|
+ SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
|
|
93
|
+ allele_counts_list.remove(min(allele_counts_list))
|
|
94
|
+ polycount += 1
|
55
|
95
|
line = inputgz.readline()
|
56
|
96
|
print(SFS_values)
|
|
97
|
+print(polycount)
|