|
@@ -11,31 +11,40 @@ import sys
|
11
|
11
|
|
12
|
12
|
# default folded SFS
|
13
|
13
|
folded = True
|
|
14
|
+diploid = True
|
|
15
|
+
|
|
16
|
+# PARAM : Nb of indiv
|
|
17
|
+n = int(sys.argv[2])
|
|
18
|
+
|
|
19
|
+if diploid and not folded:
|
|
20
|
+ n *= 2
|
|
21
|
+
|
|
22
|
+SFS_values = dict.fromkeys(range(n),0)
|
|
23
|
+
|
14
|
24
|
|
15
|
25
|
with gzip.open(sys.argv[1], "rb") as inputgz:
|
16
|
26
|
line = inputgz.readline()
|
17
|
27
|
genotypes = []
|
18
|
|
- SFS_values = {}
|
|
28
|
+ #SFS_values = {}
|
19
|
29
|
while line:
|
20
|
30
|
line = line.decode('utf-8').strip()
|
21
|
31
|
if not line.startswith("##") and not line.startswith("#"):
|
22
|
32
|
FORMAT = line.split("\t")[8:9]
|
23
|
33
|
SAMPLES = line.split("\t")[9:]
|
24
|
34
|
snp_genotypes = []
|
|
35
|
+ allele_counts = {}
|
25
|
36
|
for sample in SAMPLES:
|
26
|
37
|
# for UNPHASED data
|
27
|
|
- smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
|
28
|
|
- #if not folded:
|
29
|
|
-
|
30
|
|
- print(smpl_genotype)
|
31
|
|
- nb_alleles = len(set(smpl_genotype))
|
32
|
|
- snp_genotypes.append(nb_alleles)
|
33
|
|
- print(snp_genotypes)
|
34
|
|
- nb_derived_allele = len([val for val in snp_genotypes if val != 0])
|
35
|
|
- print("nb derived allele", nb_derived_allele)
|
36
|
|
- if nb_derived_allele not in SFS_values.keys():
|
37
|
|
- SFS_values[nb_derived_allele] = 1
|
38
|
|
- else:
|
39
|
|
- SFS_values[nb_derived_allele] += 1
|
|
38
|
+ smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
|
|
39
|
+ nb_alleles = set(smpl_genotype)
|
|
40
|
+ snp_genotypes += smpl_genotype
|
|
41
|
+ for k in set(snp_genotypes):
|
|
42
|
+ allele_counts[snp_genotypes.count(k)] = k
|
|
43
|
+ if folded :
|
|
44
|
+ for count in allele_counts.keys():
|
|
45
|
+ if count <= len(snp_genotypes)/2 :
|
|
46
|
+ SFS_values[count-1] += 1
|
|
47
|
+ else:
|
|
48
|
+ SFS_values[len(snp_genotypes)-count] += 1
|
40
|
49
|
line = inputgz.readline()
|
41
|
50
|
print(SFS_values)
|