|
@@ -0,0 +1,41 @@
|
|
1
|
+#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+"""
|
|
4
|
+Caution : At the moment for gzipped files only.
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+"""
|
|
8
|
+
|
|
9
|
+import gzip
|
|
10
|
+import sys
|
|
11
|
+
|
|
12
|
+# default folded SFS
|
|
13
|
+folded = True
|
|
14
|
+
|
|
15
|
+with gzip.open(sys.argv[1], "rb") as inputgz:
|
|
16
|
+ line = inputgz.readline()
|
|
17
|
+ genotypes = []
|
|
18
|
+ SFS_values = {}
|
|
19
|
+ while line:
|
|
20
|
+ line = line.decode('utf-8').strip()
|
|
21
|
+ if not line.startswith("##") and not line.startswith("#"):
|
|
22
|
+ FORMAT = line.split("\t")[8:9]
|
|
23
|
+ SAMPLES = line.split("\t")[9:]
|
|
24
|
+ snp_genotypes = []
|
|
25
|
+ for sample in SAMPLES:
|
|
26
|
+ # for UNPHASED data
|
|
27
|
+ smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
|
|
28
|
+ #if not folded:
|
|
29
|
+
|
|
30
|
+ print(smpl_genotype)
|
|
31
|
+ nb_alleles = len(set(smpl_genotype))
|
|
32
|
+ snp_genotypes.append(nb_alleles)
|
|
33
|
+ print(snp_genotypes)
|
|
34
|
+ nb_derived_allele = len([val for val in snp_genotypes if val != 0])
|
|
35
|
+ print("nb derived allele", nb_derived_allele)
|
|
36
|
+ if nb_derived_allele not in SFS_values.keys():
|
|
37
|
+ SFS_values[nb_derived_allele] = 1
|
|
38
|
+ else:
|
|
39
|
+ SFS_values[nb_derived_allele] += 1
|
|
40
|
+ line = inputgz.readline()
|
|
41
|
+ print(SFS_values)
|