tforest преди 3 години
родител
ревизия
0e25049a58
променени са 1 файла, в които са добавени 23 реда и са изтрити 14 реда
  1. 23 14
      vcf_to_sfs.py

+ 23 - 14
vcf_to_sfs.py Целия файл

@@ -11,31 +11,40 @@ import sys
11 11
 
12 12
 # default folded SFS
13 13
 folded = True
14
+diploid = True
15
+
16
+# PARAM : Nb of indiv
17
+n = int(sys.argv[2])
18
+
19
+if diploid and not folded:
20
+    n *= 2
21
+
22
+SFS_values = dict.fromkeys(range(n),0)
23
+
14 24
 
15 25
 with gzip.open(sys.argv[1], "rb") as inputgz:
16 26
     line = inputgz.readline()
17 27
     genotypes = []
18
-    SFS_values = {}
28
+    #SFS_values = {}
19 29
     while line:
20 30
         line = line.decode('utf-8').strip()
21 31
         if not line.startswith("##") and not line.startswith("#"):
22 32
             FORMAT = line.split("\t")[8:9]
23 33
             SAMPLES = line.split("\t")[9:]
24 34
             snp_genotypes = []
35
+            allele_counts = {}
25 36
             for sample in SAMPLES:
26 37
                 # for UNPHASED data
27
-                smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
28
-                #if not folded:
29
-                    
30
-                print(smpl_genotype)
31
-                nb_alleles = len(set(smpl_genotype))
32
-                snp_genotypes.append(nb_alleles)
33
-            print(snp_genotypes)
34
-            nb_derived_allele = len([val for val in snp_genotypes if val != 0])
35
-            print("nb derived allele", nb_derived_allele)
36
-            if nb_derived_allele not in SFS_values.keys():
37
-                SFS_values[nb_derived_allele] = 1
38
-            else:
39
-                SFS_values[nb_derived_allele] += 1
38
+                smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']                    
39
+                nb_alleles = set(smpl_genotype)
40
+                snp_genotypes += smpl_genotype
41
+            for k in set(snp_genotypes):
42
+                allele_counts[snp_genotypes.count(k)] = k
43
+            if folded :
44
+                for count in allele_counts.keys():
45
+                    if count <= len(snp_genotypes)/2 :
46
+                        SFS_values[count-1] += 1                        
47
+                    else:
48
+                        SFS_values[len(snp_genotypes)-count] += 1
40 49
         line = inputgz.readline()
41 50
         print(SFS_values)