Browse Source

add polyallelic snps

tforest 2 years ago
parent
commit
c1e2cc9192
1 changed files with 45 additions and 4 deletions
  1. 45 4
      vcf_to_sfs.py

+ 45 - 4
vcf_to_sfs.py View File

@@ -1,6 +1,8 @@
1 1
 #!/usr/bin/env python3
2 2
 
3 3
 """
4
+FOREST Thomas (thomas.forest@college-de-france.fr)
5
+
4 6
 Caution : At the moment for gzipped files only.
5 7
 
6 8
 ARGS
@@ -26,6 +28,8 @@ if diploid and not folded:
26 28
 # initiate SFS_values with a zeros dict
27 29
 SFS_values = dict.fromkeys(range(n),0)
28 30
 
31
+polycount = 0
32
+
29 33
 with gzip.open(sys.argv[1], "rb") as inputgz:
30 34
     line = inputgz.readline()
31 35
     genotypes = []
@@ -34,23 +38,60 @@ with gzip.open(sys.argv[1], "rb") as inputgz:
34 38
         line = line.decode('utf-8').strip()
35 39
         # every snp line, not comment or header
36 40
         if not line.startswith("##") and not line.startswith("#"):
41
+            FIELDS = line.split("\t")
42
+            # REF is col 4 of VCF
43
+            REF = FIELDS[3].split(",")
44
+            # ALT is col 5 of VCF
45
+            ALT = FIELDS[4].split(",")            
37 46
             FORMAT = line.split("\t")[8:9]
38 47
             SAMPLES = line.split("\t")[9:]
39 48
             snp_genotypes = []
40 49
             allele_counts = {}
50
+            allele_counts_list = []
51
+            # SKIP the SNP if :
52
+            # 1 : missing
53
+            # 2 : deletion among REF
54
+            # 3 : deletion among ALT
55
+            if "./.:." in line \
56
+               or len(ALT[0]) > 1 \
57
+               or len(REF[0]) > 1:
58
+                line = inputgz.readline()
59
+                continue
41 60
             for sample in SAMPLES:
42
-                # for UNPHASED data
43
-                smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
44
-                
61
+                if not phased:
62
+                    # for UNPHASED data
63
+                    smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
64
+                else:
65
+                    # for PHASED
66
+                    smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
45 67
                 nb_alleles = set(smpl_genotype)
46 68
                 snp_genotypes += smpl_genotype
69
+            # if set(snp_genotypes) > 2:
70
+            #     polyallelic = set(snp_genotypes)
71
+            # else:
72
+            #     polyallelic = False
73
+            polyallelic = len(ALT)
74
+            ##print(REF, ALT, snp_genotypes)
47 75
             # skip if all individuals have the same genotype
48 76
             if len(set(snp_genotypes)) == 1:
49 77
                 line = inputgz.readline()
50 78
                 continue
51 79
             for k in set(snp_genotypes):
52 80
                 allele_counts[snp_genotypes.count(k)] = k
81
+                allele_counts_list.append(snp_genotypes.count(k))
53 82
             if folded :
54
-                SFS_values[min(allele_counts.keys())-1] += 1
83
+                #allele_counts_list = list(allele_counts.keys())
84
+                ##print("ALC", allele_counts_list, "POLY", polyallelic, ALT)
85
+                # for al in range(polyallelic-1):
86
+                #     SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
87
+                #     allele_counts_list.remove(min(allele_counts_list))
88
+                if len(ALT) == 1:
89
+                    SFS_values[min(allele_counts_list)-1] += 1
90
+                else:
91
+                    for al in range(polyallelic-1):
92
+                        SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
93
+                        allele_counts_list.remove(min(allele_counts_list))
94
+                        polycount += 1
55 95
         line = inputgz.readline()
56 96
         print(SFS_values)
97
+print(polycount)