Browse Source

add comments

tforest 2 years ago
parent
commit
110ba26f7e
1 changed files with 12 additions and 18 deletions
  1. 12 18
      vcf_to_sfs.py

+ 12 - 18
vcf_to_sfs.py View File

@@ -3,6 +3,10 @@
3 3
 """
4 4
 Caution : At the moment for gzipped files only.
5 5
 
6
+ARGS
7
+--------
8
+
9
+usage : vcf_to_sfs.py VCF.gz nb_indiv
6 10
 
7 11
 """
8 12
 
@@ -12,22 +16,23 @@ import sys
12 16
 # default folded SFS
13 17
 folded = True
14 18
 diploid = True
19
+phased = False
15 20
 
16 21
 # PARAM : Nb of indiv
17 22
 n = int(sys.argv[2])
18 23
 
19 24
 if diploid and not folded:
20 25
     n *= 2
21
-
26
+# initiate SFS_values with a zeros dict
22 27
 SFS_values = dict.fromkeys(range(n),0)
23 28
 
24
-
25 29
 with gzip.open(sys.argv[1], "rb") as inputgz:
26 30
     line = inputgz.readline()
27 31
     genotypes = []
28
-    #SFS_values = {}
29 32
     while line:
33
+        # decode gzipped binary lines
30 34
         line = line.decode('utf-8').strip()
35
+        # every snp line, not comment or header
31 36
         if not line.startswith("##") and not line.startswith("#"):
32 37
             FORMAT = line.split("\t")[8:9]
33 38
             SAMPLES = line.split("\t")[9:]
@@ -35,28 +40,17 @@ with gzip.open(sys.argv[1], "rb") as inputgz:
35 40
             allele_counts = {}
36 41
             for sample in SAMPLES:
37 42
                 # for UNPHASED data
38
-                smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']                    
43
+                smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
44
+                
39 45
                 nb_alleles = set(smpl_genotype)
40 46
                 snp_genotypes += smpl_genotype
47
+            # skip if all individuals have the same genotype
41 48
             if len(set(snp_genotypes)) == 1:
42 49
                 line = inputgz.readline()
43 50
                 continue
44
-            #print(snp_genotypes)
45 51
             for k in set(snp_genotypes):
46 52
                 allele_counts[snp_genotypes.count(k)] = k
47
-            if 7 in allele_counts.keys():
48
-                print(allele_counts)
49
-                #print(allele_counts)
50 53
             if folded :
51
-                #for count in allele_counts.keys():
52
-                # for count in allele_counts.keys():
53
-                #     if count <= len(snp_genotypes)/2 :
54
-                #         SFS_values[count-1] += 1
55
-                #     else:
56
-                #         SFS_values[len(snp_genotypes)-count-1] += 1
57 54
                 SFS_values[min(allele_counts.keys())-1] += 1
58 55
         line = inputgz.readline()
59
-        #print(SFS_values)
60
-
61
-
62
-# Note : tout est doublé là
56
+        print(SFS_values)