tforest před 2 roky
rodič
revize
110ba26f7e
1 změnil soubory, kde provedl 12 přidání a 18 odebrání
  1. 12 18
      vcf_to_sfs.py

+ 12 - 18
vcf_to_sfs.py Zobrazit soubor

3
 """
3
 """
4
 Caution : At the moment for gzipped files only.
4
 Caution : At the moment for gzipped files only.
5
 
5
 
6
+ARGS
7
+--------
8
+
9
+usage : vcf_to_sfs.py VCF.gz nb_indiv
6
 
10
 
7
 """
11
 """
8
 
12
 
12
 # default folded SFS
16
 # default folded SFS
13
 folded = True
17
 folded = True
14
 diploid = True
18
 diploid = True
19
+phased = False
15
 
20
 
16
 # PARAM : Nb of indiv
21
 # PARAM : Nb of indiv
17
 n = int(sys.argv[2])
22
 n = int(sys.argv[2])
18
 
23
 
19
 if diploid and not folded:
24
 if diploid and not folded:
20
     n *= 2
25
     n *= 2
21
-
26
+# initiate SFS_values with a zeros dict
22
 SFS_values = dict.fromkeys(range(n),0)
27
 SFS_values = dict.fromkeys(range(n),0)
23
 
28
 
24
-
25
 with gzip.open(sys.argv[1], "rb") as inputgz:
29
 with gzip.open(sys.argv[1], "rb") as inputgz:
26
     line = inputgz.readline()
30
     line = inputgz.readline()
27
     genotypes = []
31
     genotypes = []
28
-    #SFS_values = {}
29
     while line:
32
     while line:
33
+        # decode gzipped binary lines
30
         line = line.decode('utf-8').strip()
34
         line = line.decode('utf-8').strip()
35
+        # every snp line, not comment or header
31
         if not line.startswith("##") and not line.startswith("#"):
36
         if not line.startswith("##") and not line.startswith("#"):
32
             FORMAT = line.split("\t")[8:9]
37
             FORMAT = line.split("\t")[8:9]
33
             SAMPLES = line.split("\t")[9:]
38
             SAMPLES = line.split("\t")[9:]
35
             allele_counts = {}
40
             allele_counts = {}
36
             for sample in SAMPLES:
41
             for sample in SAMPLES:
37
                 # for UNPHASED data
42
                 # for UNPHASED data
38
-                smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']                    
43
+                smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
44
+                
39
                 nb_alleles = set(smpl_genotype)
45
                 nb_alleles = set(smpl_genotype)
40
                 snp_genotypes += smpl_genotype
46
                 snp_genotypes += smpl_genotype
47
+            # skip if all individuals have the same genotype
41
             if len(set(snp_genotypes)) == 1:
48
             if len(set(snp_genotypes)) == 1:
42
                 line = inputgz.readline()
49
                 line = inputgz.readline()
43
                 continue
50
                 continue
44
-            #print(snp_genotypes)
45
             for k in set(snp_genotypes):
51
             for k in set(snp_genotypes):
46
                 allele_counts[snp_genotypes.count(k)] = k
52
                 allele_counts[snp_genotypes.count(k)] = k
47
-            if 7 in allele_counts.keys():
48
-                print(allele_counts)
49
-                #print(allele_counts)
50
             if folded :
53
             if folded :
51
-                #for count in allele_counts.keys():
52
-                # for count in allele_counts.keys():
53
-                #     if count <= len(snp_genotypes)/2 :
54
-                #         SFS_values[count-1] += 1
55
-                #     else:
56
-                #         SFS_values[len(snp_genotypes)-count-1] += 1
57
                 SFS_values[min(allele_counts.keys())-1] += 1
54
                 SFS_values[min(allele_counts.keys())-1] += 1
58
         line = inputgz.readline()
55
         line = inputgz.readline()
59
-        #print(SFS_values)
60
-
61
-
62
-# Note : tout est doublé là
56
+        print(SFS_values)