Browse Source

corr. vcf_to_sfs

tforest 2 years ago
parent
commit
017bf2c4ed
1 changed files with 8 additions and 11 deletions
  1. 8 11
      vcf_to_sfs.py

+ 8 - 11
vcf_to_sfs.py View File

17
 
17
 
18
 def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False):
18
 def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False):
19
 
19
 
20
-    """
21
-    Multiplication de deux nombres entiers.
22
-    Cette fonction ne sert pas à grand chose.
20
+    """ Returns an SFS from a VCF file.
23
 
21
 
24
     Parameters
22
     Parameters
25
     ----------
23
     ----------
42
         n *= 2
40
         n *= 2
43
     # initiate SFS_values with a zeros dict
41
     # initiate SFS_values with a zeros dict
44
     SFS_values = dict.fromkeys(range(n),0)
42
     SFS_values = dict.fromkeys(range(n),0)
45
-
43
+    # store nb polyallellic sites
44
+    polyall = 0
46
     with gzip.open(vcf_file, "rb") as inputgz:
45
     with gzip.open(vcf_file, "rb") as inputgz:
47
         line = inputgz.readline()
46
         line = inputgz.readline()
48
         genotypes = []
47
         genotypes = []
88
                     allele_counts[snp_genotypes.count(k)] = k
87
                     allele_counts[snp_genotypes.count(k)] = k
89
                     allele_counts_list.append(snp_genotypes.count(k))
88
                     allele_counts_list.append(snp_genotypes.count(k))
90
                 if folded and len(ALT) >= 2:
89
                 if folded and len(ALT) >= 2:
91
-                    pass
92
-                    # TODO - work in progress
93
-                    # for al in range(len(ALT)-1):
94
-                    #     SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
95
-                    #     allele_counts_list.remove(min(allele_counts_list))
90
+                    polyall += 1
96
                 else:
91
                 else:
97
                     SFS_values[min(allele_counts_list)-1] += 1
92
                     SFS_values[min(allele_counts_list)-1] += 1
98
             line = inputgz.readline()
93
             line = inputgz.readline()
99
             if verbose:
94
             if verbose:
100
                 print(SFS_values)
95
                 print(SFS_values)
101
-    return SFS_values
96
+    return SFS_values, polyall
102
 
97
 
103
 if __name__ == "__main__":
98
 if __name__ == "__main__":
104
             
99
             
106
         print("Need 2 args")
101
         print("Need 2 args")
107
         exit(0)
102
         exit(0)
108
 
103
 
104
+    # PARAM : vcf_file
105
+    vcf_file = sys.argv[1]
109
     # PARAM : Nb of indiv
106
     # PARAM : Nb of indiv
110
     n = int(sys.argv[2])
107
     n = int(sys.argv[2])
111
 
108
 
112
-    sfs = sfs_from_vcf(n, sys.argv[1], folded = True, diploid = True, phased = False)
109
+    sfs, nb_polyall = sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False)
113
     print(sfs)
110
     print(sfs)