Ver código fonte

corr. vcf_to_sfs

tforest 2 anos atrás
pai
commit
017bf2c4ed
1 arquivos alterados com 8 adições e 11 exclusões
  1. 8 11
      vcf_to_sfs.py

+ 8 - 11
vcf_to_sfs.py Ver arquivo

@@ -17,9 +17,7 @@ import sys
17 17
 
18 18
 def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False):
19 19
 
20
-    """
21
-    Multiplication de deux nombres entiers.
22
-    Cette fonction ne sert pas à grand chose.
20
+    """ Returns an SFS from a VCF file.
23 21
 
24 22
     Parameters
25 23
     ----------
@@ -42,7 +40,8 @@ def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, ver
42 40
         n *= 2
43 41
     # initiate SFS_values with a zeros dict
44 42
     SFS_values = dict.fromkeys(range(n),0)
45
-
43
+    # store nb polyallellic sites
44
+    polyall = 0
46 45
     with gzip.open(vcf_file, "rb") as inputgz:
47 46
         line = inputgz.readline()
48 47
         genotypes = []
@@ -88,17 +87,13 @@ def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, ver
88 87
                     allele_counts[snp_genotypes.count(k)] = k
89 88
                     allele_counts_list.append(snp_genotypes.count(k))
90 89
                 if folded and len(ALT) >= 2:
91
-                    pass
92
-                    # TODO - work in progress
93
-                    # for al in range(len(ALT)-1):
94
-                    #     SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
95
-                    #     allele_counts_list.remove(min(allele_counts_list))
90
+                    polyall += 1
96 91
                 else:
97 92
                     SFS_values[min(allele_counts_list)-1] += 1
98 93
             line = inputgz.readline()
99 94
             if verbose:
100 95
                 print(SFS_values)
101
-    return SFS_values
96
+    return SFS_values, polyall
102 97
 
103 98
 if __name__ == "__main__":
104 99
             
@@ -106,8 +101,10 @@ if __name__ == "__main__":
106 101
         print("Need 2 args")
107 102
         exit(0)
108 103
 
104
+    # PARAM : vcf_file
105
+    vcf_file = sys.argv[1]
109 106
     # PARAM : Nb of indiv
110 107
     n = int(sys.argv[2])
111 108
 
112
-    sfs = sfs_from_vcf(n, sys.argv[1], folded = True, diploid = True, phased = False)
109
+    sfs, nb_polyall = sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False)
113 110
     print(sfs)