4 years ago · 110ba26f7e
--- a/vcf_to_sfs.py
+++ b/vcf_to_sfs.py
 
															 """
														
 
															 Caution : At the moment for gzipped files only.
														
 
															+ARGS
														
 
															+--------
														
 
															+
														
 
															+usage : vcf_to_sfs.py VCF.gz nb_indiv
														
 
															 """
														
 
															 # default folded SFS
														
 
															 folded = True
														
 
															 diploid = True
														
 
															+phased = False
														
 
															 # PARAM : Nb of indiv
														
 
															 n = int(sys.argv[2])
														
 
															 if diploid and not folded:
														
 
															     n *= 2
														
 
															-
														
 
															+# initiate SFS_values with a zeros dict
														
 
															 SFS_values = dict.fromkeys(range(n),0)
														
 
															-
														
 
															 with gzip.open(sys.argv[1], "rb") as inputgz:
														
 
															     line = inputgz.readline()
														
 
															     genotypes = []
														
 
															-    #SFS_values = {}
														
 
															     while line:
														
 
															+        # decode gzipped binary lines
														
 
															         line = line.decode('utf-8').strip()
														
 
															+        # every snp line, not comment or header
														
 
															         if not line.startswith("##") and not line.startswith("#"):
														
 
															             FORMAT = line.split("\t")[8:9]
														
 
															             SAMPLES = line.split("\t")[9:]
														
 
															             allele_counts = {}
														
 
															             for sample in SAMPLES:
														
 
															                 # for UNPHASED data
														
 
															-                smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']                    
														
 
															+                smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
														
 
															+                
														
 
															                 nb_alleles = set(smpl_genotype)
														
 
															                 snp_genotypes += smpl_genotype
														
 
															+            # skip if all individuals have the same genotype
														
 
															             if len(set(snp_genotypes)) == 1:
														
 
															                 line = inputgz.readline()
														
 
															                 continue
														
 
															-            #print(snp_genotypes)
														
 
															             for k in set(snp_genotypes):
														
 
															                 allele_counts[snp_genotypes.count(k)] = k
														
 
															-            if 7 in allele_counts.keys():
														
 
															-                print(allele_counts)
														
 
															-                #print(allele_counts)
														
 
															             if folded :
														
 
															-                #for count in allele_counts.keys():
														
 
															-                # for count in allele_counts.keys():
														
 
															-                #     if count <= len(snp_genotypes)/2 :
														
 
															-                #         SFS_values[count-1] += 1
														
 
															-                #     else:
														
 
															-                #         SFS_values[len(snp_genotypes)-count-1] += 1
														
 
															                 SFS_values[min(allele_counts.keys())-1] += 1
														
 
															         line = inputgz.readline()
														
 
															-        #print(SFS_values)
														
 
															-
														
 
															-
														
 
															-# Note : tout est doublé là
														
 
															+        print(SFS_values)