vcf_to_sfs.py 1.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. #!/usr/bin/env python3
  2. """
  3. Caution : At the moment for gzipped files only.
  4. ARGS
  5. --------
  6. usage : vcf_to_sfs.py VCF.gz nb_indiv
  7. """
  8. import gzip
  9. import sys
  10. # default folded SFS
  11. folded = True
  12. diploid = True
  13. phased = False
  14. # PARAM : Nb of indiv
  15. n = int(sys.argv[2])
  16. if diploid and not folded:
  17. n *= 2
  18. # initiate SFS_values with a zeros dict
  19. SFS_values = dict.fromkeys(range(n),0)
  20. with gzip.open(sys.argv[1], "rb") as inputgz:
  21. line = inputgz.readline()
  22. genotypes = []
  23. while line:
  24. # decode gzipped binary lines
  25. line = line.decode('utf-8').strip()
  26. # every snp line, not comment or header
  27. if not line.startswith("##") and not line.startswith("#"):
  28. FORMAT = line.split("\t")[8:9]
  29. SAMPLES = line.split("\t")[9:]
  30. snp_genotypes = []
  31. allele_counts = {}
  32. for sample in SAMPLES:
  33. # for UNPHASED data
  34. smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
  35. nb_alleles = set(smpl_genotype)
  36. snp_genotypes += smpl_genotype
  37. # skip if all individuals have the same genotype
  38. if len(set(snp_genotypes)) == 1:
  39. line = inputgz.readline()
  40. continue
  41. for k in set(snp_genotypes):
  42. allele_counts[snp_genotypes.count(k)] = k
  43. if folded :
  44. SFS_values[min(allele_counts.keys())-1] += 1
  45. line = inputgz.readline()
  46. print(SFS_values)