vcf_to_sfs.py 1.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. #!/usr/bin/env python3
  2. """
  3. Caution : At the moment for gzipped files only.
  4. """
  5. import gzip
  6. import sys
  7. # default folded SFS
  8. folded = True
  9. diploid = True
  10. # PARAM : Nb of indiv
  11. n = int(sys.argv[2])
  12. if diploid and not folded:
  13. n *= 2
  14. SFS_values = dict.fromkeys(range(n),0)
  15. with gzip.open(sys.argv[1], "rb") as inputgz:
  16. line = inputgz.readline()
  17. genotypes = []
  18. #SFS_values = {}
  19. while line:
  20. line = line.decode('utf-8').strip()
  21. if not line.startswith("##") and not line.startswith("#"):
  22. FORMAT = line.split("\t")[8:9]
  23. SAMPLES = line.split("\t")[9:]
  24. snp_genotypes = []
  25. allele_counts = {}
  26. for sample in SAMPLES:
  27. # for UNPHASED data
  28. smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
  29. nb_alleles = set(smpl_genotype)
  30. snp_genotypes += smpl_genotype
  31. for k in set(snp_genotypes):
  32. allele_counts[snp_genotypes.count(k)] = k
  33. if folded :
  34. for count in allele_counts.keys():
  35. if count <= len(snp_genotypes)/2 :
  36. SFS_values[count-1] += 1
  37. else:
  38. SFS_values[len(snp_genotypes)-count] += 1
  39. line = inputgz.readline()
  40. print(SFS_values)