vcf_to_sfs.py 1.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. #!/usr/bin/env python3
  2. """
  3. Caution : At the moment for gzipped files only.
  4. """
  5. import gzip
  6. import sys
  7. # default folded SFS
  8. folded = True
  9. with gzip.open(sys.argv[1], "rb") as inputgz:
  10. line = inputgz.readline()
  11. genotypes = []
  12. SFS_values = {}
  13. while line:
  14. line = line.decode('utf-8').strip()
  15. if not line.startswith("##") and not line.startswith("#"):
  16. FORMAT = line.split("\t")[8:9]
  17. SAMPLES = line.split("\t")[9:]
  18. snp_genotypes = []
  19. for sample in SAMPLES:
  20. # for UNPHASED data
  21. smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
  22. #if not folded:
  23. print(smpl_genotype)
  24. nb_alleles = len(set(smpl_genotype))
  25. snp_genotypes.append(nb_alleles)
  26. print(snp_genotypes)
  27. nb_derived_allele = len([val for val in snp_genotypes if val != 0])
  28. print("nb derived allele", nb_derived_allele)
  29. if nb_derived_allele not in SFS_values.keys():
  30. SFS_values[nb_derived_allele] = 1
  31. else:
  32. SFS_values[nb_derived_allele] += 1
  33. line = inputgz.readline()
  34. print(SFS_values)