vcf_to_sfs.py 3.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. #!/usr/bin/env python3
  2. """
  3. FOREST Thomas (thomas.forest@college-de-france.fr)
  4. Caution : At the moment for gzipped files only.
  5. ARGS
  6. --------
  7. standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
  8. """
  9. import gzip
  10. import sys
  11. def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False):
  12. """ Returns an SFS from a VCF file.
  13. Parameters
  14. ----------
  15. n : int
  16. Nb of individuals in sample.
  17. vcf_file : str
  18. SNPs in VCF file format.
  19. Used to generate a Site Frequency Spectrum (SFS) from a VCF.
  20. Returns
  21. -------
  22. dict
  23. Site Frequency Spectrum (SFS)
  24. """
  25. if diploid and not folded:
  26. n *= 2
  27. # initiate SFS_values with a zeros dict
  28. SFS_values = dict.fromkeys(range(n),0)
  29. # store nb polyallellic sites
  30. polyall = 0
  31. with gzip.open(vcf_file, "rb") as inputgz:
  32. line = inputgz.readline()
  33. genotypes = []
  34. print("Parsing VCF", vcf_file, "... Please wait...")
  35. while line:
  36. # decode gzipped binary lines
  37. line = line.decode('utf-8').strip()
  38. # every snp line, not comment or header
  39. if not line.startswith("##") and not line.startswith("#"):
  40. FIELDS = line.split("\t")
  41. # REF is col 4 of VCF
  42. REF = FIELDS[3].split(",")
  43. # ALT is col 5 of VCF
  44. ALT = FIELDS[4].split(",")
  45. FORMAT = line.split("\t")[8:9]
  46. SAMPLES = line.split("\t")[9:]
  47. snp_genotypes = []
  48. allele_counts = {}
  49. allele_counts_list = []
  50. # SKIP the SNP if :
  51. # 1 : missing
  52. # 2 : deletion among REF
  53. # 3 : deletion among ALT
  54. if "./.:." in line \
  55. or len(ALT[0]) > 1 \
  56. or len(REF[0]) > 1:
  57. line = inputgz.readline()
  58. continue
  59. for sample in SAMPLES:
  60. if not phased:
  61. # for UNPHASED data
  62. smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
  63. else:
  64. # for PHASED
  65. smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
  66. nb_alleles = set(smpl_genotype)
  67. snp_genotypes += smpl_genotype
  68. # skip if all individuals have the same genotype
  69. if len(set(snp_genotypes)) == 1:
  70. line = inputgz.readline()
  71. continue
  72. for k in set(snp_genotypes):
  73. allele_counts[snp_genotypes.count(k)] = k
  74. allele_counts_list.append(snp_genotypes.count(k))
  75. if folded and len(ALT) >= 2:
  76. polyall += 1
  77. else:
  78. SFS_values[min(allele_counts_list)-1] += 1
  79. line = inputgz.readline()
  80. if verbose:
  81. print(SFS_values)
  82. return SFS_values, polyall
  83. if __name__ == "__main__":
  84. if len(sys.argv) != 3:
  85. print("Need 2 args")
  86. exit(0)
  87. # PARAM : vcf_file
  88. vcf_file = sys.argv[1]
  89. # PARAM : Nb of indiv
  90. n = int(sys.argv[2])
  91. sfs, nb_polyall = sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False)
  92. print(sfs)