vcf_to_sfs.py 2.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. #!/usr/bin/env python3
  2. """
  3. FOREST Thomas (thomas.forest@college-de-france.fr)
  4. Caution : At the moment for gzipped files only.
  5. ARGS
  6. --------
  7. usage : vcf_to_sfs.py VCF.gz nb_indiv
  8. """
  9. import gzip
  10. import sys
  11. # default folded SFS
  12. folded = True
  13. diploid = True
  14. phased = False
  15. # PARAM : Nb of indiv
  16. n = int(sys.argv[2])
  17. if diploid and not folded:
  18. n *= 2
  19. # initiate SFS_values with a zeros dict
  20. SFS_values = dict.fromkeys(range(n),0)
  21. with gzip.open(sys.argv[1], "rb") as inputgz:
  22. line = inputgz.readline()
  23. genotypes = []
  24. while line:
  25. # decode gzipped binary lines
  26. line = line.decode('utf-8').strip()
  27. # every snp line, not comment or header
  28. if not line.startswith("##") and not line.startswith("#"):
  29. FIELDS = line.split("\t")
  30. # REF is col 4 of VCF
  31. REF = FIELDS[3].split(",")
  32. # ALT is col 5 of VCF
  33. ALT = FIELDS[4].split(",")
  34. FORMAT = line.split("\t")[8:9]
  35. SAMPLES = line.split("\t")[9:]
  36. snp_genotypes = []
  37. allele_counts = {}
  38. allele_counts_list = []
  39. # SKIP the SNP if :
  40. # 1 : missing
  41. # 2 : deletion among REF
  42. # 3 : deletion among ALT
  43. if "./.:." in line \
  44. or len(ALT[0]) > 1 \
  45. or len(REF[0]) > 1:
  46. line = inputgz.readline()
  47. continue
  48. for sample in SAMPLES:
  49. if not phased:
  50. # for UNPHASED data
  51. smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
  52. else:
  53. # for PHASED
  54. smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
  55. nb_alleles = set(smpl_genotype)
  56. snp_genotypes += smpl_genotype
  57. # skip if all individuals have the same genotype
  58. if len(set(snp_genotypes)) == 1:
  59. line = inputgz.readline()
  60. continue
  61. for k in set(snp_genotypes):
  62. allele_counts[snp_genotypes.count(k)] = k
  63. allele_counts_list.append(snp_genotypes.count(k))
  64. if folded :
  65. for al in range(polyallelic-1):
  66. SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
  67. allele_counts_list.remove(min(allele_counts_list))
  68. line = inputgz.readline()
  69. print(SFS_values)
  70. print(polycount)