vcf_to_sfs.py 3.2KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. #!/usr/bin/env python3
  2. """
  3. FOREST Thomas (thomas.forest@college-de-france.fr)
  4. Caution : At the moment for gzipped files only.
  5. ARGS
  6. --------
  7. usage : vcf_to_sfs.py VCF.gz nb_indiv
  8. """
  9. import gzip
  10. import sys
  11. # default folded SFS
  12. folded = True
  13. diploid = True
  14. phased = False
  15. # PARAM : Nb of indiv
  16. n = int(sys.argv[2])
  17. if diploid and not folded:
  18. n *= 2
  19. # initiate SFS_values with a zeros dict
  20. SFS_values = dict.fromkeys(range(n),0)
  21. polycount = 0
  22. with gzip.open(sys.argv[1], "rb") as inputgz:
  23. line = inputgz.readline()
  24. genotypes = []
  25. while line:
  26. # decode gzipped binary lines
  27. line = line.decode('utf-8').strip()
  28. # every snp line, not comment or header
  29. if not line.startswith("##") and not line.startswith("#"):
  30. FIELDS = line.split("\t")
  31. # REF is col 4 of VCF
  32. REF = FIELDS[3].split(",")
  33. # ALT is col 5 of VCF
  34. ALT = FIELDS[4].split(",")
  35. FORMAT = line.split("\t")[8:9]
  36. SAMPLES = line.split("\t")[9:]
  37. snp_genotypes = []
  38. allele_counts = {}
  39. allele_counts_list = []
  40. # SKIP the SNP if :
  41. # 1 : missing
  42. # 2 : deletion among REF
  43. # 3 : deletion among ALT
  44. if "./.:." in line \
  45. or len(ALT[0]) > 1 \
  46. or len(REF[0]) > 1:
  47. line = inputgz.readline()
  48. continue
  49. for sample in SAMPLES:
  50. if not phased:
  51. # for UNPHASED data
  52. smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
  53. else:
  54. # for PHASED
  55. smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
  56. nb_alleles = set(smpl_genotype)
  57. snp_genotypes += smpl_genotype
  58. # if set(snp_genotypes) > 2:
  59. # polyallelic = set(snp_genotypes)
  60. # else:
  61. # polyallelic = False
  62. polyallelic = len(ALT)
  63. ##print(REF, ALT, snp_genotypes)
  64. # skip if all individuals have the same genotype
  65. if len(set(snp_genotypes)) == 1:
  66. line = inputgz.readline()
  67. continue
  68. for k in set(snp_genotypes):
  69. allele_counts[snp_genotypes.count(k)] = k
  70. allele_counts_list.append(snp_genotypes.count(k))
  71. if folded :
  72. #allele_counts_list = list(allele_counts.keys())
  73. ##print("ALC", allele_counts_list, "POLY", polyallelic, ALT)
  74. # for al in range(polyallelic-1):
  75. # SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
  76. # allele_counts_list.remove(min(allele_counts_list))
  77. if len(ALT) == 1:
  78. SFS_values[min(allele_counts_list)-1] += 1
  79. else:
  80. for al in range(polyallelic-1):
  81. SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
  82. allele_counts_list.remove(min(allele_counts_list))
  83. polycount += 1
  84. line = inputgz.readline()
  85. print(SFS_values)
  86. print(polycount)