vcf_utils.py 8.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. #!/usr/bin/env python3
  2. """
  3. FOREST Thomas (thomas.forest@college-de-france.fr)
  4. Caution : At the moment for gzipped files only.
  5. ARGS
  6. --------
  7. standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
  8. """
  9. import gzip
  10. import sys
  11. import numpy as np
  12. from frst import customgraphics
  13. import json
  14. import time
  15. import datetime
  16. def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
  17. start = time.time()
  18. with gzip.open(vcf_file, "rb") as inputgz:
  19. byte_line = inputgz.readline()
  20. genotypes = []
  21. noGenotype = []
  22. pos = 0
  23. pluriall_counts = 0
  24. entries = {}
  25. chrom = {}
  26. nb_site = 0
  27. print("Parsing VCF {} ... Please wait...".format(vcf_file))
  28. #print("Parsing VCF", vcf_file, "... Please wait...")
  29. while byte_line:
  30. #print(line)
  31. # decode gzipped binary lines
  32. line = byte_line.decode('utf-8').strip()
  33. nb_site += 1
  34. # # every snp line, not comment or header
  35. if not line.startswith("##") and not line.startswith("#"):
  36. FIELDS = line.split("\t")
  37. CHROM = FIELDS[0]
  38. POS = int(FIELDS[1])
  39. if stop_at:
  40. if POS > stop_at:
  41. break
  42. # REF is col 4 of VCF
  43. REF = FIELDS[3].split(",")
  44. # ALT is col 5 of VCF
  45. ALT = FIELDS[4].split(",")
  46. FORMAT = line.split("\t")[8:9]
  47. SAMPLES = line.split("\t")[9:]
  48. QUALITY = float(FIELDS[5])
  49. INFO = FIELDS[7]
  50. INFOS = {}
  51. for info in INFO.split(";"):
  52. try :
  53. INFOS[info.split('=')[0]] = info.split('=')[1]
  54. except:
  55. INFOS[info] = info
  56. GENOTYPE = []
  57. LIKELIHOOD = []
  58. # SKIP the SNP if :
  59. # 1 : missing
  60. # 2 : deletion among REF
  61. # 3 : deletion among ALT
  62. if "./.:." in line \
  63. or len(ALT[0]) > 1 \
  64. or len(REF[0]) > 1:
  65. # sites that are not kept
  66. # if at least one missing data, remember it
  67. noGenotype.append(POS)
  68. byte_line = inputgz.readline()
  69. continue
  70. elif len(ALT) > 1:
  71. # pluriall sites
  72. # remember that the gestion of PL is very diff with pluriall
  73. pluriall_counts += 1
  74. noGenotype.append(POS)
  75. byte_line = inputgz.readline()
  76. continue
  77. else:
  78. # for unphased and with only two fields : GT:PL
  79. for sample in SAMPLES:
  80. if not phased:
  81. # for UNPHASED data
  82. sample_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
  83. else:
  84. # for PHASED
  85. sample_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
  86. # list of PL : [prob of 0/0, prob of 0/1, prob of 1/1]
  87. sample_likelihood = sample.split(':')[1].split(',')
  88. sample_likelihood = [pow(10, -int(sample_likelihood[0])/10),
  89. pow(10, -int(sample_likelihood[1])/10), pow(10, -int(sample_likelihood[2])/10)]
  90. GENOTYPE += sample_genotype
  91. LIKELIHOOD.append(sample_likelihood)
  92. # from log phred score to probability of error, E
  93. #LIKELIHOOD = pow(10, -int(LIKELIHOOD[0])/10)
  94. #print(LIKELIHOOD)
  95. entries = {
  96. 'POS':POS,
  97. 'CHR':CHROM,
  98. 'FIELDS':FIELDS,
  99. 'REF':REF,
  100. 'ALT':ALT,
  101. 'FORMAT':FORMAT,
  102. 'INFOS':INFOS,
  103. 'SAMPLES':SAMPLES,
  104. 'QUALITY':QUALITY,
  105. 'GENOTYPE':GENOTYPE,
  106. 'LIKELIHOOD':LIKELIHOOD
  107. }
  108. if CHROM.startswith(chr_starts_with):
  109. # keep if chr name starts with filter
  110. # default : *, every chr is kept
  111. if CHROM not in chrom:
  112. chrom[CHROM] = {}
  113. chrom[CHROM][POS] = entries
  114. byte_line = inputgz.readline()
  115. end = time.time()
  116. print("Parsed", nb_site, "sites in", str(datetime.timedelta(seconds=end - start)))
  117. return chrom
  118. def build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit = False, normalize = False, xlim=None):
  119. """ Take infos out of parsing to build a coverage matrix of probability
  120. of error, E, at each position
  121. """
  122. # last pos without any missing data ./.:.
  123. last_pos = list(entries.keys())[-1]
  124. # last genotyped SNP with missing data, usually bigger than last_pos
  125. last_genotyped = noGenotype[-1]
  126. mat = []
  127. if diploid:
  128. # k=N if diploids, k = 2N if haploids
  129. k = int(len(entries[last_pos]['GENOTYPE'])/2)
  130. else:
  131. k = len(entries[last_pos]['GENOTYPE'])
  132. for _ in range(k):
  133. mat.append([])
  134. #display_max = last_pos
  135. if xlim:
  136. display_max = xlim
  137. else:
  138. if na_omit:
  139. display_max = last_pos
  140. else:
  141. display_max = last_genotyped
  142. for pos in range(display_max):
  143. if pos in noGenotype or pos not in entries.keys():
  144. if na_omit:
  145. continue
  146. else:
  147. for k in range(len(mat)):
  148. # if missing, prob=1, worst case
  149. mat[k].append(1)
  150. else:
  151. for k in range(len(mat)):
  152. best_prob = min(entries[pos]['LIKELIHOOD'][k])
  153. #print(ind, best_prob)
  154. mat[k].append(best_prob)
  155. mat = np.array(mat)
  156. if normalize:
  157. row_sums = mat.sum(axis=1)
  158. mat = mat / row_sums[:, np.newaxis]
  159. return mat
  160. def genotyping_continuity_plot(vcf_entries, verbose=False):
  161. last_pos = int(sorted(list(vcf_entries.keys()))[-1])
  162. x = 0
  163. y = 1
  164. coords = [[], []]
  165. print(last_pos, "sites to scan")
  166. for k, pos in enumerate(range(last_pos)):
  167. if verbose:
  168. progress = round(k/int(last_pos))*100
  169. if progress % 10 == 0:
  170. print(progress, "%")
  171. # if pos is genotyped
  172. if k in vcf_entries:
  173. y+=1
  174. x+=1
  175. coords[0].append(x)
  176. coords[1].append(y)
  177. return coords
  178. def compute_coverage(vcf_entries, verbose=False):
  179. last_pos = list(vcf_entries.keys())[-1]
  180. x = 0
  181. y = 1
  182. coords = [[], []]
  183. print(last_pos, "sites to scan")
  184. for entry in vcf_entries.values():
  185. y = int(entry['INFOS']['DP'])
  186. x = entry['POS']
  187. coords[0].append(x)
  188. coords[1].append(y)
  189. return coords
  190. if __name__ == "__main__":
  191. # check args
  192. if len(sys.argv) !=2:
  193. print("Need 1 arg")
  194. exit(0)
  195. # main
  196. vcf_file = sys.argv[1]
  197. # # without missing data
  198. # entries, noGenotype = parse_vcf(vcf_file, stop_at = 20000)
  199. # mat = build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit=True, normalize=False, xlim = None)
  200. # customgraphics.plot_matrix(mat, color_scale_type="autumn", cbarlabel = "prob. of error, E", title="Prob. of error (E) of genotyping, at each position, lower the better")
  201. # # with missing data
  202. # entries, noGenotype = parse_vcf(vcf_file, stop_at = 500)
  203. # mat = build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit=False, normalize=False, xlim = None)
  204. # customgraphics.plot_matrix(mat, color_scale_type="autumn", cbarlabel = "prob. of error, E", title="Prob. of error (E) of genotyping, at each position, lower the better")