vcf_utils.py 8.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. #!/usr/bin/env python3
  2. """
  3. FOREST Thomas (thomas.forest@college-de-france.fr)
  4. Caution : At the moment for gzipped files only.
  5. ARGS
  6. --------
  7. standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
  8. """
  9. import gzip
  10. import sys
  11. import numpy as np
  12. from frst import customgraphics
  13. import json
  14. import time
  15. import datetime
  16. import gc
  17. import pandas as pd
  18. def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
  19. start = time.time()
  20. with gzip.open(vcf_file, "rb") as inputgz:
  21. byte_line = inputgz.readline()
  22. genotypes = []
  23. noGenotype = []
  24. pos = 0
  25. pluriall_counts = 0
  26. entries = {}
  27. chrom = {}
  28. nb_site = 0
  29. print("Parsing VCF {} ... Please wait...".format(vcf_file))
  30. #print("Parsing VCF", vcf_file, "... Please wait...")
  31. while byte_line:
  32. #print(line)
  33. # decode gzipped binary lines
  34. line = byte_line.decode('utf-8').strip()
  35. nb_site += 1
  36. # # every snp line, not comment or header
  37. if not line.startswith("##") and not line.startswith("#"):
  38. FIELDS = line.split("\t")
  39. # when line is parsed, delete it to save some memory
  40. CHROM = FIELDS[0]
  41. POS = int(FIELDS[1])
  42. if stop_at:
  43. if POS > stop_at:
  44. break
  45. # REF is col 4 of VCF
  46. REF = FIELDS[3].split(",")
  47. # ALT is col 5 of VCF
  48. ALT = FIELDS[4].split(",")
  49. FORMAT = FIELDS[8:9]
  50. SAMPLES = FIELDS[9:]
  51. QUALITY = float(FIELDS[5])
  52. INFO = FIELDS[7]
  53. INFOS = {}
  54. for info in INFO.split(";"):
  55. try :
  56. INFOS[info.split('=')[0]] = info.split('=')[1]
  57. except:
  58. INFOS[info] = info
  59. GENOTYPE = []
  60. LIKELIHOOD = []
  61. # SKIP the SNP if :
  62. # 1 : missing
  63. # 2 : deletion among REF
  64. # 3 : deletion among ALT
  65. if "./.:." in SAMPLES \
  66. or len(ALT[0]) > 1 \
  67. or len(REF[0]) > 1:
  68. # sites that are not kept
  69. # if at least one missing data, remember it
  70. noGenotype.append(POS)
  71. byte_line = inputgz.readline()
  72. continue
  73. elif len(ALT) > 1:
  74. # pluriall sites
  75. # remember that the gestion of PL is very diff with pluriall
  76. pluriall_counts += 1
  77. noGenotype.append(POS)
  78. byte_line = inputgz.readline()
  79. continue
  80. else:
  81. # for unphased and with only two fields : GT:PL
  82. for sample in SAMPLES:
  83. if not phased:
  84. # for UNPHASED data
  85. sample_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
  86. else:
  87. # for PHASED
  88. sample_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
  89. # list of PL : [prob of 0/0, prob of 0/1, prob of 1/1]
  90. sample_likelihood = sample.split(':')[1].split(',')
  91. sample_likelihood = [pow(10, -int(sample_likelihood[0])/10),
  92. pow(10, -int(sample_likelihood[1])/10), pow(10, -int(sample_likelihood[2])/10)]
  93. GENOTYPE += sample_genotype
  94. LIKELIHOOD.append(sample_likelihood)
  95. # from log phred score to probability of error, E
  96. #LIKELIHOOD = pow(10, -int(LIKELIHOOD[0])/10)
  97. #print(LIKELIHOOD)
  98. entries = {
  99. 'POS':POS,
  100. 'CHR':CHROM,
  101. #'FIELDS':FIELDS,
  102. 'REF':REF,
  103. 'ALT':ALT,
  104. 'FORMAT':FORMAT,
  105. 'INFOS':INFOS,
  106. 'SAMPLES':SAMPLES,
  107. 'QUALITY':QUALITY,
  108. 'GENOTYPE':GENOTYPE,
  109. 'LIKELIHOOD':LIKELIHOOD
  110. }
  111. if CHROM.startswith(chr_starts_with):
  112. # keep if chr name starts with filter
  113. # default : *, every chr is kept
  114. if CHROM not in chrom:
  115. chrom[CHROM] = {}
  116. chrom[CHROM][POS] = entries
  117. byte_line = inputgz.readline()
  118. end = time.time()
  119. print("Parsed", nb_site, "sites in", str(datetime.timedelta(seconds=end - start)))
  120. return chrom
  121. def build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit = False, normalize = False, xlim=None):
  122. """ Take infos out of parsing to build a coverage matrix of probability
  123. of error, E, at each position
  124. """
  125. # last pos without any missing data ./.:.
  126. last_pos = list(entries.keys())[-1]
  127. # last genotyped SNP with missing data, usually bigger than last_pos
  128. last_genotyped = noGenotype[-1]
  129. mat = []
  130. if diploid:
  131. # k=N if diploids, k = 2N if haploids
  132. k = int(len(entries[last_pos]['GENOTYPE'])/2)
  133. else:
  134. k = len(entries[last_pos]['GENOTYPE'])
  135. for _ in range(k):
  136. mat.append([])
  137. #display_max = last_pos
  138. if xlim:
  139. display_max = xlim
  140. else:
  141. if na_omit:
  142. display_max = last_pos
  143. else:
  144. display_max = last_genotyped
  145. for pos in range(display_max):
  146. if pos in noGenotype or pos not in entries.keys():
  147. if na_omit:
  148. continue
  149. else:
  150. for k in range(len(mat)):
  151. # if missing, prob=1, worst case
  152. mat[k].append(1)
  153. else:
  154. for k in range(len(mat)):
  155. best_prob = min(entries[pos]['LIKELIHOOD'][k])
  156. #print(ind, best_prob)
  157. mat[k].append(best_prob)
  158. mat = np.array(mat)
  159. if normalize:
  160. row_sums = mat.sum(axis=1)
  161. mat = mat / row_sums[:, np.newaxis]
  162. return mat
  163. def genotyping_continuity_plot(vcf_entries,
  164. verbose=False,
  165. step = 1):
  166. genotyped_pos = sorted(list(vcf_entries.keys()))
  167. last_pos = genotyped_pos[-1]
  168. x = 0
  169. y = 1
  170. coords = [[], []]
  171. print("Chr. len. =", last_pos, "bp \t ; nb. SNPs =", len(genotyped_pos[::step]))
  172. for k, pos in enumerate(genotyped_pos[::step]):
  173. if verbose:
  174. progress = round(k/int(last_pos))*100
  175. if progress % 10 == 0:
  176. print(progress, "%")
  177. # if pos is genotyped
  178. # if k in vcf_entries:
  179. # y=k*step
  180. y+=1*step
  181. x=pos
  182. coords[0].append(x)
  183. coords[1].append(y)
  184. return coords
  185. def compute_coverage(vcf_entries, verbose=False):
  186. last_pos = list(vcf_entries.keys())[-1]
  187. x = 0
  188. y = 1
  189. coords = [[], []]
  190. print(last_pos, "sites to scan")
  191. for entry in vcf_entries.values():
  192. y = int(entry['INFOS']['DP'])
  193. x = entry['POS']
  194. coords[0].append(x)
  195. coords[1].append(y)
  196. return coords
  197. def free(obj):
  198. """ Free the object and call the garbage collector explicitely
  199. """
  200. del obj
  201. gc.collect()
  202. if __name__ == "__main__":
  203. # check args
  204. if len(sys.argv) !=2:
  205. print("Need 1 arg")
  206. exit(0)
  207. # main
  208. vcf_file = sys.argv[1]
  209. # # without missing data
  210. # entries, noGenotype = parse_vcf(vcf_file, stop_at = 20000)
  211. # mat = build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit=True, normalize=False, xlim = None)
  212. # customgraphics.plot_matrix(mat, color_scale_type="autumn", cbarlabel = "prob. of error, E", title="Prob. of error (E) of genotyping, at each position, lower the better")
  213. # # with missing data
  214. # entries, noGenotype = parse_vcf(vcf_file, stop_at = 500)
  215. # mat = build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit=False, normalize=False, xlim = None)
  216. # customgraphics.plot_matrix(mat, color_scale_type="autumn", cbarlabel = "prob. of error, E", title="Prob. of error (E) of genotyping, at each position, lower the better")