vcf_utils.py 8.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. #!/usr/bin/env python3
  2. """
  3. FOREST Thomas (thomas.forest@college-de-france.fr)
  4. Caution : At the moment for gzipped files only.
  5. ARGS
  6. --------
  7. standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
  8. """
  9. import gzip
  10. import sys
  11. import numpy as np
  12. from frst import customgraphics
  13. import json
  14. import time
  15. import datetime
  16. import gc
  17. import pandas as pd
  18. def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
  19. start = time.time()
  20. with gzip.open(vcf_file, "rb") as inputgz:
  21. byte_line = inputgz.readline()
  22. genotypes = []
  23. noGenotype = []
  24. pos = 0
  25. pluriall_counts = 0
  26. entries = {}
  27. chrom = {}
  28. nb_site = 0
  29. print("Parsing VCF {} ... Please wait...".format(vcf_file))
  30. #print("Parsing VCF", vcf_file, "... Please wait...")
  31. while byte_line:
  32. #print(line)
  33. # decode gzipped binary lines
  34. line = byte_line.decode('utf-8').strip()
  35. nb_site += 1
  36. # # every snp line, not comment or header
  37. if not line.startswith("##") and not line.startswith("#"):
  38. FIELDS = line.split("\t")
  39. CHROM = FIELDS[0]
  40. POS = int(FIELDS[1])
  41. if stop_at:
  42. if POS > stop_at:
  43. break
  44. # REF is col 4 of VCF
  45. REF = FIELDS[3].split(",")
  46. # ALT is col 5 of VCF
  47. ALT = FIELDS[4].split(",")
  48. FORMAT = line.split("\t")[8:9]
  49. SAMPLES = line.split("\t")[9:]
  50. QUALITY = float(FIELDS[5])
  51. INFO = FIELDS[7]
  52. INFOS = {}
  53. for info in INFO.split(";"):
  54. try :
  55. INFOS[info.split('=')[0]] = info.split('=')[1]
  56. except:
  57. INFOS[info] = info
  58. GENOTYPE = []
  59. LIKELIHOOD = []
  60. # SKIP the SNP if :
  61. # 1 : missing
  62. # 2 : deletion among REF
  63. # 3 : deletion among ALT
  64. if "./.:." in line \
  65. or len(ALT[0]) > 1 \
  66. or len(REF[0]) > 1:
  67. # sites that are not kept
  68. # if at least one missing data, remember it
  69. noGenotype.append(POS)
  70. byte_line = inputgz.readline()
  71. continue
  72. elif len(ALT) > 1:
  73. # pluriall sites
  74. # remember that the gestion of PL is very diff with pluriall
  75. pluriall_counts += 1
  76. noGenotype.append(POS)
  77. byte_line = inputgz.readline()
  78. continue
  79. else:
  80. # for unphased and with only two fields : GT:PL
  81. for sample in SAMPLES:
  82. if not phased:
  83. # for UNPHASED data
  84. sample_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
  85. else:
  86. # for PHASED
  87. sample_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
  88. # list of PL : [prob of 0/0, prob of 0/1, prob of 1/1]
  89. sample_likelihood = sample.split(':')[1].split(',')
  90. sample_likelihood = [pow(10, -int(sample_likelihood[0])/10),
  91. pow(10, -int(sample_likelihood[1])/10), pow(10, -int(sample_likelihood[2])/10)]
  92. GENOTYPE += sample_genotype
  93. LIKELIHOOD.append(sample_likelihood)
  94. # from log phred score to probability of error, E
  95. #LIKELIHOOD = pow(10, -int(LIKELIHOOD[0])/10)
  96. #print(LIKELIHOOD)
  97. entries = {
  98. 'POS':POS,
  99. 'CHR':CHROM,
  100. 'FIELDS':FIELDS,
  101. 'REF':REF,
  102. 'ALT':ALT,
  103. 'FORMAT':FORMAT,
  104. 'INFOS':INFOS,
  105. 'SAMPLES':SAMPLES,
  106. 'QUALITY':QUALITY,
  107. 'GENOTYPE':GENOTYPE,
  108. 'LIKELIHOOD':LIKELIHOOD
  109. }
  110. if CHROM.startswith(chr_starts_with):
  111. # keep if chr name starts with filter
  112. # default : *, every chr is kept
  113. if CHROM not in chrom:
  114. chrom[CHROM] = {}
  115. chrom[CHROM][POS] = entries
  116. byte_line = inputgz.readline()
  117. end = time.time()
  118. print("Parsed", nb_site, "sites in", str(datetime.timedelta(seconds=end - start)))
  119. return chrom
  120. def build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit = False, normalize = False, xlim=None):
  121. """ Take infos out of parsing to build a coverage matrix of probability
  122. of error, E, at each position
  123. """
  124. # last pos without any missing data ./.:.
  125. last_pos = list(entries.keys())[-1]
  126. # last genotyped SNP with missing data, usually bigger than last_pos
  127. last_genotyped = noGenotype[-1]
  128. mat = []
  129. if diploid:
  130. # k=N if diploids, k = 2N if haploids
  131. k = int(len(entries[last_pos]['GENOTYPE'])/2)
  132. else:
  133. k = len(entries[last_pos]['GENOTYPE'])
  134. for _ in range(k):
  135. mat.append([])
  136. #display_max = last_pos
  137. if xlim:
  138. display_max = xlim
  139. else:
  140. if na_omit:
  141. display_max = last_pos
  142. else:
  143. display_max = last_genotyped
  144. for pos in range(display_max):
  145. if pos in noGenotype or pos not in entries.keys():
  146. if na_omit:
  147. continue
  148. else:
  149. for k in range(len(mat)):
  150. # if missing, prob=1, worst case
  151. mat[k].append(1)
  152. else:
  153. for k in range(len(mat)):
  154. best_prob = min(entries[pos]['LIKELIHOOD'][k])
  155. #print(ind, best_prob)
  156. mat[k].append(best_prob)
  157. mat = np.array(mat)
  158. if normalize:
  159. row_sums = mat.sum(axis=1)
  160. mat = mat / row_sums[:, np.newaxis]
  161. return mat
  162. def genotyping_continuity_plot(vcf_entries,
  163. verbose=False,
  164. step = 1):
  165. last_pos = int(sorted(list(vcf_entries.keys()))[-1])
  166. x = 0
  167. y = 1
  168. coords = [[], []]
  169. print(last_pos, "sites to scan")
  170. for k, pos in enumerate(range(0, last_pos, step)):
  171. if verbose:
  172. progress = round(k/int(last_pos))*100
  173. if progress % 10 == 0:
  174. print(progress, "%")
  175. # if pos is genotyped
  176. if k in vcf_entries:
  177. y+=1*step
  178. x+=1*step
  179. coords[0].append(x)
  180. coords[1].append(y)
  181. return coords
  182. def compute_coverage(vcf_entries, verbose=False):
  183. last_pos = list(vcf_entries.keys())[-1]
  184. x = 0
  185. y = 1
  186. coords = [[], []]
  187. print(last_pos, "sites to scan")
  188. for entry in vcf_entries.values():
  189. y = int(entry['INFOS']['DP'])
  190. x = entry['POS']
  191. coords[0].append(x)
  192. coords[1].append(y)
  193. return coords
  194. def free(obj):
  195. """ Free the object and call the garbage collector explicitely
  196. """
  197. del obj
  198. gc.collect()
  199. if __name__ == "__main__":
  200. # check args
  201. if len(sys.argv) !=2:
  202. print("Need 1 arg")
  203. exit(0)
  204. # main
  205. vcf_file = sys.argv[1]
  206. # # without missing data
  207. # entries, noGenotype = parse_vcf(vcf_file, stop_at = 20000)
  208. # mat = build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit=True, normalize=False, xlim = None)
  209. # customgraphics.plot_matrix(mat, color_scale_type="autumn", cbarlabel = "prob. of error, E", title="Prob. of error (E) of genotyping, at each position, lower the better")
  210. # # with missing data
  211. # entries, noGenotype = parse_vcf(vcf_file, stop_at = 500)
  212. # mat = build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit=False, normalize=False, xlim = None)
  213. # customgraphics.plot_matrix(mat, color_scale_type="autumn", cbarlabel = "prob. of error, E", title="Prob. of error (E) of genotyping, at each position, lower the better")