vcf_utils.py 9.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. #!/usr/bin/env python3
  2. """
  3. FOREST Thomas (thomas.forest@college-de-france.fr)
  4. Caution : At the moment for gzipped files only.
  5. ARGS
  6. --------
  7. standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
  8. """
  9. import gzip
  10. import sys
  11. import numpy as np
  12. from frst import customgraphics
  13. import json
  14. import time
  15. import datetime
  16. import gc
  17. import pandas as pd
  18. def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
  19. start = time.time()
  20. with gzip.open(vcf_file, "rb") as inputgz:
  21. byte_line = inputgz.readline()
  22. genotypes = []
  23. noGenotype = []
  24. pos = 0
  25. pluriall_counts = 0
  26. entries = {}
  27. chrom = {}
  28. nb_site = 0
  29. print("Parsing VCF {} ... Please wait...".format(vcf_file))
  30. #print("Parsing VCF", vcf_file, "... Please wait...")
  31. while byte_line:
  32. #print(line)
  33. # decode gzipped binary lines
  34. line = byte_line.decode('utf-8').strip()
  35. nb_site += 1
  36. # # every snp line, not comment or header
  37. if not line.startswith("##") and not line.startswith("#"):
  38. FIELDS = line.split("\t")
  39. # when line is parsed, delete it to save some memory
  40. CHROM = FIELDS[0]
  41. POS = int(FIELDS[1])
  42. if stop_at:
  43. if POS > stop_at:
  44. break
  45. # REF is col 4 of VCF
  46. REF = FIELDS[3].split(",")
  47. # ALT is col 5 of VCF
  48. ALT = FIELDS[4].split(",")
  49. FORMAT = FIELDS[8:9]
  50. SAMPLES = FIELDS[9:]
  51. QUALITY = float(FIELDS[5])
  52. INFO = FIELDS[7]
  53. INFOS = {}
  54. for info in INFO.split(";"):
  55. try :
  56. INFOS[info.split('=')[0]] = info.split('=')[1]
  57. except:
  58. INFOS[info] = info
  59. GENOTYPE = []
  60. LIKELIHOOD = []
  61. # SKIP the SNP if :
  62. # 1 : missing
  63. # 2 : deletion among REF
  64. # 3 : deletion among ALT
  65. if "./.:." in SAMPLES \
  66. or len(ALT[0]) > 1 \
  67. or len(REF[0]) > 1:
  68. # sites that are not kept
  69. # if at least one missing data, remember it
  70. noGenotype.append(POS)
  71. byte_line = inputgz.readline()
  72. continue
  73. elif len(ALT) > 1:
  74. # pluriall sites
  75. # remember that the gestion of PL is very diff with pluriall
  76. pluriall_counts += 1
  77. noGenotype.append(POS)
  78. byte_line = inputgz.readline()
  79. continue
  80. else:
  81. # for unphased and with only two fields : GT:PL
  82. for sample in SAMPLES:
  83. if not phased:
  84. # for UNPHASED data
  85. sample_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
  86. else:
  87. # for PHASED
  88. sample_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
  89. # list of PL : [prob of 0/0, prob of 0/1, prob of 1/1]
  90. sample_likelihood = sample.split(':')[1].split(',')
  91. sample_likelihood = [pow(10, -int(sample_likelihood[0])/10),
  92. pow(10, -int(sample_likelihood[1])/10), pow(10, -int(sample_likelihood[2])/10)]
  93. GENOTYPE += sample_genotype
  94. LIKELIHOOD.append(sample_likelihood)
  95. # from log phred score to probability of error, E
  96. #LIKELIHOOD = pow(10, -int(LIKELIHOOD[0])/10)
  97. #print(LIKELIHOOD)
  98. entries = {
  99. 'POS':POS,
  100. 'CHR':CHROM,
  101. # removed to save some space
  102. #'FIELDS':FIELDS,
  103. 'REF':REF,
  104. 'ALT':ALT,
  105. 'FORMAT':FORMAT,
  106. 'INFOS':INFOS,
  107. 'SAMPLES':SAMPLES,
  108. 'QUALITY':QUALITY,
  109. 'GENOTYPE':GENOTYPE,
  110. 'LIKELIHOOD':LIKELIHOOD,
  111. 'NB_PLURIALL':pluriall_counts
  112. }
  113. if CHROM.startswith(chr_starts_with):
  114. # keep if chr name starts with filter
  115. # default : *, every chr is kept
  116. if CHROM not in chrom:
  117. # resent when changing chrom
  118. pluriall_counts = 0
  119. chrom[CHROM] = {}
  120. chrom[CHROM][POS] = entries
  121. byte_line = inputgz.readline()
  122. end = time.time()
  123. print("Parsed", nb_site, "sites in", str(datetime.timedelta(seconds=end - start)))
  124. return chrom
  125. def build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit = False, normalize = False, xlim=None):
  126. """ Take infos out of parsing to build a coverage matrix of probability
  127. of error, E, at each position
  128. """
  129. # last pos without any missing data ./.:.
  130. last_pos = list(entries.keys())[-1]
  131. # last genotyped SNP with missing data, usually bigger than last_pos
  132. last_genotyped = noGenotype[-1]
  133. mat = []
  134. if diploid:
  135. # k=N if diploids, k = 2N if haploids
  136. k = int(len(entries[last_pos]['GENOTYPE'])/2)
  137. else:
  138. k = len(entries[last_pos]['GENOTYPE'])
  139. for _ in range(k):
  140. mat.append([])
  141. #display_max = last_pos
  142. if xlim:
  143. display_max = xlim
  144. else:
  145. if na_omit:
  146. display_max = last_pos
  147. else:
  148. display_max = last_genotyped
  149. for pos in range(display_max):
  150. if pos in noGenotype or pos not in entries.keys():
  151. if na_omit:
  152. continue
  153. else:
  154. for k in range(len(mat)):
  155. # if missing, prob=1, worst case
  156. mat[k].append(1)
  157. else:
  158. for k in range(len(mat)):
  159. best_prob = min(entries[pos]['LIKELIHOOD'][k])
  160. #print(ind, best_prob)
  161. mat[k].append(best_prob)
  162. mat = np.array(mat)
  163. if normalize:
  164. row_sums = mat.sum(axis=1)
  165. mat = mat / row_sums[:, np.newaxis]
  166. return mat
  167. def genotyping_continuity_plot(vcf_entries,
  168. verbose=False,
  169. step = 1):
  170. genotyped_pos = sorted(list(vcf_entries.keys()))
  171. last_pos = genotyped_pos[-1]
  172. x = 0
  173. y = 1
  174. coords = [[], []]
  175. print("Chr. len. =", last_pos, "bp \t ; nb. SNPs =", len(genotyped_pos[::step]))
  176. for k, pos in enumerate(genotyped_pos[::step]):
  177. if verbose:
  178. progress = round(k/int(last_pos))*100
  179. if progress % 10 == 0:
  180. print(progress, "%")
  181. y+=1*step
  182. x=pos
  183. coords[0].append(x)
  184. coords[1].append(y)
  185. return coords
  186. def compute_coverage(vcf_entries, verbose=False):
  187. last_pos = list(vcf_entries.keys())[-1]
  188. x = 0
  189. y = 1
  190. coords = [[], []]
  191. print(last_pos, "sites to scan")
  192. for entry in vcf_entries.values():
  193. y = int(entry['INFOS']['DP'])
  194. x = entry['POS']
  195. coords[0].append(x)
  196. coords[1].append(y)
  197. return coords
  198. def free(obj):
  199. """ Free the object and call the garbage collector explicitely
  200. """
  201. del obj
  202. gc.collect()
  203. def random_indiv_from_vcfs(vcf_files_list):
  204. """
  205. Either for two VCFs of two indiv or two samplenames from the same VCF.
  206. """
  207. if len(vcf_files_list)>1:
  208. # two invid from two VCFs
  209. for vcf in vcf_files_list:
  210. with open(vcf) as vcf_stream:
  211. for line in vcf_stream:
  212. if line.startswith('#'):
  213. continue
  214. genotype = line.split("\t")[-1].strip()
  215. if genotype.split(":")[1] != '0':
  216. genotype_list = genotype.split(':')[0].split("/")
  217. print(genotype_list)
  218. else:
  219. # two samplename to randomly pick
  220. pass
  221. if __name__ == "__main__":
  222. # check args
  223. if len(sys.argv) !=2:
  224. print("Need 1 arg")
  225. exit(0)
  226. # main
  227. vcf_file = sys.argv[1]
  228. # # without missing data
  229. # entries, noGenotype = parse_vcf(vcf_file, stop_at = 20000)
  230. # mat = build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit=True, normalize=False, xlim = None)
  231. # customgraphics.plot_matrix(mat, color_scale_type="autumn", cbarlabel = "prob. of error, E", title="Prob. of error (E) of genotyping, at each position, lower the better")
  232. # # with missing data
  233. # entries, noGenotype = parse_vcf(vcf_file, stop_at = 500)
  234. # mat = build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit=False, normalize=False, xlim = None)
  235. # customgraphics.plot_matrix(mat, color_scale_type="autumn", cbarlabel = "prob. of error, E", title="Prob. of error (E) of genotyping, at each position, lower the better")