|
@@ -43,6 +43,7 @@ def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
|
43
|
43
|
# # every snp line, not comment or header
|
44
|
44
|
if not line.startswith("##") and not line.startswith("#"):
|
45
|
45
|
FIELDS = line.split("\t")
|
|
46
|
+ # when line is parsed, delete it to save some memory
|
46
|
47
|
CHROM = FIELDS[0]
|
47
|
48
|
POS = int(FIELDS[1])
|
48
|
49
|
if stop_at:
|
|
@@ -52,8 +53,8 @@ def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
|
52
|
53
|
REF = FIELDS[3].split(",")
|
53
|
54
|
# ALT is col 5 of VCF
|
54
|
55
|
ALT = FIELDS[4].split(",")
|
55
|
|
- FORMAT = line.split("\t")[8:9]
|
56
|
|
- SAMPLES = line.split("\t")[9:]
|
|
56
|
+ FORMAT = FIELDS[8:9]
|
|
57
|
+ SAMPLES = FIELDS[9:]
|
57
|
58
|
QUALITY = float(FIELDS[5])
|
58
|
59
|
INFO = FIELDS[7]
|
59
|
60
|
INFOS = {}
|
|
@@ -68,7 +69,7 @@ def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
|
68
|
69
|
# 1 : missing
|
69
|
70
|
# 2 : deletion among REF
|
70
|
71
|
# 3 : deletion among ALT
|
71
|
|
- if "./.:." in line \
|
|
72
|
+ if "./.:." in SAMPLES \
|
72
|
73
|
or len(ALT[0]) > 1 \
|
73
|
74
|
or len(REF[0]) > 1:
|
74
|
75
|
# sites that are not kept
|
|
@@ -104,7 +105,7 @@ def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
|
104
|
105
|
entries = {
|
105
|
106
|
'POS':POS,
|
106
|
107
|
'CHR':CHROM,
|
107
|
|
- 'FIELDS':FIELDS,
|
|
108
|
+ #'FIELDS':FIELDS,
|
108
|
109
|
'REF':REF,
|
109
|
110
|
'ALT':ALT,
|
110
|
111
|
'FORMAT':FORMAT,
|
|
@@ -173,20 +174,22 @@ def build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit =
|
173
|
174
|
def genotyping_continuity_plot(vcf_entries,
|
174
|
175
|
verbose=False,
|
175
|
176
|
step = 1):
|
176
|
|
- last_pos = int(sorted(list(vcf_entries.keys()))[-1])
|
|
177
|
+ genotyped_pos = sorted(list(vcf_entries.keys()))
|
|
178
|
+ last_pos = genotyped_pos[-1]
|
177
|
179
|
x = 0
|
178
|
180
|
y = 1
|
179
|
181
|
coords = [[], []]
|
180
|
|
- print(last_pos, "sites to scan")
|
181
|
|
- for k, pos in enumerate(range(0, last_pos, step)):
|
|
182
|
+ print("Chr. len. =", last_pos, "bp \t ; nb. SNPs =", len(genotyped_pos[::step]))
|
|
183
|
+ for k, pos in enumerate(genotyped_pos[::step]):
|
182
|
184
|
if verbose:
|
183
|
185
|
progress = round(k/int(last_pos))*100
|
184
|
186
|
if progress % 10 == 0:
|
185
|
187
|
print(progress, "%")
|
186
|
188
|
# if pos is genotyped
|
187
|
|
- if k in vcf_entries:
|
188
|
|
- y+=1*step
|
189
|
|
- x+=1*step
|
|
189
|
+ # if k in vcf_entries:
|
|
190
|
+ # y=k*step
|
|
191
|
+ y+=1*step
|
|
192
|
+ x=pos
|
190
|
193
|
coords[0].append(x)
|
191
|
194
|
coords[1].append(y)
|
192
|
195
|
return coords
|