Browse Source

better gestion of memory while parsing vcf

tforest 2 years ago
parent
commit
8d4d36a1f9
2 changed files with 17 additions and 13 deletions
  1. 4 3
      customgraphics.py
  2. 13 10
      vcf_utils.py

+ 4 - 3
customgraphics.py View File

237
         nb_iter =  len(recent_variants) -1
237
         nb_iter =  len(recent_variants) -1
238
     if show :
238
     if show :
239
         iter_start = min_chr_id + 1
239
         iter_start = min_chr_id + 1
240
-        if not step :
241
-            step = round(len(recent_variants[list(recent_variants.keys())[min_chr_id]]) / step)
240
+        if step == "auto" :
241
+            step = round(len(recent_variants[list(recent_variants.keys())[min_chr_id]]) / 1000)
242
         if stacked:
242
         if stacked:
243
             nb_subplots = nb_iter - min_chr_id
243
             nb_subplots = nb_iter - min_chr_id
244
             subplot_init = True
244
             subplot_init = True
262
                 print("Cleaned mem. in", str(datetime.timedelta(seconds=end - start)))
262
                 print("Cleaned mem. in", str(datetime.timedelta(seconds=end - start)))
263
         else:
263
         else:
264
             # if show is enable, use a step
264
             # if show is enable, use a step
265
-            step = round(len(recent_variants[list(recent_variants.keys())[chr]]) / 1000)
265
+            if step == "auto":
266
+                step = round(len(recent_variants[list(recent_variants.keys())[chr]]) / 1000)
266
             vcf_utils.customgraphics.plot_chrom_continuity(recent_variants, chr_id = chr, show = False, returned = False, step = step, subplot_id = chr)
267
             vcf_utils.customgraphics.plot_chrom_continuity(recent_variants, chr_id = chr, show = False, returned = False, step = step, subplot_id = chr)
267
         # last case
268
         # last case
268
     if show == True:
269
     if show == True:

+ 13 - 10
vcf_utils.py View File

43
                 #  # every snp line, not comment or header
43
                 #  # every snp line, not comment or header
44
                 if not line.startswith("##") and not line.startswith("#"):
44
                 if not line.startswith("##") and not line.startswith("#"):
45
                     FIELDS = line.split("\t")
45
                     FIELDS = line.split("\t")
46
+                    # when line is parsed, delete it to save some memory
46
                     CHROM = FIELDS[0]
47
                     CHROM = FIELDS[0]
47
                     POS = int(FIELDS[1])
48
                     POS = int(FIELDS[1])
48
                     if stop_at:
49
                     if stop_at:
52
                     REF = FIELDS[3].split(",")
53
                     REF = FIELDS[3].split(",")
53
                     # ALT is col 5 of VCF
54
                     # ALT is col 5 of VCF
54
                     ALT = FIELDS[4].split(",")
55
                     ALT = FIELDS[4].split(",")
55
-                    FORMAT = line.split("\t")[8:9]
56
-                    SAMPLES = line.split("\t")[9:]
56
+                    FORMAT = FIELDS[8:9]
57
+                    SAMPLES = FIELDS[9:]
57
                     QUALITY = float(FIELDS[5])
58
                     QUALITY = float(FIELDS[5])
58
                     INFO = FIELDS[7]
59
                     INFO = FIELDS[7]
59
                     INFOS = {}
60
                     INFOS = {}
68
                     # 1 : missing
69
                     # 1 : missing
69
                     # 2 : deletion among REF
70
                     # 2 : deletion among REF
70
                     # 3 : deletion among ALT
71
                     # 3 : deletion among ALT
71
-                    if "./.:." in line \
72
+                    if "./.:." in SAMPLES \
72
                        or len(ALT[0]) > 1 \
73
                        or len(ALT[0]) > 1 \
73
                        or len(REF[0]) > 1:
74
                        or len(REF[0]) > 1:
74
                         # sites that are not kept
75
                         # sites that are not kept
104
                         entries = {
105
                         entries = {
105
                             'POS':POS,
106
                             'POS':POS,
106
                             'CHR':CHROM,
107
                             'CHR':CHROM,
107
-                            'FIELDS':FIELDS,
108
+                            #'FIELDS':FIELDS,
108
                             'REF':REF,
109
                             'REF':REF,
109
                             'ALT':ALT,
110
                             'ALT':ALT,
110
                             'FORMAT':FORMAT,
111
                             'FORMAT':FORMAT,
173
 def genotyping_continuity_plot(vcf_entries,
174
 def genotyping_continuity_plot(vcf_entries,
174
                                                             verbose=False,
175
                                                             verbose=False,
175
                                                             step = 1):
176
                                                             step = 1):
176
-    last_pos = int(sorted(list(vcf_entries.keys()))[-1])
177
+    genotyped_pos = sorted(list(vcf_entries.keys()))
178
+    last_pos = genotyped_pos[-1]
177
     x = 0
179
     x = 0
178
     y = 1
180
     y = 1
179
     coords = [[], []]
181
     coords = [[], []]
180
-    print(last_pos, "sites to scan")
181
-    for k, pos in enumerate(range(0, last_pos, step)):
182
+    print("Chr. len. =", last_pos, "bp \t ; nb. SNPs =", len(genotyped_pos[::step]))
183
+    for k, pos in enumerate(genotyped_pos[::step]):
182
         if verbose:
184
         if verbose:
183
             progress = round(k/int(last_pos))*100
185
             progress = round(k/int(last_pos))*100
184
             if progress % 10 == 0:
186
             if progress % 10 == 0:
185
                 print(progress, "%")
187
                 print(progress, "%")
186
         # if pos is genotyped
188
         # if pos is genotyped
187
-        if k in vcf_entries:
188
-            y+=1*step
189
-        x+=1*step
189
+        # if k in vcf_entries:
190
+        #     y=k*step
191
+        y+=1*step
192
+        x=pos
190
         coords[0].append(x)
193
         coords[0].append(x)
191
         coords[1].append(y)
194
         coords[1].append(y)
192
     return coords
195
     return coords