Kaynağa Gözat

better gestion of memory while parsing vcf

tforest 2 yıl önce
ebeveyn
işleme
8d4d36a1f9
2 değiştirilmiş dosya ile 17 ekleme ve 13 silme
  1. 4 3
      customgraphics.py
  2. 13 10
      vcf_utils.py

+ 4 - 3
customgraphics.py Dosyayı Görüntüle

@@ -237,8 +237,8 @@ def plot_whole_karyotype(recent_variants, mem_clean = False, step = 1, show = Tr
237 237
         nb_iter =  len(recent_variants) -1
238 238
     if show :
239 239
         iter_start = min_chr_id + 1
240
-        if not step :
241
-            step = round(len(recent_variants[list(recent_variants.keys())[min_chr_id]]) / step)
240
+        if step == "auto" :
241
+            step = round(len(recent_variants[list(recent_variants.keys())[min_chr_id]]) / 1000)
242 242
         if stacked:
243 243
             nb_subplots = nb_iter - min_chr_id
244 244
             subplot_init = True
@@ -262,7 +262,8 @@ def plot_whole_karyotype(recent_variants, mem_clean = False, step = 1, show = Tr
262 262
                 print("Cleaned mem. in", str(datetime.timedelta(seconds=end - start)))
263 263
         else:
264 264
             # if show is enable, use a step
265
-            step = round(len(recent_variants[list(recent_variants.keys())[chr]]) / 1000)
265
+            if step == "auto":
266
+                step = round(len(recent_variants[list(recent_variants.keys())[chr]]) / 1000)
266 267
             vcf_utils.customgraphics.plot_chrom_continuity(recent_variants, chr_id = chr, show = False, returned = False, step = step, subplot_id = chr)
267 268
         # last case
268 269
     if show == True:

+ 13 - 10
vcf_utils.py Dosyayı Görüntüle

@@ -43,6 +43,7 @@ def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
43 43
                 #  # every snp line, not comment or header
44 44
                 if not line.startswith("##") and not line.startswith("#"):
45 45
                     FIELDS = line.split("\t")
46
+                    # when line is parsed, delete it to save some memory
46 47
                     CHROM = FIELDS[0]
47 48
                     POS = int(FIELDS[1])
48 49
                     if stop_at:
@@ -52,8 +53,8 @@ def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
52 53
                     REF = FIELDS[3].split(",")
53 54
                     # ALT is col 5 of VCF
54 55
                     ALT = FIELDS[4].split(",")
55
-                    FORMAT = line.split("\t")[8:9]
56
-                    SAMPLES = line.split("\t")[9:]
56
+                    FORMAT = FIELDS[8:9]
57
+                    SAMPLES = FIELDS[9:]
57 58
                     QUALITY = float(FIELDS[5])
58 59
                     INFO = FIELDS[7]
59 60
                     INFOS = {}
@@ -68,7 +69,7 @@ def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
68 69
                     # 1 : missing
69 70
                     # 2 : deletion among REF
70 71
                     # 3 : deletion among ALT
71
-                    if "./.:." in line \
72
+                    if "./.:." in SAMPLES \
72 73
                        or len(ALT[0]) > 1 \
73 74
                        or len(REF[0]) > 1:
74 75
                         # sites that are not kept
@@ -104,7 +105,7 @@ def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
104 105
                         entries = {
105 106
                             'POS':POS,
106 107
                             'CHR':CHROM,
107
-                            'FIELDS':FIELDS,
108
+                            #'FIELDS':FIELDS,
108 109
                             'REF':REF,
109 110
                             'ALT':ALT,
110 111
                             'FORMAT':FORMAT,
@@ -173,20 +174,22 @@ def build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit =
173 174
 def genotyping_continuity_plot(vcf_entries,
174 175
                                                             verbose=False,
175 176
                                                             step = 1):
176
-    last_pos = int(sorted(list(vcf_entries.keys()))[-1])
177
+    genotyped_pos = sorted(list(vcf_entries.keys()))
178
+    last_pos = genotyped_pos[-1]
177 179
     x = 0
178 180
     y = 1
179 181
     coords = [[], []]
180
-    print(last_pos, "sites to scan")
181
-    for k, pos in enumerate(range(0, last_pos, step)):
182
+    print("Chr. len. =", last_pos, "bp \t ; nb. SNPs =", len(genotyped_pos[::step]))
183
+    for k, pos in enumerate(genotyped_pos[::step]):
182 184
         if verbose:
183 185
             progress = round(k/int(last_pos))*100
184 186
             if progress % 10 == 0:
185 187
                 print(progress, "%")
186 188
         # if pos is genotyped
187
-        if k in vcf_entries:
188
-            y+=1*step
189
-        x+=1*step
189
+        # if k in vcf_entries:
190
+        #     y=k*step
191
+        y+=1*step
192
+        x=pos
190 193
         coords[0].append(x)
191 194
         coords[1].append(y)
192 195
     return coords