Преглед изворни кода

update output plotting of chrom continuity

tforest пре 2 година
родитељ
комит
a7e78958b2
4 измењених фајлова са 18 додато и 125 уклоњено
  1. 16 10
      customgraphics.py
  2. 2 2
      sfs_tools.py
  3. 0 110
      vcf_to_sfs.py
  4. 0 3
      vcf_utils.py

+ 16 - 10
customgraphics.py Прегледај датотеку

153
 
153
 
154
 def plot(x, y, outfile = None, outfolder = None, ylab=None, xlab=None,
154
 def plot(x, y, outfile = None, outfolder = None, ylab=None, xlab=None,
155
          title=None, label = None, show=True, nb_subplots = None, subplot_init = False,
155
          title=None, label = None, show=True, nb_subplots = None, subplot_init = False,
156
-         subplot_id = None):
156
+         subplot_id = None, output = None, dpi = 300, width = 15, height = 15, plot_init = True):
157
+
158
+    # before fig is generated, set its dimensions
159
+    if plot_init:
160
+        plt.figure(figsize=(width, height))
157
     if subplot_init:
161
     if subplot_init:
158
         # define a certain amount of subplots
162
         # define a certain amount of subplots
159
         fig, axs = plt.subplots(nb_subplots)
163
         fig, axs = plt.subplots(nb_subplots)
181
     if title:
185
     if title:
182
         plt.title(title)
186
         plt.title(title)
183
     if outfile:
187
     if outfile:
184
-        plt.savefig(outfile)
185
-    else:
186
-        if show == True:
187
-            plt.show()
188
+        plt.savefig(outfile, dpi = dpi)
189
+    if show == True:
190
+        plt.show()
191
+
188
 
192
 
189
 def scatter(x, y, ylab=None, xlab=None, title=None):
193
 def scatter(x, y, ylab=None, xlab=None, title=None):
190
     plt.scatter(x, y)
194
     plt.scatter(x, y)
208
 
212
 
209
 def plot_chrom_continuity(vcf_entries, chr_id, x=None, y=None, outfile = None,
213
 def plot_chrom_continuity(vcf_entries, chr_id, x=None, y=None, outfile = None,
210
                           outfolder = None, returned=False, show=True, label=True, step=1, nb_subplots = None,
214
                           outfolder = None, returned=False, show=True, label=True, step=1, nb_subplots = None,
211
-                          subplot_init = False, subplot_id = None, title = None):
215
+                          subplot_init = False, subplot_id = None, title = None, plot_init = False):
212
     chr_name = list(vcf_entries.keys())[chr_id]
216
     chr_name = list(vcf_entries.keys())[chr_id]
213
     if label:
217
     if label:
214
         label = chr_name
218
         label = chr_name
226
              xlab = "pos. in ref.",
230
              xlab = "pos. in ref.",
227
              title = title,
231
              title = title,
228
              outfile = outfile, outfolder = outfolder, show=show, label=label,
232
              outfile = outfile, outfolder = outfolder, show=show, label=label,
229
-             nb_subplots = nb_subplots, subplot_init = subplot_init, subplot_id = subplot_id)
233
+             nb_subplots = nb_subplots, subplot_init = subplot_init, subplot_id = subplot_id, plot_init = plot_init)
230
 
234
 
231
 def plot_whole_karyotype(recent_variants, mem_clean = False, step = 1, show = True, min_chr_id = 0,
235
 def plot_whole_karyotype(recent_variants, mem_clean = False, step = 1, show = True, min_chr_id = 0,
232
-                         max_chr_id = None, stacked = False, title = None):
236
+                         max_chr_id = None, stacked = False, title = None, outfile = None):
233
     coords = []
237
     coords = []
234
     if max_chr_id :
238
     if max_chr_id :
235
         nb_iter = max_chr_id
239
         nb_iter = max_chr_id
246
             nb_subplots = None
250
             nb_subplots = None
247
             subplot_init = False
251
             subplot_init = False
248
         vcf_utils.customgraphics.plot_chrom_continuity(recent_variants, chr_id = min_chr_id, show = False, returned = False, step = step,
252
         vcf_utils.customgraphics.plot_chrom_continuity(recent_variants, chr_id = min_chr_id, show = False, returned = False, step = step,
249
-                                                       nb_subplots = nb_subplots, subplot_init = subplot_init, subplot_id = min_chr_id)
253
+                                                       nb_subplots = nb_subplots, subplot_init = subplot_init, subplot_id = min_chr_id, plot_init = True)
250
     else :
254
     else :
251
         iter_start = 0
255
         iter_start = 0
252
     for chr in range(iter_start, nb_iter):
256
     for chr in range(iter_start, nb_iter):
267
             vcf_utils.customgraphics.plot_chrom_continuity(recent_variants, chr_id = chr, show = False, returned = False, step = step, subplot_id = chr)
271
             vcf_utils.customgraphics.plot_chrom_continuity(recent_variants, chr_id = chr, show = False, returned = False, step = step, subplot_id = chr)
268
         # last case
272
         # last case
269
     if show == True:
273
     if show == True:
270
-        vcf_utils.customgraphics.plot_chrom_continuity(recent_variants, chr_id = nb_iter, show = True, returned = False, step = step, subplot_id = nb_iter, title = title)
274
+        vcf_utils.customgraphics.plot_chrom_continuity(recent_variants, chr_id = nb_iter, show = True, returned = False, step = step, subplot_id = nb_iter,
275
+                                                       title = title,
276
+                                                       outfile = outfile, plot_init = False)
271
     # maybe add a clean of recent_variants in extreme cases, before building the plots
277
     # maybe add a clean of recent_variants in extreme cases, before building the plots
272
     if show == False:
278
     if show == False:
273
         return coords
279
         return coords

+ 2 - 2
sfs_tools.py Прегледај датотеку

100
             if verbose:
100
             if verbose:
101
                 print("SFS=", SFS_values)
101
                 print("SFS=", SFS_values)
102
         print("Pluriallelic sites =", count_pluriall)
102
         print("Pluriallelic sites =", count_pluriall)
103
-    return SFS_values
103
+    return SFS_values, count_pluriall
104
 
104
 
105
 def barplot_sfs(sfs, folded=True, title = "Barplot"):
105
 def barplot_sfs(sfs, folded=True, title = "Barplot"):
106
     sfs_val = []
106
     sfs_val = []
117
     #build the plot
117
     #build the plot
118
     title = title+" [folded="+str(folded)+"]"
118
     title = title+" [folded="+str(folded)+"]"
119
     plt.title(title)
119
     plt.title(title)
120
-    plt.bar(sfs.keys(), sfs_val)
120
+    plt.bar([i+1 for i in sfs.keys()], sfs_val)
121
     plt.show()
121
     plt.show()
122
 
122
 
123
 if __name__ == "__main__":
123
 if __name__ == "__main__":

+ 0 - 110
vcf_to_sfs.py Прегледај датотеку

1
-#!/usr/bin/env python3
2
-
3
-"""
4
-FOREST Thomas (thomas.forest@college-de-france.fr)
5
-
6
-Caution : At the moment for gzipped files only.
7
-
8
-ARGS
9
---------
10
-
11
-standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
12
-
13
-"""
14
-
15
-import gzip
16
-import sys
17
-
18
-def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False):
19
-
20
-    """ Returns an SFS from a VCF file.
21
-
22
-    Parameters
23
-    ----------
24
-    n : int
25
-        Nb of individuals in sample.
26
-    vcf_file : str
27
-        SNPs in VCF file format.
28
-
29
-        Used to generate a Site Frequency Spectrum (SFS) from a VCF.
30
-
31
-    Returns
32
-    -------
33
-    dict
34
-        Site Frequency Spectrum (SFS)
35
-
36
-
37
-    """
38
-    
39
-    if diploid and not folded:
40
-        n *= 2
41
-    # initiate SFS_values with a zeros dict
42
-    SFS_values = dict.fromkeys(range(n),0)
43
-    # store nb polyallellic sites
44
-    polyall = 0
45
-    with gzip.open(vcf_file, "rb") as inputgz:
46
-        line = inputgz.readline()
47
-        genotypes = []
48
-        print("Parsing VCF", vcf_file, "... Please wait...")
49
-        while line:
50
-            # decode gzipped binary lines
51
-            line = line.decode('utf-8').strip()
52
-            # every snp line, not comment or header
53
-            if not line.startswith("##") and not line.startswith("#"):
54
-                FIELDS = line.split("\t")
55
-                # REF is col 4 of VCF
56
-                REF = FIELDS[3].split(",")
57
-                # ALT is col 5 of VCF
58
-                ALT = FIELDS[4].split(",")            
59
-                FORMAT = line.split("\t")[8:9]
60
-                SAMPLES = line.split("\t")[9:]
61
-                snp_genotypes = []
62
-                allele_counts = {}
63
-                allele_counts_list = []
64
-                # SKIP the SNP if :
65
-                # 1 : missing
66
-                # 2 : deletion among REF
67
-                # 3 : deletion among ALT
68
-                if "./.:." in line \
69
-                   or len(ALT[0]) > 1 \
70
-                   or len(REF[0]) > 1:
71
-                    line = inputgz.readline()
72
-                    continue
73
-                for sample in SAMPLES:
74
-                    if not phased:
75
-                        # for UNPHASED data
76
-                        smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
77
-                    else:
78
-                        # for PHASED
79
-                        smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
80
-                    nb_alleles = set(smpl_genotype)
81
-                    snp_genotypes += smpl_genotype
82
-                # skip if all individuals have the same genotype
83
-                if len(set(snp_genotypes)) == 1:
84
-                    line = inputgz.readline()
85
-                    continue
86
-                for k in set(snp_genotypes):
87
-                    allele_counts[snp_genotypes.count(k)] = k
88
-                    allele_counts_list.append(snp_genotypes.count(k))
89
-                if folded and len(ALT) >= 2:
90
-                    polyall += 1
91
-                else:
92
-                    SFS_values[min(allele_counts_list)-1] += 1
93
-            line = inputgz.readline()
94
-            if verbose:
95
-                print(SFS_values)
96
-    return SFS_values, polyall
97
-
98
-if __name__ == "__main__":
99
-            
100
-    if len(sys.argv) != 3:
101
-        print("Need 2 args")
102
-        exit(0)
103
-
104
-    # PARAM : vcf_file
105
-    vcf_file = sys.argv[1]
106
-    # PARAM : Nb of indiv
107
-    n = int(sys.argv[2])
108
-
109
-    sfs, nb_polyall = sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False)
110
-    print(sfs)

+ 0 - 3
vcf_utils.py Прегледај датотеку

185
             progress = round(k/int(last_pos))*100
185
             progress = round(k/int(last_pos))*100
186
             if progress % 10 == 0:
186
             if progress % 10 == 0:
187
                 print(progress, "%")
187
                 print(progress, "%")
188
-        # if pos is genotyped
189
-        # if k in vcf_entries:
190
-        #     y=k*step
191
         y+=1*step
188
         y+=1*step
192
         x=pos
189
         x=pos
193
         coords[0].append(x)
190
         coords[0].append(x)