Browse Source

update output plotting of chrom continuity

tforest 2 years ago
parent
commit
a7e78958b2
4 changed files with 18 additions and 125 deletions
  1. 16 10
      customgraphics.py
  2. 2 2
      sfs_tools.py
  3. 0 110
      vcf_to_sfs.py
  4. 0 3
      vcf_utils.py

+ 16 - 10
customgraphics.py View File

@@ -153,7 +153,11 @@ def plot_matrix(mat, legend=None, color_scale_type="YlGn", cbarlabel = "qt", tit
153 153
 
154 154
 def plot(x, y, outfile = None, outfolder = None, ylab=None, xlab=None,
155 155
          title=None, label = None, show=True, nb_subplots = None, subplot_init = False,
156
-         subplot_id = None):
156
+         subplot_id = None, output = None, dpi = 300, width = 15, height = 15, plot_init = True):
157
+
158
+    # before fig is generated, set its dimensions
159
+    if plot_init:
160
+        plt.figure(figsize=(width, height))
157 161
     if subplot_init:
158 162
         # define a certain amount of subplots
159 163
         fig, axs = plt.subplots(nb_subplots)
@@ -181,10 +185,10 @@ def plot(x, y, outfile = None, outfolder = None, ylab=None, xlab=None,
181 185
     if title:
182 186
         plt.title(title)
183 187
     if outfile:
184
-        plt.savefig(outfile)
185
-    else:
186
-        if show == True:
187
-            plt.show()
188
+        plt.savefig(outfile, dpi = dpi)
189
+    if show == True:
190
+        plt.show()
191
+
188 192
 
189 193
 def scatter(x, y, ylab=None, xlab=None, title=None):
190 194
     plt.scatter(x, y)
@@ -208,7 +212,7 @@ def barplot(x, y, ylab=None, xlab=None, title=None):
208 212
 
209 213
 def plot_chrom_continuity(vcf_entries, chr_id, x=None, y=None, outfile = None,
210 214
                           outfolder = None, returned=False, show=True, label=True, step=1, nb_subplots = None,
211
-                          subplot_init = False, subplot_id = None, title = None):
215
+                          subplot_init = False, subplot_id = None, title = None, plot_init = False):
212 216
     chr_name = list(vcf_entries.keys())[chr_id]
213 217
     if label:
214 218
         label = chr_name
@@ -226,10 +230,10 @@ def plot_chrom_continuity(vcf_entries, chr_id, x=None, y=None, outfile = None,
226 230
              xlab = "pos. in ref.",
227 231
              title = title,
228 232
              outfile = outfile, outfolder = outfolder, show=show, label=label,
229
-             nb_subplots = nb_subplots, subplot_init = subplot_init, subplot_id = subplot_id)
233
+             nb_subplots = nb_subplots, subplot_init = subplot_init, subplot_id = subplot_id, plot_init = plot_init)
230 234
 
231 235
 def plot_whole_karyotype(recent_variants, mem_clean = False, step = 1, show = True, min_chr_id = 0,
232
-                         max_chr_id = None, stacked = False, title = None):
236
+                         max_chr_id = None, stacked = False, title = None, outfile = None):
233 237
     coords = []
234 238
     if max_chr_id :
235 239
         nb_iter = max_chr_id
@@ -246,7 +250,7 @@ def plot_whole_karyotype(recent_variants, mem_clean = False, step = 1, show = Tr
246 250
             nb_subplots = None
247 251
             subplot_init = False
248 252
         vcf_utils.customgraphics.plot_chrom_continuity(recent_variants, chr_id = min_chr_id, show = False, returned = False, step = step,
249
-                                                       nb_subplots = nb_subplots, subplot_init = subplot_init, subplot_id = min_chr_id)
253
+                                                       nb_subplots = nb_subplots, subplot_init = subplot_init, subplot_id = min_chr_id, plot_init = True)
250 254
     else :
251 255
         iter_start = 0
252 256
     for chr in range(iter_start, nb_iter):
@@ -267,7 +271,9 @@ def plot_whole_karyotype(recent_variants, mem_clean = False, step = 1, show = Tr
267 271
             vcf_utils.customgraphics.plot_chrom_continuity(recent_variants, chr_id = chr, show = False, returned = False, step = step, subplot_id = chr)
268 272
         # last case
269 273
     if show == True:
270
-        vcf_utils.customgraphics.plot_chrom_continuity(recent_variants, chr_id = nb_iter, show = True, returned = False, step = step, subplot_id = nb_iter, title = title)
274
+        vcf_utils.customgraphics.plot_chrom_continuity(recent_variants, chr_id = nb_iter, show = True, returned = False, step = step, subplot_id = nb_iter,
275
+                                                       title = title,
276
+                                                       outfile = outfile, plot_init = False)
271 277
     # maybe add a clean of recent_variants in extreme cases, before building the plots
272 278
     if show == False:
273 279
         return coords

+ 2 - 2
sfs_tools.py View File

@@ -100,7 +100,7 @@ def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, ver
100 100
             if verbose:
101 101
                 print("SFS=", SFS_values)
102 102
         print("Pluriallelic sites =", count_pluriall)
103
-    return SFS_values
103
+    return SFS_values, count_pluriall
104 104
 
105 105
 def barplot_sfs(sfs, folded=True, title = "Barplot"):
106 106
     sfs_val = []
@@ -117,7 +117,7 @@ def barplot_sfs(sfs, folded=True, title = "Barplot"):
117 117
     #build the plot
118 118
     title = title+" [folded="+str(folded)+"]"
119 119
     plt.title(title)
120
-    plt.bar(sfs.keys(), sfs_val)
120
+    plt.bar([i+1 for i in sfs.keys()], sfs_val)
121 121
     plt.show()
122 122
 
123 123
 if __name__ == "__main__":

+ 0 - 110
vcf_to_sfs.py View File

@@ -1,110 +0,0 @@
1
-#!/usr/bin/env python3
2
-
3
-"""
4
-FOREST Thomas (thomas.forest@college-de-france.fr)
5
-
6
-Caution : At the moment for gzipped files only.
7
-
8
-ARGS
9
---------
10
-
11
-standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
12
-
13
-"""
14
-
15
-import gzip
16
-import sys
17
-
18
-def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False):
19
-
20
-    """ Returns an SFS from a VCF file.
21
-
22
-    Parameters
23
-    ----------
24
-    n : int
25
-        Nb of individuals in sample.
26
-    vcf_file : str
27
-        SNPs in VCF file format.
28
-
29
-        Used to generate a Site Frequency Spectrum (SFS) from a VCF.
30
-
31
-    Returns
32
-    -------
33
-    dict
34
-        Site Frequency Spectrum (SFS)
35
-
36
-
37
-    """
38
-    
39
-    if diploid and not folded:
40
-        n *= 2
41
-    # initiate SFS_values with a zeros dict
42
-    SFS_values = dict.fromkeys(range(n),0)
43
-    # store nb polyallellic sites
44
-    polyall = 0
45
-    with gzip.open(vcf_file, "rb") as inputgz:
46
-        line = inputgz.readline()
47
-        genotypes = []
48
-        print("Parsing VCF", vcf_file, "... Please wait...")
49
-        while line:
50
-            # decode gzipped binary lines
51
-            line = line.decode('utf-8').strip()
52
-            # every snp line, not comment or header
53
-            if not line.startswith("##") and not line.startswith("#"):
54
-                FIELDS = line.split("\t")
55
-                # REF is col 4 of VCF
56
-                REF = FIELDS[3].split(",")
57
-                # ALT is col 5 of VCF
58
-                ALT = FIELDS[4].split(",")            
59
-                FORMAT = line.split("\t")[8:9]
60
-                SAMPLES = line.split("\t")[9:]
61
-                snp_genotypes = []
62
-                allele_counts = {}
63
-                allele_counts_list = []
64
-                # SKIP the SNP if :
65
-                # 1 : missing
66
-                # 2 : deletion among REF
67
-                # 3 : deletion among ALT
68
-                if "./.:." in line \
69
-                   or len(ALT[0]) > 1 \
70
-                   or len(REF[0]) > 1:
71
-                    line = inputgz.readline()
72
-                    continue
73
-                for sample in SAMPLES:
74
-                    if not phased:
75
-                        # for UNPHASED data
76
-                        smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
77
-                    else:
78
-                        # for PHASED
79
-                        smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
80
-                    nb_alleles = set(smpl_genotype)
81
-                    snp_genotypes += smpl_genotype
82
-                # skip if all individuals have the same genotype
83
-                if len(set(snp_genotypes)) == 1:
84
-                    line = inputgz.readline()
85
-                    continue
86
-                for k in set(snp_genotypes):
87
-                    allele_counts[snp_genotypes.count(k)] = k
88
-                    allele_counts_list.append(snp_genotypes.count(k))
89
-                if folded and len(ALT) >= 2:
90
-                    polyall += 1
91
-                else:
92
-                    SFS_values[min(allele_counts_list)-1] += 1
93
-            line = inputgz.readline()
94
-            if verbose:
95
-                print(SFS_values)
96
-    return SFS_values, polyall
97
-
98
-if __name__ == "__main__":
99
-            
100
-    if len(sys.argv) != 3:
101
-        print("Need 2 args")
102
-        exit(0)
103
-
104
-    # PARAM : vcf_file
105
-    vcf_file = sys.argv[1]
106
-    # PARAM : Nb of indiv
107
-    n = int(sys.argv[2])
108
-
109
-    sfs, nb_polyall = sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False)
110
-    print(sfs)

+ 0 - 3
vcf_utils.py View File

@@ -185,9 +185,6 @@ def genotyping_continuity_plot(vcf_entries,
185 185
             progress = round(k/int(last_pos))*100
186 186
             if progress % 10 == 0:
187 187
                 print(progress, "%")
188
-        # if pos is genotyped
189
-        # if k in vcf_entries:
190
-        #     y=k*step
191 188
         y+=1*step
192 189
         x=pos
193 190
         coords[0].append(x)