Browse Source

add scripts for vcf tools

tforest 2 years ago
parent
commit
7a08bb7b41
7 changed files with 646 additions and 189 deletions
  1. 3 0
      __init__.py
  2. 0 2
      compile.sh
  3. 196 0
      customgraphics.py
  4. 133 0
      sfs_tools.py
  5. 0 123
      vcf_to_sfs.c
  6. 96 64
      vcf_to_sfs.py
  7. 218 0
      vcf_utils.py

+ 3 - 0
__init__.py View File

@@ -0,0 +1,3 @@
1
+from frst import sfs_tools
2
+from frst import customgraphics
3
+from frst import vcf_utils

+ 0 - 2
compile.sh View File

@@ -1,2 +0,0 @@
1
-#!/bin/sh
2
-gcc -Wall -pthread vcf_to_sfs.c -lm -lz -std=c99 -Wextra -o vcf_to_sfs

+ 196 - 0
customgraphics.py View File

@@ -0,0 +1,196 @@
1
+""" Custom graphics lib for pop gen or genomics
2
+
3
+FOREST Thomas (thomas.forest@college-de-france.fr)
4
+
5
+
6
+
7
+"""
8
+
9
+import matplotlib.pyplot as plt
10
+import matplotlib.ticker as ticker
11
+import numpy as np
12
+from frst import vcf_utils
13
+
14
+def heatmap(data, row_labels=None, col_labels=None, ax=None,
15
+            cbar_kw={}, cbarlabel="", **kwargs):
16
+    """
17
+    Create a heatmap from a numpy array and two lists of labels.
18
+    (from the matplotlib doc)
19
+
20
+    Parameters
21
+    ----------
22
+    data
23
+        A 2D numpy array of shape (M, N).
24
+    row_labels
25
+        A list or array of length M with the labels for the rows.
26
+    col_labels
27
+        A list or array of length N with the labels for the columns.
28
+    ax
29
+        A `matplotlib.axes.Axes` instance to which the heatmap is plotted.  If
30
+        not provided, use current axes or create a new one.  Optional.
31
+    cbar_kw
32
+        A dictionary with arguments to `matplotlib.Figure.colorbar`.  Optional.
33
+    cbarlabel
34
+        The label for the colorbar.  Optional.
35
+    **kwargs
36
+        All other arguments are forwarded to `imshow`.
37
+    """
38
+
39
+    if not ax:
40
+        ax = plt.gca()
41
+
42
+    # Plot the heatmap
43
+    im = ax.imshow(data, **kwargs)
44
+
45
+    # Create colorbar
46
+    cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)
47
+    cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")
48
+
49
+    # Show all ticks and label them with the respective list entries.
50
+    if col_labels:
51
+        ax.set_xticks(col_labels)
52
+    if row_labels:
53
+        ax.set_yticks(row_labels)
54
+    
55
+    # Let the horizontal axes labeling appear on top.
56
+    ax.tick_params(top=True, bottom=False,
57
+                   labeltop=True, labelbottom=False)
58
+
59
+    # Rotate the tick labels and set their alignment.
60
+    plt.setp(ax.get_xticklabels(), rotation=-30, ha="right",
61
+             rotation_mode="anchor")
62
+
63
+    # Turn spines off and create white grid.
64
+    ax.spines[:].set_visible(False)
65
+
66
+    ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True)
67
+    ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True)
68
+    ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
69
+    ax.tick_params(which="minor", bottom=False, left=False)
70
+
71
+    return im, cbar
72
+
73
+def annotate_heatmap(im, data=None, valfmt="{x:.2f}",
74
+                     textcolors=("black", "white"),
75
+                     threshold=None, **textkw):
76
+    """
77
+    A function to annotate a heatmap.
78
+     (from the matplotlib doc)
79
+    Parameters
80
+    ----------
81
+    im
82
+        The AxesImage to be labeled.
83
+    data
84
+        Data used to annotate.  If None, the image's data is used.  Optional.
85
+    valfmt
86
+        The format of the annotations inside the heatmap.  This should either
87
+        use the string format method, e.g. "$ {x:.2f}", or be a
88
+        `matplotlib.ticker.Formatter`.  Optional.
89
+    textcolors
90
+        A pair of colors.  The first is used for values below a threshold,
91
+        the second for those above.  Optional.
92
+    threshold
93
+        Value in data units according to which the colors from textcolors are
94
+        applied.  If None (the default) uses the middle of the colormap as
95
+        separation.  Optional.
96
+    **kwargs
97
+        All other arguments are forwarded to each call to `text` used to create
98
+        the text labels.
99
+    """
100
+
101
+    if not isinstance(data, (list, np.ndarray)):
102
+        data = im.get_array()
103
+
104
+    # Normalize the threshold to the images color range.
105
+    if threshold is not None:
106
+        threshold = im.norm(threshold)
107
+    else:
108
+        threshold = im.norm(data.max())/2.
109
+
110
+    # Set default alignment to center, but allow it to be
111
+    # overwritten by textkw.
112
+    kw = dict(horizontalalignment="center",
113
+              verticalalignment="center")
114
+    kw.update(textkw)
115
+
116
+    # Get the formatter in case a string is supplied
117
+    if isinstance(valfmt, str):
118
+        valfmt = ticker.StrMethodFormatter(valfmt)
119
+
120
+    # Loop over the data and create a `Text` for each "pixel".
121
+    # Change the text's color depending on the data.
122
+    texts = []
123
+    for i in range(data.shape[0]):
124
+        for j in range(data.shape[1]):
125
+            kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])
126
+            text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)
127
+            texts.append(text)
128
+
129
+    return texts
130
+
131
+def plot_matrix(mat, legend=None, color_scale_type="YlGn", cbarlabel = "qt", title=None):
132
+         
133
+    fig, ax = plt.subplots(figsize=(10,8))
134
+    if legend:
135
+        row_labels = [k for k in range(len(mat))]
136
+        col_labels = [k for k in range(len(mat[0]))]
137
+        im, cbar = heatmap(mat, row_labels, col_labels, ax=ax,
138
+                           cmap=color_scale_type, cbarlabel=cbarlabel)
139
+    else:
140
+        im, cbar = heatmap(mat, ax=ax,
141
+                           cmap=color_scale_type, cbarlabel=cbarlabel)
142
+    #texts = annotate_heatmap(im, valfmt="{x:.5f}")
143
+    if title:
144
+        ax.set_title(title)
145
+    fig.tight_layout()
146
+    plt.show()
147
+
148
+def plot(x, y, outfile = None, outfolder = None, ylab=None, xlab=None, title=None):
149
+    plt.plot(x, y)
150
+    if ylab:
151
+        plt.ylabel(ylab)
152
+    if xlab:
153
+        plt.xlabel(xlab)
154
+    if title:
155
+        plt.title(title)
156
+    if outfile:
157
+        plt.savefig(outfile)
158
+    else:
159
+        plt.show()
160
+
161
+def scatter(x, y, ylab=None, xlab=None, title=None):
162
+    plt.scatter(x, y)
163
+    if ylab:
164
+        plt.ylabel(ylab)
165
+    if xlab:
166
+        plt.xlabel(xlab)
167
+    if title:
168
+        plt.title(title)
169
+    plt.show()
170
+
171
+def barplot(x, y, ylab=None, xlab=None, title=None):
172
+    plt.bar(x, y)
173
+    if ylab:
174
+        plt.ylabel(ylab)
175
+    if xlab:
176
+        plt.xlabel(xlab)
177
+    if title:
178
+        plt.title(title)
179
+    plt.show()
180
+
181
+def plot_chrom_continuity(vcf_entries, chr_id, outfile = None, outfolder = None):
182
+    chr_name = list(vcf_entries.keys())[chr_id]
183
+    chr_entries = vcf_entries[chr_name]
184
+    genotyped_pos = vcf_utils.genotyping_continuity_plot(chr_entries)
185
+    plot(genotyped_pos[0], genotyped_pos[1], ylab = "genotyped pos.",
186
+         xlab = "pos. in ref.",
187
+         title = "Genotyped pos in chr "+str(chr_id+1)+":'"+chr_name+"'",
188
+         outfile = outfile, outfolder = outfolder)
189
+
190
+def plot_chrom_coverage(vcf_entries, chr_id):
191
+    chr_name = list(vcf_entries.keys())[chr_id]
192
+    chr_entries = vcf_entries[chr_name]
193
+    coverage = vcf_utils.compute_coverage(chr_entries)
194
+    barplot(coverage[0], coverage[1], ylab = "coverage (X)",
195
+            xlab = "pos. in ref.",
196
+            title = "Coverage for chr "+str(chr_id+1)+":'"+chr_name+"'")

+ 133 - 0
sfs_tools.py View File

@@ -0,0 +1,133 @@
1
+#!/usr/bin/env python3
2
+
3
+"""
4
+FOREST Thomas (thomas.forest@college-de-france.fr)
5
+
6
+Caution : At the moment for gzipped files only.
7
+
8
+ARGS
9
+--------
10
+
11
+standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
12
+
13
+"""
14
+
15
+import gzip
16
+import sys
17
+import matplotlib.pyplot as plt
18
+
19
+def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False):
20
+
21
+    """
22
+    Multiplication de deux nombres entiers.
23
+    Cette fonction ne sert pas à grand chose.
24
+
25
+    Parameters
26
+    ----------
27
+    n : int
28
+        Nb of individuals in sample.
29
+    vcf_file : str
30
+        SNPs in VCF file format.
31
+
32
+        Used to generate a Site Frequency Spectrum (SFS) from a VCF.
33
+
34
+    Returns
35
+    -------
36
+    dict
37
+        Site Frequency Spectrum (SFS)
38
+
39
+
40
+    """
41
+    
42
+    if diploid and not folded:
43
+        n *= 2
44
+    # initiate SFS_values with a zeros dict
45
+    SFS_values = dict.fromkeys(range(n),0)
46
+    count_pluriall = 0
47
+    with gzip.open(vcf_file, "rb") as inputgz:
48
+        line = inputgz.readline()
49
+        genotypes = []
50
+        print("Parsing VCF", vcf_file, "... Please wait...")
51
+        while line:
52
+            # decode gzipped binary lines
53
+            line = line.decode('utf-8').strip()
54
+            # every snp line, not comment or header
55
+            if not line.startswith("##") and not line.startswith("#"):
56
+                FIELDS = line.split("\t")
57
+                # REF is col 4 of VCF
58
+                REF = FIELDS[3].split(",")
59
+                # ALT is col 5 of VCF
60
+                ALT = FIELDS[4].split(",")            
61
+                FORMAT = line.split("\t")[8:9]
62
+                SAMPLES = line.split("\t")[9:]
63
+                snp_genotypes = []
64
+                allele_counts = {}
65
+                allele_counts_list = []
66
+                # SKIP the SNP if :
67
+                # 1 : missing
68
+                # 2 : deletion among REF
69
+                # 3 : deletion among ALT
70
+                if "./.:." in line \
71
+                   or len(ALT[0]) > 1 \
72
+                   or len(REF[0]) > 1:
73
+                    line = inputgz.readline()
74
+                    continue
75
+                for sample in SAMPLES:
76
+                    if not phased:
77
+                        # for UNPHASED data
78
+                        smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
79
+                    else:
80
+                        # for PHASED
81
+                        smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
82
+                    nb_alleles = set(smpl_genotype)
83
+                    snp_genotypes += smpl_genotype
84
+                # skip if all individuals have the same genotype
85
+                if len(set(snp_genotypes)) == 1:
86
+                    line = inputgz.readline()
87
+                    continue
88
+                for k in set(snp_genotypes):
89
+                    allele_counts[snp_genotypes.count(k)] = k
90
+                    allele_counts_list.append(snp_genotypes.count(k))
91
+                if folded and len(ALT) >= 2:
92
+                    #pass
93
+                    count_pluriall +=1
94
+                    # TODO - work in progress
95
+                    # for al in range(len(ALT)-1):
96
+                    #     SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
97
+                    #     allele_counts_list.remove(min(allele_counts_list))
98
+                else:
99
+                    SFS_values[min(allele_counts_list)-1] += 1
100
+            line = inputgz.readline()
101
+            if verbose:
102
+                print("SFS=", SFS_values)
103
+        print("Pluriallelic sites =", count_pluriall)
104
+    return SFS_values
105
+
106
+def barplot_sfs(sfs, folded=True, title = "Barplot"):
107
+    sfs_val = []
108
+    n = len(sfs.values())
109
+    for k in range(1, n):
110
+        ksi = list(sfs.values())[k-1]
111
+        # k+1 because k starts from 0
112
+        if folded:
113
+            sfs_val.append(ksi * k * (n - k))
114
+        else:
115
+            sfs_val.append(ksi * k)
116
+    #terminal case, same for folded or unfolded
117
+    sfs_val.append(list(sfs.values())[n-1] * n)
118
+    #build the plot
119
+    title = title+" [folded="+str(folded)+"]"
120
+    plt.title(title)
121
+    plt.bar(sfs.keys(), sfs_val)
122
+    plt.show()
123
+
124
+if __name__ == "__main__":
125
+            
126
+    if len(sys.argv) != 3:
127
+        print("Need 2 args")
128
+        exit(0)
129
+
130
+    # PARAM : Nb of indiv
131
+    n = int(sys.argv[2])
132
+    sfs = sfs_from_vcf(n, sys.argv[1], folded = True, diploid = True, phased = False)
133
+    print(sfs)

+ 0 - 123
vcf_to_sfs.c View File

@@ -1,123 +0,0 @@
1
-# include <stdio.h>
2
-# include <stdlib.h>
3
-# include <zlib.h>
4
-#include <string.h>
5
-#include <stdbool.h>
6
-
7
-bool StartsWith(const char *a, const char *b)
8
-{
9
-   if(strncmp(a, b, strlen(b)) == 0) return 1;
10
-   return 0;
11
-}
12
-
13
-void slice_str(const char * str, char * buffer, size_t start, size_t end)
14
-{
15
-    size_t j = 0;
16
-    for ( size_t i = start; i <= end; ++i ) {
17
-        buffer[j++] = str[i];
18
-    }
19
-    buffer[j] = 0;
20
-}
21
-
22
-int min(int * array, int size){
23
-    //Consider first element as smallest
24
-   int smallest = array[0];
25
-   int i;
26
-   for (i = 0; i < num; i++) {
27
-      if (a[i] < smallest) {
28
-         smallest = a[i];
29
-      }
30
-   }
31
-}
32
-
33
-int countDistinct(int a[], int n)      //Function Definition
34
-{
35
-   int i, j, count = 0;
36
-   //Traverse the array
37
-   for (i = 1; i < n; i++)      //hold an array element
38
-   {
39
-      for (j = 0; j < i; j++)   
40
-      {
41
-         if (a[i] == a[j])    //Check for duplicate elements
42
-         {
43
-            break;             //If duplicate elements found then break
44
-         }
45
-      }
46
-      if (i == j)
47
-      {
48
-         count++;     //increment the number of distinct elements
49
-      }
50
-   }
51
-   return count;      //Return the number of distinct elements
52
-}
53
-
54
-# define LL 8192   /* line length maximum */
55
-# define DIPLOID true
56
-# define FOLDED true
57
-# define IGNORED_FIELDS 9
58
-
59
-int main ( int argc, char *argv[] ){
60
-    if ( argc < 3) {
61
-	printf("Need 2 args!\n");
62
-	return 1;
63
-    }
64
-    gzFile fp;
65
-    char line[LL];
66
-    int N;
67
-    char delim[] = "\t";
68
-    fp = gzopen( argv[1], "r" );
69
-
70
-    // pop of size 2N when diploid
71
-    if (DIPLOID == true && FOLDED == false) {
72
-	N = 2 * atoi(argv[2]);
73
-	    } else {
74
-	N = atoi(argv[2]);
75
-    }
76
-
77
-    int snp_genotypes[N];
78
-    int SFS_values[N];
79
-
80
-    gzgets( fp, line, LL );
81
-    while ( ! gzeof( fp ) ){
82
-	int k = 0;
83
-	if ( StartsWith(line, "##") || ( StartsWith(line, "#") ) || (strstr(line, "./.:.") != NULL)){
84
-	    gzgets( fp, line, LL );
85
-	    continue;
86
-	}
87
-	
88
-	char *vcf_field = strtok(line, delim);
89
-	while(vcf_field != NULL){
90
-	    k++;
91
-	    if (k > IGNORED_FIELDS) {
92
-		const size_t len = strlen(vcf_field);
93
-		char buffer[len + 1];
94
-		//printf("'%s'\n", ptr);
95
-		slice_str(vcf_field, buffer, 0, 0);
96
-		//printf("%d %s      ", N, buffer);
97
-		snp_genotypes[k-IGNORED_FIELDS] = atoi(buffer);
98
-		//printf("%d ", smpl_genotype[k-9]);
99
-	    }
100
-	    vcf_field = strtok(NULL, delim);
101
-	    int c= countDistinct(snp_genotypes, N);
102
-            // skip if all individuals have the same genotype
103
-	    if (c == 1) {
104
-		continue;
105
-		gzgets( fp, line, LL );
106
-	    }
107
-	    /* int i; */
108
-	    /* for (i = 1; i < N; ++i) */
109
-	    /* 	{ */
110
-	    /* 	    printf("%d ", snp_genotypes[i]); */
111
-	    /* 	} */
112
-	    int allele_counts[c];
113
-	    
114
-	    min(allele_counts, N);
115
-	}
116
-	// printf("%s", line );
117
-	// loads the next line
118
-	gzgets( fp, line, LL );
119
-    }
120
-
121
-    gzclose( fp );
122
-    return 0;
123
-}

+ 96 - 64
vcf_to_sfs.py View File

@@ -8,74 +8,106 @@ Caution : At the moment for gzipped files only.
8 8
 ARGS
9 9
 --------
10 10
 
11
-usage : vcf_to_sfs.py VCF.gz nb_indiv
11
+standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
12 12
 
13 13
 """
14 14
 
15 15
 import gzip
16 16
 import sys
17 17
 
18
-# default folded SFS
19
-folded = True
20
-diploid = True
21
-phased = False
22
-
23
-# PARAM : Nb of indiv
24
-n = int(sys.argv[2])
25
-
26
-if diploid and not folded:
27
-    n *= 2
28
-# initiate SFS_values with a zeros dict
29
-SFS_values = dict.fromkeys(range(n),0)
30
-
31
-with gzip.open(sys.argv[1], "rb") as inputgz:
32
-    line = inputgz.readline()
33
-    genotypes = []
34
-    while line:
35
-        # decode gzipped binary lines
36
-        line = line.decode('utf-8').strip()
37
-        # every snp line, not comment or header
38
-        if not line.startswith("##") and not line.startswith("#"):
39
-            FIELDS = line.split("\t")
40
-            # REF is col 4 of VCF
41
-            REF = FIELDS[3].split(",")
42
-            # ALT is col 5 of VCF
43
-            ALT = FIELDS[4].split(",")            
44
-            FORMAT = line.split("\t")[8:9]
45
-            SAMPLES = line.split("\t")[9:]
46
-            snp_genotypes = []
47
-            allele_counts = {}
48
-            allele_counts_list = []
49
-            # SKIP the SNP if :
50
-            # 1 : missing
51
-            # 2 : deletion among REF
52
-            # 3 : deletion among ALT
53
-            if "./.:." in line \
54
-               or len(ALT[0]) > 1 \
55
-               or len(REF[0]) > 1:
56
-                line = inputgz.readline()
57
-                continue
58
-            for sample in SAMPLES:
59
-                if not phased:
60
-                    # for UNPHASED data
61
-                    smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
62
-                else:
63
-                    # for PHASED
64
-                    smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
65
-                nb_alleles = set(smpl_genotype)
66
-                snp_genotypes += smpl_genotype
67
-            # skip if all individuals have the same genotype
68
-            if len(set(snp_genotypes)) == 1:
69
-                line = inputgz.readline()
70
-                continue
71
-            for k in set(snp_genotypes):
72
-                allele_counts[snp_genotypes.count(k)] = k
73
-                allele_counts_list.append(snp_genotypes.count(k))
74
-            if folded and len(ALT) >= 2:
75
-                for al in range(len(ALT)-1):
76
-                    SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
77
-                    allele_counts_list.remove(min(allele_counts_list))
78
-            else:
79
-                SFS_values[min(allele_counts_list)-1] += 1
18
+def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False):
19
+
20
+    """
21
+    Multiplication de deux nombres entiers.
22
+    Cette fonction ne sert pas à grand chose.
23
+
24
+    Parameters
25
+    ----------
26
+    n : int
27
+        Nb of individuals in sample.
28
+    vcf_file : str
29
+        SNPs in VCF file format.
30
+
31
+        Used to generate a Site Frequency Spectrum (SFS) from a VCF.
32
+
33
+    Returns
34
+    -------
35
+    dict
36
+        Site Frequency Spectrum (SFS)
37
+
38
+
39
+    """
40
+    
41
+    if diploid and not folded:
42
+        n *= 2
43
+    # initiate SFS_values with a zeros dict
44
+    SFS_values = dict.fromkeys(range(n),0)
45
+
46
+    with gzip.open(vcf_file, "rb") as inputgz:
80 47
         line = inputgz.readline()
81
-        print(SFS_values)
48
+        genotypes = []
49
+        print("Parsing VCF", vcf_file, "... Please wait...")
50
+        while line:
51
+            # decode gzipped binary lines
52
+            line = line.decode('utf-8').strip()
53
+            # every snp line, not comment or header
54
+            if not line.startswith("##") and not line.startswith("#"):
55
+                FIELDS = line.split("\t")
56
+                # REF is col 4 of VCF
57
+                REF = FIELDS[3].split(",")
58
+                # ALT is col 5 of VCF
59
+                ALT = FIELDS[4].split(",")            
60
+                FORMAT = line.split("\t")[8:9]
61
+                SAMPLES = line.split("\t")[9:]
62
+                snp_genotypes = []
63
+                allele_counts = {}
64
+                allele_counts_list = []
65
+                # SKIP the SNP if :
66
+                # 1 : missing
67
+                # 2 : deletion among REF
68
+                # 3 : deletion among ALT
69
+                if "./.:." in line \
70
+                   or len(ALT[0]) > 1 \
71
+                   or len(REF[0]) > 1:
72
+                    line = inputgz.readline()
73
+                    continue
74
+                for sample in SAMPLES:
75
+                    if not phased:
76
+                        # for UNPHASED data
77
+                        smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
78
+                    else:
79
+                        # for PHASED
80
+                        smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
81
+                    nb_alleles = set(smpl_genotype)
82
+                    snp_genotypes += smpl_genotype
83
+                # skip if all individuals have the same genotype
84
+                if len(set(snp_genotypes)) == 1:
85
+                    line = inputgz.readline()
86
+                    continue
87
+                for k in set(snp_genotypes):
88
+                    allele_counts[snp_genotypes.count(k)] = k
89
+                    allele_counts_list.append(snp_genotypes.count(k))
90
+                if folded and len(ALT) >= 2:
91
+                    pass
92
+                    # TODO - work in progress
93
+                    # for al in range(len(ALT)-1):
94
+                    #     SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
95
+                    #     allele_counts_list.remove(min(allele_counts_list))
96
+                else:
97
+                    SFS_values[min(allele_counts_list)-1] += 1
98
+            line = inputgz.readline()
99
+            if verbose:
100
+                print(SFS_values)
101
+    return SFS_values
102
+
103
+if __name__ == "__main__":
104
+            
105
+    if len(sys.argv) != 3:
106
+        print("Need 2 args")
107
+        exit(0)
108
+
109
+    # PARAM : Nb of indiv
110
+    n = int(sys.argv[2])
111
+
112
+    sfs = sfs_from_vcf(n, sys.argv[1], folded = True, diploid = True, phased = False)
113
+    print(sfs)

+ 218 - 0
vcf_utils.py View File

@@ -0,0 +1,218 @@
1
+#!/usr/bin/env python3
2
+
3
+"""
4
+FOREST Thomas (thomas.forest@college-de-france.fr)
5
+
6
+Caution : At the moment for gzipped files only.
7
+
8
+ARGS
9
+--------
10
+
11
+standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
12
+
13
+"""
14
+import gzip
15
+import sys
16
+import numpy as np
17
+from frst import customgraphics
18
+import json 
19
+import time
20
+import datetime
21
+
22
+def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
23
+    start = time.time()
24
+    with gzip.open(vcf_file, "rb") as inputgz:
25
+            byte_line = inputgz.readline()
26
+            genotypes = []
27
+            noGenotype = []
28
+            pos = 0
29
+            pluriall_counts = 0
30
+            entries = {}
31
+            chrom = {}
32
+            nb_site = 0
33
+            print("Parsing VCF {} ... Please wait...".format(vcf_file))
34
+            #print("Parsing VCF", vcf_file, "... Please wait...")
35
+            while byte_line:
36
+                #print(line)
37
+                # decode gzipped binary lines
38
+                line = byte_line.decode('utf-8').strip()
39
+                nb_site += 1
40
+                #  # every snp line, not comment or header
41
+                if not line.startswith("##") and not line.startswith("#"):
42
+                    FIELDS = line.split("\t")
43
+                    CHROM = FIELDS[0]
44
+                    POS = int(FIELDS[1])
45
+                    if stop_at:
46
+                        if POS > stop_at:
47
+                            break
48
+                    # REF is col 4 of VCF
49
+                    REF = FIELDS[3].split(",")
50
+                    # ALT is col 5 of VCF
51
+                    ALT = FIELDS[4].split(",")
52
+                    FORMAT = line.split("\t")[8:9]
53
+                    SAMPLES = line.split("\t")[9:]
54
+                    QUALITY = float(FIELDS[5])
55
+                    INFO = FIELDS[7]
56
+                    INFOS = {}
57
+                    for info in INFO.split(";"):
58
+                        try :
59
+                            INFOS[info.split('=')[0]] = info.split('=')[1]
60
+                        except:
61
+                            INFOS[info] = info
62
+                    GENOTYPE = []
63
+                    LIKELIHOOD = []
64
+                    # SKIP the SNP if :
65
+                    # 1 : missing
66
+                    # 2 : deletion among REF
67
+                    # 3 : deletion among ALT
68
+                    if "./.:." in line \
69
+                       or len(ALT[0]) > 1 \
70
+                       or len(REF[0]) > 1:
71
+                        # sites that are not kept
72
+                        # if at least one missing data, remember it
73
+                        noGenotype.append(POS)
74
+                        byte_line = inputgz.readline()
75
+                        continue
76
+                    elif len(ALT) > 1:
77
+                        # pluriall sites
78
+                        # remember that the gestion of PL is very diff with pluriall
79
+                        pluriall_counts += 1
80
+                        noGenotype.append(POS)
81
+                        byte_line = inputgz.readline()
82
+                        continue
83
+                    else:
84
+                        # for unphased and with only two fields : GT:PL
85
+                        for sample in SAMPLES:
86
+                            if not phased:
87
+                                # for UNPHASED data
88
+                                sample_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
89
+                            else:
90
+                                # for PHASED
91
+                                sample_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
92
+                            # list of PL : [prob of 0/0, prob of 0/1, prob of 1/1]
93
+                            sample_likelihood =  sample.split(':')[1].split(',')
94
+                            sample_likelihood = [pow(10, -int(sample_likelihood[0])/10),
95
+                                                 pow(10, -int(sample_likelihood[1])/10), pow(10, -int(sample_likelihood[2])/10)]
96
+                            GENOTYPE += sample_genotype
97
+                            LIKELIHOOD.append(sample_likelihood)
98
+                        # from log phred score to probability of error, E
99
+                        #LIKELIHOOD = pow(10, -int(LIKELIHOOD[0])/10)
100
+                        #print(LIKELIHOOD)
101
+                        entries = {
102
+                            'POS':POS,
103
+                            'CHR':CHROM,
104
+                            'FIELDS':FIELDS,
105
+                            'REF':REF,
106
+                            'ALT':ALT,
107
+                            'FORMAT':FORMAT,
108
+                            'INFOS':INFOS,
109
+                            'SAMPLES':SAMPLES,
110
+                            'QUALITY':QUALITY,
111
+                            'GENOTYPE':GENOTYPE,
112
+                            'LIKELIHOOD':LIKELIHOOD
113
+                        }
114
+                        if CHROM.startswith(chr_starts_with):
115
+                        # keep if chr name starts with filter
116
+                        # default : *, every chr is kept
117
+                            if CHROM not in chrom:
118
+                                 chrom[CHROM] = {}
119
+                            chrom[CHROM][POS] = entries
120
+                byte_line = inputgz.readline()
121
+    end = time.time()
122
+    print("Parsed", nb_site, "sites in", str(datetime.timedelta(seconds=end - start)))
123
+
124
+    return chrom
125
+
126
+def build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit = False, normalize = False, xlim=None):
127
+    """ Take infos out of parsing to build a coverage matrix of probability 
128
+    of error, E, at each position
129
+    """
130
+    # last pos without any missing data ./.:.
131
+    last_pos = list(entries.keys())[-1]
132
+    # last genotyped SNP with missing data, usually bigger than last_pos
133
+    last_genotyped = noGenotype[-1]
134
+    mat = []
135
+    if diploid:
136
+        # k=N if diploids, k = 2N if haploids 
137
+        k = int(len(entries[last_pos]['GENOTYPE'])/2)
138
+    else:
139
+        k = len(entries[last_pos]['GENOTYPE'])
140
+    for _ in range(k):
141
+        mat.append([])
142
+    #display_max = last_pos
143
+    if xlim:
144
+        display_max = xlim
145
+    else:
146
+        if na_omit:
147
+            display_max = last_pos
148
+        else:
149
+            display_max = last_genotyped
150
+    for pos in range(display_max):
151
+        if pos in noGenotype or pos not in entries.keys():
152
+            if na_omit:
153
+                continue
154
+            else:
155
+                for k in range(len(mat)):
156
+                    # if missing, prob=1, worst case
157
+                    mat[k].append(1)
158
+        else:
159
+            for k in range(len(mat)):
160
+                best_prob = min(entries[pos]['LIKELIHOOD'][k])
161
+                #print(ind, best_prob)
162
+                mat[k].append(best_prob)
163
+                    
164
+    mat = np.array(mat)
165
+    if normalize:
166
+        row_sums = mat.sum(axis=1)
167
+        mat = mat / row_sums[:, np.newaxis]
168
+    return mat
169
+
170
+def genotyping_continuity_plot(vcf_entries, verbose=False):
171
+    last_pos = int(sorted(list(vcf_entries.keys()))[-1])
172
+    x = 0
173
+    y = 1
174
+    coords = [[], []]
175
+    print(last_pos, "sites to scan")
176
+    for k, pos in enumerate(range(last_pos)):
177
+        if verbose:
178
+            progress = round(k/int(last_pos))*100
179
+            if progress % 10 == 0:
180
+                print(progress, "%")
181
+        # if pos is genotyped
182
+        if k in vcf_entries:
183
+            y+=1
184
+        x+=1
185
+        coords[0].append(x)
186
+        coords[1].append(y)
187
+    return coords
188
+
189
+def compute_coverage(vcf_entries, verbose=False):
190
+    last_pos = list(vcf_entries.keys())[-1]
191
+    x = 0
192
+    y = 1
193
+    coords = [[], []]
194
+    print(last_pos, "sites to scan")
195
+    for entry in vcf_entries.values():
196
+        y = int(entry['INFOS']['DP'])
197
+        x = entry['POS']
198
+        coords[0].append(x)
199
+        coords[1].append(y)
200
+    return coords
201
+
202
+if __name__ == "__main__":
203
+    # check args
204
+    if len(sys.argv) !=2:
205
+        print("Need 1 arg")
206
+        exit(0)
207
+    # main
208
+    vcf_file = sys.argv[1]
209
+
210
+    # # without missing data
211
+    # entries, noGenotype = parse_vcf(vcf_file, stop_at = 20000)
212
+    # mat = build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit=True, normalize=False, xlim = None)
213
+    # customgraphics.plot_matrix(mat, color_scale_type="autumn", cbarlabel = "prob. of error, E", title="Prob. of error (E) of genotyping, at each position, lower the better")
214
+
215
+    # # with missing data
216
+    # entries, noGenotype = parse_vcf(vcf_file, stop_at = 500)
217
+    # mat = build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit=False, normalize=False, xlim = None)
218
+    # customgraphics.plot_matrix(mat, color_scale_type="autumn", cbarlabel = "prob. of error, E", title="Prob. of error (E) of genotyping, at each position, lower the better")