3 years ago · 7a08bb7b41
--- a/__init__.py
+++ b/__init__.py
@@ -0,0 +1,3 @@
 
				+from frst import sfs_tools
			
 
				+from frst import customgraphics
			
 
				+from frst import vcf_utils
			
--- a/compile.sh
+++ b/compile.sh
@@ -1,2 +0,0 @@
 
				-#!/bin/sh
			
 
				-gcc -Wall -pthread vcf_to_sfs.c -lm -lz -std=c99 -Wextra -o vcf_to_sfs
			
--- a/customgraphics.py
+++ b/customgraphics.py
@@ -0,0 +1,196 @@
 
				+""" Custom graphics lib for pop gen or genomics
			
 
				+
			
 
				+FOREST Thomas (thomas.forest@college-de-france.fr)
			
 
				+
			
 
				+
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import matplotlib.pyplot as plt
			
 
				+import matplotlib.ticker as ticker
			
 
				+import numpy as np
			
 
				+from frst import vcf_utils
			
 
				+
			
 
				+def heatmap(data, row_labels=None, col_labels=None, ax=None,
			
 
				+            cbar_kw={}, cbarlabel="", **kwargs):
			
 
				+    """
			
 
				+    Create a heatmap from a numpy array and two lists of labels.
			
 
				+    (from the matplotlib doc)
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    data
			
 
				+        A 2D numpy array of shape (M, N).
			
 
				+    row_labels
			
 
				+        A list or array of length M with the labels for the rows.
			
 
				+    col_labels
			
 
				+        A list or array of length N with the labels for the columns.
			
 
				+    ax
			
 
				+        A `matplotlib.axes.Axes` instance to which the heatmap is plotted.  If
			
 
				+        not provided, use current axes or create a new one.  Optional.
			
 
				+    cbar_kw
			
 
				+        A dictionary with arguments to `matplotlib.Figure.colorbar`.  Optional.
			
 
				+    cbarlabel
			
 
				+        The label for the colorbar.  Optional.
			
 
				+    **kwargs
			
 
				+        All other arguments are forwarded to `imshow`.
			
 
				+    """
			
 
				+
			
 
				+    if not ax:
			
 
				+        ax = plt.gca()
			
 
				+
			
 
				+    # Plot the heatmap
			
 
				+    im = ax.imshow(data, **kwargs)
			
 
				+
			
 
				+    # Create colorbar
			
 
				+    cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)
			
 
				+    cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")
			
 
				+
			
 
				+    # Show all ticks and label them with the respective list entries.
			
 
				+    if col_labels:
			
 
				+        ax.set_xticks(col_labels)
			
 
				+    if row_labels:
			
 
				+        ax.set_yticks(row_labels)
			
 
				+    
			
 
				+    # Let the horizontal axes labeling appear on top.
			
 
				+    ax.tick_params(top=True, bottom=False,
			
 
				+                   labeltop=True, labelbottom=False)
			
 
				+
			
 
				+    # Rotate the tick labels and set their alignment.
			
 
				+    plt.setp(ax.get_xticklabels(), rotation=-30, ha="right",
			
 
				+             rotation_mode="anchor")
			
 
				+
			
 
				+    # Turn spines off and create white grid.
			
 
				+    ax.spines[:].set_visible(False)
			
 
				+
			
 
				+    ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True)
			
 
				+    ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True)
			
 
				+    ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
			
 
				+    ax.tick_params(which="minor", bottom=False, left=False)
			
 
				+
			
 
				+    return im, cbar
			
 
				+
			
 
				+def annotate_heatmap(im, data=None, valfmt="{x:.2f}",
			
 
				+                     textcolors=("black", "white"),
			
 
				+                     threshold=None, **textkw):
			
 
				+    """
			
 
				+    A function to annotate a heatmap.
			
 
				+     (from the matplotlib doc)
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    im
			
 
				+        The AxesImage to be labeled.
			
 
				+    data
			
 
				+        Data used to annotate.  If None, the image's data is used.  Optional.
			
 
				+    valfmt
			
 
				+        The format of the annotations inside the heatmap.  This should either
			
 
				+        use the string format method, e.g. "$ {x:.2f}", or be a
			
 
				+        `matplotlib.ticker.Formatter`.  Optional.
			
 
				+    textcolors
			
 
				+        A pair of colors.  The first is used for values below a threshold,
			
 
				+        the second for those above.  Optional.
			
 
				+    threshold
			
 
				+        Value in data units according to which the colors from textcolors are
			
 
				+        applied.  If None (the default) uses the middle of the colormap as
			
 
				+        separation.  Optional.
			
 
				+    **kwargs
			
 
				+        All other arguments are forwarded to each call to `text` used to create
			
 
				+        the text labels.
			
 
				+    """
			
 
				+
			
 
				+    if not isinstance(data, (list, np.ndarray)):
			
 
				+        data = im.get_array()
			
 
				+
			
 
				+    # Normalize the threshold to the images color range.
			
 
				+    if threshold is not None:
			
 
				+        threshold = im.norm(threshold)
			
 
				+    else:
			
 
				+        threshold = im.norm(data.max())/2.
			
 
				+
			
 
				+    # Set default alignment to center, but allow it to be
			
 
				+    # overwritten by textkw.
			
 
				+    kw = dict(horizontalalignment="center",
			
 
				+              verticalalignment="center")
			
 
				+    kw.update(textkw)
			
 
				+
			
 
				+    # Get the formatter in case a string is supplied
			
 
				+    if isinstance(valfmt, str):
			
 
				+        valfmt = ticker.StrMethodFormatter(valfmt)
			
 
				+
			
 
				+    # Loop over the data and create a `Text` for each "pixel".
			
 
				+    # Change the text's color depending on the data.
			
 
				+    texts = []
			
 
				+    for i in range(data.shape[0]):
			
 
				+        for j in range(data.shape[1]):
			
 
				+            kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])
			
 
				+            text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)
			
 
				+            texts.append(text)
			
 
				+
			
 
				+    return texts
			
 
				+
			
 
				+def plot_matrix(mat, legend=None, color_scale_type="YlGn", cbarlabel = "qt", title=None):
			
 
				+         
			
 
				+    fig, ax = plt.subplots(figsize=(10,8))
			
 
				+    if legend:
			
 
				+        row_labels = [k for k in range(len(mat))]
			
 
				+        col_labels = [k for k in range(len(mat[0]))]
			
 
				+        im, cbar = heatmap(mat, row_labels, col_labels, ax=ax,
			
 
				+                           cmap=color_scale_type, cbarlabel=cbarlabel)
			
 
				+    else:
			
 
				+        im, cbar = heatmap(mat, ax=ax,
			
 
				+                           cmap=color_scale_type, cbarlabel=cbarlabel)
			
 
				+    #texts = annotate_heatmap(im, valfmt="{x:.5f}")
			
 
				+    if title:
			
 
				+        ax.set_title(title)
			
 
				+    fig.tight_layout()
			
 
				+    plt.show()
			
 
				+
			
 
				+def plot(x, y, outfile = None, outfolder = None, ylab=None, xlab=None, title=None):
			
 
				+    plt.plot(x, y)
			
 
				+    if ylab:
			
 
				+        plt.ylabel(ylab)
			
 
				+    if xlab:
			
 
				+        plt.xlabel(xlab)
			
 
				+    if title:
			
 
				+        plt.title(title)
			
 
				+    if outfile:
			
 
				+        plt.savefig(outfile)
			
 
				+    else:
			
 
				+        plt.show()
			
 
				+
			
 
				+def scatter(x, y, ylab=None, xlab=None, title=None):
			
 
				+    plt.scatter(x, y)
			
 
				+    if ylab:
			
 
				+        plt.ylabel(ylab)
			
 
				+    if xlab:
			
 
				+        plt.xlabel(xlab)
			
 
				+    if title:
			
 
				+        plt.title(title)
			
 
				+    plt.show()
			
 
				+
			
 
				+def barplot(x, y, ylab=None, xlab=None, title=None):
			
 
				+    plt.bar(x, y)
			
 
				+    if ylab:
			
 
				+        plt.ylabel(ylab)
			
 
				+    if xlab:
			
 
				+        plt.xlabel(xlab)
			
 
				+    if title:
			
 
				+        plt.title(title)
			
 
				+    plt.show()
			
 
				+
			
 
				+def plot_chrom_continuity(vcf_entries, chr_id, outfile = None, outfolder = None):
			
 
				+    chr_name = list(vcf_entries.keys())[chr_id]
			
 
				+    chr_entries = vcf_entries[chr_name]
			
 
				+    genotyped_pos = vcf_utils.genotyping_continuity_plot(chr_entries)
			
 
				+    plot(genotyped_pos[0], genotyped_pos[1], ylab = "genotyped pos.",
			
 
				+         xlab = "pos. in ref.",
			
 
				+         title = "Genotyped pos in chr "+str(chr_id+1)+":'"+chr_name+"'",
			
 
				+         outfile = outfile, outfolder = outfolder)
			
 
				+
			
 
				+def plot_chrom_coverage(vcf_entries, chr_id):
			
 
				+    chr_name = list(vcf_entries.keys())[chr_id]
			
 
				+    chr_entries = vcf_entries[chr_name]
			
 
				+    coverage = vcf_utils.compute_coverage(chr_entries)
			
 
				+    barplot(coverage[0], coverage[1], ylab = "coverage (X)",
			
 
				+            xlab = "pos. in ref.",
			
 
				+            title = "Coverage for chr "+str(chr_id+1)+":'"+chr_name+"'")
			
--- a/sfs_tools.py
+++ b/sfs_tools.py
@@ -0,0 +1,133 @@
 
				+#!/usr/bin/env python3
			
 
				+
			
 
				+"""
			
 
				+FOREST Thomas (thomas.forest@college-de-france.fr)
			
 
				+
			
 
				+Caution : At the moment for gzipped files only.
			
 
				+
			
 
				+ARGS
			
 
				+--------
			
 
				+
			
 
				+standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import gzip
			
 
				+import sys
			
 
				+import matplotlib.pyplot as plt
			
 
				+
			
 
				+def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False):
			
 
				+
			
 
				+    """
			
 
				+    Multiplication de deux nombres entiers.
			
 
				+    Cette fonction ne sert pas à grand chose.
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    n : int
			
 
				+        Nb of individuals in sample.
			
 
				+    vcf_file : str
			
 
				+        SNPs in VCF file format.
			
 
				+
			
 
				+        Used to generate a Site Frequency Spectrum (SFS) from a VCF.
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+    dict
			
 
				+        Site Frequency Spectrum (SFS)
			
 
				+
			
 
				+
			
 
				+    """
			
 
				+    
			
 
				+    if diploid and not folded:
			
 
				+        n *= 2
			
 
				+    # initiate SFS_values with a zeros dict
			
 
				+    SFS_values = dict.fromkeys(range(n),0)
			
 
				+    count_pluriall = 0
			
 
				+    with gzip.open(vcf_file, "rb") as inputgz:
			
 
				+        line = inputgz.readline()
			
 
				+        genotypes = []
			
 
				+        print("Parsing VCF", vcf_file, "... Please wait...")
			
 
				+        while line:
			
 
				+            # decode gzipped binary lines
			
 
				+            line = line.decode('utf-8').strip()
			
 
				+            # every snp line, not comment or header
			
 
				+            if not line.startswith("##") and not line.startswith("#"):
			
 
				+                FIELDS = line.split("\t")
			
 
				+                # REF is col 4 of VCF
			
 
				+                REF = FIELDS[3].split(",")
			
 
				+                # ALT is col 5 of VCF
			
 
				+                ALT = FIELDS[4].split(",")            
			
 
				+                FORMAT = line.split("\t")[8:9]
			
 
				+                SAMPLES = line.split("\t")[9:]
			
 
				+                snp_genotypes = []
			
 
				+                allele_counts = {}
			
 
				+                allele_counts_list = []
			
 
				+                # SKIP the SNP if :
			
 
				+                # 1 : missing
			
 
				+                # 2 : deletion among REF
			
 
				+                # 3 : deletion among ALT
			
 
				+                if "./.:." in line \
			
 
				+                   or len(ALT[0]) > 1 \
			
 
				+                   or len(REF[0]) > 1:
			
 
				+                    line = inputgz.readline()
			
 
				+                    continue
			
 
				+                for sample in SAMPLES:
			
 
				+                    if not phased:
			
 
				+                        # for UNPHASED data
			
 
				+                        smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
			
 
				+                    else:
			
 
				+                        # for PHASED
			
 
				+                        smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
			
 
				+                    nb_alleles = set(smpl_genotype)
			
 
				+                    snp_genotypes += smpl_genotype
			
 
				+                # skip if all individuals have the same genotype
			
 
				+                if len(set(snp_genotypes)) == 1:
			
 
				+                    line = inputgz.readline()
			
 
				+                    continue
			
 
				+                for k in set(snp_genotypes):
			
 
				+                    allele_counts[snp_genotypes.count(k)] = k
			
 
				+                    allele_counts_list.append(snp_genotypes.count(k))
			
 
				+                if folded and len(ALT) >= 2:
			
 
				+                    #pass
			
 
				+                    count_pluriall +=1
			
 
				+                    # TODO - work in progress
			
 
				+                    # for al in range(len(ALT)-1):
			
 
				+                    #     SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
			
 
				+                    #     allele_counts_list.remove(min(allele_counts_list))
			
 
				+                else:
			
 
				+                    SFS_values[min(allele_counts_list)-1] += 1
			
 
				+            line = inputgz.readline()
			
 
				+            if verbose:
			
 
				+                print("SFS=", SFS_values)
			
 
				+        print("Pluriallelic sites =", count_pluriall)
			
 
				+    return SFS_values
			
 
				+
			
 
				+def barplot_sfs(sfs, folded=True, title = "Barplot"):
			
 
				+    sfs_val = []
			
 
				+    n = len(sfs.values())
			
 
				+    for k in range(1, n):
			
 
				+        ksi = list(sfs.values())[k-1]
			
 
				+        # k+1 because k starts from 0
			
 
				+        if folded:
			
 
				+            sfs_val.append(ksi * k * (n - k))
			
 
				+        else:
			
 
				+            sfs_val.append(ksi * k)
			
 
				+    #terminal case, same for folded or unfolded
			
 
				+    sfs_val.append(list(sfs.values())[n-1] * n)
			
 
				+    #build the plot
			
 
				+    title = title+" [folded="+str(folded)+"]"
			
 
				+    plt.title(title)
			
 
				+    plt.bar(sfs.keys(), sfs_val)
			
 
				+    plt.show()
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+            
			
 
				+    if len(sys.argv) != 3:
			
 
				+        print("Need 2 args")
			
 
				+        exit(0)
			
 
				+
			
 
				+    # PARAM : Nb of indiv
			
 
				+    n = int(sys.argv[2])
			
 
				+    sfs = sfs_from_vcf(n, sys.argv[1], folded = True, diploid = True, phased = False)
			
 
				+    print(sfs)
			
--- a/vcf_to_sfs.c
+++ b/vcf_to_sfs.c
@@ -1,123 +0,0 @@
 
				-# include <stdio.h>
			
 
				-# include <stdlib.h>
			
 
				-# include <zlib.h>
			
 
				-#include <string.h>
			
 
				-#include <stdbool.h>
			
 
				-
			
 
				-bool StartsWith(const char *a, const char *b)
			
 
				-{
			
 
				-   if(strncmp(a, b, strlen(b)) == 0) return 1;
			
 
				-   return 0;
			
 
				-}
			
 
				-
			
 
				-void slice_str(const char * str, char * buffer, size_t start, size_t end)
			
 
				-{
			
 
				-    size_t j = 0;
			
 
				-    for ( size_t i = start; i <= end; ++i ) {
			
 
				-        buffer[j++] = str[i];
			
 
				-    }
			
 
				-    buffer[j] = 0;
			
 
				-}
			
 
				-
			
 
				-int min(int * array, int size){
			
 
				-    //Consider first element as smallest
			
 
				-   int smallest = array[0];
			
 
				-   int i;
			
 
				-   for (i = 0; i < num; i++) {
			
 
				-      if (a[i] < smallest) {
			
 
				-         smallest = a[i];
			
 
				-      }
			
 
				-   }
			
 
				-}
			
 
				-
			
 
				-int countDistinct(int a[], int n)      //Function Definition
			
 
				-{
			
 
				-   int i, j, count = 0;
			
 
				-   //Traverse the array
			
 
				-   for (i = 1; i < n; i++)      //hold an array element
			
 
				-   {
			
 
				-      for (j = 0; j < i; j++)   
			
 
				-      {
			
 
				-         if (a[i] == a[j])    //Check for duplicate elements
			
 
				-         {
			
 
				-            break;             //If duplicate elements found then break
			
 
				-         }
			
 
				-      }
			
 
				-      if (i == j)
			
 
				-      {
			
 
				-         count++;     //increment the number of distinct elements
			
 
				-      }
			
 
				-   }
			
 
				-   return count;      //Return the number of distinct elements
			
 
				-}
			
 
				-
			
 
				-# define LL 8192   /* line length maximum */
			
 
				-# define DIPLOID true
			
 
				-# define FOLDED true
			
 
				-# define IGNORED_FIELDS 9
			
 
				-
			
 
				-int main ( int argc, char *argv[] ){
			
 
				-    if ( argc < 3) {
			
 
				-	printf("Need 2 args!\n");
			
 
				-	return 1;
			
 
				-    }
			
 
				-    gzFile fp;
			
 
				-    char line[LL];
			
 
				-    int N;
			
 
				-    char delim[] = "\t";
			
 
				-    fp = gzopen( argv[1], "r" );
			
 
				-
			
 
				-    // pop of size 2N when diploid
			
 
				-    if (DIPLOID == true && FOLDED == false) {
			
 
				-	N = 2 * atoi(argv[2]);
			
 
				-	    } else {
			
 
				-	N = atoi(argv[2]);
			
 
				-    }
			
 
				-
			
 
				-    int snp_genotypes[N];
			
 
				-    int SFS_values[N];
			
 
				-
			
 
				-    gzgets( fp, line, LL );
			
 
				-    while ( ! gzeof( fp ) ){
			
 
				-	int k = 0;
			
 
				-	if ( StartsWith(line, "##") || ( StartsWith(line, "#") ) || (strstr(line, "./.:.") != NULL)){
			
 
				-	    gzgets( fp, line, LL );
			
 
				-	    continue;
			
 
				-	}
			
 
				-	
			
 
				-	char *vcf_field = strtok(line, delim);
			
 
				-	while(vcf_field != NULL){
			
 
				-	    k++;
			
 
				-	    if (k > IGNORED_FIELDS) {
			
 
				-		const size_t len = strlen(vcf_field);
			
 
				-		char buffer[len + 1];
			
 
				-		//printf("'%s'\n", ptr);
			
 
				-		slice_str(vcf_field, buffer, 0, 0);
			
 
				-		//printf("%d %s      ", N, buffer);
			
 
				-		snp_genotypes[k-IGNORED_FIELDS] = atoi(buffer);
			
 
				-		//printf("%d ", smpl_genotype[k-9]);
			
 
				-	    }
			
 
				-	    vcf_field = strtok(NULL, delim);
			
 
				-	    int c= countDistinct(snp_genotypes, N);
			
 
				-            // skip if all individuals have the same genotype
			
 
				-	    if (c == 1) {
			
 
				-		continue;
			
 
				-		gzgets( fp, line, LL );
			
 
				-	    }
			
 
				-	    /* int i; */
			
 
				-	    /* for (i = 1; i < N; ++i) */
			
 
				-	    /* 	{ */
			
 
				-	    /* 	    printf("%d ", snp_genotypes[i]); */
			
 
				-	    /* 	} */
			
 
				-	    int allele_counts[c];
			
 
				-	    
			
 
				-	    min(allele_counts, N);
			
 
				-	}
			
 
				-	// printf("%s", line );
			
 
				-	// loads the next line
			
 
				-	gzgets( fp, line, LL );
			
 
				-    }
			
 
				-
			
 
				-    gzclose( fp );
			
 
				-    return 0;
			
 
				-}
			
--- a/vcf_to_sfs.py
+++ b/vcf_to_sfs.py
@@ -8,74 +8,106 @@ Caution : At the moment for gzipped files only.
 
				 ARGS
			
 
				 --------
			
 
				 
			
 
				-usage : vcf_to_sfs.py VCF.gz nb_indiv
			
 
				+standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
			
 
				 
			
 
				 """
			
 
				 
			
 
				 import gzip
			
 
				 import sys
			
 
				 
			
 
				-# default folded SFS
			
 
				-folded = True
			
 
				-diploid = True
			
 
				-phased = False
			
 
				-
			
 
				-# PARAM : Nb of indiv
			
 
				-n = int(sys.argv[2])
			
 
				-
			
 
				-if diploid and not folded:
			
 
				-    n *= 2
			
 
				-# initiate SFS_values with a zeros dict
			
 
				-SFS_values = dict.fromkeys(range(n),0)
			
 
				-
			
 
				-with gzip.open(sys.argv[1], "rb") as inputgz:
			
 
				-    line = inputgz.readline()
			
 
				-    genotypes = []
			
 
				-    while line:
			
 
				-        # decode gzipped binary lines
			
 
				-        line = line.decode('utf-8').strip()
			
 
				-        # every snp line, not comment or header
			
 
				-        if not line.startswith("##") and not line.startswith("#"):
			
 
				-            FIELDS = line.split("\t")
			
 
				-            # REF is col 4 of VCF
			
 
				-            REF = FIELDS[3].split(",")
			
 
				-            # ALT is col 5 of VCF
			
 
				-            ALT = FIELDS[4].split(",")            
			
 
				-            FORMAT = line.split("\t")[8:9]
			
 
				-            SAMPLES = line.split("\t")[9:]
			
 
				-            snp_genotypes = []
			
 
				-            allele_counts = {}
			
 
				-            allele_counts_list = []
			
 
				-            # SKIP the SNP if :
			
 
				-            # 1 : missing
			
 
				-            # 2 : deletion among REF
			
 
				-            # 3 : deletion among ALT
			
 
				-            if "./.:." in line \
			
 
				-               or len(ALT[0]) > 1 \
			
 
				-               or len(REF[0]) > 1:
			
 
				-                line = inputgz.readline()
			
 
				-                continue
			
 
				-            for sample in SAMPLES:
			
 
				-                if not phased:
			
 
				-                    # for UNPHASED data
			
 
				-                    smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
			
 
				-                else:
			
 
				-                    # for PHASED
			
 
				-                    smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
			
 
				-                nb_alleles = set(smpl_genotype)
			
 
				-                snp_genotypes += smpl_genotype
			
 
				-            # skip if all individuals have the same genotype
			
 
				-            if len(set(snp_genotypes)) == 1:
			
 
				-                line = inputgz.readline()
			
 
				-                continue
			
 
				-            for k in set(snp_genotypes):
			
 
				-                allele_counts[snp_genotypes.count(k)] = k
			
 
				-                allele_counts_list.append(snp_genotypes.count(k))
			
 
				-            if folded and len(ALT) >= 2:
			
 
				-                for al in range(len(ALT)-1):
			
 
				-                    SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
			
 
				-                    allele_counts_list.remove(min(allele_counts_list))
			
 
				-            else:
			
 
				-                SFS_values[min(allele_counts_list)-1] += 1
			
 
				+def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False):
			
 
				+
			
 
				+    """
			
 
				+    Multiplication de deux nombres entiers.
			
 
				+    Cette fonction ne sert pas à grand chose.
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    n : int
			
 
				+        Nb of individuals in sample.
			
 
				+    vcf_file : str
			
 
				+        SNPs in VCF file format.
			
 
				+
			
 
				+        Used to generate a Site Frequency Spectrum (SFS) from a VCF.
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+    dict
			
 
				+        Site Frequency Spectrum (SFS)
			
 
				+
			
 
				+
			
 
				+    """
			
 
				+    
			
 
				+    if diploid and not folded:
			
 
				+        n *= 2
			
 
				+    # initiate SFS_values with a zeros dict
			
 
				+    SFS_values = dict.fromkeys(range(n),0)
			
 
				+
			
 
				+    with gzip.open(vcf_file, "rb") as inputgz:
			
 
				         line = inputgz.readline()
			
 
				-        print(SFS_values)
			
 
				+        genotypes = []
			
 
				+        print("Parsing VCF", vcf_file, "... Please wait...")
			
 
				+        while line:
			
 
				+            # decode gzipped binary lines
			
 
				+            line = line.decode('utf-8').strip()
			
 
				+            # every snp line, not comment or header
			
 
				+            if not line.startswith("##") and not line.startswith("#"):
			
 
				+                FIELDS = line.split("\t")
			
 
				+                # REF is col 4 of VCF
			
 
				+                REF = FIELDS[3].split(",")
			
 
				+                # ALT is col 5 of VCF
			
 
				+                ALT = FIELDS[4].split(",")            
			
 
				+                FORMAT = line.split("\t")[8:9]
			
 
				+                SAMPLES = line.split("\t")[9:]
			
 
				+                snp_genotypes = []
			
 
				+                allele_counts = {}
			
 
				+                allele_counts_list = []
			
 
				+                # SKIP the SNP if :
			
 
				+                # 1 : missing
			
 
				+                # 2 : deletion among REF
			
 
				+                # 3 : deletion among ALT
			
 
				+                if "./.:." in line \
			
 
				+                   or len(ALT[0]) > 1 \
			
 
				+                   or len(REF[0]) > 1:
			
 
				+                    line = inputgz.readline()
			
 
				+                    continue
			
 
				+                for sample in SAMPLES:
			
 
				+                    if not phased:
			
 
				+                        # for UNPHASED data
			
 
				+                        smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
			
 
				+                    else:
			
 
				+                        # for PHASED
			
 
				+                        smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
			
 
				+                    nb_alleles = set(smpl_genotype)
			
 
				+                    snp_genotypes += smpl_genotype
			
 
				+                # skip if all individuals have the same genotype
			
 
				+                if len(set(snp_genotypes)) == 1:
			
 
				+                    line = inputgz.readline()
			
 
				+                    continue
			
 
				+                for k in set(snp_genotypes):
			
 
				+                    allele_counts[snp_genotypes.count(k)] = k
			
 
				+                    allele_counts_list.append(snp_genotypes.count(k))
			
 
				+                if folded and len(ALT) >= 2:
			
 
				+                    pass
			
 
				+                    # TODO - work in progress
			
 
				+                    # for al in range(len(ALT)-1):
			
 
				+                    #     SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
			
 
				+                    #     allele_counts_list.remove(min(allele_counts_list))
			
 
				+                else:
			
 
				+                    SFS_values[min(allele_counts_list)-1] += 1
			
 
				+            line = inputgz.readline()
			
 
				+            if verbose:
			
 
				+                print(SFS_values)
			
 
				+    return SFS_values
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+            
			
 
				+    if len(sys.argv) != 3:
			
 
				+        print("Need 2 args")
			
 
				+        exit(0)
			
 
				+
			
 
				+    # PARAM : Nb of indiv
			
 
				+    n = int(sys.argv[2])
			
 
				+
			
 
				+    sfs = sfs_from_vcf(n, sys.argv[1], folded = True, diploid = True, phased = False)
			
 
				+    print(sfs)
			
--- a/vcf_utils.py
+++ b/vcf_utils.py
@@ -0,0 +1,218 @@
 
				+#!/usr/bin/env python3
			
 
				+
			
 
				+"""
			
 
				+FOREST Thomas (thomas.forest@college-de-france.fr)
			
 
				+
			
 
				+Caution : At the moment for gzipped files only.
			
 
				+
			
 
				+ARGS
			
 
				+--------
			
 
				+
			
 
				+standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
			
 
				+
			
 
				+"""
			
 
				+import gzip
			
 
				+import sys
			
 
				+import numpy as np
			
 
				+from frst import customgraphics
			
 
				+import json 
			
 
				+import time
			
 
				+import datetime
			
 
				+
			
 
				+def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
			
 
				+    start = time.time()
			
 
				+    with gzip.open(vcf_file, "rb") as inputgz:
			
 
				+            byte_line = inputgz.readline()
			
 
				+            genotypes = []
			
 
				+            noGenotype = []
			
 
				+            pos = 0
			
 
				+            pluriall_counts = 0
			
 
				+            entries = {}
			
 
				+            chrom = {}
			
 
				+            nb_site = 0
			
 
				+            print("Parsing VCF {} ... Please wait...".format(vcf_file))
			
 
				+            #print("Parsing VCF", vcf_file, "... Please wait...")
			
 
				+            while byte_line:
			
 
				+                #print(line)
			
 
				+                # decode gzipped binary lines
			
 
				+                line = byte_line.decode('utf-8').strip()
			
 
				+                nb_site += 1
			
 
				+                #  # every snp line, not comment or header
			
 
				+                if not line.startswith("##") and not line.startswith("#"):
			
 
				+                    FIELDS = line.split("\t")
			
 
				+                    CHROM = FIELDS[0]
			
 
				+                    POS = int(FIELDS[1])
			
 
				+                    if stop_at:
			
 
				+                        if POS > stop_at:
			
 
				+                            break
			
 
				+                    # REF is col 4 of VCF
			
 
				+                    REF = FIELDS[3].split(",")
			
 
				+                    # ALT is col 5 of VCF
			
 
				+                    ALT = FIELDS[4].split(",")
			
 
				+                    FORMAT = line.split("\t")[8:9]
			
 
				+                    SAMPLES = line.split("\t")[9:]
			
 
				+                    QUALITY = float(FIELDS[5])
			
 
				+                    INFO = FIELDS[7]
			
 
				+                    INFOS = {}
			
 
				+                    for info in INFO.split(";"):
			
 
				+                        try :
			
 
				+                            INFOS[info.split('=')[0]] = info.split('=')[1]
			
 
				+                        except:
			
 
				+                            INFOS[info] = info
			
 
				+                    GENOTYPE = []
			
 
				+                    LIKELIHOOD = []
			
 
				+                    # SKIP the SNP if :
			
 
				+                    # 1 : missing
			
 
				+                    # 2 : deletion among REF
			
 
				+                    # 3 : deletion among ALT
			
 
				+                    if "./.:." in line \
			
 
				+                       or len(ALT[0]) > 1 \
			
 
				+                       or len(REF[0]) > 1:
			
 
				+                        # sites that are not kept
			
 
				+                        # if at least one missing data, remember it
			
 
				+                        noGenotype.append(POS)
			
 
				+                        byte_line = inputgz.readline()
			
 
				+                        continue
			
 
				+                    elif len(ALT) > 1:
			
 
				+                        # pluriall sites
			
 
				+                        # remember that the gestion of PL is very diff with pluriall
			
 
				+                        pluriall_counts += 1
			
 
				+                        noGenotype.append(POS)
			
 
				+                        byte_line = inputgz.readline()
			
 
				+                        continue
			
 
				+                    else:
			
 
				+                        # for unphased and with only two fields : GT:PL
			
 
				+                        for sample in SAMPLES:
			
 
				+                            if not phased:
			
 
				+                                # for UNPHASED data
			
 
				+                                sample_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
			
 
				+                            else:
			
 
				+                                # for PHASED
			
 
				+                                sample_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
			
 
				+                            # list of PL : [prob of 0/0, prob of 0/1, prob of 1/1]
			
 
				+                            sample_likelihood =  sample.split(':')[1].split(',')
			
 
				+                            sample_likelihood = [pow(10, -int(sample_likelihood[0])/10),
			
 
				+                                                 pow(10, -int(sample_likelihood[1])/10), pow(10, -int(sample_likelihood[2])/10)]
			
 
				+                            GENOTYPE += sample_genotype
			
 
				+                            LIKELIHOOD.append(sample_likelihood)
			
 
				+                        # from log phred score to probability of error, E
			
 
				+                        #LIKELIHOOD = pow(10, -int(LIKELIHOOD[0])/10)
			
 
				+                        #print(LIKELIHOOD)
			
 
				+                        entries = {
			
 
				+                            'POS':POS,
			
 
				+                            'CHR':CHROM,
			
 
				+                            'FIELDS':FIELDS,
			
 
				+                            'REF':REF,
			
 
				+                            'ALT':ALT,
			
 
				+                            'FORMAT':FORMAT,
			
 
				+                            'INFOS':INFOS,
			
 
				+                            'SAMPLES':SAMPLES,
			
 
				+                            'QUALITY':QUALITY,
			
 
				+                            'GENOTYPE':GENOTYPE,
			
 
				+                            'LIKELIHOOD':LIKELIHOOD
			
 
				+                        }
			
 
				+                        if CHROM.startswith(chr_starts_with):
			
 
				+                        # keep if chr name starts with filter
			
 
				+                        # default : *, every chr is kept
			
 
				+                            if CHROM not in chrom:
			
 
				+                                 chrom[CHROM] = {}
			
 
				+                            chrom[CHROM][POS] = entries
			
 
				+                byte_line = inputgz.readline()
			
 
				+    end = time.time()
			
 
				+    print("Parsed", nb_site, "sites in", str(datetime.timedelta(seconds=end - start)))
			
 
				+
			
 
				+    return chrom
			
 
				+
			
 
				+def build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit = False, normalize = False, xlim=None):
			
 
				+    """ Take infos out of parsing to build a coverage matrix of probability 
			
 
				+    of error, E, at each position
			
 
				+    """
			
 
				+    # last pos without any missing data ./.:.
			
 
				+    last_pos = list(entries.keys())[-1]
			
 
				+    # last genotyped SNP with missing data, usually bigger than last_pos
			
 
				+    last_genotyped = noGenotype[-1]
			
 
				+    mat = []
			
 
				+    if diploid:
			
 
				+        # k=N if diploids, k = 2N if haploids 
			
 
				+        k = int(len(entries[last_pos]['GENOTYPE'])/2)
			
 
				+    else:
			
 
				+        k = len(entries[last_pos]['GENOTYPE'])
			
 
				+    for _ in range(k):
			
 
				+        mat.append([])
			
 
				+    #display_max = last_pos
			
 
				+    if xlim:
			
 
				+        display_max = xlim
			
 
				+    else:
			
 
				+        if na_omit:
			
 
				+            display_max = last_pos
			
 
				+        else:
			
 
				+            display_max = last_genotyped
			
 
				+    for pos in range(display_max):
			
 
				+        if pos in noGenotype or pos not in entries.keys():
			
 
				+            if na_omit:
			
 
				+                continue
			
 
				+            else:
			
 
				+                for k in range(len(mat)):
			
 
				+                    # if missing, prob=1, worst case
			
 
				+                    mat[k].append(1)
			
 
				+        else:
			
 
				+            for k in range(len(mat)):
			
 
				+                best_prob = min(entries[pos]['LIKELIHOOD'][k])
			
 
				+                #print(ind, best_prob)
			
 
				+                mat[k].append(best_prob)
			
 
				+                    
			
 
				+    mat = np.array(mat)
			
 
				+    if normalize:
			
 
				+        row_sums = mat.sum(axis=1)
			
 
				+        mat = mat / row_sums[:, np.newaxis]
			
 
				+    return mat
			
 
				+
			
 
				+def genotyping_continuity_plot(vcf_entries, verbose=False):
			
 
				+    last_pos = int(sorted(list(vcf_entries.keys()))[-1])
			
 
				+    x = 0
			
 
				+    y = 1
			
 
				+    coords = [[], []]
			
 
				+    print(last_pos, "sites to scan")
			
 
				+    for k, pos in enumerate(range(last_pos)):
			
 
				+        if verbose:
			
 
				+            progress = round(k/int(last_pos))*100
			
 
				+            if progress % 10 == 0:
			
 
				+                print(progress, "%")
			
 
				+        # if pos is genotyped
			
 
				+        if k in vcf_entries:
			
 
				+            y+=1
			
 
				+        x+=1
			
 
				+        coords[0].append(x)
			
 
				+        coords[1].append(y)
			
 
				+    return coords
			
 
				+
			
 
				+def compute_coverage(vcf_entries, verbose=False):
			
 
				+    last_pos = list(vcf_entries.keys())[-1]
			
 
				+    x = 0
			
 
				+    y = 1
			
 
				+    coords = [[], []]
			
 
				+    print(last_pos, "sites to scan")
			
 
				+    for entry in vcf_entries.values():
			
 
				+        y = int(entry['INFOS']['DP'])
			
 
				+        x = entry['POS']
			
 
				+        coords[0].append(x)
			
 
				+        coords[1].append(y)
			
 
				+    return coords
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # check args
			
 
				+    if len(sys.argv) !=2:
			
 
				+        print("Need 1 arg")
			
 
				+        exit(0)
			
 
				+    # main
			
 
				+    vcf_file = sys.argv[1]
			
 
				+
			
 
				+    # # without missing data
			
 
				+    # entries, noGenotype = parse_vcf(vcf_file, stop_at = 20000)
			
 
				+    # mat = build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit=True, normalize=False, xlim = None)
			
 
				+    # customgraphics.plot_matrix(mat, color_scale_type="autumn", cbarlabel = "prob. of error, E", title="Prob. of error (E) of genotyping, at each position, lower the better")
			
 
				+
			
 
				+    # # with missing data
			
 
				+    # entries, noGenotype = parse_vcf(vcf_file, stop_at = 500)
			
 
				+    # mat = build_polymorph_coverage_matrix(entries, noGenotype, diploid=True, na_omit=False, normalize=False, xlim = None)
			
 
				+    # customgraphics.plot_matrix(mat, color_scale_type="autumn", cbarlabel = "prob. of error, E", title="Prob. of error (E) of genotyping, at each position, lower the better")