#!/usr/bin/env python3 """ FOREST Thomas (thomas.forest@college-de-france.fr) Caution : At the moment for gzipped files only. ARGS -------- standalone usage : vcf_to_sfs.py VCF.gz nb_indiv """ import gzip import sys import matplotlib.pyplot as plt def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False): """ Generates a Site Frequency Spectrum from a gzipped VCF file format. Parameters ---------- n : int Nb of individuals in sample. vcf_file : str SNPs in VCF file format. Used to generate a Site Frequency Spectrum (SFS) from a VCF. Returns ------- dict Site Frequency Spectrum (SFS) """ if diploid and not folded: n *= 2 # initiate SFS_values with a zeros dict SFS_values = dict.fromkeys(range(n),0) count_pluriall = 0 with gzip.open(vcf_file, "rb") as inputgz: line = inputgz.readline() genotypes = [] print("Parsing VCF", vcf_file, "... Please wait...") while line: # decode gzipped binary lines line = line.decode('utf-8').strip() # every snp line, not comment or header if not line.startswith("##") and not line.startswith("#"): FIELDS = line.split("\t") # REF is col 4 of VCF REF = FIELDS[3].split(",") # ALT is col 5 of VCF ALT = FIELDS[4].split(",") FORMAT = line.split("\t")[8:9] SAMPLES = line.split("\t")[9:] snp_genotypes = [] allele_counts = {} allele_counts_list = [] # SKIP the SNP if : # 1 : missing # 2 : deletion among REF # 3 : deletion among ALT if "./.:." in line \ or len(ALT[0]) > 1 \ or len(REF[0]) > 1: line = inputgz.readline() continue for sample in SAMPLES: if not phased: # for UNPHASED data smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.'] else: # for PHASED smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.'] nb_alleles = set(smpl_genotype) snp_genotypes += smpl_genotype # skip if all individuals have the same genotype if len(set(snp_genotypes)) == 1: line = inputgz.readline() continue for k in set(snp_genotypes): allele_counts[snp_genotypes.count(k)] = k allele_counts_list.append(snp_genotypes.count(k)) if folded and len(ALT) >= 2: #pass count_pluriall +=1 # TODO - work in progress # for al in range(len(ALT)-1): # SFS_values[min(allele_counts_list)-1] += 1/len(ALT) # allele_counts_list.remove(min(allele_counts_list)) else: SFS_values[min(allele_counts_list)-1] += 1 line = inputgz.readline() if verbose: print("SFS=", SFS_values) print("Pluriallelic sites =", count_pluriall) return SFS_values def barplot_sfs(sfs, folded=True, title = "Barplot"): sfs_val = [] n = len(sfs.values()) for k in range(1, n): ksi = list(sfs.values())[k-1] # k+1 because k starts from 0 if folded: sfs_val.append(ksi * k * (n - k)) else: sfs_val.append(ksi * k) #terminal case, same for folded or unfolded sfs_val.append(list(sfs.values())[n-1] * n) #build the plot title = title+" [folded="+str(folded)+"]" plt.title(title) plt.bar(sfs.keys(), sfs_val) plt.show() if __name__ == "__main__": if len(sys.argv) != 3: print("Need 2 args") exit(0) # PARAM : Nb of indiv n = int(sys.argv[2]) sfs = sfs_from_vcf(n, sys.argv[1], folded = True, diploid = True, phased = False) print(sfs)