richard
/
geinf


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
							#!/usr/bin/env python3

"""
Caution : At the moment for gzipped files only.

ARGS
--------

usage : vcf_to_sfs.py VCF.gz nb_indiv

"""

import gzip
import sys

# default folded SFS
folded = True
diploid = True
phased = False

# PARAM : Nb of indiv
n = int(sys.argv[2])

if diploid and not folded:
    n *= 2
# initiate SFS_values with a zeros dict
SFS_values = dict.fromkeys(range(n),0)

with gzip.open(sys.argv[1], "rb") as inputgz:
    line = inputgz.readline()
    genotypes = []
    while line:
        # decode gzipped binary lines
        line = line.decode('utf-8').strip()
        # every snp line, not comment or header
        if not line.startswith("##") and not line.startswith("#"):
            FORMAT = line.split("\t")[8:9]
            SAMPLES = line.split("\t")[9:]
            snp_genotypes = []
            allele_counts = {}
            for sample in SAMPLES:
                # for UNPHASED data
                smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
                
                nb_alleles = set(smpl_genotype)
                snp_genotypes += smpl_genotype
            # skip if all individuals have the same genotype
            if len(set(snp_genotypes)) == 1:
                line = inputgz.readline()
                continue
            for k in set(snp_genotypes):
                allele_counts[snp_genotypes.count(k)] = k
            if folded :
                SFS_values[min(allele_counts.keys())-1] += 1
        line = inputgz.readline()
        print(SFS_values)