#!/usr/bin/env python3 """ Caution : At the moment for gzipped files only. """ import gzip import sys # default folded SFS folded = True diploid = True # PARAM : Nb of indiv n = int(sys.argv[2]) if diploid and not folded: n *= 2 SFS_values = dict.fromkeys(range(n),0) with gzip.open(sys.argv[1], "rb") as inputgz: line = inputgz.readline() genotypes = [] #SFS_values = {} while line: line = line.decode('utf-8').strip() if not line.startswith("##") and not line.startswith("#"): FORMAT = line.split("\t")[8:9] SAMPLES = line.split("\t")[9:] snp_genotypes = [] allele_counts = {} for sample in SAMPLES: # for UNPHASED data smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.'] nb_alleles = set(smpl_genotype) snp_genotypes += smpl_genotype for k in set(snp_genotypes): allele_counts[snp_genotypes.count(k)] = k if folded : for count in allele_counts.keys(): if count <= len(snp_genotypes)/2 : SFS_values[count-1] += 1 else: SFS_values[len(snp_genotypes)-count] += 1 line = inputgz.readline() print(SFS_values)