123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051 |
- #!/usr/bin/env python3
-
- """
- Caution : At the moment for gzipped files only.
-
-
- """
-
- import gzip
- import sys
-
- # default folded SFS
- folded = True
- diploid = True
-
- # PARAM : Nb of indiv
- n = int(sys.argv[2])
-
- if diploid and not folded:
- n *= 2
-
- SFS_values = dict.fromkeys(range(n),0)
-
-
- with gzip.open(sys.argv[1], "rb") as inputgz:
- line = inputgz.readline()
- genotypes = []
- #SFS_values = {}
- while line:
- line = line.decode('utf-8').strip()
- if not line.startswith("##") and not line.startswith("#"):
- FORMAT = line.split("\t")[8:9]
- SAMPLES = line.split("\t")[9:]
- snp_genotypes = []
- allele_counts = {}
- for sample in SAMPLES:
- # for UNPHASED data
- smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
- nb_alleles = set(smpl_genotype)
- snp_genotypes += smpl_genotype
- for k in set(snp_genotypes):
- allele_counts[snp_genotypes.count(k)] = k
- if folded :
- for count in allele_counts.keys():
- if count <= len(snp_genotypes)/2 :
- SFS_values[count-1] += 1
- else:
- SFS_values[len(snp_genotypes)-count] += 1
- line = inputgz.readline()
- print(SFS_values)
|