123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- #!/usr/bin/env python3
-
- """
- FOREST Thomas (thomas.forest@college-de-france.fr)
-
- Caution : At the moment for gzipped files only.
-
- ARGS
- --------
-
- standalone usage : vcf_to_sfs.py VCF.gz nb_indiv
-
- """
-
- import gzip
- import sys
-
- def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False):
-
- """ Returns an SFS from a VCF file.
-
- Parameters
- ----------
- n : int
- Nb of individuals in sample.
- vcf_file : str
- SNPs in VCF file format.
-
- Used to generate a Site Frequency Spectrum (SFS) from a VCF.
-
- Returns
- -------
- dict
- Site Frequency Spectrum (SFS)
-
-
- """
-
- if diploid and not folded:
- n *= 2
- # initiate SFS_values with a zeros dict
- SFS_values = dict.fromkeys(range(n),0)
- # store nb polyallellic sites
- polyall = 0
- with gzip.open(vcf_file, "rb") as inputgz:
- line = inputgz.readline()
- genotypes = []
- print("Parsing VCF", vcf_file, "... Please wait...")
- while line:
- # decode gzipped binary lines
- line = line.decode('utf-8').strip()
- # every snp line, not comment or header
- if not line.startswith("##") and not line.startswith("#"):
- FIELDS = line.split("\t")
- # REF is col 4 of VCF
- REF = FIELDS[3].split(",")
- # ALT is col 5 of VCF
- ALT = FIELDS[4].split(",")
- FORMAT = line.split("\t")[8:9]
- SAMPLES = line.split("\t")[9:]
- snp_genotypes = []
- allele_counts = {}
- allele_counts_list = []
- # SKIP the SNP if :
- # 1 : missing
- # 2 : deletion among REF
- # 3 : deletion among ALT
- if "./.:." in line \
- or len(ALT[0]) > 1 \
- or len(REF[0]) > 1:
- line = inputgz.readline()
- continue
- for sample in SAMPLES:
- if not phased:
- # for UNPHASED data
- smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
- else:
- # for PHASED
- smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
- nb_alleles = set(smpl_genotype)
- snp_genotypes += smpl_genotype
- # skip if all individuals have the same genotype
- if len(set(snp_genotypes)) == 1:
- line = inputgz.readline()
- continue
- for k in set(snp_genotypes):
- allele_counts[snp_genotypes.count(k)] = k
- allele_counts_list.append(snp_genotypes.count(k))
- if folded and len(ALT) >= 2:
- polyall += 1
- else:
- SFS_values[min(allele_counts_list)-1] += 1
- line = inputgz.readline()
- if verbose:
- print(SFS_values)
- return SFS_values, polyall
-
- if __name__ == "__main__":
-
- if len(sys.argv) != 3:
- print("Need 2 args")
- exit(0)
-
- # PARAM : vcf_file
- vcf_file = sys.argv[1]
- # PARAM : Nb of indiv
- n = int(sys.argv[2])
-
- sfs, nb_polyall = sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False)
- print(sfs)
|