|
@@ -102,6 +102,67 @@ def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, ver
|
102
|
102
|
print("Pluriallelic sites =", count_pluriall)
|
103
|
103
|
return SFS_values, count_pluriall
|
104
|
104
|
|
|
105
|
+
|
|
106
|
+def sfs_from_parsed_vcf(n, vcf_dict, folded = True, diploid = True, phased = False, verbose = False):
|
|
107
|
+
|
|
108
|
+ """
|
|
109
|
+ Generates a Site Frequency Spectrum from a gzipped VCF file format.
|
|
110
|
+
|
|
111
|
+ Parameters
|
|
112
|
+ ----------
|
|
113
|
+ n : int
|
|
114
|
+ Nb of individuals in sample.
|
|
115
|
+ vcf_file : str
|
|
116
|
+ SNPs in VCF file format.
|
|
117
|
+
|
|
118
|
+ Used to generate a Site Frequency Spectrum (SFS) from a VCF.
|
|
119
|
+
|
|
120
|
+ Returns
|
|
121
|
+ -------
|
|
122
|
+ dict
|
|
123
|
+ Site Frequency Spectrum (SFS)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+ """
|
|
127
|
+
|
|
128
|
+ if diploid and not folded:
|
|
129
|
+ n *= 2
|
|
130
|
+ # initiate SFS_values with a zeros dict
|
|
131
|
+ SFS_values = dict.fromkeys(range(n),0)
|
|
132
|
+ count_pluriall = 0
|
|
133
|
+
|
|
134
|
+ for CHROM in vcf_dict:
|
|
135
|
+ for SNP in vcf_dict[CHROM]:
|
|
136
|
+ snp_genotypes = []
|
|
137
|
+ allele_counts = {}
|
|
138
|
+ allele_counts_list = []
|
|
139
|
+ print(CHROM, SNP)
|
|
140
|
+ for sample in vcf_dict[CHROM][SNP]["SAMPLES"]:
|
|
141
|
+ if not phased:
|
|
142
|
+ # for UNPHASED data
|
|
143
|
+ smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
|
|
144
|
+ else:
|
|
145
|
+ # for PHASED
|
|
146
|
+ smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
|
|
147
|
+ nb_alleles = set(smpl_genotype)
|
|
148
|
+ snp_genotypes += smpl_genotype
|
|
149
|
+ # skip if all individuals have the same genotype
|
|
150
|
+ if len(set(snp_genotypes)) == 1:
|
|
151
|
+ continue
|
|
152
|
+ for k in set(snp_genotypes):
|
|
153
|
+ allele_counts[snp_genotypes.count(k)] = k
|
|
154
|
+ allele_counts_list.append(snp_genotypes.count(k))
|
|
155
|
+ SFS_values[min(allele_counts_list)-1] += 1
|
|
156
|
+ # sum pluriall counts for this CHR to the rest
|
|
157
|
+ count_pluriall += vcf_dict[CHROM]['NB_PLURIALL']
|
|
158
|
+
|
|
159
|
+ if verbose:
|
|
160
|
+ print("SFS=", SFS_values)
|
|
161
|
+ print("Pluriallelic sites =", count_pluriall)
|
|
162
|
+
|
|
163
|
+ return SFS_values, count_pluriall
|
|
164
|
+
|
|
165
|
+
|
105
|
166
|
def barplot_sfs(sfs, folded=True, title = "Barplot"):
|
106
|
167
|
sfs_val = []
|
107
|
168
|
n = len(sfs.values())
|