Bladeren bron

improve gestion of already parsed vcf for other functions

tforest 2 jaren geleden
bovenliggende
commit
856e20d46e
2 gewijzigde bestanden met toevoegingen van 67 en 2 verwijderingen
  1. 61 0
      sfs_tools.py
  2. 6 2
      vcf_utils.py

+ 61 - 0
sfs_tools.py Bestand weergeven

@@ -102,6 +102,67 @@ def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, ver
102 102
         print("Pluriallelic sites =", count_pluriall)
103 103
     return SFS_values, count_pluriall
104 104
 
105
+
106
+def sfs_from_parsed_vcf(n, vcf_dict, folded = True, diploid = True, phased = False, verbose = False):
107
+
108
+    """
109
+    Generates a Site Frequency Spectrum from a gzipped VCF file format.
110
+
111
+    Parameters
112
+    ----------
113
+    n : int
114
+        Nb of individuals in sample.
115
+    vcf_file : str
116
+        SNPs in VCF file format.
117
+
118
+        Used to generate a Site Frequency Spectrum (SFS) from a VCF.
119
+
120
+    Returns
121
+    -------
122
+    dict
123
+        Site Frequency Spectrum (SFS)
124
+
125
+
126
+    """
127
+    
128
+    if diploid and not folded:
129
+        n *= 2
130
+    # initiate SFS_values with a zeros dict
131
+    SFS_values = dict.fromkeys(range(n),0)
132
+    count_pluriall = 0
133
+
134
+    for CHROM in vcf_dict:
135
+        for SNP in vcf_dict[CHROM]:
136
+            snp_genotypes = []
137
+            allele_counts = {}
138
+            allele_counts_list = []
139
+            print(CHROM, SNP)
140
+            for sample in vcf_dict[CHROM][SNP]["SAMPLES"]:
141
+                if not phased:
142
+                    # for UNPHASED data
143
+                    smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
144
+                else:
145
+                    # for PHASED
146
+                    smpl_genotype = [int(a) for a in sample.split(':')[0].split('|') if a != '.']
147
+                nb_alleles = set(smpl_genotype)
148
+                snp_genotypes += smpl_genotype
149
+            # skip if all individuals have the same genotype
150
+            if len(set(snp_genotypes)) == 1:
151
+                continue
152
+            for k in set(snp_genotypes):
153
+                allele_counts[snp_genotypes.count(k)] = k
154
+                allele_counts_list.append(snp_genotypes.count(k))
155
+            SFS_values[min(allele_counts_list)-1] += 1
156
+        # sum pluriall counts for this CHR to the rest
157
+        count_pluriall += vcf_dict[CHROM]['NB_PLURIALL']
158
+                    
159
+    if verbose:
160
+        print("SFS=", SFS_values)
161
+    print("Pluriallelic sites =", count_pluriall)
162
+    
163
+    return SFS_values, count_pluriall
164
+
165
+
105 166
 def barplot_sfs(sfs, folded=True, title = "Barplot"):
106 167
     sfs_val = []
107 168
     n = len(sfs.values())

+ 6 - 2
vcf_utils.py Bestand weergeven

@@ -105,6 +105,7 @@ def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
105 105
                         entries = {
106 106
                             'POS':POS,
107 107
                             'CHR':CHROM,
108
+                            # removed to save some space
108 109
                             #'FIELDS':FIELDS,
109 110
                             'REF':REF,
110 111
                             'ALT':ALT,
@@ -113,13 +114,16 @@ def parse_vcf(vcf_file, phased=False, stop_at=None, chr_starts_with="*"):
113 114
                             'SAMPLES':SAMPLES,
114 115
                             'QUALITY':QUALITY,
115 116
                             'GENOTYPE':GENOTYPE,
116
-                            'LIKELIHOOD':LIKELIHOOD
117
+                            'LIKELIHOOD':LIKELIHOOD,
118
+                            'NB_PLURIALL':pluriall_counts
117 119
                         }
118 120
                         if CHROM.startswith(chr_starts_with):
119 121
                         # keep if chr name starts with filter
120 122
                         # default : *, every chr is kept
121 123
                             if CHROM not in chrom:
122
-                                 chrom[CHROM] = {}
124
+                                # resent when changing chrom
125
+                                pluriall_counts = 0
126
+                                chrom[CHROM] = {}
123 127
                             chrom[CHROM][POS] = entries
124 128
                 byte_line = inputgz.readline()
125 129
     end = time.time()