瀏覽代碼

first commit

tforest 2 年之前
當前提交
598fb21351
共有 1 個文件被更改,包括 41 次插入0 次删除
  1. 41 0
      vcf_to_sfs.py

+ 41 - 0
vcf_to_sfs.py 查看文件

@@ -0,0 +1,41 @@
1
+#!/usr/bin/env python3
2
+
3
+"""
4
+Caution : At the moment for gzipped files only.
5
+
6
+
7
+"""
8
+
9
+import gzip
10
+import sys
11
+
12
+# default folded SFS
13
+folded = True
14
+
15
+with gzip.open(sys.argv[1], "rb") as inputgz:
16
+    line = inputgz.readline()
17
+    genotypes = []
18
+    SFS_values = {}
19
+    while line:
20
+        line = line.decode('utf-8').strip()
21
+        if not line.startswith("##") and not line.startswith("#"):
22
+            FORMAT = line.split("\t")[8:9]
23
+            SAMPLES = line.split("\t")[9:]
24
+            snp_genotypes = []
25
+            for sample in SAMPLES:
26
+                # for UNPHASED data
27
+                smpl_genotype = [int(a) for a in sample.split(':')[0].split('/') if a != '.']
28
+                #if not folded:
29
+                    
30
+                print(smpl_genotype)
31
+                nb_alleles = len(set(smpl_genotype))
32
+                snp_genotypes.append(nb_alleles)
33
+            print(snp_genotypes)
34
+            nb_derived_allele = len([val for val in snp_genotypes if val != 0])
35
+            print("nb derived allele", nb_derived_allele)
36
+            if nb_derived_allele not in SFS_values.keys():
37
+                SFS_values[nb_derived_allele] = 1
38
+            else:
39
+                SFS_values[nb_derived_allele] += 1
40
+        line = inputgz.readline()
41
+        print(SFS_values)