浏览代码

Add C version start

tforest 3 年前
父节点
当前提交
86f84a676d
共有 3 个文件被更改,包括 68 次插入3 次删除
  1. 2 0
      compile.sh
  2. 62 0
      vcf_to_sfs.c
  3. 4 3
      vcf_to_sfs.py

+ 2 - 0
compile.sh 查看文件

@@ -0,0 +1,2 @@
1
+#!/bin/sh
2
+gcc -Wall -pthread vcf_to_sfs.c -lm -lz -std=c99 -Wextra -o vcf_to_sfs

+ 62 - 0
vcf_to_sfs.c 查看文件

@@ -0,0 +1,62 @@
1
+# include <stdio.h>
2
+# include <stdlib.h>
3
+# include <zlib.h>
4
+#include <string.h>
5
+#include <stdbool.h>
6
+
7
+bool StartsWith(const char *a, const char *b)
8
+{
9
+   if(strncmp(a, b, strlen(b)) == 0) return 1;
10
+   return 0;
11
+}
12
+
13
+void slice_str(const char * str, char * buffer, size_t start, size_t end)
14
+{
15
+    size_t j = 0;
16
+    for ( size_t i = start; i <= end; ++i ) {
17
+        buffer[j++] = str[i];
18
+    }
19
+    buffer[j] = 0;
20
+}
21
+
22
+
23
+# define LL 8192   /* line length maximum */
24
+
25
+int main ( int argc, char *argv[] ){
26
+    if ( argc < 3) {
27
+	printf("Need 2 args!\n");
28
+	return 1;
29
+    }
30
+    gzFile fp;
31
+    char line[LL];
32
+    char delim[] = "\t";
33
+    fp = gzopen( argv[1], "r" );
34
+
35
+    gzgets( fp, line, LL );
36
+    while ( ! gzeof( fp ) ){
37
+	int k = 0;
38
+	if ( StartsWith(line, "##") || ( StartsWith(line, "#") ) || (strstr(line, "./.:.") != NULL)){
39
+	    gzgets( fp, line, LL );
40
+	    continue;
41
+	}
42
+	
43
+	char *vcf_field = strtok(line, delim);
44
+	while(vcf_field != NULL){
45
+	    k++;
46
+	    if (k > 9) {
47
+		const size_t len = strlen(vcf_field);
48
+		char buffer[len + 1];
49
+		//printf("'%s'\n", ptr);
50
+		slice_str(vcf_field, buffer, 0, 0);
51
+		printf("%s      ", buffer);
52
+	    }
53
+	    vcf_field = strtok(NULL, delim);
54
+	}
55
+	// printf("%s", line );
56
+	// loads the next line
57
+	gzgets( fp, line, LL );
58
+    }
59
+
60
+    gzclose( fp );
61
+    return 0;
62
+}

+ 4 - 3
vcf_to_sfs.py 查看文件

@@ -71,10 +71,11 @@ with gzip.open(sys.argv[1], "rb") as inputgz:
71 71
             for k in set(snp_genotypes):
72 72
                 allele_counts[snp_genotypes.count(k)] = k
73 73
                 allele_counts_list.append(snp_genotypes.count(k))
74
-            if folded :
75
-                for al in range(polyallelic-1):
74
+            if folded and len(ALT) >= 2:
75
+                for al in range(len(ALT)-1):
76 76
                     SFS_values[min(allele_counts_list)-1] += 1/len(ALT)
77 77
                     allele_counts_list.remove(min(allele_counts_list))
78
+            else:
79
+                SFS_values[min(allele_counts_list)-1] += 1
78 80
         line = inputgz.readline()
79 81
         print(SFS_values)
80
-print(polycount)