tforest 3 anni fa
parent
commit
91915e5fa8
1 ha cambiato i file con 63 aggiunte e 2 eliminazioni
  1. 63 2
      vcf_to_sfs.c

+ 63 - 2
vcf_to_sfs.c Vedi File

@@ -19,8 +19,42 @@ void slice_str(const char * str, char * buffer, size_t start, size_t end)
19 19
     buffer[j] = 0;
20 20
 }
21 21
 
22
+int min(int * array, int size){
23
+    //Consider first element as smallest
24
+   int smallest = array[0];
25
+   int i;
26
+   for (i = 0; i < num; i++) {
27
+      if (a[i] < smallest) {
28
+         smallest = a[i];
29
+      }
30
+   }
31
+}
32
+
33
+int countDistinct(int a[], int n)      //Function Definition
34
+{
35
+   int i, j, count = 0;
36
+   //Traverse the array
37
+   for (i = 1; i < n; i++)      //hold an array element
38
+   {
39
+      for (j = 0; j < i; j++)   
40
+      {
41
+         if (a[i] == a[j])    //Check for duplicate elements
42
+         {
43
+            break;             //If duplicate elements found then break
44
+         }
45
+      }
46
+      if (i == j)
47
+      {
48
+         count++;     //increment the number of distinct elements
49
+      }
50
+   }
51
+   return count;      //Return the number of distinct elements
52
+}
22 53
 
23 54
 # define LL 8192   /* line length maximum */
55
+# define DIPLOID true
56
+# define FOLDED true
57
+# define IGNORED_FIELDS 9
24 58
 
25 59
 int main ( int argc, char *argv[] ){
26 60
     if ( argc < 3) {
@@ -29,9 +63,20 @@ int main ( int argc, char *argv[] ){
29 63
     }
30 64
     gzFile fp;
31 65
     char line[LL];
66
+    int N;
32 67
     char delim[] = "\t";
33 68
     fp = gzopen( argv[1], "r" );
34 69
 
70
+    // pop of size 2N when diploid
71
+    if (DIPLOID == true && FOLDED == false) {
72
+	N = 2 * atoi(argv[2]);
73
+	    } else {
74
+	N = atoi(argv[2]);
75
+    }
76
+
77
+    int snp_genotypes[N];
78
+    int SFS_values[N];
79
+
35 80
     gzgets( fp, line, LL );
36 81
     while ( ! gzeof( fp ) ){
37 82
 	int k = 0;
@@ -43,14 +88,30 @@ int main ( int argc, char *argv[] ){
43 88
 	char *vcf_field = strtok(line, delim);
44 89
 	while(vcf_field != NULL){
45 90
 	    k++;
46
-	    if (k > 9) {
91
+	    if (k > IGNORED_FIELDS) {
47 92
 		const size_t len = strlen(vcf_field);
48 93
 		char buffer[len + 1];
49 94
 		//printf("'%s'\n", ptr);
50 95
 		slice_str(vcf_field, buffer, 0, 0);
51
-		printf("%s      ", buffer);
96
+		//printf("%d %s      ", N, buffer);
97
+		snp_genotypes[k-IGNORED_FIELDS] = atoi(buffer);
98
+		//printf("%d ", smpl_genotype[k-9]);
52 99
 	    }
53 100
 	    vcf_field = strtok(NULL, delim);
101
+	    int c= countDistinct(snp_genotypes, N);
102
+            // skip if all individuals have the same genotype
103
+	    if (c == 1) {
104
+		continue;
105
+		gzgets( fp, line, LL );
106
+	    }
107
+	    /* int i; */
108
+	    /* for (i = 1; i < N; ++i) */
109
+	    /* 	{ */
110
+	    /* 	    printf("%d ", snp_genotypes[i]); */
111
+	    /* 	} */
112
+	    int allele_counts[c];
113
+	    
114
+	    min(allele_counts, N);
54 115
 	}
55 116
 	// printf("%s", line );
56 117
 	// loads the next line