vcf_to_sfs.c 2.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. # include <stdio.h>
  2. # include <stdlib.h>
  3. # include <zlib.h>
  4. #include <string.h>
  5. #include <stdbool.h>
  6. bool StartsWith(const char *a, const char *b)
  7. {
  8. if(strncmp(a, b, strlen(b)) == 0) return 1;
  9. return 0;
  10. }
  11. void slice_str(const char * str, char * buffer, size_t start, size_t end)
  12. {
  13. size_t j = 0;
  14. for ( size_t i = start; i <= end; ++i ) {
  15. buffer[j++] = str[i];
  16. }
  17. buffer[j] = 0;
  18. }
  19. int min(int * array, int size){
  20. //Consider first element as smallest
  21. int smallest = array[0];
  22. int i;
  23. for (i = 0; i < num; i++) {
  24. if (a[i] < smallest) {
  25. smallest = a[i];
  26. }
  27. }
  28. }
  29. int countDistinct(int a[], int n) //Function Definition
  30. {
  31. int i, j, count = 0;
  32. //Traverse the array
  33. for (i = 1; i < n; i++) //hold an array element
  34. {
  35. for (j = 0; j < i; j++)
  36. {
  37. if (a[i] == a[j]) //Check for duplicate elements
  38. {
  39. break; //If duplicate elements found then break
  40. }
  41. }
  42. if (i == j)
  43. {
  44. count++; //increment the number of distinct elements
  45. }
  46. }
  47. return count; //Return the number of distinct elements
  48. }
  49. # define LL 8192 /* line length maximum */
  50. # define DIPLOID true
  51. # define FOLDED true
  52. # define IGNORED_FIELDS 9
  53. int main ( int argc, char *argv[] ){
  54. if ( argc < 3) {
  55. printf("Need 2 args!\n");
  56. return 1;
  57. }
  58. gzFile fp;
  59. char line[LL];
  60. int N;
  61. char delim[] = "\t";
  62. fp = gzopen( argv[1], "r" );
  63. // pop of size 2N when diploid
  64. if (DIPLOID == true && FOLDED == false) {
  65. N = 2 * atoi(argv[2]);
  66. } else {
  67. N = atoi(argv[2]);
  68. }
  69. int snp_genotypes[N];
  70. int SFS_values[N];
  71. gzgets( fp, line, LL );
  72. while ( ! gzeof( fp ) ){
  73. int k = 0;
  74. if ( StartsWith(line, "##") || ( StartsWith(line, "#") ) || (strstr(line, "./.:.") != NULL)){
  75. gzgets( fp, line, LL );
  76. continue;
  77. }
  78. char *vcf_field = strtok(line, delim);
  79. while(vcf_field != NULL){
  80. k++;
  81. if (k > IGNORED_FIELDS) {
  82. const size_t len = strlen(vcf_field);
  83. char buffer[len + 1];
  84. //printf("'%s'\n", ptr);
  85. slice_str(vcf_field, buffer, 0, 0);
  86. //printf("%d %s ", N, buffer);
  87. snp_genotypes[k-IGNORED_FIELDS] = atoi(buffer);
  88. //printf("%d ", smpl_genotype[k-9]);
  89. }
  90. vcf_field = strtok(NULL, delim);
  91. int c= countDistinct(snp_genotypes, N);
  92. // skip if all individuals have the same genotype
  93. if (c == 1) {
  94. continue;
  95. gzgets( fp, line, LL );
  96. }
  97. /* int i; */
  98. /* for (i = 1; i < N; ++i) */
  99. /* { */
  100. /* printf("%d ", snp_genotypes[i]); */
  101. /* } */
  102. int allele_counts[c];
  103. min(allele_counts, N);
  104. }
  105. // printf("%s", line );
  106. // loads the next line
  107. gzgets( fp, line, LL );
  108. }
  109. gzclose( fp );
  110. return 0;
  111. }