swp2.py 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. import matplotlib.pyplot as plt
  2. import os
  3. import numpy as np
  4. def return_x_y_from_stwp_theta_file(stwp_theta_file, breaks, mu, tgen, relative_theta_scale = False):
  5. with open(stwp_theta_file, "r") as swp_file:
  6. # Read the first line
  7. line = swp_file.readline()
  8. L = float(line.split()[2])
  9. # Process lines until the end of the file
  10. while line:
  11. # check at each line
  12. if line.startswith("dim") :
  13. dim = int(line.split()[1])
  14. if dim == breaks+1:
  15. likelihood = line.split()[5]
  16. groups = line.split()[6:6+dim]
  17. theta_site = line.split()[6+dim:6+dim+1+dim]
  18. elif dim < breaks+1:
  19. line = swp_file.readline()
  20. continue
  21. elif dim > breaks+1:
  22. break
  23. #return 0,0,0
  24. # Read the next line
  25. line = swp_file.readline()
  26. #### END of parsing
  27. # quit this file if the number of dimensions is incorrect
  28. if dim < breaks+1:
  29. return 0,0,0
  30. # get n, the last bin of the last group
  31. # revert the list of groups as the most recent times correspond
  32. # to the closest and last leafs of the coal. tree.
  33. groups = groups[::-1]
  34. theta_site = theta_site[::-1]
  35. # initiate the dict of times
  36. t = {}
  37. # list of thetas
  38. theta_L = []
  39. sum_t = 0
  40. for group_nb, group in enumerate(groups):
  41. ###print(group_nb, group, theta_site[group_nb], len(theta_site))
  42. # store all the thetas one by one, with one theta per group
  43. theta_L.append(float(theta_site[group_nb]))
  44. # if the group is of size 1
  45. if len(group.split(',')) == 1:
  46. i = int(group)
  47. # if the group size is >1, take the first elem of the group
  48. # i is the first bin of each group, straight after a breakpoint
  49. else:
  50. i = int(group.split(",")[0])
  51. j = int(group.split(",")[-1])
  52. t[i] = 0
  53. #t =
  54. if len(group.split(',')) == 1:
  55. k = i
  56. if relative_theta_scale:
  57. t[i] += ((theta_L[group_nb] ) / (k*(k-1)))
  58. else:
  59. t[i] += ((theta_L[group_nb] ) / (k*(k-1)) * tgen) / mu
  60. else:
  61. for k in range(j, i-1, -1 ):
  62. if relative_theta_scale:
  63. t[i] += ((theta_L[group_nb] ) / (k*(k-1)))
  64. else:
  65. t[i] += ((theta_L[group_nb] ) / (k*(k-1)) * tgen) / mu
  66. # we add the cumulative times at the end
  67. t[i] += sum_t
  68. sum_t = t[i]
  69. # build the y axis (sizes)
  70. y = []
  71. for theta in theta_L:
  72. if relative_theta_scale:
  73. size = theta
  74. else:
  75. # with size N = theta/4mu
  76. size = theta / (4*mu)
  77. y.append(size)
  78. y.append(size)
  79. # build the time x axis
  80. x = [0]
  81. for time in range(0, len(t.values())-1):
  82. x.append(list(t.values())[time])
  83. x.append(list(t.values())[time])
  84. x.append(list(t.values())[len(t.values())-1])
  85. # if relative_theta_scale:
  86. # # rescale
  87. # #N0 = y[0]
  88. # # for i in range(len(y)):
  89. # # # divide by N0
  90. # # y[i] = y[i]/N0
  91. # # x[i] = x[i]/N0
  92. return x,y,likelihood
  93. def return_x_y_from_stwp_theta_file_as_is(stwp_theta_file, breaks, mu, tgen, relative_theta_scale = False):
  94. with open(stwp_theta_file, "r") as swp_file:
  95. # Read the first line
  96. line = swp_file.readline()
  97. L = float(line.split()[2])
  98. # Process lines until the end of the file
  99. while line:
  100. # check at each line
  101. if line.startswith("dim") :
  102. dim = int(line.split()[1])
  103. if dim == breaks+1:
  104. likelihood = line.split()[5]
  105. groups = line.split()[6:6+dim]
  106. theta_site = line.split()[6+dim:6+dim+1+dim]
  107. elif dim < breaks+1:
  108. line = swp_file.readline()
  109. continue
  110. elif dim > breaks+1:
  111. break
  112. #return 0,0,0
  113. # Read the next line
  114. line = swp_file.readline()
  115. #### END of parsing
  116. # quit this file if the number of dimensions is incorrect
  117. if dim < breaks+1:
  118. return 0,0,0
  119. # get n, the last bin of the last group
  120. # revert the list of groups as the most recent times correspond
  121. # to the closest and last leafs of the coal. tree.
  122. groups = groups[::-1]
  123. theta_site = theta_site[::-1]
  124. thetas = {}
  125. for i in range(len(groups)):
  126. groups[i] = groups[i].split(',')
  127. #print(groups[i], len(groups[i]))
  128. thetas[i] = [float(theta_site[i]), groups[i], likelihood]
  129. return thetas
  130. def plot_k_epochs_thetafolder(folder_path, mu, tgen, breaks = 2, title = "Title", theta_scale = True):
  131. scenari = {}
  132. cpt = 0
  133. for file_name in os.listdir(folder_path):
  134. if os.path.isfile(os.path.join(folder_path, file_name)):
  135. # Perform actions on each file
  136. x,y,likelihood = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks,
  137. tgen = tgen,
  138. mu = mu, relative_theta_scale = theta_scale)
  139. if x == 0 or y == 0:
  140. continue
  141. cpt +=1
  142. scenari[likelihood] = x,y
  143. print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(breaks)+"\n*******\n")
  144. print(cpt, "theta file(s) have been scanned.")
  145. # sort starting by the smallest -log(Likelihood)
  146. print(scenari)
  147. best10_scenari = (sorted(list(scenari.keys())))[:10]
  148. print("10 greatest Likelihoods", best10_scenari)
  149. greatest_likelihood = best10_scenari[0]
  150. x, y = scenari[greatest_likelihood]
  151. my_dpi = 300
  152. plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
  153. plt.plot(x, y, 'r-', lw=2, label = 'Lik='+greatest_likelihood)
  154. plt.xlim(1e-3, 1)
  155. plt.ylim(0, 10)
  156. #plt.yscale('log')
  157. plt.xscale('log')
  158. plt.grid(True,which="both", linestyle='--', alpha = 0.3)
  159. for scenario in best10_scenari[1:]:
  160. x,y = scenari[scenario]
  161. #print("\n---- Lik:",scenario,"\n\nt=", x,"\n\nN=",y, "\n\n")
  162. plt.plot(x, y, '--', lw=1, label = 'Lik='+scenario)
  163. if theta_scale:
  164. plt.xlabel("Coal. time")
  165. plt.ylabel("Pop. size scaled by N0")
  166. recent_scale_lower_bound = y[0] * 0.01
  167. recent_scale_upper_bound = y[0] * 0.1
  168. plt.axvline(x=recent_scale_lower_bound)
  169. plt.axvline(x=recent_scale_upper_bound)
  170. else:
  171. # years
  172. plt.xlabel("Time (years)")
  173. plt.ylabel("Individuals (N)")
  174. plt.legend(loc='upper right')
  175. plt.title(title)
  176. plt.savefig(title+'_b'+str(breaks)+'.pdf')
  177. def plot_all_epochs_thetafolder(folder_path, mu, tgen, title = "Title", theta_scale = True):
  178. #scenari = {}
  179. cpt = 0
  180. epochs = {}
  181. for file_name in os.listdir(folder_path):
  182. breaks = 0
  183. cpt +=1
  184. if os.path.isfile(os.path.join(folder_path, file_name)):
  185. x, y, likelihood = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks,
  186. tgen = tgen,
  187. mu = mu, relative_theta_scale = theta_scale)
  188. while not (x == 0 and y == 0):
  189. if breaks not in epochs.keys():
  190. epochs[breaks] = {}
  191. epochs[breaks][likelihood] = x,y
  192. breaks += 1
  193. x,y,likelihood = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks,
  194. tgen = tgen,
  195. mu = mu, relative_theta_scale = theta_scale)
  196. print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(breaks)+"\n*******\n")
  197. print(cpt, "theta file(s) have been scanned.")
  198. # intialize figure
  199. my_dpi = 300
  200. plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
  201. plt.xlim(1e-3, 1)
  202. #plt.ylim(0, 10)
  203. #plt.yscale('log')
  204. plt.xscale('log')
  205. plt.grid(True,which="both", linestyle='--', alpha = 0.3)
  206. brkpt_lik = []
  207. for epoch, scenari in epochs.items():
  208. # sort starting by the smallest -log(Likelihood)
  209. best10_scenari = (sorted(list(scenari.keys())))[:10]
  210. greatest_likelihood = best10_scenari[0]
  211. # store the tuple breakpoints and likelihood for later plot
  212. brkpt_lik.append((epoch, greatest_likelihood))
  213. x, y = scenari[greatest_likelihood]
  214. #without breakpoint
  215. if epoch == 0:
  216. # do something with the theta without bp and skip the plotting
  217. N0 = y[0]
  218. #continue
  219. for i in range(len(y)):
  220. # divide by N0
  221. y[i] = y[i]/N0
  222. x[i] = x[i]/N0
  223. plt.plot(x, y, '-', alpha=0.75, lw=2, label = str(epoch)+' BrkPt | Lik='+greatest_likelihood)
  224. if theta_scale:
  225. plt.xlabel("Coal. time")
  226. plt.ylabel("Pop. size scaled by N0")
  227. recent_scale_lower_bound = 0.01
  228. recent_scale_upper_bound = 0.1
  229. #print(recent_scale_lower_bound, recent_scale_upper_bound)
  230. plt.axvline(x=recent_scale_lower_bound)
  231. plt.axvline(x=recent_scale_upper_bound)
  232. else:
  233. # years
  234. plt.xlabel("Time (years)")
  235. plt.ylabel("Individuals (N)")
  236. plt.xlim(1e-5, 1)
  237. plt.legend(loc='upper right')
  238. plt.title(title)
  239. plt.savefig(title+'_b'+str(breaks)+'.pdf')
  240. # plot likelihood against nb of breakpoints
  241. plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
  242. plt.rcParams['font.size'] = '18'
  243. AIC = 2*(len(brkpt_lik)+1)+2*np.array(brkpt_lik)[:, 1].astype(float)
  244. plt.plot(np.array(brkpt_lik)[:, 0], AIC, 'o', linestyle = "dotted", lw=2)
  245. plt.axhline(y=106)
  246. plt.yscale('log')
  247. plt.xlabel("# breakpoints", fontsize=20)
  248. plt.ylabel("$-\log\mathcal{L}$")
  249. #plt.legend(loc='upper right')
  250. plt.title(title)
  251. plt.savefig(title+'_Breakpts_Likelihood.pdf')
  252. def plot_test_theta(folder_path, mu, tgen, title = "Title", theta_scale = True, breaks_max = 6):
  253. cpt = 0
  254. epochs = {}
  255. for file_name in os.listdir(folder_path):
  256. cpt +=1
  257. if os.path.isfile(os.path.join(folder_path, file_name)):
  258. for k in range(breaks_max):
  259. thetas = return_x_y_from_stwp_theta_file_as_is(folder_path+file_name, breaks = k,
  260. tgen = tgen,
  261. mu = mu, relative_theta_scale = theta_scale)
  262. if thetas[0] == 0:
  263. continue
  264. epochs[k] = thetas
  265. print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(k)+"\n*******\n")
  266. print(cpt, "theta file(s) have been scanned.")
  267. # intialize figure
  268. my_dpi = 300
  269. plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
  270. for epoch, theta in epochs.items():
  271. groups = np.array(list(theta.values()), dtype=object)[:, 1].tolist()
  272. x = []
  273. y = []
  274. thetas = np.array(list(theta.values()), dtype=object)[:, 0]
  275. for i,group in enumerate(groups):
  276. x += group[::-1]
  277. y += list(np.repeat(thetas[i], len(group)))
  278. if epoch == 0:
  279. N0 = y[0]
  280. for i in range(len(y)):
  281. y[i] = y[i]/N0
  282. plt.plot(x, y, 'o', linestyle="dotted", alpha=0.75, lw=2, label = str(epoch)+' brks')
  283. plt.xlabel("# breaks")
  284. plt.ylabel("theta")
  285. plt.legend(loc='upper right')
  286. plt.savefig(title+'_test'+str(k)+'.pdf')
  287. # fig 2
  288. plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
  289. for epoch, theta in epochs.items():
  290. groups = np.array(list(theta.values()), dtype=object)[:, 1].tolist()
  291. x = []
  292. y = []
  293. thetas = np.array(list(theta.values()), dtype=object)[:, 0]
  294. for i,group in enumerate(groups):
  295. x += group[::-1]
  296. y += list(np.repeat(thetas[i], len(group)))
  297. if epoch == 0:
  298. N0 = y[0]
  299. for i in range(len(y)):
  300. y[i] = y[i]/N0
  301. #
  302. x_2 = []
  303. T = 0
  304. # k allant de de 14 à 2
  305. for i in range(len(x)):
  306. x[i] = int(x[i])
  307. #print(x[2])
  308. for i in range(0, len(x)):
  309. k = x[i]
  310. #print(k, y[k-2])
  311. #theta_k = y[k] / (k*(k-1))
  312. T += y[i] / (x[i]*(x[i]-1))
  313. x_2.append(T)
  314. plt.plot(x_2, y, 'o', linestyle="dotted", alpha=0.75, lw=2, label = str(epoch)+' brks')
  315. plt.xscale('log')
  316. plt.xlabel("# breaks")
  317. plt.ylabel("theta")
  318. plt.legend(loc='upper right')
  319. plt.savefig(title+'_test'+str(k)+'.pdf')
  320. #
  321. if __name__ == "__main__":
  322. if len(sys.argv) != 4:
  323. print("Need 3 args: ThetaFolder MutationRate GenerationTime")
  324. exit(0)
  325. folder_path = sys.argv[1]
  326. mu = sys.argv[2]
  327. tgen = sys.argv[3]
  328. plot_all_epochs_thetafolder(folder_path, mu, tgen)