swp2.py 29KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705
  1. import matplotlib.pyplot as plt
  2. import os
  3. import numpy as np
  4. import math
  5. import json
  6. def log_facto(k):
  7. """
  8. Using the Stirling's approximation
  9. """
  10. k = int(k)
  11. if k > 1e6:
  12. return k * np.log(k) - k + np.log(2*math.pi*k)/2
  13. val = 0
  14. for i in range(2, k+1):
  15. val += np.log(i)
  16. return val
  17. def parse_stwp_theta_file(stwp_theta_file, breaks, mu, tgen, relative_theta_scale = False):
  18. with open(stwp_theta_file, "r") as swp_file:
  19. # Read the first line
  20. line = swp_file.readline()
  21. L = float(line.split()[2])
  22. rands = swp_file.readline()
  23. line = swp_file.readline()
  24. # skip empty lines before SFS
  25. while line == "\n":
  26. line = swp_file.readline()
  27. sfs = np.array(line.split()).astype(float)
  28. # Process lines until the end of the file
  29. while line:
  30. # check at each line
  31. if line.startswith("dim") :
  32. dim = int(line.split()[1])
  33. if dim == breaks+1:
  34. likelihood = line.split()[5]
  35. groups = line.split()[6:6+dim]
  36. theta_site = line.split()[6+dim:6+dim+1+dim]
  37. elif dim < breaks+1:
  38. line = swp_file.readline()
  39. continue
  40. elif dim > breaks+1:
  41. break
  42. #return 0,0,0
  43. # Read the next line
  44. line = swp_file.readline()
  45. #### END of parsing
  46. # quit this file if the number of dimensions is incorrect
  47. if dim < breaks+1:
  48. return 0,0,0,0,0,0
  49. # get n, the last bin of the last group
  50. # revert the list of groups as the most recent times correspond
  51. # to the closest and last leafs of the coal. tree.
  52. groups = groups[::-1]
  53. theta_site = theta_site[::-1]
  54. # store thetas for later use
  55. grps = groups.copy()
  56. thetas = {}
  57. for i in range(len(groups)):
  58. grps[i] = grps[i].split(',')
  59. thetas[i] = [float(theta_site[i]), grps[i], likelihood]
  60. # initiate the dict of times
  61. t = {}
  62. # list of thetas
  63. theta_L = []
  64. sum_t = 0
  65. for group_nb, group in enumerate(groups):
  66. ###print(group_nb, group, theta_site[group_nb], len(theta_site))
  67. # store all the thetas one by one, with one theta per group
  68. theta_L.append(float(theta_site[group_nb]))
  69. # if the group is of size 1
  70. if len(group.split(',')) == 1:
  71. i = int(group)
  72. # if the group size is >1, take the first elem of the group
  73. # i is the first bin of each group, straight after a breakpoint
  74. else:
  75. i = int(group.split(",")[0])
  76. j = int(group.split(",")[-1])
  77. t[i] = 0
  78. #t =
  79. if len(group.split(',')) == 1:
  80. k = i
  81. if relative_theta_scale:
  82. t[i] += ((theta_L[group_nb] ) / (k*(k-1)))
  83. else:
  84. t[i] += ((theta_L[group_nb] ) / (k*(k-1)) * tgen) / mu
  85. else:
  86. for k in range(j, i-1, -1 ):
  87. if relative_theta_scale:
  88. t[i] += ((theta_L[group_nb] ) / (k*(k-1)))
  89. else:
  90. t[i] += ((theta_L[group_nb] ) / (k*(k-1)) * tgen) / mu
  91. # we add the cumulative times at the end
  92. t[i] += sum_t
  93. sum_t = t[i]
  94. # build the y axis (sizes)
  95. y = []
  96. for theta in theta_L:
  97. if relative_theta_scale:
  98. size = theta
  99. else:
  100. # with size N = theta/4mu
  101. size = theta / (4*mu)
  102. y.append(size)
  103. y.append(size)
  104. # build the time x axis
  105. x = [0]
  106. for time in range(0, len(t.values())-1):
  107. x.append(list(t.values())[time])
  108. x.append(list(t.values())[time])
  109. x.append(list(t.values())[len(t.values())-1])
  110. return x,y,likelihood,thetas,sfs,L
  111. def plot_straight_x_y(x,y):
  112. x_1 = [x[0]]
  113. y_1 = []
  114. for i in range(0, len(y)-1):
  115. x_1.append(x[i])
  116. x_1.append(x[i])
  117. y_1.append(y[i])
  118. y_1.append(y[i])
  119. y_1 = y_1+[y[-1],y[-1]]
  120. x_1.append(x[-1])
  121. return x_1, y_1
  122. def plot_all_epochs_thetafolder(full_dict, mu, tgen, title = "Title",
  123. theta_scale = True, ax = None, input = None, output = None):
  124. my_dpi = 500
  125. L = full_dict["L"]
  126. if ax is None:
  127. # intialize figure
  128. #my_dpi = 300
  129. fnt_size = 18
  130. # plt.rcParams['font.size'] = fnt_size
  131. fig, ax1 = plt.subplots(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
  132. else:
  133. fnt_size = 12
  134. # plt.rcParams['font.size'] = fnt_size
  135. ax1 = ax[1][0,0]
  136. ax1.set_yscale('log')
  137. ax1.set_xscale('log')
  138. plot_handles = []
  139. best_plot = full_dict['all_epochs']['best']
  140. p0, = ax1.plot(best_plot[0], best_plot[1], linestyle = "-",
  141. alpha=1, lw=2, label = str(best_plot[2])+' brks | Lik='+best_plot[3])
  142. plot_handles.append(p0)
  143. #ax1.grid(True,which="both", linestyle='--', alpha = 0.3)
  144. for k, plot_Lk in enumerate(full_dict['all_epochs']['plots']):
  145. plot_Lk = str(full_dict['all_epochs']['plots'][k][3])
  146. # plt.rcParams['font.size'] = fnt_size
  147. p, = ax1.plot(full_dict['all_epochs']['plots'][k][0], full_dict['all_epochs']['plots'][k][1], linestyle = "-",
  148. alpha=1/(k+1), lw=1.5, label = str(full_dict['all_epochs']['plots'][k][2])+' brks | Lik='+plot_Lk)
  149. plot_handles.append(p)
  150. if theta_scale:
  151. ax1.set_xlabel("Coal. time", fontsize=fnt_size)
  152. ax1.set_ylabel("Pop. size scaled by N0", fontsize=fnt_size)
  153. # recent_scale_lower_bound = 0.01
  154. # recent_scale_upper_bound = 0.1
  155. # ax1.axvline(x=recent_scale_lower_bound)
  156. # ax1.axvline(x=recent_scale_upper_bound)
  157. else:
  158. # years
  159. if ax is not None:
  160. plt.set_xlabel("Time (years)", fontsize=fnt_size)
  161. plt.set_ylabel("Effective pop. size (Ne)", fontsize=fnt_size)
  162. else:
  163. plt.xlabel("Time (years)", fontsize=fnt_size)
  164. plt.ylabel("Effective pop. size (Ne)", fontsize=fnt_size)
  165. # x_ticks = ax1.get_xticks()
  166. # ax1.set_xticklabels([f'{k:.0e}\n{k/(mu):.0e}\n{k/(mu)*tgen:.0e}' for k in x_ticks], fontsize = fnt_size*0.5)
  167. # ax1.set_xticklabels([f'{k}\n{k/(mu)}\n{k/(mu)*tgen}' for k in x_ticks], fontsize = fnt_size*0.8)
  168. # plt.rcParams['font.size'] = fnt_size
  169. # print(fnt_size, "rcParam font.size=", plt.rcParams['font.size'])
  170. ax1.legend(handles = plot_handles, loc='best', fontsize = fnt_size*0.5)
  171. ax1.set_title(title)
  172. breaks = len(full_dict['all_epochs']['plots'])
  173. if ax is None:
  174. plt.savefig(title+'_'+str(breaks+1)+'_epochs.pdf')
  175. # plot likelihood against nb of breakpoints
  176. if ax is None:
  177. fig, ax2 = plt.subplots(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
  178. # plt.rcParams['font.size'] = fnt_size
  179. else:
  180. #plt.rcParams['font.size'] = fnt_size
  181. ax2 = ax[0][0,1]
  182. # Retrieve the default color cycle from rcParams
  183. default_colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
  184. # Create an array of colors from the default color cycle
  185. colors = [default_colors[i % len(default_colors)] for i in range(len(full_dict['Ln_Brks'][0]))]
  186. ax2.plot(full_dict['Ln_Brks'][0], full_dict['Ln_Brks'][1], "--", lw=1, color="black", zorder=1)
  187. ax2.scatter(full_dict['Ln_Brks'][0], full_dict['Ln_Brks'][1], s=50, c=colors, marker='o', zorder=2)
  188. ax2.axhline(y=full_dict['best_Ln'], linestyle = "-.", color = "red", label = "$-\log\mathcal{L}$ = "+str(round(full_dict['best_Ln'], 2)))
  189. ax2.set_yscale('log')
  190. ax2.set_xlabel("# breakpoints", fontsize=fnt_size)
  191. ax2.set_ylabel("$-\log\mathcal{L}$", fontsize=fnt_size)
  192. ax2.legend(loc='best', fontsize = fnt_size*0.5)
  193. ax2.set_title(title+" Likelihood gain from # breakpoints")
  194. if ax is None:
  195. plt.savefig(title+'_Breakpts_Likelihood.pdf')
  196. # AIC
  197. if ax is None:
  198. fig, ax3 = plt.subplots(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
  199. # plt.rcParams['font.size'] = '18'
  200. else:
  201. #plt.rcParams['font.size'] = fnt_size
  202. ax3 = ax[1][0,1]
  203. AIC = full_dict['AIC_Brks']
  204. # ax3.plot(AIC[0], AIC[1], 'o', linestyle = "dotted", lw=2)
  205. ax3.plot(AIC[0], AIC[1], "--", lw=1, color="black", zorder=1)
  206. ax3.scatter(AIC[0], AIC[1], s=50, c=colors, marker='o', zorder=2)
  207. ax3.axhline(y=full_dict['best_AIC'], linestyle = "-.", color = "red",
  208. label = "Min. AIC = "+str(round(full_dict['best_AIC'], 2)))
  209. ax3.set_yscale('log')
  210. ax3.set_xlabel("# breakpoints", fontsize=fnt_size)
  211. ax3.set_ylabel("AIC")
  212. ax3.legend(loc='best', fontsize = fnt_size*0.5)
  213. ax3.set_title(title+" AIC")
  214. if ax is None:
  215. plt.savefig(title+'_Breakpts_Likelihood_AIC.pdf')
  216. else:
  217. # return plots
  218. return ax[0], ax[1]
  219. def save_all_epochs_thetafolder(folder_path, mu, tgen, title = "Title", theta_scale = True, input = None, output = None):
  220. #scenari = {}
  221. cpt = 0
  222. epochs = {}
  223. plots = {}
  224. # store ['best'], and [0] for epoch 0 etc...
  225. for file_name in os.listdir(folder_path):
  226. breaks = 0
  227. cpt +=1
  228. if os.path.isfile(os.path.join(folder_path, file_name)):
  229. x, y, likelihood, theta, sfs, L = parse_stwp_theta_file(folder_path+file_name, breaks = breaks,
  230. tgen = tgen,
  231. mu = mu, relative_theta_scale = theta_scale)
  232. SFS_stored = sfs
  233. L_stored = L
  234. while not (x == 0 and y == 0):
  235. if breaks not in epochs.keys():
  236. epochs[breaks] = {}
  237. epochs[breaks][likelihood] = x,y
  238. breaks += 1
  239. x,y,likelihood,theta,sfs,L = parse_stwp_theta_file(folder_path+file_name, breaks = breaks,
  240. tgen = tgen,
  241. mu = mu, relative_theta_scale = theta_scale)
  242. if x == 0:
  243. # last break did not work, then breaks = breaks-1
  244. breaks -= 1
  245. print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(breaks)+"\n*******\n")
  246. print(cpt, "theta file(s) have been scanned.")
  247. brkpt_lik = []
  248. top_plots = {}
  249. for epoch, scenari in epochs.items():
  250. # sort starting by the smallest -log(Likelihood)
  251. best10_scenari = (sorted(list(scenari.keys())))[:10]
  252. greatest_likelihood = best10_scenari[0]
  253. # store the tuple breakpoints and likelihood for later plot
  254. brkpt_lik.append((epoch, greatest_likelihood))
  255. x, y = scenari[greatest_likelihood]
  256. #without breakpoint
  257. if epoch == 0:
  258. # do something with the theta without bp and skip the plotting
  259. N0 = y[0]
  260. #continue
  261. if theta_scale:
  262. for i in range(len(y)):
  263. # divide by N0
  264. y[i] = y[i]/N0
  265. x[i] = x[i]/N0
  266. top_plots[greatest_likelihood] = x,y,epoch
  267. plots_likelihoods = list(top_plots.keys())
  268. for i in range(len(plots_likelihoods)):
  269. plots_likelihoods[i] = float(plots_likelihoods[i])
  270. best10_plots = sorted(plots_likelihoods)[:10]
  271. top_plot_lik = str(best10_plots[0])
  272. # store x,y,brks,likelihood
  273. plots['best'] = (top_plots[top_plot_lik][0], top_plots[top_plot_lik][1], str(top_plots[top_plot_lik][2]), top_plot_lik)
  274. plots['plots'] = []
  275. for k, plot_Lk in enumerate(best10_plots[1:]):
  276. plot_Lk = str(plot_Lk)
  277. plots['plots'].append([top_plots[plot_Lk][0], top_plots[plot_Lk][1], str(top_plots[plot_Lk][2]), plot_Lk])
  278. # plot likelihood against nb of breakpoints
  279. # best possible likelihood from SFS
  280. # Segregating sites
  281. S = sum(SFS_stored)
  282. # Number of kept sites from which the SFS is computed
  283. L = L_stored
  284. # number of monomorphic sites
  285. S0 = L-S
  286. # print("SFS", SFS_stored)
  287. # print("S", S, "L", L, "S0=", S0)
  288. # compute Ln
  289. Ln = log_facto(S+S0) - log_facto(S0) + np.log(float(S0)/(S+S0)) * S0
  290. for xi in range(0, len(SFS_stored)):
  291. p_i = SFS_stored[xi] / float(S+S0)
  292. Ln += np.log(p_i) * SFS_stored[xi] - log_facto(SFS_stored[xi])
  293. # basic plot likelihood
  294. Ln_Brks = [list(np.array(brkpt_lik)[:, 0]), list(np.array(brkpt_lik)[:, 1].astype(float))]
  295. best_Ln = -Ln
  296. AIC = []
  297. for brk in np.array(brkpt_lik)[:, 0]:
  298. brk = int(brk)
  299. AIC.append((2*brk+1)+2*np.array(brkpt_lik)[brk, 1].astype(float))
  300. AIC_Brks = [list(np.array(brkpt_lik)[:, 0]), AIC]
  301. # AIC = 2*k - 2ln(L) ; where k is the number of parameters, here brks+1
  302. AIC_ln = 2*(len(brkpt_lik)+1) - 2*Ln
  303. best_AIC = AIC_ln
  304. selected_brks_nb = AIC.index(min(AIC))
  305. # to return : plots ; Ln_Brks ; AIC_Brks ; best_Ln ; best_AIC
  306. # 'plots' dict keys: 'best', {epochs}('0', '1',...)
  307. if input == None:
  308. saved_plots = {"S":S, "S0":S0, "L":L, "mu":mu, "tgen":tgen,
  309. "all_epochs":plots, "Ln_Brks":Ln_Brks,
  310. "AIC_Brks":AIC_Brks, "best_Ln":best_Ln,
  311. "best_AIC":best_AIC, "best_epoch_by_AIC":selected_brks_nb}
  312. else:
  313. # if the dict has to be loaded from input
  314. with open(input, 'r') as json_file:
  315. saved_plots = json.load(json_file)
  316. saved_plots["S"] = S
  317. saved_plots["S0"] = S0
  318. saved_plots["L"] = L
  319. saved_plots["mu"] = mu
  320. saved_plots["tgen"] = tgen
  321. saved_plots["all_epochs"] = plots
  322. saved_plots["Ln_Brks"] = Ln_Brks
  323. saved_plots["AIC_Brks"] = AIC_Brks
  324. saved_plots["best_Ln"] = best_Ln
  325. saved_plots["best_AIC"] = best_AIC
  326. saved_plots["best_epoch_by_AIC"] = selected_brks_nb
  327. if output == None:
  328. output = title+"_plotdata.json"
  329. with open(output, 'w') as json_file:
  330. json.dump(saved_plots, json_file)
  331. return saved_plots
  332. def save_k_theta(folder_path, mu, tgen, title = "Title", theta_scale = True,
  333. breaks_max = 10, input = None, output = None):
  334. """
  335. Save theta values as is to do basic plots.
  336. """
  337. cpt = 0
  338. epochs = {}
  339. len_sfs = 0
  340. for file_name in os.listdir(folder_path):
  341. cpt +=1
  342. if os.path.isfile(os.path.join(folder_path, file_name)):
  343. for k in range(breaks_max+1):
  344. x,y,likelihood,thetas,sfs,L = parse_stwp_theta_file(folder_path+file_name, breaks = k,
  345. tgen = tgen,
  346. mu = mu, relative_theta_scale = theta_scale)
  347. if thetas == 0:
  348. continue
  349. if len(thetas)-1 != k:
  350. continue
  351. if k not in epochs.keys():
  352. epochs[k] = {}
  353. likelihood = str(eval(thetas[k][2]))
  354. epochs[k][likelihood] = thetas
  355. #epochs[k] = thetas
  356. print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(k)+"\n*******\n")
  357. print(cpt, "theta file(s) have been scanned.")
  358. plots = []
  359. best_epochs = {}
  360. for epoch in epochs:
  361. likelihoods = []
  362. for key in epochs[epoch].keys():
  363. likelihoods.append(key)
  364. likelihoods.sort()
  365. minLogLn = str(likelihoods[0])
  366. best_epochs[epoch] = epochs[epoch][minLogLn]
  367. for epoch, theta in best_epochs.items():
  368. groups = np.array(list(theta.values()), dtype=object)[:, 1].tolist()
  369. x = []
  370. y = []
  371. thetas = np.array(list(theta.values()), dtype=object)[:, 0]
  372. for i,group in enumerate(groups):
  373. x += group[::-1]
  374. y += list(np.repeat(thetas[i], len(group)))
  375. if epoch == 0:
  376. N0 = y[0]
  377. # compute the proportion of information used at each bin of the SFS
  378. sum_theta_i = 0
  379. for i in range(2, len(y)+2):
  380. sum_theta_i+=y[i-2] / (i-1)
  381. prop = []
  382. for k in range(2, len(y)+2):
  383. prop.append(y[k-2] / (k - 1) / sum_theta_i)
  384. prop = prop[::-1]
  385. if theta_scale :
  386. # normalise to N0 (N0 of epoch1)
  387. for i in range(len(y)):
  388. y[i] = y[i]/N0
  389. # x_plot, y_plot = plot_straight_x_y(x, y)
  390. p = x, y
  391. # add plot to the list of all plots to superimpose
  392. plots.append(p)
  393. cumul = 0
  394. prop_cumul = []
  395. for val in prop:
  396. prop_cumul.append(val+cumul)
  397. cumul = val+cumul
  398. prop = prop_cumul
  399. lines_fig2 = []
  400. for epoch, theta in best_epochs.items():
  401. groups = np.array(list(theta.values()), dtype=object)[:, 1].tolist()
  402. x = []
  403. y = []
  404. thetas = np.array(list(theta.values()), dtype=object)[:, 0]
  405. for i,group in enumerate(groups):
  406. x += group[::-1]
  407. y += list(np.repeat(thetas[i], len(group)))
  408. if epoch == 0:
  409. N0 = y[0]
  410. if theta_scale :
  411. for i in range(len(y)):
  412. y[i] = y[i]/N0
  413. x_2 = []
  414. T = 0
  415. for i in range(len(x)):
  416. x[i] = int(x[i])
  417. # compute the times as: theta_k / (k*(k-1))
  418. for i in range(0, len(x)):
  419. T += y[i] / (x[i]*(x[i]-1))
  420. x_2.append(T)
  421. # Save plotting (fig 2)
  422. x_2 = [0]+x_2
  423. y = [y[0]]+y
  424. # x2_plot, y2_plot = plot_straight_x_y(x_2, y)
  425. p2 = x_2, y
  426. lines_fig2.append(p2)
  427. if input == None:
  428. saved_plots = {"raw_stairs":plots, "scaled_stairs":lines_fig2,
  429. "prop":prop}
  430. else:
  431. # if the dict has to be loaded from input
  432. with open(input, 'r') as json_file:
  433. saved_plots = json.load(json_file)
  434. saved_plots["raw_stairs"] = plots
  435. saved_plots["scaled_stairs"] = lines_fig2
  436. saved_plots["prop"] = prop
  437. if output == None:
  438. output = title+"_plotdata.json"
  439. with open(output, 'w') as json_file:
  440. json.dump(saved_plots, json_file)
  441. return saved_plots
  442. def plot_scaled_theta(plot_lines, prop, title, mu, tgen, swp2_lines = None, ax = None, n_ticks = 10, subset = None, theta_scale = False):
  443. # nb of plot_lines represent the number of epochs stored (len(plot_lines) = #breaks+1)
  444. nb_epochs = len(plot_lines)
  445. # fig 2 & 3
  446. if ax is None:
  447. my_dpi = 500
  448. fnt_size = 18
  449. fig2, ax2 = plt.subplots(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
  450. fig3, ax3 = plt.subplots(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
  451. else:
  452. # plt.rcParams['font.size'] = fnt_size
  453. fnt_size = 12
  454. # place of plots on the grid
  455. ax2 = ax[1,0]
  456. ax3 = ax[1,1]
  457. lines_fig2 = []
  458. lines_fig3 = []
  459. #plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
  460. if swp2_lines:
  461. for k in range(len(swp2_lines[0])):
  462. swp2_lines[0][k] = swp2_lines[0][k]/tgen*mu
  463. for k in range(len(swp2_lines[1])):
  464. swp2_lines[1][k] = swp2_lines[1][k]*4*mu
  465. x2_plot, y2_plot = plot_straight_x_y(swp2_lines[0],swp2_lines[1])
  466. p2, = ax2.plot(x2_plot, y2_plot, linestyle="-", alpha=0.75, lw=2, label = 'swp2', color="black")
  467. lines_fig2.append(p2)
  468. # Plotting (fig 3) which is the same but log scale for x
  469. p3, = ax3.plot(x2_plot, y2_plot, linestyle="-", alpha=0.75, lw=2, label = 'swp2', color="black")
  470. lines_fig3.append(p3)
  471. for breaks, plot in enumerate(plot_lines):
  472. x,y=plot
  473. x2_plot, y2_plot = plot_straight_x_y(x,y)
  474. if subset is not None:
  475. if breaks in subset:
  476. masking_alpha = 0.75
  477. else:
  478. masking_alpha = 0
  479. p2, = ax2.plot(x2_plot, y2_plot, 'o', linestyle="-", alpha=masking_alpha, lw=2, label = str(breaks)+' brks')
  480. # Plotting (fig 3) which is the same but log scale for x
  481. p3, = ax3.plot(x2_plot, y2_plot, 'o', linestyle="-", alpha=masking_alpha, lw=2, label = str(breaks)+' brks')
  482. if subset is not None and breaks in subset:
  483. # store for legend
  484. lines_fig2.append(p2)
  485. lines_fig3.append(p3)
  486. ax3.axvline(x=500/tgen*mu, linestyle="--")
  487. if theta_scale:
  488. xlabel = "Theta scaled by N0"
  489. ylabel = "Theta scaled by N0"
  490. else:
  491. xlabel = "t"
  492. ylabel = r"$\theta$"
  493. if ax is None:
  494. # if not ax, then use the plt syntax, not ax...
  495. plt.xlabel(xlabel, fontsize=fnt_size)
  496. plt.ylabel(ylabel, fontsize=fnt_size)
  497. plt.xlim(left=0)
  498. x_ticks = list(plt.xticks())[0]
  499. plt.gca().set_xticks(x_ticks)
  500. plt.gca().set_xticklabels([f'{k:.0e}\n{k/(mu):.0e}\n{k/(mu)*tgen:.0e}' for k in x_ticks], fontsize = fnt_size*0.5)
  501. plt.title(title, fontsize=fnt_size)
  502. plt.legend(handles=lines_fig2, loc='best', fontsize = fnt_size*0.5)
  503. plt.text(-0.13, -0.135, 'Coal. time\nGen. time\nYears', ha='left', va='bottom', transform=ax3.transAxes)
  504. plt.subplots_adjust(bottom=0.2) # Adjust the value as needed
  505. plt.savefig(title+'_plotB_'+str(nb_epochs)+'_epochs.pdf')
  506. # close fig2 to save memory
  507. plt.close(fig2)
  508. else:
  509. # when ax subplotting is used
  510. ax2.set_xlabel(xlabel, fontsize=fnt_size)
  511. ax2.set_ylabel(ylabel, fontsize=fnt_size)
  512. ax2.set_title(title, fontsize=fnt_size)
  513. ax2.legend(handles=lines_fig2, loc='best', fontsize = fnt_size*0.5)
  514. ax3.set_xscale('log')
  515. ax3.set_yscale('log')
  516. ax3.set_xlabel("time log scale", fontsize=fnt_size)
  517. ax3.set_ylabel("theta", fontsize=fnt_size)
  518. ax3.set_title(title, fontsize=fnt_size)
  519. ax3.legend(handles=lines_fig3, loc='best', fontsize = fnt_size*0.5)
  520. x_ticks = list(ax3.get_xticks())
  521. ax3.set_xlim(left=min(x_ticks))
  522. ax3.set_xticks(x_ticks)
  523. ax3.set_xticklabels([f'{k:.0e}\n{k/(mu):.0e}\n{k/(mu)*tgen:.0e}' for k in x_ticks], fontsize = fnt_size*0.5)
  524. plt.text(-0.13, -0.135, 'Coal. time\nGen. time\nYears', ha='left', va='bottom', transform=ax3.transAxes)
  525. plt.subplots_adjust(bottom=0.2) # Adjust the value as needed
  526. if ax is None:
  527. # nb of plot_lines represent the number of epochs stored (len(plot_lines) = #breaks+1)
  528. plt.savefig(title+'_plotC_'+str(nb_epochs)+'_epochs_log.pdf')
  529. # close fig3 to save memory
  530. plt.close(fig3)
  531. return ax
  532. def plot_raw_stairs(plot_lines, prop, title, ax = None, n_ticks = 10, rescale = False, subset = None, max_breaks = None):
  533. if max_breaks:
  534. nb_breaks = max_breaks
  535. else:
  536. nb_breaks = len(plot_lines)+1
  537. # multiple fig
  538. if ax is None:
  539. # intialize figure 1
  540. my_dpi = 500
  541. fnt_size = 18
  542. # plt.rcParams['font.size'] = fnt_size
  543. fig, ax1 = plt.subplots(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
  544. plt.subplots_adjust(bottom=0.2) # Adjust the value as needed
  545. else:
  546. fnt_size = 12
  547. # plt.rcParams['font.size'] = fnt_size
  548. ax1 = ax[0, 0]
  549. plt.subplots_adjust(wspace=0.3, hspace=0.3)
  550. plots = []
  551. for breaks, plot in enumerate(plot_lines):
  552. if max_breaks and breaks > max_breaks:
  553. # stop plotting if it exceeds the limit
  554. continue
  555. x,y = plot
  556. x_plot, y_plot = plot_straight_x_y(x,y)
  557. p, = ax1.plot(x_plot, y_plot, 'o', linestyle="-", alpha=0.75, lw=2, label = str(breaks)+' brks')
  558. # add plot to the list of all plots to superimpose
  559. plots.append(p)
  560. x_ticks = x
  561. # print(x_ticks)
  562. #print(prop, "\n", sum(prop))
  563. #ax.legend(handles=[p0]+plots)
  564. ax1.set_xlabel("# bin & cumul. prop. of sites", fontsize=fnt_size)
  565. # Set the x-axis locator to reduce the number of ticks to 10
  566. ax1.set_ylabel(r'$\theta_k$', fontsize=fnt_size, rotation = 90)
  567. ax1.set_title(title, fontsize=fnt_size)
  568. ax1.legend(handles=plots, loc='best', fontsize = fnt_size*0.5)
  569. ax1.set_xticks(x_ticks)
  570. step = len(x_ticks)//(n_ticks-1)
  571. values = x_ticks[::step]
  572. new_prop = []
  573. for val in values:
  574. new_prop.append(prop[int(val)-2])
  575. new_prop = new_prop[::-1]
  576. ax1.set_xticks(values)
  577. ax1.set_xticklabels([f'{values[k]}\n{val:.2f}' for k, val in enumerate(new_prop)], fontsize = fnt_size*0.8)
  578. if ax is None:
  579. # nb of plot_lines represent the number of epochs stored (len(plot_lines) = #breaks+1)
  580. plt.savefig(title+'_raw_'+str(nb_breaks)+'_breaks.pdf')
  581. plt.close(fig)
  582. # return plots
  583. return ax
  584. def combined_plot(folder_path, mu, tgen, breaks, title = "Title", theta_scale = False, selected_breaks = []):
  585. my_dpi = 300
  586. saved_plots_dict = save_all_epochs_thetafolder(folder_path, mu, tgen, title, theta_scale, output = title+"_plotdata.json")
  587. nb_of_epochs = len(saved_plots_dict["all_epochs"]["plots"])
  588. best_epoch = saved_plots_dict["best_epoch_by_AIC"]
  589. print("Best epoch based on AIC =", best_epoch)
  590. save_k_theta(folder_path, mu, tgen, title, theta_scale, breaks_max = nb_of_epochs, input = title+"_plotdata.json", output = title+"_plotdata.json")
  591. with open(title+"_plotdata.json", 'r') as json_file:
  592. loaded_data = json.load(json_file)
  593. # START OF COMBINED PLOT CODE
  594. # # plot page 1 of summary
  595. # fig1, ax1 = plt.subplots(2, 2, figsize=(5000/my_dpi, 2970/my_dpi), dpi=my_dpi)
  596. # # fig1.tight_layout()
  597. # # Adjust absolute space between the top and bottom rows
  598. # fig1.subplots_adjust(hspace=0.35) # Adjust this value based on your requirement
  599. # # plot page 2 of summary
  600. # fig2, ax2 = plt.subplots(2, 2, figsize=(5000/my_dpi, 2970/my_dpi), dpi=my_dpi)
  601. # # fig2.tight_layout()
  602. # ax1 = plot_raw_stairs(plot_lines = loaded_data['raw_stairs'],
  603. # prop = loaded_data['prop'], title = title, ax = ax1)
  604. # ax1 = plot_scaled_theta(plot_lines = loaded_data['scaled_stairs'],
  605. # prop = loaded_data['prop'], title = title, ax = ax1, subset=[loaded_data['best_epoch_by_AIC']]+selected_breaks)
  606. # ax2 = plot_scaled_theta(plot_lines = loaded_data['scaled_stairs'],
  607. # prop = loaded_data['prop'], title = title, ax = ax2)
  608. # ax1, ax2 = plot_all_epochs_thetafolder(loaded_data, mu, tgen, title, theta_scale, ax = [ax1, ax2])
  609. # fig1.savefig(title+'_combined_p1.pdf')
  610. # print("Wrote", title+'_combined_p1.pdf')
  611. # fig2.savefig(title+'_combined_p2.pdf')
  612. # print("Wrote", title+'_combined_p2.pdf')
  613. # END OF COMBINED PLOT CODE
  614. # Start of Parsing real swp2 output
  615. folder_splitted = folder_path.split("/")
  616. swp2_summary = "/".join(folder_splitted[:-2])+'/'+folder_splitted[-3]+".final.summary"
  617. swp2_vals = parse_stairwayplot_output_summary(stwplt_out = swp2_summary)
  618. swp2_x, swp2_y = swp2_vals[0], swp2_vals[1]
  619. # End of Parsing real swp2 output
  620. plot_raw_stairs(plot_lines = loaded_data['raw_stairs'],
  621. prop = loaded_data['prop'], title = title, ax = None, max_breaks = breaks)
  622. plot_scaled_theta(plot_lines = loaded_data['scaled_stairs'], mu = mu, tgen = tgen, subset=[loaded_data['best_epoch_by_AIC']]+selected_breaks,
  623. # plot_scaled_theta(plot_lines = loaded_data['scaled_stairs'], subset=list(range(0,3))+[loaded_data['best_epoch_by_AIC']]+selected_breaks,
  624. prop = loaded_data['prop'], title = title, swp2_lines = [swp2_x, swp2_y], ax = None)
  625. plot_all_epochs_thetafolder(loaded_data, mu, tgen, title, theta_scale, ax = None)
  626. # plt.close(fig1)
  627. # plt.close(fig2)
  628. def parse_stairwayplot_output_summary(stwplt_out, xlim = None, ylim = None, title = "default title", plot = False):
  629. #col 5
  630. year = []
  631. # col 6
  632. ne_median = []
  633. ne_2_5 = []
  634. ne_97_5 = []
  635. ne_12_5 = []
  636. # col 10
  637. ne_87_5 = []
  638. with open(stwplt_out, "r") as stwplt_stream:
  639. for line in stwplt_stream:
  640. ## Line format
  641. # mutation_per_site n_estimation theta_per_site_median theta_per_site_2.5% theta_per_site_97.5% year Ne_median Ne_2.5% Ne_97.5% Ne_12.5% Ne_87.5%
  642. if not line.startswith("mutation_per_site"):
  643. #not header
  644. values = line.strip().split()
  645. year.append(float(values[5]))
  646. ne_median.append(float(values[6]))
  647. ne_2_5.append(float(values[7]))
  648. ne_97_5.append(float(values[8]))
  649. ne_12_5.append(float(values[9]))
  650. ne_87_5.append(float(values[10]))
  651. vals = [year, ne_median, ne_2_5, ne_97_5, ne_12_5, ne_87_5]
  652. if plot :
  653. # plot parsed data
  654. label = ["Ne median", "Ne 2.5%", "Ne 97.5%", "Ne 12.5%", "Ne 87.5%"]
  655. for i in range(1, 5):
  656. fig, = plt.plot(year, vals[i], '--', alpha = 0.4)
  657. fig.set_label(label[i])
  658. # # last plot is median
  659. fig, = plt.plot(year, ne_median, 'r-', lw=2)
  660. fig.set_label(label[0])
  661. plt.legend()
  662. plt.ylabel("Individuals (Ne)")
  663. plt.xlabel("Time (years)")
  664. if xlim:
  665. plt.xlim(xlim)
  666. if ylim:
  667. plt.ylim(ylim)
  668. plt.title(title)
  669. plt.show()
  670. plt.close()
  671. return vals
  672. if __name__ == "__main__":
  673. if len(sys.argv) != 4:
  674. print("Need 3 args: ThetaFolder MutationRate GenerationTime")
  675. exit(0)
  676. folder_path = sys.argv[1]
  677. mu = sys.argv[2]
  678. tgen = sys.argv[3]
  679. plot_all_epochs_thetafolder(folder_path, mu, tgen)