123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429 |
- import matplotlib.pyplot as plt
- import os
- import numpy as np
- import math
- from scipy.special import gammaln
- from matplotlib.backends.backend_pdf import PdfPages
-
- def log_facto(k):
- k = int(k)
- if k > 1e6:
- return k * np.log(k) - k + np.log(2*math.pi*k)/2
- val = 0
- for i in range(2, k+1):
- val += np.log(i)
- return val
-
- def log_facto_1(k):
- startf = 1 # start of factorial sequence
- stopf = int(k+1) # end of of factorial sequence
-
- q = gammaln(range(startf+1, stopf+1)) # n! = G(n+1)
-
- return q[-1]
-
- def return_x_y_from_stwp_theta_file(stwp_theta_file, breaks, mu, tgen, relative_theta_scale = False):
- with open(stwp_theta_file, "r") as swp_file:
- # Read the first line
- line = swp_file.readline()
- L = float(line.split()[2])
- rands = swp_file.readline()
- line = swp_file.readline()
- # skip empty lines before SFS
- while line == "\n":
- line = swp_file.readline()
- sfs = np.array(line.split()).astype(float)
- # Process lines until the end of the file
- while line:
- # check at each line
- if line.startswith("dim") :
- dim = int(line.split()[1])
- if dim == breaks+1:
- likelihood = line.split()[5]
- groups = line.split()[6:6+dim]
- theta_site = line.split()[6+dim:6+dim+1+dim]
- elif dim < breaks+1:
- line = swp_file.readline()
- continue
- elif dim > breaks+1:
- break
- #return 0,0,0
- # Read the next line
- line = swp_file.readline()
- #### END of parsing
- # quit this file if the number of dimensions is incorrect
- if dim < breaks+1:
- return 0,0,0,0,0
- # get n, the last bin of the last group
- # revert the list of groups as the most recent times correspond
- # to the closest and last leafs of the coal. tree.
- groups = groups[::-1]
- theta_site = theta_site[::-1]
- # initiate the dict of times
- t = {}
- # list of thetas
- theta_L = []
- sum_t = 0
- for group_nb, group in enumerate(groups):
- ###print(group_nb, group, theta_site[group_nb], len(theta_site))
- # store all the thetas one by one, with one theta per group
- theta_L.append(float(theta_site[group_nb]))
- # if the group is of size 1
- if len(group.split(',')) == 1:
- i = int(group)
- # if the group size is >1, take the first elem of the group
- # i is the first bin of each group, straight after a breakpoint
- else:
- i = int(group.split(",")[0])
- j = int(group.split(",")[-1])
- t[i] = 0
- #t =
- if len(group.split(',')) == 1:
- k = i
- if relative_theta_scale:
- t[i] += ((theta_L[group_nb] ) / (k*(k-1)))
- else:
- t[i] += ((theta_L[group_nb] ) / (k*(k-1)) * tgen) / mu
- else:
- for k in range(j, i-1, -1 ):
- if relative_theta_scale:
- t[i] += ((theta_L[group_nb] ) / (k*(k-1)))
- else:
- t[i] += ((theta_L[group_nb] ) / (k*(k-1)) * tgen) / mu
- # we add the cumulative times at the end
- t[i] += sum_t
- sum_t = t[i]
- # build the y axis (sizes)
- y = []
- for theta in theta_L:
- if relative_theta_scale:
- size = theta
- else:
- # with size N = theta/4mu
- size = theta / (4*mu)
- y.append(size)
- y.append(size)
- # build the time x axis
- x = [0]
- for time in range(0, len(t.values())-1):
- x.append(list(t.values())[time])
- x.append(list(t.values())[time])
- x.append(list(t.values())[len(t.values())-1])
- # if relative_theta_scale:
- # # rescale
- # #N0 = y[0]
- # # for i in range(len(y)):
- # # # divide by N0
- # # y[i] = y[i]/N0
- # # x[i] = x[i]/N0
- return x,y,likelihood,sfs,L
-
- def return_x_y_from_stwp_theta_file_as_is(stwp_theta_file, breaks, mu, tgen, relative_theta_scale = False):
- with open(stwp_theta_file, "r") as swp_file:
- # Read the first line
- line = swp_file.readline()
- L = float(line.split()[2])
- rands = swp_file.readline()
- line = swp_file.readline()
- # skip empty lines before SFS
- while line == "\n":
- line = swp_file.readline()
- sfs = np.array(line.split()).astype(float)
- # Process lines until the end of the file
- while line:
- # check at each line
- if line.startswith("dim") :
- dim = int(line.split()[1])
- if dim == breaks+1:
- likelihood = line.split()[5]
- groups = line.split()[6:6+dim]
- theta_site = line.split()[6+dim:6+dim+1+dim]
- elif dim < breaks+1:
- line = swp_file.readline()
- continue
- elif dim > breaks+1:
- break
- #return 0,0,0
- # Read the next line
- line = swp_file.readline()
- #### END of parsing
- # quit this file if the number of dimensions is incorrect
- if dim < breaks+1:
- return 0,0
- # get n, the last bin of the last group
- # revert the list of groups as the most recent times correspond
- # to the closest and last leafs of the coal. tree.
- groups = groups[::-1]
- theta_site = theta_site[::-1]
-
- thetas = {}
-
- for i in range(len(groups)):
- groups[i] = groups[i].split(',')
- #print(groups[i], len(groups[i]))
- thetas[i] = [float(theta_site[i]), groups[i], likelihood]
- return thetas, sfs
-
- def plot_k_epochs_thetafolder(folder_path, mu, tgen, breaks = 2, title = "Title", theta_scale = True):
- scenari = {}
- cpt = 0
- for file_name in os.listdir(folder_path):
- if os.path.isfile(os.path.join(folder_path, file_name)):
- # Perform actions on each file
- x,y,likelihood,sfs,L = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks,
- tgen = tgen,
- mu = mu, relative_theta_scale = theta_scale)
- if x == 0 or y == 0:
- continue
- cpt +=1
- scenari[likelihood] = x,y
- print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(breaks)+"\n*******\n")
- print(cpt, "theta file(s) have been scanned.")
- # sort starting by the smallest -log(Likelihood)
- print(scenari)
- best10_scenari = (sorted(list(scenari.keys())))[:10]
- print("10 greatest Likelihoods", best10_scenari)
- greatest_likelihood = best10_scenari[0]
- x, y = scenari[greatest_likelihood]
- my_dpi = 300
- plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
- plt.plot(x, y, 'r-', lw=2, label = 'Lik='+greatest_likelihood)
- plt.xlim(1e-3, 1)
- plt.ylim(0, 10)
- #plt.yscale('log')
- plt.xscale('log')
- plt.grid(True,which="both", linestyle='--', alpha = 0.3)
-
- for scenario in best10_scenari[1:]:
- x,y = scenari[scenario]
- #print("\n---- Lik:",scenario,"\n\nt=", x,"\n\nN=",y, "\n\n")
- plt.plot(x, y, '--', lw=1, label = 'Lik='+scenario)
- if theta_scale:
- plt.xlabel("Coal. time")
- plt.ylabel("Pop. size scaled by N0")
- recent_scale_lower_bound = y[0] * 0.01
- recent_scale_upper_bound = y[0] * 0.1
- plt.axvline(x=recent_scale_lower_bound)
- plt.axvline(x=recent_scale_upper_bound)
- else:
- # years
- plt.xlabel("Time (years)")
- plt.ylabel("Individuals (N)")
- plt.legend(loc='upper right')
- plt.title(title)
- plt.savefig(title+'_b'+str(breaks)+'.pdf')
-
- def plot_all_epochs_thetafolder(folder_path, mu, tgen, title = "Title", theta_scale = True):
- #scenari = {}
- cpt = 0
- epochs = {}
- for file_name in os.listdir(folder_path):
- breaks = 0
- cpt +=1
- if os.path.isfile(os.path.join(folder_path, file_name)):
- x, y, likelihood, sfs, L = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks,
- tgen = tgen,
- mu = mu, relative_theta_scale = theta_scale)
- SFS_stored = sfs
- L_stored = L
- while not (x == 0 and y == 0):
- if breaks not in epochs.keys():
- epochs[breaks] = {}
- epochs[breaks][likelihood] = x,y
- breaks += 1
- x,y,likelihood,sfs,L = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks,
- tgen = tgen,
- mu = mu, relative_theta_scale = theta_scale)
- print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(breaks)+"\n*******\n")
- print(cpt, "theta file(s) have been scanned.")
-
- # intialize figure
- my_dpi = 300
- plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
- plt.xlim(1e-3, 1)
- #plt.ylim(0, 10)
- plt.yscale('log')
- plt.xscale('log')
- plt.grid(True,which="both", linestyle='--', alpha = 0.3)
- brkpt_lik = []
- for epoch, scenari in epochs.items():
- # sort starting by the smallest -log(Likelihood)
- best10_scenari = (sorted(list(scenari.keys())))[:10]
- greatest_likelihood = best10_scenari[0]
- # store the tuple breakpoints and likelihood for later plot
- brkpt_lik.append((epoch, greatest_likelihood))
- x, y = scenari[greatest_likelihood]
- #without breakpoint
- if epoch == 0:
- # do something with the theta without bp and skip the plotting
- N0 = y[0]
- #continue
- for i in range(len(y)):
- # divide by N0
- y[i] = y[i]/N0
- x[i] = x[i]/N0
- sum_theta_i = 0
- print(epoch, x, y)
- for i in range(2, len(y)-1):
- sum_theta_i=y[i] / (i-1)
- prop = []
- for k in range(2, len(y)-1):
- prop.append(y[k+1] / (k - 1) / sum_theta_i)
- #print(epoch, prop)
- plt.plot(x, y, 'o', linestyle = "-", alpha=0.75, lw=2, label = str(epoch)+' BrkPt | Lik='+greatest_likelihood)
- if theta_scale:
- plt.xlabel("Coal. time")
- plt.ylabel("Pop. size scaled by N0")
- recent_scale_lower_bound = 0.01
- recent_scale_upper_bound = 0.1
- #print(recent_scale_lower_bound, recent_scale_upper_bound)
- plt.axvline(x=recent_scale_lower_bound)
- plt.axvline(x=recent_scale_upper_bound)
- else:
- # years
- plt.xlabel("Time (years)")
- plt.ylabel("Individuals (N)")
- plt.xlim(1e-5, 1)
- plt.legend(loc='upper right')
- plt.title(title)
- plt.savefig(title+'_b'+str(breaks)+'.pdf')
- # plot likelihood against nb of breakpoints
- # best possible likelihood from SFS
- # Segregating sites
- S = sum(SFS_stored)
- # number of monomorphic sites
- L = L_stored
- S0 = L-S
- print("SFS", SFS_stored)
- print("S", S, "L", L, "S0=", S0)
- # compute Ln
- Ln = log_facto(S+S0) - log_facto(S0) + np.log(float(S0)/(S+S0)) * S0
- for xi in range(0, len(SFS_stored)):
- p_i = SFS_stored[xi] / float(S+S0)
- Ln += np.log(p_i) * SFS_stored[xi] - log_facto(SFS_stored[xi])
- res = Ln
- print(res)
- # basic plot likelihood
- plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
- plt.rcParams['font.size'] = '18'
- plt.plot(np.array(brkpt_lik)[:, 0], np.array(brkpt_lik)[:, 1].astype(float), 'o', linestyle = "dotted", lw=2)
- # plt.ylim(0,100)
- # plt.axhline(y=res)
- plt.yscale('log')
- plt.xlabel("# breakpoints", fontsize=20)
- plt.ylabel("$-\log\mathcal{L}$")
- #plt.legend(loc='upper right')
- plt.title(title)
- plt.savefig(title+'_Breakpts_Likelihood.pdf')
- # AIC
- plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
- plt.rcParams['font.size'] = '18'
- AIC = 2*(len(brkpt_lik)+1)+2*np.array(brkpt_lik)[:, 1].astype(float)
- plt.plot(np.array(brkpt_lik)[:, 0], AIC, 'o', linestyle = "dotted", lw=2)
- # plt.axhline(y=106)
- plt.yscale('log')
- plt.xlabel("# breakpoints", fontsize=20)
- plt.ylabel("AIC")
- #plt.legend(loc='upper right')
- plt.title(title)
- plt.savefig(title+'_Breakpts_Likelihood_AIC.pdf')
-
- def plot_test_theta(folder_path, mu, tgen, title = "Title", theta_scale = True, breaks_max = 5):
- """
- Use theta values as is to do basic plots.
- """
- cpt = 0
- epochs = {}
- for file_name in os.listdir(folder_path):
- cpt +=1
- if os.path.isfile(os.path.join(folder_path, file_name)):
- for k in range(breaks_max):
- thetas,sfs = return_x_y_from_stwp_theta_file_as_is(folder_path+file_name, breaks = k,
- tgen = tgen,
- mu = mu, relative_theta_scale = theta_scale)
- if thetas == 0:
- continue
- epochs[k] = thetas
- print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(k)+"\n*******\n")
- print(cpt, "theta file(s) have been scanned.")
-
- # intialize figure 1
- my_dpi = 300
- plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
- for epoch, theta in epochs.items():
- groups = np.array(list(theta.values()), dtype=object)[:, 1].tolist()
- x = []
- y = []
- thetas = np.array(list(theta.values()), dtype=object)[:, 0]
- for i,group in enumerate(groups):
- x += group[::-1]
- y += list(np.repeat(thetas[i], len(group)))
- if epoch == 0:
- N0 = y[0]
- for i in range(len(y)):
- y[i] = y[i]/N0
- plt.plot(x, y, 'o', linestyle="dotted", alpha=0.75, lw=2, label = str(epoch)+' brks')
- plt.xlabel("# breaks")
- plt.ylabel("theta")
- plt.legend(loc='upper right')
- plt.savefig(title+'_test'+str(k)+'.pdf')
- # fig 2 & 3
- plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
- for epoch, theta in epochs.items():
- groups = np.array(list(theta.values()), dtype=object)[:, 1].tolist()
- x = []
- y = []
- thetas = np.array(list(theta.values()), dtype=object)[:, 0]
- for i,group in enumerate(groups):
- x += group[::-1]
- y += list(np.repeat(thetas[i], len(group)))
- if epoch == 0:
- N0 = y[0]
- for i in range(len(y)):
- y[i] = y[i]/N0
- x_2 = []
- T = 0
- for i in range(len(x)):
- x[i] = int(x[i])
- # compute the times as: theta_k / (k*(k-1))
- for i in range(0, len(x)):
- T += y[i] / (x[i]*(x[i]-1))
- x_2.append(T)
- # Plotting (fig 2)
- plt.plot(x_2, y, 'o', linestyle="dotted", alpha=0.75, lw=2, label = str(epoch)+' brks')
- plt.xlabel("# breaks")
- plt.ylabel("theta")
- plt.legend(loc='upper right')
- plt.savefig(title+'_test'+str(k)+'.pdf')
- # Plotting (fig 3) which is the same but log scale for x
- plt.plot(x_2, y, 'o', linestyle="dotted", alpha=0.75, lw=2, label = str(epoch)+' brks')
- plt.xscale('log')
- plt.xlabel("# breaks")
- plt.ylabel("theta")
- plt.legend(loc='upper right')
- plt.savefig(title+'_test'+str(k)+'_log.pdf')
-
- def save_multi_image(filename):
- pp = PdfPages(filename)
- fig_nums = plt.get_fignums()
- figs = [plt.figure(n) for n in fig_nums]
- for fig in figs:
- fig.savefig(pp, format='pdf')
- pp.close()
-
- def combined_plot(folder_path, mu, tgen, breaks, title = "Title", theta_scale = True):
- plot_all_epochs_thetafolder(folder_path, mu, tgen, title, theta_scale)
- plot_test_theta(folder_path, mu, tgen, title, theta_scale, breaks_max = breaks)
- save_multi_image(title+"_combined.pdf")
- if __name__ == "__main__":
-
- if len(sys.argv) != 4:
- print("Need 3 args: ThetaFolder MutationRate GenerationTime")
- exit(0)
-
- folder_path = sys.argv[1]
- mu = sys.argv[2]
- tgen = sys.argv[3]
-
- plot_all_epochs_thetafolder(folder_path, mu, tgen)
|