import matplotlib.pyplot as plt import os import numpy as np import math from scipy.special import gammaln from matplotlib.backends.backend_pdf import PdfPages def log_facto(k): k = int(k) if k > 1e6: return k * np.log(k) - k + np.log(2*math.pi*k)/2 val = 0 for i in range(2, k+1): val += np.log(i) return val def log_facto_1(k): startf = 1 # start of factorial sequence stopf = int(k+1) # end of of factorial sequence q = gammaln(range(startf+1, stopf+1)) # n! = G(n+1) return q[-1] def return_x_y_from_stwp_theta_file(stwp_theta_file, breaks, mu, tgen, relative_theta_scale = False): with open(stwp_theta_file, "r") as swp_file: # Read the first line line = swp_file.readline() L = float(line.split()[2]) rands = swp_file.readline() line = swp_file.readline() # skip empty lines before SFS while line == "\n": line = swp_file.readline() sfs = np.array(line.split()).astype(float) # Process lines until the end of the file while line: # check at each line if line.startswith("dim") : dim = int(line.split()[1]) if dim == breaks+1: likelihood = line.split()[5] groups = line.split()[6:6+dim] theta_site = line.split()[6+dim:6+dim+1+dim] elif dim < breaks+1: line = swp_file.readline() continue elif dim > breaks+1: break #return 0,0,0 # Read the next line line = swp_file.readline() #### END of parsing # quit this file if the number of dimensions is incorrect if dim < breaks+1: return 0,0,0,0,0 # get n, the last bin of the last group # revert the list of groups as the most recent times correspond # to the closest and last leafs of the coal. tree. groups = groups[::-1] theta_site = theta_site[::-1] # initiate the dict of times t = {} # list of thetas theta_L = [] sum_t = 0 for group_nb, group in enumerate(groups): ###print(group_nb, group, theta_site[group_nb], len(theta_site)) # store all the thetas one by one, with one theta per group theta_L.append(float(theta_site[group_nb])) # if the group is of size 1 if len(group.split(',')) == 1: i = int(group) # if the group size is >1, take the first elem of the group # i is the first bin of each group, straight after a breakpoint else: i = int(group.split(",")[0]) j = int(group.split(",")[-1]) t[i] = 0 #t = if len(group.split(',')) == 1: k = i if relative_theta_scale: t[i] += ((theta_L[group_nb] ) / (k*(k-1))) else: t[i] += ((theta_L[group_nb] ) / (k*(k-1)) * tgen) / mu else: for k in range(j, i-1, -1 ): if relative_theta_scale: t[i] += ((theta_L[group_nb] ) / (k*(k-1))) else: t[i] += ((theta_L[group_nb] ) / (k*(k-1)) * tgen) / mu # we add the cumulative times at the end t[i] += sum_t sum_t = t[i] # build the y axis (sizes) y = [] for theta in theta_L: if relative_theta_scale: size = theta else: # with size N = theta/4mu size = theta / (4*mu) y.append(size) y.append(size) # build the time x axis x = [0] for time in range(0, len(t.values())-1): x.append(list(t.values())[time]) x.append(list(t.values())[time]) x.append(list(t.values())[len(t.values())-1]) # if relative_theta_scale: # # rescale # #N0 = y[0] # # for i in range(len(y)): # # # divide by N0 # # y[i] = y[i]/N0 # # x[i] = x[i]/N0 return x,y,likelihood,sfs,L def return_x_y_from_stwp_theta_file_as_is(stwp_theta_file, breaks, mu, tgen, relative_theta_scale = False): with open(stwp_theta_file, "r") as swp_file: # Read the first line line = swp_file.readline() L = float(line.split()[2]) rands = swp_file.readline() line = swp_file.readline() # skip empty lines before SFS while line == "\n": line = swp_file.readline() sfs = np.array(line.split()).astype(float) # Process lines until the end of the file while line: # check at each line if line.startswith("dim") : dim = int(line.split()[1]) if dim == breaks+1: likelihood = line.split()[5] groups = line.split()[6:6+dim] theta_site = line.split()[6+dim:6+dim+1+dim] elif dim < breaks+1: line = swp_file.readline() continue elif dim > breaks+1: break #return 0,0,0 # Read the next line line = swp_file.readline() #### END of parsing # quit this file if the number of dimensions is incorrect if dim < breaks+1: return 0,0 # get n, the last bin of the last group # revert the list of groups as the most recent times correspond # to the closest and last leafs of the coal. tree. groups = groups[::-1] theta_site = theta_site[::-1] thetas = {} for i in range(len(groups)): groups[i] = groups[i].split(',') #print(groups[i], len(groups[i])) thetas[i] = [float(theta_site[i]), groups[i], likelihood] return thetas, sfs def plot_k_epochs_thetafolder(folder_path, mu, tgen, breaks = 2, title = "Title", theta_scale = True): scenari = {} cpt = 0 for file_name in os.listdir(folder_path): if os.path.isfile(os.path.join(folder_path, file_name)): # Perform actions on each file x,y,likelihood,sfs,L = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks, tgen = tgen, mu = mu, relative_theta_scale = theta_scale) if x == 0 or y == 0: continue cpt +=1 scenari[likelihood] = x,y print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(breaks)+"\n*******\n") print(cpt, "theta file(s) have been scanned.") # sort starting by the smallest -log(Likelihood) print(scenari) best10_scenari = (sorted(list(scenari.keys())))[:10] print("10 greatest Likelihoods", best10_scenari) greatest_likelihood = best10_scenari[0] x, y = scenari[greatest_likelihood] my_dpi = 300 plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi) plt.plot(x, y, 'r-', lw=2, label = 'Lik='+greatest_likelihood) plt.xlim(1e-3, 1) plt.ylim(0, 10) #plt.yscale('log') plt.xscale('log') plt.grid(True,which="both", linestyle='--', alpha = 0.3) for scenario in best10_scenari[1:]: x,y = scenari[scenario] #print("\n---- Lik:",scenario,"\n\nt=", x,"\n\nN=",y, "\n\n") plt.plot(x, y, '--', lw=1, label = 'Lik='+scenario) if theta_scale: plt.xlabel("Coal. time") plt.ylabel("Pop. size scaled by N0") recent_scale_lower_bound = y[0] * 0.01 recent_scale_upper_bound = y[0] * 0.1 plt.axvline(x=recent_scale_lower_bound) plt.axvline(x=recent_scale_upper_bound) else: # years plt.xlabel("Time (years)") plt.ylabel("Individuals (N)") plt.legend(loc='upper right') plt.title(title) plt.savefig(title+'_b'+str(breaks)+'.pdf') def plot_all_epochs_thetafolder(folder_path, mu, tgen, title = "Title", theta_scale = True): #scenari = {} cpt = 0 epochs = {} for file_name in os.listdir(folder_path): breaks = 0 cpt +=1 if os.path.isfile(os.path.join(folder_path, file_name)): x, y, likelihood, sfs, L = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks, tgen = tgen, mu = mu, relative_theta_scale = theta_scale) SFS_stored = sfs L_stored = L while not (x == 0 and y == 0): if breaks not in epochs.keys(): epochs[breaks] = {} epochs[breaks][likelihood] = x,y breaks += 1 x,y,likelihood,sfs,L = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks, tgen = tgen, mu = mu, relative_theta_scale = theta_scale) print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(breaks)+"\n*******\n") print(cpt, "theta file(s) have been scanned.") # intialize figure my_dpi = 300 plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi) plt.xlim(1e-3, 1) #plt.ylim(0, 10) plt.yscale('log') plt.xscale('log') plt.grid(True,which="both", linestyle='--', alpha = 0.3) brkpt_lik = [] for epoch, scenari in epochs.items(): # sort starting by the smallest -log(Likelihood) best10_scenari = (sorted(list(scenari.keys())))[:10] greatest_likelihood = best10_scenari[0] # store the tuple breakpoints and likelihood for later plot brkpt_lik.append((epoch, greatest_likelihood)) x, y = scenari[greatest_likelihood] #without breakpoint if epoch == 0: # do something with the theta without bp and skip the plotting N0 = y[0] #continue for i in range(len(y)): # divide by N0 y[i] = y[i]/N0 x[i] = x[i]/N0 sum_theta_i = 0 print(epoch, x, y) for i in range(2, len(y)-1): sum_theta_i=y[i] / (i-1) prop = [] for k in range(2, len(y)-1): prop.append(y[k+1] / (k - 1) / sum_theta_i) #print(epoch, prop) plt.plot(x, y, 'o', linestyle = "-", alpha=0.75, lw=2, label = str(epoch)+' BrkPt | Lik='+greatest_likelihood) if theta_scale: plt.xlabel("Coal. time") plt.ylabel("Pop. size scaled by N0") recent_scale_lower_bound = 0.01 recent_scale_upper_bound = 0.1 #print(recent_scale_lower_bound, recent_scale_upper_bound) plt.axvline(x=recent_scale_lower_bound) plt.axvline(x=recent_scale_upper_bound) else: # years plt.xlabel("Time (years)") plt.ylabel("Individuals (N)") plt.xlim(1e-5, 1) plt.legend(loc='upper right') plt.title(title) plt.savefig(title+'_b'+str(breaks)+'.pdf') # plot likelihood against nb of breakpoints # best possible likelihood from SFS # Segregating sites S = sum(SFS_stored) # number of monomorphic sites L = L_stored S0 = L-S print("SFS", SFS_stored) print("S", S, "L", L, "S0=", S0) # compute Ln Ln = log_facto(S+S0) - log_facto(S0) + np.log(float(S0)/(S+S0)) * S0 for xi in range(0, len(SFS_stored)): p_i = SFS_stored[xi] / float(S+S0) Ln += np.log(p_i) * SFS_stored[xi] - log_facto(SFS_stored[xi]) res = Ln print(res) # basic plot likelihood plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi) plt.rcParams['font.size'] = '18' plt.plot(np.array(brkpt_lik)[:, 0], np.array(brkpt_lik)[:, 1].astype(float), 'o', linestyle = "dotted", lw=2) # plt.ylim(0,100) # plt.axhline(y=res) plt.yscale('log') plt.xlabel("# breakpoints", fontsize=20) plt.ylabel("$-\log\mathcal{L}$") #plt.legend(loc='upper right') plt.title(title) plt.savefig(title+'_Breakpts_Likelihood.pdf') # AIC plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi) plt.rcParams['font.size'] = '18' AIC = 2*(len(brkpt_lik)+1)+2*np.array(brkpt_lik)[:, 1].astype(float) plt.plot(np.array(brkpt_lik)[:, 0], AIC, 'o', linestyle = "dotted", lw=2) # plt.axhline(y=106) plt.yscale('log') plt.xlabel("# breakpoints", fontsize=20) plt.ylabel("AIC") #plt.legend(loc='upper right') plt.title(title) plt.savefig(title+'_Breakpts_Likelihood_AIC.pdf') def plot_test_theta(folder_path, mu, tgen, title = "Title", theta_scale = True, breaks_max = 5): """ Use theta values as is to do basic plots. """ cpt = 0 epochs = {} for file_name in os.listdir(folder_path): cpt +=1 if os.path.isfile(os.path.join(folder_path, file_name)): for k in range(breaks_max): thetas,sfs = return_x_y_from_stwp_theta_file_as_is(folder_path+file_name, breaks = k, tgen = tgen, mu = mu, relative_theta_scale = theta_scale) if thetas == 0: continue epochs[k] = thetas print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(k)+"\n*******\n") print(cpt, "theta file(s) have been scanned.") # intialize figure 1 my_dpi = 300 plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi) for epoch, theta in epochs.items(): groups = np.array(list(theta.values()), dtype=object)[:, 1].tolist() x = [] y = [] thetas = np.array(list(theta.values()), dtype=object)[:, 0] for i,group in enumerate(groups): x += group[::-1] y += list(np.repeat(thetas[i], len(group))) if epoch == 0: N0 = y[0] for i in range(len(y)): y[i] = y[i]/N0 plt.plot(x, y, 'o', linestyle="dotted", alpha=0.75, lw=2, label = str(epoch)+' brks') plt.xlabel("# breaks") plt.ylabel("theta") plt.legend(loc='upper right') plt.savefig(title+'_test'+str(k)+'.pdf') # fig 2 & 3 plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi) for epoch, theta in epochs.items(): groups = np.array(list(theta.values()), dtype=object)[:, 1].tolist() x = [] y = [] thetas = np.array(list(theta.values()), dtype=object)[:, 0] for i,group in enumerate(groups): x += group[::-1] y += list(np.repeat(thetas[i], len(group))) if epoch == 0: N0 = y[0] for i in range(len(y)): y[i] = y[i]/N0 x_2 = [] T = 0 for i in range(len(x)): x[i] = int(x[i]) # compute the times as: theta_k / (k*(k-1)) for i in range(0, len(x)): T += y[i] / (x[i]*(x[i]-1)) x_2.append(T) # Plotting (fig 2) plt.plot(x_2, y, 'o', linestyle="dotted", alpha=0.75, lw=2, label = str(epoch)+' brks') plt.xlabel("# breaks") plt.ylabel("theta") plt.legend(loc='upper right') plt.savefig(title+'_test'+str(k)+'.pdf') # Plotting (fig 3) which is the same but log scale for x plt.plot(x_2, y, 'o', linestyle="dotted", alpha=0.75, lw=2, label = str(epoch)+' brks') plt.xscale('log') plt.xlabel("# breaks") plt.ylabel("theta") plt.legend(loc='upper right') plt.savefig(title+'_test'+str(k)+'_log.pdf') def save_multi_image(filename): pp = PdfPages(filename) fig_nums = plt.get_fignums() figs = [plt.figure(n) for n in fig_nums] for fig in figs: fig.savefig(pp, format='pdf') pp.close() def combined_plot(folder_path, mu, tgen, breaks, title = "Title", theta_scale = True): plot_all_epochs_thetafolder(folder_path, mu, tgen, title, theta_scale) plot_test_theta(folder_path, mu, tgen, title, theta_scale, breaks_max = breaks) save_multi_image(title+"_combined.pdf") if __name__ == "__main__": if len(sys.argv) != 4: print("Need 3 args: ThetaFolder MutationRate GenerationTime") exit(0) folder_path = sys.argv[1] mu = sys.argv[2] tgen = sys.argv[3] plot_all_epochs_thetafolder(folder_path, mu, tgen)