richard
/
geinf


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
							import matplotlib.pyplot as plt
import os
import numpy as np
import math
from scipy.special import gammaln
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.ticker import MaxNLocator
import io
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

def log_facto(k):
    k = int(k)
    if k > 1e6:
        return k * np.log(k) - k + np.log(2*math.pi*k)/2
    val = 0
    for i in range(2, k+1):
        val += np.log(i)
    return val

def log_facto_1(k):
    startf = 1 # start of factorial sequence
    stopf  = int(k+1) # end of of factorial sequence

    q = gammaln(range(startf+1, stopf+1)) # n! = G(n+1)

    return q[-1]

def return_x_y_from_stwp_theta_file(stwp_theta_file, breaks, mu, tgen, relative_theta_scale = False):
    with open(stwp_theta_file, "r") as swp_file:
        # Read the first line
        line = swp_file.readline()
        L = float(line.split()[2])
        rands = swp_file.readline()
        line = swp_file.readline()
        # skip empty lines before SFS
        while line == "\n":
            line = swp_file.readline()
        sfs = np.array(line.split()).astype(float)
        # Process lines until the end of the file
        while line:
            # check at each line
            if line.startswith("dim") :
                dim = int(line.split()[1])
                if dim == breaks+1:
                    likelihood = line.split()[5]
                    groups = line.split()[6:6+dim]
                    theta_site = line.split()[6+dim:6+dim+1+dim]
                elif dim < breaks+1:
                    line = swp_file.readline()
                    continue
                elif dim > breaks+1:
                    break
                    #return 0,0,0
            # Read the next line
            line = swp_file.readline()
    #### END of parsing
    # quit this file if the number of dimensions is incorrect
    if dim < breaks+1:
        return 0,0,0,0,0
    # get n, the last bin of the last group
    # revert the list of groups as the most recent times correspond
    # to the closest and last leafs of the coal. tree.
    groups = groups[::-1]
    theta_site = theta_site[::-1]
    # initiate the dict of times
    t = {}
    # list of thetas
    theta_L = []
    sum_t = 0
    for group_nb, group in enumerate(groups):
        ###print(group_nb, group, theta_site[group_nb], len(theta_site))
        # store all the thetas one by one, with one theta per group
        theta_L.append(float(theta_site[group_nb]))
        # if the group is of size 1
        if len(group.split(',')) == 1:
            i = int(group)
        # if the group size is >1, take the first elem of the group
        # i is the first bin of each group, straight after a breakpoint
        else:
            i = int(group.split(",")[0])
            j = int(group.split(",")[-1])
        t[i] = 0
        #t =
        if len(group.split(',')) == 1:
            k = i
            if relative_theta_scale:
                t[i] += ((theta_L[group_nb] ) / (k*(k-1)))
            else:
                t[i] += ((theta_L[group_nb] ) / (k*(k-1)) * tgen) / mu
        else:
            for k in range(j, i-1, -1 ):
                if relative_theta_scale:
                    t[i] += ((theta_L[group_nb] ) / (k*(k-1)))
                else:
                    t[i] += ((theta_L[group_nb] ) / (k*(k-1)) * tgen) / mu
        # we add the cumulative times at the end
        t[i] += sum_t
        sum_t = t[i]
    # build the y axis (sizes)
    y = []
    for theta in theta_L:
        if relative_theta_scale:
            size = theta
        else:
            # with size N = theta/4mu
            size = theta / (4*mu)
        y.append(size)
        y.append(size)
    # build the time x axis
    x = [0]
    for time in range(0, len(t.values())-1):
        x.append(list(t.values())[time])
        x.append(list(t.values())[time])
    x.append(list(t.values())[len(t.values())-1])
    # if relative_theta_scale:
    #     # rescale
    #     #N0 = y[0]
    #     # for i in range(len(y)):
    #     #     # divide by N0
    #     #     y[i] = y[i]/N0
    #     #     x[i] = x[i]/N0
    return x,y,likelihood,sfs,L

def return_x_y_from_stwp_theta_file_as_is(stwp_theta_file, breaks, mu, tgen, relative_theta_scale = False):
    with open(stwp_theta_file, "r") as swp_file:
        # Read the first line
        line = swp_file.readline()
        L = float(line.split()[2])
        rands = swp_file.readline()
        line = swp_file.readline()
        # skip empty lines before SFS
        while line == "\n":
            line = swp_file.readline()
        sfs = np.array(line.split()).astype(float)
        # Process lines until the end of the file
        while line:
            # check at each line
            if line.startswith("dim") :
                dim = int(line.split()[1])
                if dim == breaks+1:
                    likelihood = line.split()[5]
                    groups = line.split()[6:6+dim]
                    theta_site = line.split()[6+dim:6+dim+1+dim]
                elif dim < breaks+1:
                    line = swp_file.readline()
                    continue
                elif dim > breaks+1:
                    break
                    #return 0,0,0
            # Read the next line
            line = swp_file.readline()
    #### END of parsing
    # quit this file if the number of dimensions is incorrect
    if dim < breaks+1:
        return 0,0
    # get n, the last bin of the last group
    # revert the list of groups as the most recent times correspond
    # to the closest and last leafs of the coal. tree.
    groups = groups[::-1]
    theta_site = theta_site[::-1]

    thetas = {}

    for i in range(len(groups)):
        groups[i] = groups[i].split(',')
        #print(groups[i], len(groups[i]))
        thetas[i] = [float(theta_site[i]), groups[i], likelihood]
    return thetas, sfs

def plot_k_epochs_thetafolder(folder_path, mu, tgen, breaks = 2, title = "Title", theta_scale = True):
    scenari = {}
    cpt = 0
    for file_name in os.listdir(folder_path):
        if os.path.isfile(os.path.join(folder_path, file_name)):
            # Perform actions on each file
            x,y,likelihood,sfs,L = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks,
                                                             tgen = tgen,
                                     mu = mu, relative_theta_scale = theta_scale)
            if x == 0 or y == 0:
                continue
            cpt +=1
            scenari[likelihood] = x,y
    print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(breaks)+"\n*******\n")
    print(cpt, "theta file(s) have been scanned.")
    # sort starting by the smallest -log(Likelihood)
    print(scenari)
    best10_scenari = (sorted(list(scenari.keys())))[:10]
    print("10 greatest Likelihoods", best10_scenari)
    greatest_likelihood = best10_scenari[0]
    x, y = scenari[greatest_likelihood]
    my_dpi = 300
    plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
    plt.plot(x, y, 'r-', lw=2, label = 'Lik='+greatest_likelihood)
    plt.xlim(1e-3, 1)
    plt.ylim(0, 10)
    #plt.yscale('log')
    plt.xscale('log')
    plt.grid(True,which="both", linestyle='--', alpha = 0.3)

    for scenario in best10_scenari[1:]:
        x,y = scenari[scenario]
        #print("\n----  Lik:",scenario,"\n\nt=", x,"\n\nN=",y, "\n\n")
        plt.plot(x, y, '--', lw=1, label = 'Lik='+scenario)
    if theta_scale:
        plt.xlabel("Coal. time")
        plt.ylabel("Pop. size scaled by N0")
        recent_scale_lower_bound = y[0] * 0.01
        recent_scale_upper_bound = y[0] * 0.1
        plt.axvline(x=recent_scale_lower_bound)
        plt.axvline(x=recent_scale_upper_bound)
    else:
        # years
        plt.xlabel("Time (years)")
        plt.ylabel("Individuals (N)")
    plt.legend(loc='upper right')
    plt.title(title)
    plt.savefig(title+'_b'+str(breaks)+'.pdf')

def plot_straight_x_y(x,y):
    x_1 = [x[0]]
    y_1 = []
    for i in range(0, len(y)-1):
        x_1.append(x[i])
        x_1.append(x[i])
        y_1.append(y[i])
        y_1.append(y[i])
    y_1 = y_1+[y[-1],y[-1]]
    x_1.append(x[-1])
    return x_1, y_1

def plot_all_epochs_thetafolder(folder_path, mu, tgen, title = "Title", theta_scale = True, ax = None):
    #scenari = {}
    cpt = 0
    epochs = {}
    for file_name in os.listdir(folder_path):
        breaks = 0
        cpt +=1
        if os.path.isfile(os.path.join(folder_path, file_name)):
            x, y, likelihood, sfs, L = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks,
                                                             tgen = tgen,
                                                             mu = mu, relative_theta_scale = theta_scale)
            SFS_stored = sfs
            L_stored = L
            while not (x == 0 and y == 0):
                if breaks not in epochs.keys():
                    epochs[breaks] = {}
                epochs[breaks][likelihood] = x,y
                breaks += 1
                x,y,likelihood,sfs,L = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks,
                                                                 tgen = tgen,
                                                                  mu = mu, relative_theta_scale = theta_scale)
            if x == 0:
                # last break did not work, then breaks = breaks-1
                breaks -= 1
    print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(breaks)+"\n*******\n")
    print(cpt, "theta file(s) have been scanned.")
    my_dpi = 300
    if ax is None:
        # intialize figure
        my_dpi = 300
        fnt_size = 18
        fig, ax1 = plt.subplots(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
    else:
        fnt_size = 12
        plt.rcParams['font.size'] = fnt_size
        ax1 = ax[0,0]
    #ax1.set_xlim(1e-3, 1)
    ax1.set_yscale('log')
    ax1.set_xscale('log')
    ax1.grid(True,which="both", linestyle='--', alpha = 0.3)
    brkpt_lik = []
    top_plots = {}
    for epoch, scenari in epochs.items():
        # sort starting by the smallest -log(Likelihood)
        best10_scenari = (sorted(list(scenari.keys())))[:10]
        greatest_likelihood = best10_scenari[0]
        # store the tuple breakpoints and likelihood for later plot
        brkpt_lik.append((epoch, greatest_likelihood))
        x, y = scenari[greatest_likelihood]
        #without breakpoint
        if epoch == 0:
            # do something with the theta without bp and skip the plotting
            N0 = y[0]
            #continue
        for i in range(len(y)):
            # divide by N0
            y[i] = y[i]/N0
            x[i] = x[i]/N0
        top_plots[greatest_likelihood] = x,y,epoch
    plots_likelihoods = list(top_plots.keys())
    for i in range(len(plots_likelihoods)):
        plots_likelihoods[i] = float(plots_likelihoods[i])
    best10_plots = sorted(plots_likelihoods)[:10]
    top_plot_lik = str(best10_plots[0])
    plot_handles = []
    p0, = ax1.plot(top_plots[top_plot_lik][0], top_plots[top_plot_lik][1], 'o', linestyle = "-",
    alpha=1, lw=2, label = str(top_plots[top_plot_lik][2])+' epoch | Lik='+top_plot_lik)
    plot_handles.append(p0)
    for k, plot_Lk in enumerate(best10_plots[1:]):
        plot_Lk = str(plot_Lk)
        p, = ax1.plot(top_plots[plot_Lk][0], top_plots[plot_Lk][1], 'o', linestyle = "--",
        alpha=1/(k+1), lw=1.5, label = str(top_plots[plot_Lk][2])+' epoch | Lik='+plot_Lk)
        plot_handles.append(p)
    if theta_scale:
        ax1.set_xlabel("Coal. time")
        ax1.set_ylabel("Pop. size scaled by N0")
        # recent_scale_lower_bound = 0.01
        # recent_scale_upper_bound = 0.1
        # ax1.axvline(x=recent_scale_lower_bound)
        # ax1.axvline(x=recent_scale_upper_bound)
    else:
        # years
        plt.set_xlabel("Time (years)")
        plt.set_ylabel("Individuals (N)")
        ax1.set_xlim(1e-5, 1)
    ax1.legend(handles = plot_handles, loc='best', fontsize = fnt_size*0.5)
    ax1.set_title(title)
    if ax is None:
        plt.savefig(title+'_b'+str(breaks)+'.pdf')
    # plot likelihood against nb of breakpoints
    # best possible likelihood from SFS
    # Segregating sites
    S = sum(SFS_stored)
    # Number of kept sites from which the SFS is computed
    L = L_stored
    # number of monomorphic sites
    S0 = L-S
    # print("SFS", SFS_stored)
    # print("S", S, "L", L, "S0=", S0)
    # compute Ln
    Ln = log_facto(S+S0) - log_facto(S0) + np.log(float(S0)/(S+S0)) * S0
    for xi in range(0, len(SFS_stored)):
        p_i = SFS_stored[xi] / float(S+S0)
        Ln += np.log(p_i) * SFS_stored[xi] - log_facto(SFS_stored[xi])
    # basic plot likelihood
    if ax is None:
        fig, ax2 = plt.subplots(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
        plt.rcParams['font.size'] = '18'
    else:
        plt.rcParams['font.size'] = fnt_size
        ax2 = ax[2,0]
    ax2.plot(np.array(brkpt_lik)[:, 0], np.array(brkpt_lik)[:, 1].astype(float), 'o', linestyle = "dotted", lw=2)
    ax2.axhline(y=-Ln, linestyle = "-.", color = "red", label = "$-\log\mathcal{L}$ = "+str(round(-Ln, 2)))
    ax2.set_yscale('log')
    ax2.set_xlabel("# breakpoints", fontsize=fnt_size)
    ax2.set_ylabel("$-\log\mathcal{L}$")
    ax2.legend(loc='best', fontsize = fnt_size*0.8)
    ax2.set_title(title+" Likelihood gain from # breakpoints")
    if ax is None:
        plt.savefig(title+'_Breakpts_Likelihood.pdf')
    # AIC
    if ax is None:
        fig, ax3 = plt.subplots(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
        plt.rcParams['font.size'] = '18'
    else:
        ax3 = ax[2,1]
    AIC = 2*(len(brkpt_lik)+1)+2*np.array(brkpt_lik)[:, 1].astype(float)
    ax3.plot(np.array(brkpt_lik)[:, 0], AIC, 'o', linestyle = "dotted", lw=2)
    AIC_ln = 2*(len(brkpt_lik)+1)-2*Ln
    ax3.axhline(y=AIC_ln, linestyle = "-.", color = "red",
    label = "Min. AIC = "+str(round(AIC_ln, 2)))
    ax3.set_yscale('log')
    ax3.set_xlabel("# breakpoints", fontsize=fnt_size)
    ax3.set_ylabel("AIC")
    ax3.legend(loc='best', fontsize = fnt_size*0.8)
    ax3.set_title(title+" AIC")
    if ax is None:
        plt.savefig(title+'_Breakpts_Likelihood_AIC.pdf')
    return ax
def plot_test_theta(folder_path, mu, tgen, title = "Title", theta_scale = True, breaks_max = 10, ax = None):
    """
    Use theta values as is to do basic plots.
    """
    cpt = 0
    epochs = {}
    for file_name in os.listdir(folder_path):
        cpt +=1
        if os.path.isfile(os.path.join(folder_path, file_name)):
            for k in range(breaks_max):
                thetas,sfs = return_x_y_from_stwp_theta_file_as_is(folder_path+file_name, breaks = k,
                                                                 tgen = tgen,
                                                                 mu = mu, relative_theta_scale = theta_scale)
                if thetas == 0:
                    continue
                epochs[k] = thetas
    print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(k)+"\n*******\n")
    print(cpt, "theta file(s) have been scanned.")
    # multiple fig
    if ax is None:
        # intialize figure 1
        my_dpi = 300
        fnt_size = 18
        fig, ax1 = plt.subplots(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
        # Add some extra space for the second axis at the bottom
        fig.subplots_adjust(bottom=0.15)
    else:
        fnt_size = 12
        ax1 = ax[0, 1]
        plt.subplots_adjust(wspace=0.3, hspace=0.3)

    twin = ax1.twiny()

    plots = []
    for epoch, theta in epochs.items():
        groups = np.array(list(theta.values()), dtype=object)[:, 1].tolist()
        x = []
        y = []
        thetas = np.array(list(theta.values()), dtype=object)[:, 0]
        for i,group in enumerate(groups):
            x += group[::-1]
            y += list(np.repeat(thetas[i], len(group)))
            if epoch == 0:
                N0 = y[0]
                # compute the proportion of information used at each bin of the SFS
                sum_theta_i = 0
                for i in range(2, len(y)+2):
                    sum_theta_i+=y[i-2] / (i-1)
                prop = []
                for k in range(2, len(y)+2):
                    prop.append(y[k-2] / (k - 1) / sum_theta_i)
                prop = prop[::-1]
                # print(prop, "\n", sum(prop))
        # normalise to N0 (N0 of epoch1)
        for i in range(len(y)):
            y[i] = y[i]/N0
        # plot
        x_plot, y_plot = plot_straight_x_y(x, y)
        #plt.plot(x, y, 'o', linestyle="dotted", alpha=0.75, lw=2, label = str(epoch)+' brks')
        p, = ax1.plot(x_plot, y_plot, 'o', linestyle="-.", alpha=0.75, lw=2, label = str(epoch)+' brks')
        # add plot to the list of all plots to superimpose
        plots.append(p)
    # virtual line to get the second x axis for proportions
    p0, = twin.plot(prop, y, alpha = 0, label="Proportion")
    # Move twinned axis ticks and label from top to bottom
    twin.xaxis.set_ticks_position("bottom")
    twin.xaxis.set_label_position("bottom")
    # Offset the twin axis below the host
    if ax is None:
        # arrange differently the second x axis if the plot is plain
        twin.spines["bottom"].set_position(("axes", -0.15))
    else:
        # in a combined plot, more space between the fig and the axis
        twin.spines["bottom"].set_position(("axes", -0.35))
    #ax.legend(handles=[p0]+plots)
    ax1.set_xlabel("# bin")
    # Set the x-axis locator to reduce the number of ticks to 10
    ax1.xaxis.set_major_locator(MaxNLocator(nbins=10))
    twin.xaxis.set_major_locator(MaxNLocator(nbins=10))
    ax1.set_ylabel("theta")
    twin.set_ylabel("Proportion")
    ax1.set_title("Title")
    ax1.legend(handles=plots, loc='best', fontsize = fnt_size*0.5)
    if ax is None:
        plt.savefig(title+'_raw'+str(k)+'.pdf')
    # fig 2 & 3
    if ax is None:
        fig2, ax2 = plt.subplots(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
        fig3, ax3 = plt.subplots(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
    else:
        # place of plots on the grid
        ax2 = ax[1,0]
        ax3 = ax[1,1]
    lines_fig2 = []
    lines_fig3 = []
    #plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
    for epoch, theta in epochs.items():
        groups = np.array(list(theta.values()), dtype=object)[:, 1].tolist()
        x = []
        y = []
        thetas = np.array(list(theta.values()), dtype=object)[:, 0]
        for i,group in enumerate(groups):
            x += group[::-1]
            y += list(np.repeat(thetas[i], len(group)))
            if epoch == 0:
                N0 = y[0]
        for i in range(len(y)):
            y[i] = y[i]/N0
        x_2 = []
        T = 0
        for i in range(len(x)):
            x[i] = int(x[i])
        # compute the times as: theta_k / (k*(k-1))
        for i in range(0, len(x)):
            T += y[i] / (x[i]*(x[i]-1))
            x_2.append(T)
        # Plotting (fig 2)
        x_2 = [0]+x_2
        y = [y[0]]+y
        x2_plot, y2_plot = plot_straight_x_y(x_2, y)
        p2, = ax2.plot(x2_plot, y2_plot, 'o', linestyle="-.", alpha=0.75, lw=2, label = str(epoch)+' brks')
        lines_fig2.append(p2)
        # Plotting (fig 3) which is the same but log scale for x
        p3, = ax3.plot(x2_plot, y2_plot, 'o', linestyle="-.", alpha=0.75, lw=2, label = str(epoch)+' brks')
        lines_fig3.append(p3)
    ax2.set_xlabel("Relative scale")
    ax2.set_ylabel("theta")
    ax2.set_title("Title")
    ax2.legend(handles=lines_fig2, loc='best', fontsize = fnt_size*0.5)
    if ax is None:
        plt.savefig(title+'_plot2_'+str(k)+'.pdf')
    ax3.set_xscale('log')
    ax3.set_xlabel("log Relative scale")
    ax3.set_ylabel("theta")
    ax3.set_title("Title")
    ax3.legend(handles=lines_fig3, loc='best', fontsize = fnt_size*0.5)
    if ax is None:
        plt.savefig(title+'_plot3_'+str(k)+'_log.pdf')
    # return plots
    return ax

def combined_plot(folder_path, mu, tgen, breaks, title = "Title", theta_scale = True):
    my_dpi = 300
    # Add some extra space for the second axis at the bottom
    fig, axs = plt.subplots(3, 2, figsize=(5000/my_dpi, 2970/my_dpi), dpi=my_dpi)
    ax = plot_all_epochs_thetafolder(folder_path, mu, tgen, title, theta_scale, ax = axs)
    ax = plot_test_theta(folder_path, mu, tgen, title, theta_scale, breaks_max = breaks, ax = axs)
    # Adjust layout to prevent clipping of titles
    plt.tight_layout()
    # Adjust absolute space between the top and bottom rows
    plt.subplots_adjust(hspace=0.7)  # Adjust this value based on your requirement
    # Save the entire grid as a single figure
    plt.savefig(title+'_combined.pdf')
    plt.close()
    # second call for individual plots
    plot_all_epochs_thetafolder(folder_path, mu, tgen, title, theta_scale, ax = None)
    plot_test_theta(folder_path, mu, tgen, title, theta_scale, breaks_max = breaks, ax = None)

if __name__ == "__main__":

    if len(sys.argv) != 4:
        print("Need 3 args: ThetaFolder MutationRate GenerationTime")
        exit(0)

    folder_path = sys.argv[1]
    mu = sys.argv[2]
    tgen = sys.argv[3]

    plot_all_epochs_thetafolder(folder_path, mu, tgen)