Browse Source

theoritical logLn swp2

tforest 5 months ago
parent
commit
2253ce5f67
1 changed files with 118 additions and 32 deletions
  1. 118 32
      swp2.py

+ 118 - 32
swp2.py View File

@@ -1,12 +1,38 @@
1 1
 import matplotlib.pyplot as plt
2 2
 import os
3 3
 import numpy as np
4
+import math
5
+from scipy.special import gammaln
6
+from matplotlib.backends.backend_pdf import PdfPages
7
+
8
+def log_facto(k):
9
+    k = int(k)
10
+    if k > 1e6:
11
+        return k * np.log(k) - k + np.log(2*math.pi*k)/2
12
+    val = 0
13
+    for i in range(2, k+1):
14
+        val += np.log(i)
15
+    return val
16
+
17
+def log_facto_1(k):
18
+    startf = 1 # start of factorial sequence
19
+    stopf  = int(k+1) # end of of factorial sequence
20
+
21
+    q = gammaln(range(startf+1, stopf+1)) # n! = G(n+1)
22
+
23
+    return q[-1]
4 24
 
5 25
 def return_x_y_from_stwp_theta_file(stwp_theta_file, breaks, mu, tgen, relative_theta_scale = False):
6 26
     with open(stwp_theta_file, "r") as swp_file:
7 27
         # Read the first line
8 28
         line = swp_file.readline()
9 29
         L = float(line.split()[2])
30
+        rands = swp_file.readline()
31
+        line = swp_file.readline()
32
+        # skip empty lines before SFS
33
+        while line == "\n":
34
+            line = swp_file.readline()
35
+        sfs = np.array(line.split()).astype(float)
10 36
         # Process lines until the end of the file
11 37
         while line:
12 38
             # check at each line
@@ -27,7 +53,7 @@ def return_x_y_from_stwp_theta_file(stwp_theta_file, breaks, mu, tgen, relative_
27 53
     #### END of parsing
28 54
     # quit this file if the number of dimensions is incorrect
29 55
     if dim < breaks+1:
30
-        return 0,0,0
56
+        return 0,0,0,0,0
31 57
     # get n, the last bin of the last group
32 58
     # revert the list of groups as the most recent times correspond
33 59
     # to the closest and last leafs of the coal. tree.
@@ -90,13 +116,19 @@ def return_x_y_from_stwp_theta_file(stwp_theta_file, breaks, mu, tgen, relative_
90 116
     #     #     # divide by N0
91 117
     #     #     y[i] = y[i]/N0
92 118
     #     #     x[i] = x[i]/N0
93
-    return x,y,likelihood
119
+    return x,y,likelihood,sfs,L
94 120
 
95 121
 def return_x_y_from_stwp_theta_file_as_is(stwp_theta_file, breaks, mu, tgen, relative_theta_scale = False):
96 122
     with open(stwp_theta_file, "r") as swp_file:
97 123
         # Read the first line
98 124
         line = swp_file.readline()
99 125
         L = float(line.split()[2])
126
+        rands = swp_file.readline()
127
+        line = swp_file.readline()
128
+        # skip empty lines before SFS
129
+        while line == "\n":
130
+            line = swp_file.readline()
131
+        sfs = np.array(line.split()).astype(float)
100 132
         # Process lines until the end of the file
101 133
         while line:
102 134
             # check at each line
@@ -117,7 +149,7 @@ def return_x_y_from_stwp_theta_file_as_is(stwp_theta_file, breaks, mu, tgen, rel
117 149
     #### END of parsing
118 150
     # quit this file if the number of dimensions is incorrect
119 151
     if dim < breaks+1:
120
-        return 0,0,0
152
+        return 0,0
121 153
     # get n, the last bin of the last group
122 154
     # revert the list of groups as the most recent times correspond
123 155
     # to the closest and last leafs of the coal. tree.
@@ -130,7 +162,7 @@ def return_x_y_from_stwp_theta_file_as_is(stwp_theta_file, breaks, mu, tgen, rel
130 162
         groups[i] = groups[i].split(',')
131 163
         #print(groups[i], len(groups[i]))
132 164
         thetas[i] = [float(theta_site[i]), groups[i], likelihood]
133
-    return thetas
165
+    return thetas, sfs
134 166
 
135 167
 def plot_k_epochs_thetafolder(folder_path, mu, tgen, breaks = 2, title = "Title", theta_scale = True):
136 168
     scenari = {}
@@ -138,9 +170,9 @@ def plot_k_epochs_thetafolder(folder_path, mu, tgen, breaks = 2, title = "Title"
138 170
     for file_name in os.listdir(folder_path):
139 171
         if os.path.isfile(os.path.join(folder_path, file_name)):
140 172
             # Perform actions on each file
141
-            x,y,likelihood = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks,
173
+            x,y,likelihood,sfs,L = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks,
142 174
                                                              tgen = tgen,
143
-                                                             mu = mu, relative_theta_scale = theta_scale)
175
+                                     mu = mu, relative_theta_scale = theta_scale)
144 176
             if x == 0 or y == 0:
145 177
                 continue
146 178
             cpt +=1
@@ -189,15 +221,17 @@ def plot_all_epochs_thetafolder(folder_path, mu, tgen, title = "Title", theta_sc
189 221
         breaks = 0
190 222
         cpt +=1
191 223
         if os.path.isfile(os.path.join(folder_path, file_name)):
192
-            x, y, likelihood = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks,
224
+            x, y, likelihood, sfs, L = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks,
193 225
                                                              tgen = tgen,
194 226
                                                              mu = mu, relative_theta_scale = theta_scale)
227
+            SFS_stored = sfs
228
+            L_stored = L
195 229
             while not (x == 0 and y == 0):
196 230
                 if breaks not in epochs.keys():
197 231
                     epochs[breaks] = {}
198 232
                 epochs[breaks][likelihood] = x,y
199 233
                 breaks += 1
200
-                x,y,likelihood = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks,
234
+                x,y,likelihood,sfs,L = return_x_y_from_stwp_theta_file(folder_path+file_name, breaks = breaks,
201 235
                                                                  tgen = tgen,
202 236
                                                                   mu = mu, relative_theta_scale = theta_scale)
203 237
     print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(breaks)+"\n*******\n")
@@ -208,7 +242,7 @@ def plot_all_epochs_thetafolder(folder_path, mu, tgen, title = "Title", theta_sc
208 242
     plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
209 243
     plt.xlim(1e-3, 1)
210 244
     #plt.ylim(0, 10)
211
-    #plt.yscale('log')
245
+    plt.yscale('log')
212 246
     plt.xscale('log')
213 247
     plt.grid(True,which="both", linestyle='--', alpha = 0.3)
214 248
     brkpt_lik = []
@@ -228,7 +262,15 @@ def plot_all_epochs_thetafolder(folder_path, mu, tgen, title = "Title", theta_sc
228 262
             # divide by N0
229 263
             y[i] = y[i]/N0
230 264
             x[i] = x[i]/N0
231
-        plt.plot(x, y, '-', alpha=0.75, lw=2, label = str(epoch)+' BrkPt | Lik='+greatest_likelihood)
265
+        sum_theta_i = 0
266
+        print(epoch, x, y)
267
+        for i in range(2, len(y)-1):
268
+            sum_theta_i=y[i] / (i-1)
269
+        prop = []
270
+        for k in range(2, len(y)-1):
271
+            prop.append(y[k+1] / (k - 1) / sum_theta_i)
272
+        #print(epoch, prop)
273
+        plt.plot(x, y, 'o', linestyle = "-", alpha=0.75, lw=2, label = str(epoch)+' BrkPt | Lik='+greatest_likelihood)
232 274
         if theta_scale:
233 275
             plt.xlabel("Coal. time")
234 276
             plt.ylabel("Pop. size scaled by N0")
@@ -246,35 +288,67 @@ def plot_all_epochs_thetafolder(folder_path, mu, tgen, title = "Title", theta_sc
246 288
         plt.title(title)
247 289
         plt.savefig(title+'_b'+str(breaks)+'.pdf')
248 290
     # plot likelihood against nb of breakpoints
291
+
292
+    # best possible likelihood from SFS
293
+    # Segregating sites
294
+    S = sum(SFS_stored)
295
+    # number of monomorphic sites
296
+    L = L_stored
297
+    S0 = L-S
298
+    print("SFS", SFS_stored)
299
+    print("S", S, "L", L, "S0=", S0)
300
+    # compute Ln
301
+    Ln = log_facto(S+S0) - log_facto(S0) + np.log(float(S0)/(S+S0)) * S0
302
+    for xi in range(0, len(SFS_stored)):
303
+        p_i = SFS_stored[xi] / float(S+S0)
304
+        Ln += np.log(p_i) * SFS_stored[xi] - log_facto(SFS_stored[xi])
305
+    res = Ln
306
+    print(res)
307
+    # basic plot likelihood
249 308
     plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
250 309
     plt.rcParams['font.size'] = '18'
251
-    AIC = 2*(len(brkpt_lik)+1)+2*np.array(brkpt_lik)[:, 1].astype(float)
252
-    plt.plot(np.array(brkpt_lik)[:, 0], AIC, 'o', linestyle = "dotted", lw=2)
253
-    plt.axhline(y=106)
310
+    plt.plot(np.array(brkpt_lik)[:, 0], np.array(brkpt_lik)[:, 1].astype(float), 'o', linestyle = "dotted", lw=2)
311
+    # plt.ylim(0,100)
312
+    # plt.axhline(y=res)
254 313
     plt.yscale('log')
255 314
     plt.xlabel("# breakpoints", fontsize=20)
256 315
     plt.ylabel("$-\log\mathcal{L}$")
257 316
     #plt.legend(loc='upper right')
258 317
     plt.title(title)
259 318
     plt.savefig(title+'_Breakpts_Likelihood.pdf')
319
+    # AIC
320
+    plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
321
+    plt.rcParams['font.size'] = '18'
322
+    AIC = 2*(len(brkpt_lik)+1)+2*np.array(brkpt_lik)[:, 1].astype(float)
323
+    plt.plot(np.array(brkpt_lik)[:, 0], AIC, 'o', linestyle = "dotted", lw=2)
324
+    # plt.axhline(y=106)
325
+    plt.yscale('log')
326
+    plt.xlabel("# breakpoints", fontsize=20)
327
+    plt.ylabel("AIC")
328
+    #plt.legend(loc='upper right')
329
+    plt.title(title)
330
+    plt.savefig(title+'_Breakpts_Likelihood_AIC.pdf')
260 331
 
261
-def plot_test_theta(folder_path, mu, tgen, title = "Title", theta_scale = True, breaks_max = 6):
332
+def plot_test_theta(folder_path, mu, tgen, title = "Title", theta_scale = True, breaks_max = 5):
333
+    """
334
+    Use theta values as is to do basic plots.
335
+    """
262 336
     cpt = 0
263 337
     epochs = {}
264 338
     for file_name in os.listdir(folder_path):
265 339
         cpt +=1
266 340
         if os.path.isfile(os.path.join(folder_path, file_name)):
267 341
             for k in range(breaks_max):
268
-                thetas = return_x_y_from_stwp_theta_file_as_is(folder_path+file_name, breaks = k,
342
+                thetas,sfs = return_x_y_from_stwp_theta_file_as_is(folder_path+file_name, breaks = k,
269 343
                                                                  tgen = tgen,
270 344
                                                                  mu = mu, relative_theta_scale = theta_scale)
271
-                if thetas[0] == 0:
345
+                if thetas == 0:
272 346
                     continue
273 347
                 epochs[k] = thetas
274 348
     print("\n*******\n"+title+"\n--------\n"+"mu="+str(mu)+"\ntgen="+str(tgen)+"\nbreaks="+str(k)+"\n*******\n")
275 349
     print(cpt, "theta file(s) have been scanned.")
276 350
 
277
-    # intialize figure
351
+    # intialize figure 1
278 352
     my_dpi = 300
279 353
     plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
280 354
     for epoch, theta in epochs.items():
@@ -294,7 +368,7 @@ def plot_test_theta(folder_path, mu, tgen, title = "Title", theta_scale = True,
294 368
         plt.ylabel("theta")
295 369
         plt.legend(loc='upper right')
296 370
         plt.savefig(title+'_test'+str(k)+'.pdf')
297
-    # fig 2
371
+    # fig 2 & 3
298 372
     plt.figure(figsize=(5000/my_dpi, 2800/my_dpi), dpi=my_dpi)
299 373
     for epoch, theta in epochs.items():
300 374
         groups = np.array(list(theta.values()), dtype=object)[:, 1].tolist()
@@ -308,29 +382,41 @@ def plot_test_theta(folder_path, mu, tgen, title = "Title", theta_scale = True,
308 382
                 N0 = y[0]
309 383
         for i in range(len(y)):
310 384
             y[i] = y[i]/N0
311
-        #
312 385
         x_2 = []
313 386
         T = 0
314
-        # k allant de de 14 à 2
315 387
         for i in range(len(x)):
316 388
             x[i] = int(x[i])
317
-        #print(x[2])
389
+        # compute the times as: theta_k / (k*(k-1))
318 390
         for i in range(0, len(x)):
319
-            k = x[i]
320
-            #print(k, y[k-2])
321
-
322
-            #theta_k = y[k] / (k*(k-1))
323 391
             T += y[i] / (x[i]*(x[i]-1))
324 392
             x_2.append(T)
393
+    # Plotting (fig 2)
394
+    plt.plot(x_2, y, 'o', linestyle="dotted", alpha=0.75, lw=2, label = str(epoch)+' brks')
395
+    plt.xlabel("# breaks")
396
+    plt.ylabel("theta")
397
+    plt.legend(loc='upper right')
398
+    plt.savefig(title+'_test'+str(k)+'.pdf')
325 399
 
326
-        plt.plot(x_2, y, 'o', linestyle="dotted", alpha=0.75, lw=2, label = str(epoch)+' brks')
327
-        plt.xscale('log')
328
-        plt.xlabel("# breaks")
329
-        plt.ylabel("theta")
330
-        plt.legend(loc='upper right')
331
-        plt.savefig(title+'_test'+str(k)+'.pdf')
332
-        #
400
+    # Plotting (fig 3) which is the same but log scale for x
401
+    plt.plot(x_2, y, 'o', linestyle="dotted", alpha=0.75, lw=2, label = str(epoch)+' brks')
402
+    plt.xscale('log')
403
+    plt.xlabel("# breaks")
404
+    plt.ylabel("theta")
405
+    plt.legend(loc='upper right')
406
+    plt.savefig(title+'_test'+str(k)+'_log.pdf')
407
+
408
+def save_multi_image(filename):
409
+    pp = PdfPages(filename)
410
+    fig_nums = plt.get_fignums()
411
+    figs = [plt.figure(n) for n in fig_nums]
412
+    for fig in figs:
413
+        fig.savefig(pp, format='pdf')
414
+    pp.close()
333 415
 
416
+def combined_plot(folder_path, mu, tgen, breaks, title = "Title", theta_scale = True):
417
+    plot_all_epochs_thetafolder(folder_path, mu, tgen, title, theta_scale)
418
+    plot_test_theta(folder_path, mu, tgen, title, theta_scale, breaks_max = breaks)
419
+    save_multi_image(title+"_combined.pdf")
334 420
 if __name__ == "__main__":
335 421
 
336 422
     if len(sys.argv) != 4: