|
@@ -23,6 +23,54 @@ import matplotlib.pyplot as plt
|
23
|
23
|
from frst import customgraphics
|
24
|
24
|
import numpy as np
|
25
|
25
|
|
|
26
|
+def parse_sfs(sfs_file):
|
|
27
|
+ """
|
|
28
|
+ Parse a Site Frequency Spectrum (SFS) file and return a masked spectrum.
|
|
29
|
+
|
|
30
|
+ This function reads an SFS file, extracts the spectrum data, and applies a mask to it.
|
|
31
|
+ The mask excludes specific bins from the spectrum, resulting in a masked SFS.
|
|
32
|
+
|
|
33
|
+ Parameters:
|
|
34
|
+ - sfs_file (str): The path to the SFS file to be parsed, in dadi's .fs format.
|
|
35
|
+
|
|
36
|
+ Returns:
|
|
37
|
+ - masked_spectrum (list): A masked SFS as a list of integers.
|
|
38
|
+
|
|
39
|
+ Raises:
|
|
40
|
+ - FileNotFoundError: If the specified SFS file is not found.
|
|
41
|
+ - ValueError: If there are inconsistencies in the file format or data.
|
|
42
|
+
|
|
43
|
+ Note: The actual structure of the SFS file is based on dadi's fs format.
|
|
44
|
+ """
|
|
45
|
+ try:
|
|
46
|
+ with open(sfs_file, 'r') as file:
|
|
47
|
+ # Read the first line which contains information about the file
|
|
48
|
+ num_individuals, mode, species_name = file.readline().strip().split()
|
|
49
|
+ num_individuals = int(num_individuals)
|
|
50
|
+ # Read the spectrum data
|
|
51
|
+ spectrum_data = list(map(int, file.readline().strip().split()))
|
|
52
|
+ # Check if the number of bins in the spectrum matches the expected number
|
|
53
|
+ if len(spectrum_data) != num_individuals:
|
|
54
|
+ raise ValueError("Error: Number of bins in the spectrum doesn't match the expected number of individuals.")
|
|
55
|
+ # Read the mask data
|
|
56
|
+ mask_data = list(map(int, file.readline().strip().split()))
|
|
57
|
+
|
|
58
|
+ # Check if the size of the mask matches the number of bins in the spectrum
|
|
59
|
+ if len(mask_data) != num_individuals:
|
|
60
|
+ raise ValueError("Error: Size of the mask doesn't match the number of bins in the spectrum.")
|
|
61
|
+ # Apply the mask to the spectrum
|
|
62
|
+ masked_spectrum = [spectrum_data[i] for i in range(num_individuals) if not mask_data[i]]
|
|
63
|
+ # Error handling
|
|
64
|
+ except FileNotFoundError:
|
|
65
|
+ print(f"Error: File not found - {sfs_file}")
|
|
66
|
+ except ValueError as ve:
|
|
67
|
+ print(f"Error: {ve}")
|
|
68
|
+ except Exception as e:
|
|
69
|
+ print(f"Error: {e}")
|
|
70
|
+ # final return of SFS as a list
|
|
71
|
+ return masked_spectrum
|
|
72
|
+
|
|
73
|
+
|
26
|
74
|
def sfs_from_vcf(n, vcf_file, folded = True, diploid = True, phased = False, verbose = False,
|
27
|
75
|
strip = False, count_ext = False):
|
28
|
76
|
|
|
@@ -193,7 +241,7 @@ def sfs_from_parsed_vcf(n, vcf_dict, folded = True, diploid = True, phased = Fal
|
193
|
241
|
return SFS_values, count_pluriall
|
194
|
242
|
|
195
|
243
|
|
196
|
|
-def barplot_sfs(sfs, xlab, ylab, folded=True, title = "Barplot", transformed = False, normalized = False, ploidy = 2):
|
|
244
|
+def barplot_sfs(sfs, xlab, ylab, folded=True, title = "Barplot", transformed = False, normalized = False, ploidy = 2, output = None):
|
197
|
245
|
sfs_val = []
|
198
|
246
|
n = len(sfs.values())
|
199
|
247
|
sum_sites = sum(list(sfs.values()))
|
|
@@ -242,7 +290,8 @@ def barplot_sfs(sfs, xlab, ylab, folded=True, title = "Barplot", transformed =
|
242
|
290
|
else:
|
243
|
291
|
# the spectrum is n-1 long when unfolded
|
244
|
292
|
n_title = n+1
|
245
|
|
-
|
|
293
|
+ original_title = title
|
|
294
|
+ # reformat title and add infos
|
246
|
295
|
title = title+" (n="+str(n_title)+") [folded="+str(folded)+"]"+" [transformed="+str(transformed)+"]"
|
247
|
296
|
print("SFS =", sfs)
|
248
|
297
|
|
|
@@ -252,7 +301,7 @@ def barplot_sfs(sfs, xlab, ylab, folded=True, title = "Barplot", transformed =
|
252
|
301
|
if transformed:
|
253
|
302
|
print("Transformed SFS ( n =",n_title, ") :", sfs_val)
|
254
|
303
|
#plt.axhline(y=1/n, color='r', linestyle='-')
|
255
|
|
- plt.bar([x+0.2 for x in list(sfs.keys())], [1/n]*n, color='r', linestyle='-', width = 0.4, label= "H0 Theoric constant")
|
|
304
|
+ plt.bar([x+0.2 for x in list(sfs.keys())], [1/n]*n, fill=False, hatch="///", linestyle='-', width = 0.4, label= "H0 Theoric constant")
|
256
|
305
|
|
257
|
306
|
else:
|
258
|
307
|
if normalized:
|
|
@@ -260,10 +309,19 @@ def barplot_sfs(sfs, xlab, ylab, folded=True, title = "Barplot", transformed =
|
260
|
309
|
sum_expected = sum([(1/(i+1)) for i,x in enumerate(list(sfs.keys()))])
|
261
|
310
|
expected_y = [(1/(i+1))/sum_expected for i,x in enumerate(list(sfs.keys()))]
|
262
|
311
|
print(expected_y)
|
263
|
|
- plt.bar([x+0.2 for x in list(sfs.keys())], expected_y, color='r', linestyle='-', width = 0.4, label= "H0 Theoric constant")
|
|
312
|
+ plt.bar([x+0.2 for x in list(sfs.keys())], expected_y, fill=False, hatch="///", linestyle='-', width = 0.4, label= "H0 Theoric constant")
|
264
|
313
|
print(sum(expected_y))
|
265
|
|
- customgraphics.barplot(x = [x-0.2 for x in X_axis], width=0.4, y= sfs_val, xlab = xlab, ylab = ylab, title = title, label = "H1 Observed spectrum", xticks =list(sfs.keys()) )
|
266
|
|
- plt.show()
|
|
314
|
+ if output is not None:
|
|
315
|
+ # if write in a file, don't open the window dynamically
|
|
316
|
+ plot = False
|
|
317
|
+ else:
|
|
318
|
+ plot = True
|
|
319
|
+ customgraphics.barplot(x = [x-0.2 for x in X_axis], width=0.4, y= sfs_val, xlab = xlab, ylab = ylab, title = title, label = "H1 Observed spectrum", xticks =list(sfs.keys()), plot = plot )
|
|
320
|
+ if output:
|
|
321
|
+ plt.savefig(f"{output}/{original_title}_SFS.pdf")
|
|
322
|
+ else:
|
|
323
|
+ plt.show()
|
|
324
|
+ plt.close()
|
267
|
325
|
|
268
|
326
|
if __name__ == "__main__":
|
269
|
327
|
|