|
@@ -0,0 +1,116 @@
|
|
1
|
+#! /usr/bin/env python3
|
|
2
|
+import urllib3
|
|
3
|
+import sys, os, re
|
|
4
|
+import shutil
|
|
5
|
+import certifi
|
|
6
|
+
|
|
7
|
+from bs4 import BeautifulSoup
|
|
8
|
+#import lxml
|
|
9
|
+
|
|
10
|
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
11
|
+
|
|
12
|
+if len(sys.argv) != 3:
|
|
13
|
+ print("This program allows you to download audio files likes concerts etc " +
|
|
14
|
+ "from the website archive.org.\nUsage in cli is :\n" +
|
|
15
|
+ "\"./archive-download.py 'url' 'directory'\" with :\n" +
|
|
16
|
+ "\t- url : the url of the files you want to download\n" +
|
|
17
|
+ "\t- directory : a path where you want to folder to be created" +
|
|
18
|
+ " e.g '~/Downloads'"
|
|
19
|
+ )
|
|
20
|
+ exit()
|
|
21
|
+
|
|
22
|
+def getHTML(URL):
|
|
23
|
+ """
|
|
24
|
+ Return a html document from a URL link.
|
|
25
|
+ """
|
|
26
|
+ http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where())
|
|
27
|
+ r = http.request('GET',URL)
|
|
28
|
+ if r == urllib3.exceptions.SSLError:
|
|
29
|
+ print("Unable to establish connection.")
|
|
30
|
+ exit()
|
|
31
|
+ return r.data
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+def getTracks(URL):
|
|
35
|
+ """
|
|
36
|
+ Return a dictionnary with key = track_names and values = track_links.
|
|
37
|
+ """
|
|
38
|
+ soup = BeautifulSoup(getHTML(URL), features="lxml")
|
|
39
|
+ bloc = soup.find("div", {"class":"container container-ia width-max"})
|
|
40
|
+ raw_names = bloc.find_all("meta", {"itemprop":"name"})
|
|
41
|
+ raw_links = bloc.find_all("link", {"itemprop":"associatedMedia"})
|
|
42
|
+ track_names = []
|
|
43
|
+ track_links = []
|
|
44
|
+ i = 0
|
|
45
|
+ for raw_name in raw_names:
|
|
46
|
+ i += 1
|
|
47
|
+ if i < 10:
|
|
48
|
+ nb = "0" + str(i)
|
|
49
|
+ else:
|
|
50
|
+ nb = str(i)
|
|
51
|
+ track_names.append(nb + " - " + raw_name.get('content') + ".mp3")
|
|
52
|
+ for i in range(len(raw_links)//2):
|
|
53
|
+ track_links.append(raw_links[i*2].get('href'))
|
|
54
|
+ tracks = {}
|
|
55
|
+ for track in range(len(track_names)):
|
|
56
|
+ tracks[track_names[track]] = track_links[track]
|
|
57
|
+ return tracks
|
|
58
|
+
|
|
59
|
+def getFolderName(URL):
|
|
60
|
+ """
|
|
61
|
+ Return a string representing the name of the folder wich will be
|
|
62
|
+ created to store all tracks.
|
|
63
|
+ """
|
|
64
|
+ soup = BeautifulSoup(getHTML(URL), features="lxml")
|
|
65
|
+ folderName = soup.find("h1", {"class":"sr-only"})
|
|
66
|
+ folderName = folderName.get_text()
|
|
67
|
+ folderName = re.sub('\n','',folderName)
|
|
68
|
+ folderName = re.sub(' +', '_', folderName)
|
|
69
|
+ folderName = re.sub('^_+','',folderName)
|
|
70
|
+ folderName = re.sub('_+$','',folderName)
|
|
71
|
+
|
|
72
|
+ return folderName
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+def writeTrack(track_name, track_link):
|
|
76
|
+ """
|
|
77
|
+ Writes out a track in the current directory.
|
|
78
|
+ """
|
|
79
|
+ url = track_link
|
|
80
|
+ c = urllib3.PoolManager()
|
|
81
|
+
|
|
82
|
+ with c.request('GET',url, preload_content=False) as resp, open(track_name.replace("/", "-"), 'wb') as out_file:
|
|
83
|
+ shutil.copyfileobj(resp, out_file)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+def writeAll(URL):
|
|
87
|
+ tracks = getTracks(URL)
|
|
88
|
+ print("Successfully located track links and names.")
|
|
89
|
+ folder = getFolderName(URL)
|
|
90
|
+ os.mkdir(folder) # creates a sub-directory in current directory to store the live
|
|
91
|
+ os.chdir(folder) # making the created folder the current working folder
|
|
92
|
+ print("Successfully created a sub-directory to download tracks in.")
|
|
93
|
+ n = len(tracks)
|
|
94
|
+ i = 0
|
|
95
|
+ print("Downloading tracks, this might take a while depending on your connection and the size of your request.")
|
|
96
|
+ for track_name, track_link in tracks.items():
|
|
97
|
+ i += 1
|
|
98
|
+ writeTrack(track_name, track_link)
|
|
99
|
+ print("Wrote {}/{} tracks".format(i,n)) # to know how the progress
|
|
100
|
+ print("All tracks successfully written. Bye")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+url = sys.argv[1]
|
|
105
|
+path = sys.argv[2]
|
|
106
|
+
|
|
107
|
+os.chdir(os.path.expanduser(path))
|
|
108
|
+
|
|
109
|
+writeAll(url)
|
|
110
|
+
|
|
111
|
+#url = "https://archive.org/details/gd72-08-27.sbd.orf.3328.sbeok.shnf/gd1972-08-27d1t01-orf.shn"
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
|