Browse Source

first commit

gordon 5 years ago
commit
d65f0e4fbf
3 changed files with 132 additions and 0 deletions
  1. 14 0
      README.md
  2. 116 0
      archive-download.py
  3. 2 0
      install_dependencies.sh

+ 14 - 0
README.md View File

1
+This program runs with python3.
2
+Required packages are :
3
+    - BeautifulSoup4
4
+    - lxml
5
+    - urllib3
6
+
7
+
8
+This program allows you to download audio files likes concerts etc from the website archive.org.
9
+Usage in cli is :
10
+        "./archive-download.py 'url' 'directory'\" with :
11
+            - url : the url of the files you want to download
12
+            - directory : a path where you want to folder to be created e.g '~/Downloads'
13
+
14
+

+ 116 - 0
archive-download.py View File

1
+#! /usr/bin/env python3
2
+import urllib3
3
+import sys, os, re
4
+import shutil
5
+import certifi
6
+
7
+from bs4 import BeautifulSoup
8
+#import lxml
9
+
10
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
11
+
12
+if len(sys.argv) != 3:
13
+    print("This program allows you to download audio files likes concerts etc " +
14
+        "from the website archive.org.\nUsage in cli is :\n" +
15
+        "\"./archive-download.py 'url' 'directory'\" with :\n" +
16
+        "\t- url : the url of the files you want to download\n" +
17
+        "\t- directory : a path where you want to folder to be created" +
18
+        " e.g '~/Downloads'"
19
+        )    
20
+    exit()
21
+
22
+def getHTML(URL):
23
+    """
24
+    Return a html document from a URL link.
25
+    """
26
+    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where())
27
+    r = http.request('GET',URL)
28
+    if r == urllib3.exceptions.SSLError:
29
+        print("Unable to establish connection.")
30
+        exit()
31
+    return r.data
32
+    
33
+
34
+def getTracks(URL):
35
+    """
36
+    Return a dictionnary with key = track_names and values = track_links.
37
+    """
38
+    soup = BeautifulSoup(getHTML(URL), features="lxml")
39
+    bloc = soup.find("div", {"class":"container container-ia width-max"})
40
+    raw_names = bloc.find_all("meta", {"itemprop":"name"})
41
+    raw_links = bloc.find_all("link", {"itemprop":"associatedMedia"})
42
+    track_names = []
43
+    track_links = []
44
+    i = 0
45
+    for raw_name in raw_names:
46
+            i += 1
47
+            if i < 10:
48
+                nb = "0" + str(i)
49
+            else:
50
+                nb = str(i)
51
+            track_names.append(nb + " - " + raw_name.get('content') + ".mp3")
52
+    for i in range(len(raw_links)//2):
53
+        track_links.append(raw_links[i*2].get('href'))
54
+    tracks = {}
55
+    for track in range(len(track_names)):
56
+        tracks[track_names[track]] = track_links[track]
57
+    return tracks
58
+
59
+def getFolderName(URL):
60
+    """
61
+    Return a string representing the name of the folder wich will be
62
+    created to store all tracks.
63
+    """
64
+    soup = BeautifulSoup(getHTML(URL), features="lxml")
65
+    folderName = soup.find("h1", {"class":"sr-only"})
66
+    folderName = folderName.get_text()
67
+    folderName = re.sub('\n','',folderName)
68
+    folderName = re.sub(' +', '_', folderName)
69
+    folderName = re.sub('^_+','',folderName)
70
+    folderName = re.sub('_+$','',folderName)
71
+    
72
+    return folderName
73
+    
74
+    
75
+def writeTrack(track_name, track_link):
76
+    """
77
+    Writes out a track in the current directory.
78
+    """
79
+    url = track_link
80
+    c = urllib3.PoolManager()
81
+
82
+    with c.request('GET',url, preload_content=False) as resp, open(track_name.replace("/", "-"), 'wb') as out_file:
83
+        shutil.copyfileobj(resp, out_file)
84
+
85
+
86
+def writeAll(URL):
87
+    tracks = getTracks(URL)
88
+    print("Successfully located track links and names.")
89
+    folder = getFolderName(URL)
90
+    os.mkdir(folder) # creates a sub-directory in current directory to store the live
91
+    os.chdir(folder) # making the created folder the current working folder
92
+    print("Successfully created a sub-directory to download tracks in.")
93
+    n = len(tracks)
94
+    i = 0
95
+    print("Downloading tracks, this might take a while depending on your connection and the size of your request.")
96
+    for track_name, track_link in tracks.items():
97
+        i += 1
98
+        writeTrack(track_name, track_link)
99
+        print("Wrote {}/{} tracks".format(i,n)) # to know how the progress    
100
+    print("All tracks successfully written. Bye")
101
+
102
+
103
+    
104
+url = sys.argv[1]
105
+path = sys.argv[2]
106
+
107
+os.chdir(os.path.expanduser(path))
108
+
109
+writeAll(url)
110
+           
111
+#url = "https://archive.org/details/gd72-08-27.sbd.orf.3328.sbeok.shnf/gd1972-08-27d1t01-orf.shn"
112
+    
113
+
114
+
115
+
116
+

+ 2 - 0
install_dependencies.sh View File

1
+#!/bin/bash
2
+pip3 install BeautifulSoup4 lxml urllib3