nzimmermann
/
archive-download


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
							#! /usr/bin/env python3
import urllib3
import sys, os, re
import shutil
import certifi

from bs4 import BeautifulSoup
#import lxml

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

if len(sys.argv) != 3:
    print("This program allows you to download audio files likes concerts etc " +
        "from the website archive.org.\nUsage in cli is :\n" +
        "\"./archive-download.py 'url' 'directory'\" with :\n" +
        "\t- url : the url of the files you want to download\n" +
        "\t- directory : a path where you want to folder to be created" +
        " e.g '~/Downloads'"
        )    
    exit()

def getHTML(URL):
    """
    Return a html document from a URL link.
    """
    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where())
    r = http.request('GET',URL)
    if r == urllib3.exceptions.SSLError:
        print("Unable to establish connection.")
        exit()
    return r.data
    

def getTracks(URL):
    """
    Return a dictionnary with key = track_names and values = track_links.
    """
    soup = BeautifulSoup(getHTML(URL), features="lxml")
    bloc = soup.find("div", {"class":"container container-ia width-max"})
    raw_names = bloc.find_all("meta", {"itemprop":"name"})
    raw_links = bloc.find_all("link", {"itemprop":"associatedMedia"})
    track_names = []
    track_links = []
    i = 0
    for raw_name in raw_names:
            i += 1
            if i < 10:
                nb = "0" + str(i)
            else:
                nb = str(i)
            track_names.append(nb + " - " + raw_name.get('content') + ".mp3")
    for i in range(len(raw_links)//2):
        track_links.append(raw_links[i*2].get('href'))
    tracks = {}
    for track in range(len(track_names)):
        tracks[track_names[track]] = track_links[track]
    return tracks

def getFolderName(URL):
    """
    Return a string representing the name of the folder wich will be
    created to store all tracks.
    """
    soup = BeautifulSoup(getHTML(URL), features="lxml")
    folderName = soup.find("h1", {"class":"sr-only"})
    folderName = folderName.get_text()
    folderName = re.sub('\n','',folderName)
    folderName = re.sub(' +', '_', folderName)
    folderName = re.sub('^_+','',folderName)
    folderName = re.sub('_+$','',folderName)
    
    return folderName
    
    
def writeTrack(track_name, track_link):
    """
    Writes out a track in the current directory.
    """
    url = track_link
    c = urllib3.PoolManager()

    with c.request('GET',url, preload_content=False) as resp, open(track_name.replace("/", "-"), 'wb') as out_file:
        shutil.copyfileobj(resp, out_file)


def writeAll(URL):
    tracks = getTracks(URL)
    print("Successfully located track links and names.")
    folder = getFolderName(URL)
    os.mkdir(folder) # creates a sub-directory in current directory to store the live
    os.chdir(folder) # making the created folder the current working folder
    print("Successfully created a sub-directory to download tracks in.")
    n = len(tracks)
    i = 0
    print("Downloading tracks, this might take a while depending on your connection and the size of your request.")
    for track_name, track_link in tracks.items():
        i += 1
        writeTrack(track_name, track_link)
        print("Wrote {}/{} tracks".format(i,n)) # to know how the progress    
    print("All tracks successfully written. Bye")


url = sys.argv[1]
path = sys.argv[2]

os.chdir(os.path.expanduser(path))

writeAll(url)
           
#url = "https://archive.org/details/gd72-08-27.sbd.orf.3328.sbeok.shnf/gd1972-08-27d1t01-orf.shn"