Python3 program allowing to download lives from archive.org

archive-download.py 3.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. #! /usr/bin/env python3
  2. import urllib3
  3. import sys, os, re
  4. import shutil
  5. import certifi
  6. from bs4 import BeautifulSoup
  7. #import lxml
  8. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  9. if len(sys.argv) != 3:
  10. print("This program allows you to download audio files likes concerts etc " +
  11. "from the website archive.org.\nUsage in cli is :\n" +
  12. "\"./archive-download.py 'url' 'directory'\" with :\n" +
  13. "\t- url : the url of the files you want to download\n" +
  14. "\t- directory : a path where you want to folder to be created" +
  15. " e.g '~/Downloads'"
  16. )
  17. exit()
  18. def getHTML(URL):
  19. """
  20. Return a html document from a URL link.
  21. """
  22. http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where())
  23. r = http.request('GET',URL)
  24. if r == urllib3.exceptions.SSLError:
  25. print("Unable to establish connection.")
  26. exit()
  27. return r.data
  28. def getTracks(URL):
  29. """
  30. Return a dictionnary with key = track_names and values = track_links.
  31. """
  32. soup = BeautifulSoup(getHTML(URL), features="lxml")
  33. bloc = soup.find("div", {"class":"container container-ia width-max"})
  34. raw_names = bloc.find_all("meta", {"itemprop":"name"})
  35. raw_links = bloc.find_all("link", {"itemprop":"associatedMedia"})
  36. track_names = []
  37. track_links = []
  38. i = 0
  39. for raw_name in raw_names:
  40. i += 1
  41. if i < 10:
  42. nb = "0" + str(i)
  43. else:
  44. nb = str(i)
  45. track_names.append(nb + " - " + raw_name.get('content') + ".mp3")
  46. for i in range(len(raw_links)//2):
  47. track_links.append(raw_links[i*2].get('href'))
  48. tracks = {}
  49. for track in range(len(track_names)):
  50. tracks[track_names[track]] = track_links[track]
  51. return tracks
  52. def getFolderName(URL):
  53. """
  54. Return a string representing the name of the folder wich will be
  55. created to store all tracks.
  56. """
  57. soup = BeautifulSoup(getHTML(URL), features="lxml")
  58. folderName = soup.find("h1", {"class":"sr-only"})
  59. folderName = folderName.get_text()
  60. folderName = re.sub('\n','',folderName)
  61. folderName = re.sub(' +', '_', folderName)
  62. folderName = re.sub('^_+','',folderName)
  63. folderName = re.sub('_+$','',folderName)
  64. return folderName
  65. def writeTrack(track_name, track_link):
  66. """
  67. Writes out a track in the current directory.
  68. """
  69. url = track_link
  70. c = urllib3.PoolManager()
  71. with c.request('GET',url, preload_content=False) as resp, open(track_name.replace("/", "-"), 'wb') as out_file:
  72. shutil.copyfileobj(resp, out_file)
  73. def writeAll(URL):
  74. tracks = getTracks(URL)
  75. print("Successfully located track links and names.")
  76. folder = getFolderName(URL)
  77. os.mkdir(folder) # creates a sub-directory in current directory to store the live
  78. os.chdir(folder) # making the created folder the current working folder
  79. print("Successfully created a sub-directory to download tracks in.")
  80. n = len(tracks)
  81. i = 0
  82. print("Downloading tracks, this might take a while depending on your connection and the size of your request.")
  83. for track_name, track_link in tracks.items():
  84. i += 1
  85. writeTrack(track_name, track_link)
  86. print("Wrote {}/{} tracks".format(i,n)) # to know how the progress
  87. print("All tracks successfully written. Bye")
  88. url = sys.argv[1]
  89. path = sys.argv[2]
  90. os.chdir(os.path.expanduser(path))
  91. writeAll(url)
  92. #url = "https://archive.org/details/gd72-08-27.sbd.orf.3328.sbeok.shnf/gd1972-08-27d1t01-orf.shn"