Leech/update scripts for zenius-i-vanisher.com
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 
 

213 řádky
7.0 KiB

  1. import argparse
  2. import functools
  3. import os
  4. import os.path
  5. import pickle
  6. import queue
  7. import urllib
  8. import urllib.parse
  9. import zipfile
  10. import requests
  11. import requests.exceptions
  12. from bs4 import BeautifulSoup
  13. from frozendict import frozendict
  14. def parse_args(*args):
  15. parser = argparse.ArgumentParser(description="Mirror simfiles from ZIV")
  16. parser.add_argument('categories', type=str, nargs='*',
  17. help='ZIV category pages to mirror',
  18. default=['https://zenius-i-vanisher.com/v5.2/simfiles.php?category=latest20official'])
  19. parser.add_argument('--songdir', type=str, help="Directory to keep songs in", default="songs")
  20. parser.add_argument('--zipdir', type=str, help="Directory to keep downloaded zip files in", default="zips")
  21. parser.add_argument('--recurse', '-r',
  22. help='Recursively fetch the main categories for each song',
  23. action='store_true')
  24. feature = parser.add_mutually_exclusive_group(required=False)
  25. feature.add_argument('--dry-run', '-n',
  26. help="Only perform a dry run; don't download any files",
  27. dest='dry_run', action='store_true')
  28. feature.add_argument('--no-dry-run',
  29. help="Dwonload all files",
  30. dest='dry_run', action='store_false')
  31. feature.set_defaults(dry_run=False)
  32. return parser.parse_args(*args)
  33. errors = []
  34. @functools.lru_cache()
  35. def retrieve(url, filename, save_headers=None, extract=None, **kwargs):
  36. print(f'Downloading {url} -> {filename}')
  37. remove = False
  38. def record_error(message):
  39. errors.append((url, filename, message))
  40. print(message)
  41. try:
  42. req = requests.get(url, **kwargs, stream=True)
  43. if req.status_code == 200:
  44. with open(f'{filename}.part', 'wb') as output:
  45. for chunk in req.iter_content(1024):
  46. output.write(chunk)
  47. os.replace(f'{filename}.part', filename)
  48. if extract:
  49. with zipfile.ZipFile(filename, 'r') as zip:
  50. print(f'Extracting into {extract}')
  51. os.makedirs(extract, exist_ok=True)
  52. zip.extractall(extract)
  53. if save_headers:
  54. with open(save_headers, 'wb') as data:
  55. pickle.dump(req.headers, data)
  56. elif req.status_code == 304:
  57. print("Not modified")
  58. else:
  59. record_error(f"Error: {req.status_code} {req.text}")
  60. except requests.exceptions.BaseHTTPError as e:
  61. record_error(f'Error downloading: {e.msg}')
  62. except zipfile.BadZipFile:
  63. record_error(f'Not a zip file: {filename}')
  64. remove = True
  65. except KeyboardInterrupt as e:
  66. record_error('Download aborting...')
  67. remove = True
  68. raise e
  69. except Exception as e:
  70. record_error(f'Unhandled error: {e}')
  71. remove = True
  72. finally:
  73. if remove:
  74. if os.path.isfile(filename):
  75. print(f'Removing {filename}')
  76. os.remove(filename)
  77. if save_headers and os.path.isfile(save_headers):
  78. print(f'Removing {save_headers}')
  79. os.remove(save_headers)
  80. return req.headers
  81. @functools.lru_cache()
  82. def get_page(cat_url):
  83. request = requests.get(cat_url)
  84. return BeautifulSoup(request.text, features="html.parser")
  85. def load_prev_headers(filename, header_file):
  86. req_headers = {}
  87. if os.path.isfile(header_file) and os.path.isfile(filename):
  88. with open(header_file, 'rb') as data:
  89. prev_headers = pickle.load(data)
  90. if 'etag' in prev_headers:
  91. req_headers['If-None-Match'] = prev_headers['etag']
  92. if 'last-modified' in prev_headers:
  93. req_headers['If-Modified-Since'] = prev_headers['last-modified']
  94. return req_headers
  95. def mirror(cat_url, args):
  96. page = get_page(cat_url)
  97. group_urls = {}
  98. if 'viewsimfilecategory.php' in cat_url:
  99. simgroup = page.find('div', {'class': 'headertop'}).h1
  100. group_url = cat_url
  101. else:
  102. simgroup = None
  103. for row in page.find_all('tr'):
  104. simfile = row.find(
  105. "a", href=lambda href: href and "viewsimfile.php" in href)
  106. group_link = row.find(
  107. "a", href=lambda href: href and "viewsimfilecategory.php" in href)
  108. if group_link:
  109. simgroup = group_link
  110. group_url = group_link['href']
  111. if not (simfile and simgroup):
  112. continue
  113. songname = ' '.join(simfile.get_text().replace('/', '-').split())
  114. groupname = ' '.join(simgroup.get_text().replace('/', '-').split())
  115. print(f"collection: '{groupname}' simfile: '{songname}'")
  116. simlink = simfile['href']
  117. try:
  118. sim_id = urllib.parse.parse_qs(urllib.parse.urlparse(
  119. simfile['href']).query)['simfileid'][0]
  120. except KeyError:
  121. print(f"WARNING: no simfileid found on URL {simlink}")
  122. continue
  123. group_urls[groupname] = urllib.parse.urljoin(cat_url, group_url)
  124. url = f'https://zenius-i-vanisher.com/v5.2/download.php?type=ddrsimfile&simfileid={sim_id}'
  125. if args.dry_run:
  126. print(f"Dry run requested, not downloading {url}")
  127. continue
  128. filename = os.path.join(args.zipdir, f'{sim_id}.zip')
  129. headers = os.path.join(args.zipdir, f'{sim_id}.headers')
  130. req_headers = load_prev_headers(filename, headers)
  131. retrieve(url, filename, extract=os.path.join(args.songdir, groupname),
  132. headers=frozendict(req_headers), save_headers=headers)
  133. for groupname, group_url in group_urls.items():
  134. page = get_page(group_url)
  135. banner_urls = {urllib.parse.urljoin(group_url, banner['src'])
  136. for banner in page.select('p.centre img')
  137. if 'simfileNoBanner.png' not in banner['src']}
  138. for url in banner_urls:
  139. filename = os.path.join(args.songdir, groupname, 'banner.png')
  140. headers = os.path.join(args.zipdir, f'{groupname}-banner.headers')
  141. req_headers = load_prev_headers(filename, headers)
  142. retrieve(url, filename, extract=False,
  143. headers=frozendict(req_headers), save_headers=headers)
  144. return group_urls.values()
  145. def main():
  146. args = parse_args()
  147. os.makedirs(args.songdir, exist_ok=True)
  148. os.makedirs(args.zipdir, exist_ok=True)
  149. seen_cats = set()
  150. pending = queue.Queue()
  151. for url in args.categories:
  152. pending.put(url)
  153. while not pending.empty():
  154. url = pending.get()
  155. found = None
  156. if url not in seen_cats:
  157. seen_cats.add(url)
  158. found = mirror(url, args)
  159. if args.recurse:
  160. for url in found:
  161. print(f'Scheduling discovered category {url}')
  162. pending.put(url)
  163. if __name__ == "__main__":
  164. main()
  165. if errors:
  166. print('Downloading got errors:')
  167. for url, filename, message in errors:
  168. print(f'{url} ({filename}): {message}')