Leech/update scripts for zenius-i-vanisher.com
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

178 lignes
6.2 KiB

  1. import argparse
  2. import functools
  3. import os
  4. import os.path
  5. import pickle
  6. import queue
  7. import urllib
  8. import urllib.parse
  9. import zipfile
  10. from urllib.request import urlretrieve
  11. import requests
  12. import requests.exceptions
  13. from bs4 import BeautifulSoup
  14. from frozendict import frozendict
  15. def parse_args(*args):
  16. parser = argparse.ArgumentParser(description="Mirror simfiles from ZIV")
  17. parser.add_argument('categories', type=str, nargs='*', help='ZIV category pages to mirror',
  18. default=['https://zenius-i-vanisher.com/v5.2/simfiles.php?category=latest20official'])
  19. parser.add_argument('--songdir', type=str, help="Directory to keep songs in", default="songs")
  20. parser.add_argument('--zipdir', type=str, help="Directory to keep downloaded zip files in", default="zips")
  21. parser.add_argument('--recurse', '-r',
  22. help='Recursively fetch the main categories for each song',
  23. action='store_true')
  24. feature = parser.add_mutually_exclusive_group(required=False)
  25. feature.add_argument('--dry-run', '-n',
  26. help="Only perform a dry run; don't download any files",
  27. dest='dry_run', action='store_true')
  28. feature.add_argument('--no-dry-run',
  29. help="Send pings normally",
  30. dest='dry_run', action='store_false')
  31. feature.set_defaults(dry_run=False)
  32. return parser.parse_args(*args)
  33. @functools.lru_cache()
  34. def retrieve(url, filename, save_headers=None, extract=None, **kwargs):
  35. print(f'Downloading {url} -> {filename}')
  36. try:
  37. req = requests.get(url, **kwargs, stream=True)
  38. if req.status_code == 200:
  39. with open(f'{filename}.part', 'wb') as output:
  40. for chunk in req.iter_content(1024):
  41. output.write(chunk)
  42. os.replace(f'{filename}.part', filename)
  43. if extract:
  44. with zipfile.ZipFile(filename, 'r') as zip:
  45. print(f'Extracting into {extract}')
  46. os.makedirs(extract, exist_ok=True)
  47. zip.extractall(extract)
  48. if save_headers:
  49. pickle.dump(req.headers, open(save_headers, 'wb'))
  50. elif req.status_code == 304:
  51. print("Not modified")
  52. else:
  53. print(f"Error: {req.status_code} {req.text}")
  54. except requests.exceptions.BaseHTTPError as e:
  55. print(f'Error downloading: {e.msg}')
  56. except zipfile.BadZipFile:
  57. print(f'Not a zip file: {filename}')
  58. except KeyboardInterrupt as e:
  59. print(f'Download aborting...')
  60. if os.path.isfile(filename):
  61. print(f'Removing partial file {filename}')
  62. os.unlink(filename)
  63. raise e
  64. return req.headers
  65. @functools.lru_cache()
  66. def get_page(cat_url):
  67. request = requests.get(cat_url)
  68. return BeautifulSoup(request.text, features="html.parser")
  69. def load_prev_headers(filename, header_file):
  70. req_headers = {}
  71. if os.path.isfile(header_file) and os.path.isfile(filename):
  72. prev_headers = pickle.load(open(header_file, 'rb'))
  73. if 'etag' in prev_headers:
  74. req_headers['If-None-Match'] = prev_headers['etag']
  75. if 'last-modified' in prev_headers:
  76. req_headers['If-Modified-Since'] = prev_headers['last-modified']
  77. return req_headers
  78. def mirror(cat_url, args):
  79. page = get_page(cat_url)
  80. group_urls = {}
  81. if 'viewsimfilecategory.php' in cat_url:
  82. simgroup = page.find('div', {'class': 'headertop'}).h1
  83. group_url = cat_url
  84. else:
  85. simgroup = None
  86. for row in page.find_all('tr'):
  87. simfile = row.find(
  88. "a", href=lambda href: href and "viewsimfile.php" in href)
  89. group_link = row.find(
  90. "a", href=lambda href: href and "viewsimfilecategory.php" in href)
  91. if group_link:
  92. simgroup = group_link
  93. group_url = group_link['href']
  94. if not (simfile and simgroup):
  95. continue
  96. songname = ' '.join(simfile.get_text().replace('/', '-').split())
  97. groupname = ' '.join(simgroup.get_text().replace('/', '-').split())
  98. print(f"collection: '{groupname}' simfile: '{songname}'")
  99. simlink = simfile['href']
  100. try:
  101. sim_id = urllib.parse.parse_qs(urllib.parse.urlparse(
  102. simfile['href']).query)['simfileid'][0]
  103. except KeyError:
  104. print(f"WARNING: no simfileid found on URL {simlink}")
  105. continue
  106. group_urls[groupname] = urllib.parse.urljoin(cat_url, group_url)
  107. url = f'https://zenius-i-vanisher.com/v5.2/download.php?type=ddrsimfile&simfileid={sim_id}'
  108. if args.dry_run:
  109. print(f"Dry run requested, not downloading {url}")
  110. continue
  111. filename = os.path.join(args.zipdir, f'{sim_id}.zip')
  112. headers = os.path.join(args.zipdir, f'{sim_id}.headers')
  113. req_headers = load_prev_headers(filename, headers)
  114. retrieve(url, filename, extract=os.path.join(args.songdir, groupname),
  115. headers=frozendict(req_headers), save_headers=headers)
  116. for groupname, group_url in group_urls.items():
  117. page = get_page(group_url)
  118. banner_urls = {urllib.parse.urljoin(group_url, banner['src'])
  119. for banner in page.select('p.centre img')
  120. if 'simfileNoBanner.png' not in banner['src']}
  121. for url in banner_urls:
  122. filename = os.path.join(args.songdir, groupname, 'banner.png')
  123. headers = os.path.join(args.zipdir, f'{groupname}-banner.headers')
  124. req_headers = load_prev_headers(filename, headers)
  125. retrieve(url, filename, extract=False,
  126. headers=req_headers, save_headers=headers)
  127. return group_urls.values()
  128. if __name__ == "__main__":
  129. args = parse_args()
  130. os.makedirs(args.songdir, exist_ok=True)
  131. os.makedirs(args.zipdir, exist_ok=True)
  132. seen_cats = set()
  133. categories = args.categories
  134. while categories:
  135. for url in {cat for cat in categories if cat not in seen_cats}:
  136. found = mirror(url, args)
  137. seen_cats.add(url)
  138. if args.recurse:
  139. print(f'Recursively fetching categories: {list(found)}')
  140. categories = found