Leech/update scripts for zenius-i-vanisher.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

178 rivejä
6.2 KiB

  1. import argparse
  2. import functools
  3. import os
  4. import os.path
  5. import pickle
  6. import queue
  7. import urllib
  8. import urllib.parse
  9. import zipfile
  10. from urllib.request import urlretrieve
  11. import requests
  12. import requests.exceptions
  13. from bs4 import BeautifulSoup
  14. from frozendict import frozendict
  15. def parse_args(*args):
  16. parser = argparse.ArgumentParser(description="Mirror simfiles from ZIV")
  17. parser.add_argument('categories', type=str, nargs='*', help='ZIV category pages to mirror',
  18. default=['https://zenius-i-vanisher.com/v5.2/simfiles.php?category=latest20official'])
  19. parser.add_argument('--songdir', type=str, help="Directory to keep songs in", default="songs")
  20. parser.add_argument('--zipdir', type=str, help="Directory to keep downloaded zip files in", default="zips")
  21. parser.add_argument('--recurse', '-r',
  22. help='Recursively fetch the main categories for each song',
  23. action='store_true')
  24. feature = parser.add_mutually_exclusive_group(required=False)
  25. feature.add_argument('--dry-run', '-n',
  26. help="Only perform a dry run; don't download any files",
  27. dest='dry_run', action='store_true')
  28. feature.add_argument('--no-dry-run',
  29. help="Send pings normally",
  30. dest='dry_run', action='store_false')
  31. feature.set_defaults(dry_run=False)
  32. return parser.parse_args(*args)
  33. @functools.lru_cache()
  34. def retrieve(url, filename, save_headers=None, extract=None, **kwargs):
  35. print(f'Downloading {url} -> {filename}')
  36. try:
  37. req = requests.get(url, **kwargs, stream=True)
  38. if req.status_code == 200:
  39. with open(f'{filename}.part', 'wb') as output:
  40. for chunk in req.iter_content(1024):
  41. output.write(chunk)
  42. os.replace(f'{filename}.part', filename)
  43. if extract:
  44. with zipfile.ZipFile(filename, 'r') as zip:
  45. print(f'Extracting into {extract}')
  46. os.makedirs(extract, exist_ok=True)
  47. zip.extractall(extract)
  48. if save_headers:
  49. pickle.dump(req.headers, open(save_headers, 'wb'))
  50. elif req.status_code == 304:
  51. print("Not modified")
  52. else:
  53. print(f"Error: {req.status_code} {req.text}")
  54. except requests.exceptions.BaseHTTPError as e:
  55. print(f'Error downloading: {e.msg}')
  56. except zipfile.BadZipFile:
  57. print(f'Not a zip file: {filename}')
  58. except KeyboardInterrupt as e:
  59. print(f'Download aborting...')
  60. if os.path.isfile(filename):
  61. print(f'Removing partial file {filename}')
  62. os.unlink(filename)
  63. raise e
  64. return req.headers
  65. @functools.lru_cache()
  66. def get_page(cat_url):
  67. request = requests.get(cat_url)
  68. return BeautifulSoup(request.text, features="html.parser")
  69. def load_prev_headers(filename, header_file):
  70. req_headers = {}
  71. if os.path.isfile(header_file) and os.path.isfile(filename):
  72. prev_headers = pickle.load(open(header_file, 'rb'))
  73. if 'etag' in prev_headers:
  74. req_headers['If-None-Match'] = prev_headers['etag']
  75. if 'last-modified' in prev_headers:
  76. req_headers['If-Modified-Since'] = prev_headers['last-modified']
  77. return req_headers
  78. def mirror(cat_url, args):
  79. page = get_page(cat_url)
  80. group_urls = {}
  81. if 'viewsimfilecategory.php' in cat_url:
  82. simgroup = page.find('div', {'class': 'headertop'}).h1
  83. group_url = cat_url
  84. else:
  85. simgroup = None
  86. for row in page.find_all('tr'):
  87. simfile = row.find(
  88. "a", href=lambda href: href and "viewsimfile.php" in href)
  89. group_link = row.find(
  90. "a", href=lambda href: href and "viewsimfilecategory.php" in href)
  91. if group_link:
  92. simgroup = group_link
  93. group_url = group_link['href']
  94. if not (simfile and simgroup):
  95. continue
  96. songname = ' '.join(simfile.get_text().replace('/', '-').split())
  97. groupname = ' '.join(simgroup.get_text().replace('/', '-').split())
  98. print(f"collection: '{groupname}' simfile: '{songname}'")
  99. simlink = simfile['href']
  100. try:
  101. sim_id = urllib.parse.parse_qs(urllib.parse.urlparse(
  102. simfile['href']).query)['simfileid'][0]
  103. except KeyError:
  104. print(f"WARNING: no simfileid found on URL {simlink}")
  105. continue
  106. group_urls[groupname] = urllib.parse.urljoin(cat_url, group_url)
  107. url = f'https://zenius-i-vanisher.com/v5.2/download.php?type=ddrsimfile&simfileid={sim_id}'
  108. if args.dry_run:
  109. print(f"Dry run requested, not downloading {url}")
  110. continue
  111. filename = os.path.join(args.zipdir, f'{sim_id}.zip')
  112. headers = os.path.join(args.zipdir, f'{sim_id}.headers')
  113. req_headers = load_prev_headers(filename, headers)
  114. retrieve(url, filename, extract=os.path.join(args.songdir, groupname),
  115. headers=frozendict(req_headers), save_headers=headers)
  116. for groupname, group_url in group_urls.items():
  117. page = get_page(group_url)
  118. banner_urls = {urllib.parse.urljoin(group_url, banner['src'])
  119. for banner in page.select('p.centre img')
  120. if 'simfileNoBanner.png' not in banner['src']}
  121. for url in banner_urls:
  122. filename = os.path.join(args.songdir, groupname, 'banner.png')
  123. headers = os.path.join(args.zipdir, f'{groupname}-banner.headers')
  124. req_headers = load_prev_headers(filename, headers)
  125. retrieve(url, filename, extract=False,
  126. headers=req_headers, save_headers=headers)
  127. return group_urls.values()
  128. if __name__ == "__main__":
  129. args = parse_args()
  130. os.makedirs(args.songdir, exist_ok=True)
  131. os.makedirs(args.zipdir, exist_ok=True)
  132. seen_cats = set()
  133. categories = args.categories
  134. while categories:
  135. for url in {cat for cat in categories if cat not in seen_cats}:
  136. found = mirror(url, args)
  137. seen_cats.add(url)
  138. if args.recurse:
  139. print(f'Recursively fetching categories: {list(found)}')
  140. categories = found