Leech/update scripts for zenius-i-vanisher.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

161 lines
5.6 KiB

  1. import argparse
  2. import functools
  3. import os
  4. import os.path
  5. import pickle
  6. import urllib
  7. import urllib.parse
  8. import zipfile
  9. from urllib.request import urlretrieve
  10. import requests
  11. import requests.exceptions
  12. from bs4 import BeautifulSoup
  13. def parse_args(*args):
  14. parser = argparse.ArgumentParser(description="Mirror simfiles from ZIV")
  15. parser.add_argument('categories', type=str, nargs='*', help='ZIV category pages to mirror',
  16. default=['https://zenius-i-vanisher.com/v5.2/simfiles.php?category=latest20official'])
  17. parser.add_argument('--songdir', type=str, help="Directory to keep songs in", default="songs")
  18. parser.add_argument('--zipdir', type=str, help="Directory to keep downloaded zip files in", default="zips")
  19. feature = parser.add_mutually_exclusive_group(required=False)
  20. feature.add_argument('--dry-run', '-n',
  21. help="Only perform a dry run; don't send any pings",
  22. dest='dry_run', action='store_true')
  23. feature.add_argument('--no-dry-run',
  24. help="Send pings normally",
  25. dest='dry_run', action='store_false')
  26. feature.set_defaults(dry_run=False)
  27. return parser.parse_args(*args)
  28. def retrieve(url, filename, save_headers=None, extract=None, **kwargs):
  29. print(f'Downloading {url} -> {filename}')
  30. try:
  31. req = requests.get(url, **kwargs, stream=True)
  32. if req.status_code == 200:
  33. with open(filename, 'wb') as output:
  34. for chunk in req.iter_content(1024):
  35. output.write(chunk)
  36. if extract:
  37. with zipfile.ZipFile(filename, 'r') as zip:
  38. print(f'Extracting into {extract}')
  39. os.makedirs(extract, exist_ok=True)
  40. zip.extractall(extract)
  41. if save_headers:
  42. pickle.dump(req.headers, open(save_headers, 'wb'))
  43. elif req.status_code == 304:
  44. print("Not modified")
  45. else:
  46. print(f"Error: {req.status_code} {req.text}")
  47. except requests.exceptions.BaseHTTPError as e:
  48. print(f'Error downloading: {e.msg}')
  49. except zipfile.BadZipFile:
  50. print(f'Not a zip file: {filename}')
  51. except KeyboardInterrupt as e:
  52. print(f'Download aborting...')
  53. if os.path.isfile(filename):
  54. print(f'Removing partial file {filename}')
  55. os.unlink(filename)
  56. raise e
  57. return req.headers
  58. @functools.lru_cache()
  59. def get_page(cat_url):
  60. request = requests.get(cat_url)
  61. return BeautifulSoup(request.text, features="html.parser")
  62. def load_prev_headers(filename, header_file):
  63. req_headers = {}
  64. if os.path.isfile(header_file) and os.path.isfile(filename):
  65. prev_headers = pickle.load(open(header_file, 'rb'))
  66. if 'etag' in prev_headers:
  67. req_headers['If-None-Match'] = prev_headers['etag']
  68. if 'last-modified' in prev_headers:
  69. req_headers['If-Modified-Since'] = prev_headers['last-modified']
  70. return req_headers
  71. def mirror(cat_url, args):
  72. page = get_page(cat_url)
  73. group_urls = {}
  74. if 'viewsimfilecategory.php' in cat_url:
  75. simgroup = page.find('div', {'class': 'headertop'}).h1
  76. group_url = cat_url
  77. else:
  78. simgroup = None
  79. for row in page.find_all('tr'):
  80. simfile = row.find(
  81. "a", href=lambda href: href and "viewsimfile.php" in href)
  82. group_link = row.find(
  83. "a", href=lambda href: href and "viewsimfilecategory.php" in href)
  84. if group_link:
  85. simgroup = group_link
  86. group_url = group_link['href']
  87. if not (simfile and simgroup):
  88. continue
  89. songname = ' '.join(simfile.get_text().replace('/', '-').split())
  90. groupname = ' '.join(simgroup.get_text().replace('/', '-').split())
  91. print(f"collection: '{groupname}' simfile: '{songname}'")
  92. simlink = simfile['href']
  93. try:
  94. sim_id = urllib.parse.parse_qs(urllib.parse.urlparse(
  95. simfile['href']).query)['simfileid'][0]
  96. except KeyError:
  97. print(f"WARNING: no simfileid found on URL {simlink}")
  98. continue
  99. group_urls[groupname] = urllib.parse.urljoin(cat_url, group_url)
  100. url = f'https://zenius-i-vanisher.com/v5.2/download.php?type=ddrsimfile&simfileid={sim_id}'
  101. if args.dry_run:
  102. print(f"Dry run requested, not downloading {url}")
  103. continue
  104. filename = os.path.join(args.zipdir, f'{sim_id}.zip')
  105. headers = os.path.join(args.zipdir, f'{sim_id}.headers')
  106. req_headers = load_prev_headers(filename, headers)
  107. retrieve(url, filename, extract=os.path.join(args.songdir, groupname),
  108. headers=req_headers, save_headers=headers)
  109. for groupname, group_url in group_urls.items():
  110. page = get_page(group_url)
  111. banner_urls = {urllib.parse.urljoin(group_url, banner['src'])
  112. for banner in page.select('p.centre img')
  113. if 'simfileNoBanner.png' not in banner['src']}
  114. for url in banner_urls:
  115. filename = os.path.join(args.songdir, groupname, 'banner.png')
  116. headers = os.path.join(args.zipdir, f'{groupname}-banner.headers')
  117. req_headers = load_prev_headers(filename, headers)
  118. retrieve(url, filename, extract=False,
  119. headers=req_headers, save_headers=headers)
  120. if __name__ == "__main__":
  121. args = parse_args()
  122. os.makedirs(args.songdir, exist_ok=True)
  123. os.makedirs(args.zipdir, exist_ok=True)
  124. for url in args.categories:
  125. mirror(url, args)