1
0
Fork 0
zivleech/zenius.py

213 Zeilen
7.0 KiB
Python

2021-03-15 14:40:02 -07:00
import argparse
import functools
import os
import os.path
import pickle
2021-04-21 16:05:53 -07:00
import queue
2021-03-15 14:40:02 -07:00
import urllib
2021-02-08 11:31:24 -08:00
import urllib.parse
2021-03-15 14:40:02 -07:00
import zipfile
2021-02-08 11:31:24 -08:00
import requests
2021-03-15 14:49:07 -07:00
import requests.exceptions
2021-02-08 11:31:24 -08:00
from bs4 import BeautifulSoup
2021-04-21 16:05:53 -07:00
from frozendict import frozendict
2021-03-15 14:40:02 -07:00
2021-02-08 11:31:24 -08:00
def parse_args(*args):
parser = argparse.ArgumentParser(description="Mirror simfiles from ZIV")
2021-07-23 12:22:11 -07:00
parser.add_argument('categories', type=str, nargs='*',
help='ZIV category pages to mirror',
2021-03-15 14:40:02 -07:00
default=['https://zenius-i-vanisher.com/v5.2/simfiles.php?category=latest20official'])
2021-02-08 11:31:24 -08:00
2021-04-18 16:44:07 -07:00
parser.add_argument('--songdir', type=str, help="Directory to keep songs in", default="songs")
parser.add_argument('--zipdir', type=str, help="Directory to keep downloaded zip files in", default="zips")
2021-04-21 16:05:53 -07:00
parser.add_argument('--recurse', '-r',
help='Recursively fetch the main categories for each song',
action='store_true')
2021-02-08 11:31:24 -08:00
feature = parser.add_mutually_exclusive_group(required=False)
feature.add_argument('--dry-run', '-n',
2021-04-21 16:05:53 -07:00
help="Only perform a dry run; don't download any files",
2021-02-08 11:31:24 -08:00
dest='dry_run', action='store_true')
feature.add_argument('--no-dry-run',
2021-07-23 12:25:52 -07:00
help="Dwonload all files",
2021-02-08 11:31:24 -08:00
dest='dry_run', action='store_false')
feature.set_defaults(dry_run=False)
return parser.parse_args(*args)
2021-07-23 13:56:52 -07:00
errors = []
2021-04-21 16:05:53 -07:00
@functools.lru_cache()
2021-03-15 14:49:07 -07:00
def retrieve(url, filename, save_headers=None, extract=None, **kwargs):
2021-03-01 14:01:51 -08:00
print(f'Downloading {url} -> {filename}')
2021-07-23 12:22:11 -07:00
remove = False
2021-07-23 13:56:52 -07:00
def record_error(message):
errors.append((url, filename, message))
print(message)
2021-03-01 14:01:51 -08:00
try:
req = requests.get(url, **kwargs, stream=True)
if req.status_code == 200:
2021-04-21 16:05:53 -07:00
with open(f'{filename}.part', 'wb') as output:
2021-03-01 14:01:51 -08:00
for chunk in req.iter_content(1024):
output.write(chunk)
2021-04-21 16:05:53 -07:00
os.replace(f'{filename}.part', filename)
2021-03-15 14:40:02 -07:00
if extract:
with zipfile.ZipFile(filename, 'r') as zip:
2021-03-15 14:49:07 -07:00
print(f'Extracting into {extract}')
os.makedirs(extract, exist_ok=True)
zip.extractall(extract)
2021-03-01 14:01:51 -08:00
if save_headers:
2021-07-23 12:22:11 -07:00
with open(save_headers, 'wb') as data:
pickle.dump(req.headers, data)
2021-03-01 14:01:51 -08:00
elif req.status_code == 304:
print("Not modified")
else:
2021-07-23 13:56:52 -07:00
record_error(f"Error: {req.status_code} {req.text}")
2021-03-15 14:49:07 -07:00
except requests.exceptions.BaseHTTPError as e:
2021-07-23 13:56:52 -07:00
record_error(f'Error downloading: {e.msg}')
2021-03-01 14:01:51 -08:00
except zipfile.BadZipFile:
2021-07-23 13:56:52 -07:00
record_error(f'Not a zip file: {filename}')
2021-07-23 12:22:11 -07:00
remove = True
2021-03-01 14:01:51 -08:00
except KeyboardInterrupt as e:
2021-07-23 13:56:52 -07:00
record_error('Download aborting...')
2021-07-23 12:22:11 -07:00
remove = True
2021-03-01 14:01:51 -08:00
raise e
2021-07-23 13:56:52 -07:00
except Exception as e:
record_error(f'Unhandled error: {e}')
remove = True
2021-07-23 12:22:11 -07:00
finally:
if remove:
if os.path.isfile(filename):
print(f'Removing {filename}')
os.remove(filename)
if save_headers and os.path.isfile(save_headers):
print(f'Removing {save_headers}')
os.remove(save_headers)
2021-03-01 14:01:51 -08:00
return req.headers
2021-03-15 14:40:02 -07:00
@functools.lru_cache()
def get_page(cat_url):
2021-02-08 11:31:24 -08:00
request = requests.get(cat_url)
2021-03-15 14:40:02 -07:00
return BeautifulSoup(request.text, features="html.parser")
def load_prev_headers(filename, header_file):
req_headers = {}
if os.path.isfile(header_file) and os.path.isfile(filename):
2021-07-23 12:22:11 -07:00
with open(header_file, 'rb') as data:
prev_headers = pickle.load(data)
2021-03-15 14:40:02 -07:00
if 'etag' in prev_headers:
req_headers['If-None-Match'] = prev_headers['etag']
if 'last-modified' in prev_headers:
req_headers['If-Modified-Since'] = prev_headers['last-modified']
return req_headers
def mirror(cat_url, args):
page = get_page(cat_url)
group_urls = {}
2021-02-08 11:31:24 -08:00
if 'viewsimfilecategory.php' in cat_url:
simgroup = page.find('div', {'class': 'headertop'}).h1
2021-03-15 14:40:02 -07:00
group_url = cat_url
2021-02-08 11:31:24 -08:00
else:
simgroup = None
for row in page.find_all('tr'):
2021-03-15 14:40:02 -07:00
simfile = row.find(
"a", href=lambda href: href and "viewsimfile.php" in href)
group_link = row.find(
"a", href=lambda href: href and "viewsimfilecategory.php" in href)
if group_link:
simgroup = group_link
group_url = group_link['href']
2021-02-08 11:31:24 -08:00
if not (simfile and simgroup):
continue
songname = ' '.join(simfile.get_text().replace('/', '-').split())
groupname = ' '.join(simgroup.get_text().replace('/', '-').split())
print(f"collection: '{groupname}' simfile: '{songname}'")
simlink = simfile['href']
try:
2021-03-15 14:40:02 -07:00
sim_id = urllib.parse.parse_qs(urllib.parse.urlparse(
simfile['href']).query)['simfileid'][0]
2021-02-08 11:31:24 -08:00
except KeyError:
print(f"WARNING: no simfileid found on URL {simlink}")
continue
2021-03-15 14:40:02 -07:00
group_urls[groupname] = urllib.parse.urljoin(cat_url, group_url)
2021-02-08 11:31:24 -08:00
url = f'https://zenius-i-vanisher.com/v5.2/download.php?type=ddrsimfile&simfileid={sim_id}'
if args.dry_run:
print(f"Dry run requested, not downloading {url}")
continue
2021-04-18 16:44:07 -07:00
filename = os.path.join(args.zipdir, f'{sim_id}.zip')
headers = os.path.join(args.zipdir, f'{sim_id}.headers')
2021-03-15 14:40:02 -07:00
req_headers = load_prev_headers(filename, headers)
2021-03-01 14:01:51 -08:00
2021-04-18 16:44:07 -07:00
retrieve(url, filename, extract=os.path.join(args.songdir, groupname),
2021-04-21 16:05:53 -07:00
headers=frozendict(req_headers), save_headers=headers)
2021-02-08 11:31:24 -08:00
2021-03-15 14:40:02 -07:00
for groupname, group_url in group_urls.items():
page = get_page(group_url)
banner_urls = {urllib.parse.urljoin(group_url, banner['src'])
2021-03-15 16:01:31 -07:00
for banner in page.select('p.centre img')
if 'simfileNoBanner.png' not in banner['src']}
2021-03-15 14:40:02 -07:00
for url in banner_urls:
2021-04-18 16:44:07 -07:00
filename = os.path.join(args.songdir, groupname, 'banner.png')
headers = os.path.join(args.zipdir, f'{groupname}-banner.headers')
2021-03-15 14:40:02 -07:00
req_headers = load_prev_headers(filename, headers)
2021-03-15 16:01:31 -07:00
retrieve(url, filename, extract=False,
2021-07-23 12:25:52 -07:00
headers=frozendict(req_headers), save_headers=headers)
2021-03-15 14:40:02 -07:00
2021-04-21 16:05:53 -07:00
return group_urls.values()
2021-03-15 14:40:02 -07:00
2021-07-23 12:22:11 -07:00
def main():
2021-02-08 11:31:24 -08:00
args = parse_args()
2021-04-18 16:44:07 -07:00
os.makedirs(args.songdir, exist_ok=True)
os.makedirs(args.zipdir, exist_ok=True)
2021-04-21 16:05:53 -07:00
seen_cats = set()
2021-04-27 22:57:28 -07:00
pending = queue.Queue()
for url in args.categories:
pending.put(url)
while not pending.empty():
url = pending.get()
2021-05-09 23:05:11 -07:00
found = None
2021-04-27 22:57:28 -07:00
if url not in seen_cats:
2021-04-21 16:05:53 -07:00
seen_cats.add(url)
2021-04-27 22:57:28 -07:00
found = mirror(url, args)
2021-05-09 23:09:15 -07:00
if args.recurse:
for url in found:
print(f'Scheduling discovered category {url}')
pending.put(url)
2021-04-27 22:57:28 -07:00
2021-07-23 12:22:11 -07:00
if __name__ == "__main__":
main()
2021-07-23 13:56:52 -07:00
if errors:
print('Downloading got errors:')
for url, filename, message in errors:
print(f'{url} ({filename}): {message}')