zivleech/zenius.py

import argparse
import functools
import os
import os.path
import pickle
import queue
import urllib
import urllib.parse
import zipfile

import requests
import requests.exceptions
from bs4 import BeautifulSoup
from frozendict import frozendict

def parse_args(*args):
    parser = argparse.ArgumentParser(description="Mirror simfiles from ZIV")

    parser.add_argument('categories', type=str, nargs='*',
                        help='ZIV category pages to mirror',
                        default=['https://zenius-i-vanisher.com/v5.2/simfiles.php?category=latest20official'])

    parser.add_argument('--songdir', type=str, help="Directory to keep songs in", default="songs")
    parser.add_argument('--zipdir', type=str, help="Directory to keep downloaded zip files in", default="zips")

    parser.add_argument('--recurse', '-r',
        help='Recursively fetch the main categories for each song',
        action='store_true')

    feature = parser.add_mutually_exclusive_group(required=False)
    feature.add_argument('--dry-run', '-n',
                         help="Only perform a dry run; don't download any files",
                         dest='dry_run', action='store_true')
    feature.add_argument('--no-dry-run',
                         help="Dwonload all files",
                         dest='dry_run', action='store_false')
    feature.set_defaults(dry_run=False)

    return parser.parse_args(*args)

errors = []

@functools.lru_cache()
def retrieve(url, filename, save_headers=None, extract=None, **kwargs):
    print(f'Downloading {url} -> {filename}')
    remove = False

    def record_error(message):
        errors.append((url, filename, message))
        print(message)

    try:
        req = requests.get(url, **kwargs, stream=True)
        if req.status_code == 200:
            with open(f'{filename}.part', 'wb') as output:
                for chunk in req.iter_content(1024):
                    output.write(chunk)

            os.replace(f'{filename}.part', filename)

            if extract:
                with zipfile.ZipFile(filename, 'r') as zip:
                    print(f'Extracting into {extract}')
                    os.makedirs(extract, exist_ok=True)
                    zip.extractall(extract)

            if save_headers:
                with open(save_headers, 'wb') as data:
                    pickle.dump(req.headers, data)
        elif req.status_code == 304:
            print("Not modified")
        else:
            record_error(f"Error: {req.status_code} {req.text}")
    except requests.exceptions.BaseHTTPError as e:
        record_error(f'Error downloading: {e.msg}')
    except zipfile.BadZipFile:
        record_error(f'Not a zip file: {filename}')
        remove = True
    except KeyboardInterrupt as e:
        record_error('Download aborting...')
        remove = True
        raise e
    except Exception as e:
        record_error(f'Unhandled error: {e}')
        remove = True
    finally:
        if remove:
            if os.path.isfile(filename):
                print(f'Removing {filename}')
                os.remove(filename)
            if save_headers and os.path.isfile(save_headers):
                print(f'Removing {save_headers}')
                os.remove(save_headers)

    return req.headers


@functools.lru_cache()
def get_page(cat_url):
    request = requests.get(cat_url)
    return BeautifulSoup(request.text, features="html.parser")


def load_prev_headers(filename, header_file):
    req_headers = {}
    if os.path.isfile(header_file) and os.path.isfile(filename):
        with open(header_file, 'rb') as data:
            prev_headers = pickle.load(data)
        if 'etag' in prev_headers:
            req_headers['If-None-Match'] = prev_headers['etag']
        if 'last-modified' in prev_headers:
            req_headers['If-Modified-Since'] = prev_headers['last-modified']
    return req_headers


def mirror(cat_url, args):
    page = get_page(cat_url)

    group_urls = {}

    if 'viewsimfilecategory.php' in cat_url:
        simgroup = page.find('div', {'class': 'headertop'}).h1
        group_url = cat_url
    else:
        simgroup = None

    for row in page.find_all('tr'):
        simfile = row.find(
            "a", href=lambda href: href and "viewsimfile.php" in href)
        group_link = row.find(
            "a", href=lambda href: href and "viewsimfilecategory.php" in href)
        if group_link:
            simgroup = group_link
            group_url = group_link['href']

        if not (simfile and simgroup):
            continue

        songname = ' '.join(simfile.get_text().replace('/', '-').split())
        groupname = ' '.join(simgroup.get_text().replace('/', '-').split())

        print(f"collection: '{groupname}' simfile: '{songname}'")

        simlink = simfile['href']
        try:
            sim_id = urllib.parse.parse_qs(urllib.parse.urlparse(
                simfile['href']).query)['simfileid'][0]
        except KeyError:
            print(f"WARNING: no simfileid found on URL {simlink}")
            continue

        group_urls[groupname] = urllib.parse.urljoin(cat_url, group_url)

        url = f'https://zenius-i-vanisher.com/v5.2/download.php?type=ddrsimfile&simfileid={sim_id}'

        if args.dry_run:
            print(f"Dry run requested, not downloading {url}")
            continue

        filename = os.path.join(args.zipdir, f'{sim_id}.zip')
        headers = os.path.join(args.zipdir, f'{sim_id}.headers')
        req_headers = load_prev_headers(filename, headers)

        retrieve(url, filename, extract=os.path.join(args.songdir, groupname),
                 headers=frozendict(req_headers), save_headers=headers)

    for groupname, group_url in group_urls.items():
        page = get_page(group_url)
        banner_urls = {urllib.parse.urljoin(group_url, banner['src'])
                       for banner in page.select('p.centre img')
                       if 'simfileNoBanner.png' not in banner['src']}
        for url in banner_urls:
            filename = os.path.join(args.songdir, groupname, 'banner.png')
            headers = os.path.join(args.zipdir, f'{groupname}-banner.headers')
            req_headers = load_prev_headers(filename, headers)
            retrieve(url, filename, extract=False,
                     headers=frozendict(req_headers), save_headers=headers)

    return group_urls.values()


def main():
    args = parse_args()

    os.makedirs(args.songdir, exist_ok=True)
    os.makedirs(args.zipdir, exist_ok=True)

    seen_cats = set()
    pending = queue.Queue()
    for url in args.categories:
        pending.put(url)

    while not pending.empty():
        url = pending.get()
        found = None
        if url not in seen_cats:
            seen_cats.add(url)
            found = mirror(url, args)
            if args.recurse:
                for url in found:
                    print(f'Scheduling discovered category {url}')
                    pending.put(url)


if __name__ == "__main__":
    main()

    if errors:
        print('Downloading got errors:')
        for url, filename, message in errors:
            print(f'{url} ({filename}): {message}')
Add group banner retrieval 2021-03-15 14:40:02 -07:00			`import argparse`
			`import functools`
			`import os`
			`import os.path`
			`import pickle`
Recursively download source categories 2021-04-21 16:05:53 -07:00			`import queue`
Add group banner retrieval 2021-03-15 14:40:02 -07:00			`import urllib`
Initial checkin 2021-02-08 11:31:24 -08:00			`import urllib.parse`
Add group banner retrieval 2021-03-15 14:40:02 -07:00			`import zipfile`

Initial checkin 2021-02-08 11:31:24 -08:00			`import requests`
Fix downloading of new simfiles, oops 2021-03-15 14:49:07 -07:00			`import requests.exceptions`
Initial checkin 2021-02-08 11:31:24 -08:00			`from bs4 import BeautifulSoup`
Recursively download source categories 2021-04-21 16:05:53 -07:00			`from frozendict import frozendict`
Add group banner retrieval 2021-03-15 14:40:02 -07:00
Initial checkin 2021-02-08 11:31:24 -08:00			`def parse_args(*args):`
			`parser = argparse.ArgumentParser(description="Mirror simfiles from ZIV")`

Cleanups 2021-07-23 12:22:11 -07:00			`parser.add_argument('categories', type=str, nargs='*',`
			`help='ZIV category pages to mirror',`
Add group banner retrieval 2021-03-15 14:40:02 -07:00			`default=['https://zenius-i-vanisher.com/v5.2/simfiles.php?category=latest20official'])`
Initial checkin 2021-02-08 11:31:24 -08:00
Updates to work natively on Windows 2021-04-18 16:44:07 -07:00			`parser.add_argument('--songdir', type=str, help="Directory to keep songs in", default="songs")`
			`parser.add_argument('--zipdir', type=str, help="Directory to keep downloaded zip files in", default="zips")`

Recursively download source categories 2021-04-21 16:05:53 -07:00			`parser.add_argument('--recurse', '-r',`
			`help='Recursively fetch the main categories for each song',`
			`action='store_true')`

Initial checkin 2021-02-08 11:31:24 -08:00			`feature = parser.add_mutually_exclusive_group(required=False)`
			`feature.add_argument('--dry-run', '-n',`
Recursively download source categories 2021-04-21 16:05:53 -07:00			`help="Only perform a dry run; don't download any files",`
Initial checkin 2021-02-08 11:31:24 -08:00			`dest='dry_run', action='store_true')`
			`feature.add_argument('--no-dry-run',`
Fix banner download 2021-07-23 12:25:52 -07:00			`help="Dwonload all files",`
Initial checkin 2021-02-08 11:31:24 -08:00			`dest='dry_run', action='store_false')`
			`feature.set_defaults(dry_run=False)`

			`return parser.parse_args(*args)`

record errors 2021-07-23 13:56:52 -07:00			`errors = []`

Recursively download source categories 2021-04-21 16:05:53 -07:00			`@functools.lru_cache()`
Fix downloading of new simfiles, oops 2021-03-15 14:49:07 -07:00			`def retrieve(url, filename, save_headers=None, extract=None, **kwargs):`
Only download changed files 2021-03-01 14:01:51 -08:00			`print(f'Downloading {url} -> {filename}')`
Cleanups 2021-07-23 12:22:11 -07:00			`remove = False`
record errors 2021-07-23 13:56:52 -07:00
			`def record_error(message):`
			`errors.append((url, filename, message))`
			`print(message)`

Only download changed files 2021-03-01 14:01:51 -08:00			`try:`
			`req = requests.get(url, **kwargs, stream=True)`
			`if req.status_code == 200:`
Recursively download source categories 2021-04-21 16:05:53 -07:00			`with open(f'{filename}.part', 'wb') as output:`
Only download changed files 2021-03-01 14:01:51 -08:00			`for chunk in req.iter_content(1024):`
			`output.write(chunk)`

Recursively download source categories 2021-04-21 16:05:53 -07:00			`os.replace(f'{filename}.part', filename)`

Add group banner retrieval 2021-03-15 14:40:02 -07:00			`if extract:`
			`with zipfile.ZipFile(filename, 'r') as zip:`
Fix downloading of new simfiles, oops 2021-03-15 14:49:07 -07:00			`print(f'Extracting into {extract}')`
			`os.makedirs(extract, exist_ok=True)`
			`zip.extractall(extract)`
Only download changed files 2021-03-01 14:01:51 -08:00
			`if save_headers:`
Cleanups 2021-07-23 12:22:11 -07:00			`with open(save_headers, 'wb') as data:`
			`pickle.dump(req.headers, data)`
Only download changed files 2021-03-01 14:01:51 -08:00			`elif req.status_code == 304:`
			`print("Not modified")`
			`else:`
record errors 2021-07-23 13:56:52 -07:00			`record_error(f"Error: {req.status_code} {req.text}")`
Fix downloading of new simfiles, oops 2021-03-15 14:49:07 -07:00			`except requests.exceptions.BaseHTTPError as e:`
record errors 2021-07-23 13:56:52 -07:00			`record_error(f'Error downloading: {e.msg}')`
Only download changed files 2021-03-01 14:01:51 -08:00			`except zipfile.BadZipFile:`
record errors 2021-07-23 13:56:52 -07:00			`record_error(f'Not a zip file: {filename}')`
Cleanups 2021-07-23 12:22:11 -07:00			`remove = True`
Only download changed files 2021-03-01 14:01:51 -08:00			`except KeyboardInterrupt as e:`
record errors 2021-07-23 13:56:52 -07:00			`record_error('Download aborting...')`
Cleanups 2021-07-23 12:22:11 -07:00			`remove = True`
Only download changed files 2021-03-01 14:01:51 -08:00			`raise e`
record errors 2021-07-23 13:56:52 -07:00			`except Exception as e:`
			`record_error(f'Unhandled error: {e}')`
			`remove = True`
Cleanups 2021-07-23 12:22:11 -07:00			`finally:`
			`if remove:`
			`if os.path.isfile(filename):`
			`print(f'Removing {filename}')`
			`os.remove(filename)`
			`if save_headers and os.path.isfile(save_headers):`
			`print(f'Removing {save_headers}')`
			`os.remove(save_headers)`
Only download changed files 2021-03-01 14:01:51 -08:00
			`return req.headers`

Add group banner retrieval 2021-03-15 14:40:02 -07:00
			`@functools.lru_cache()`
			`def get_page(cat_url):`
Initial checkin 2021-02-08 11:31:24 -08:00			`request = requests.get(cat_url)`
Add group banner retrieval 2021-03-15 14:40:02 -07:00			`return BeautifulSoup(request.text, features="html.parser")`


			`def load_prev_headers(filename, header_file):`
			`req_headers = {}`
			`if os.path.isfile(header_file) and os.path.isfile(filename):`
Cleanups 2021-07-23 12:22:11 -07:00			`with open(header_file, 'rb') as data:`
			`prev_headers = pickle.load(data)`
Add group banner retrieval 2021-03-15 14:40:02 -07:00			`if 'etag' in prev_headers:`
			`req_headers['If-None-Match'] = prev_headers['etag']`
			`if 'last-modified' in prev_headers:`
			`req_headers['If-Modified-Since'] = prev_headers['last-modified']`
			`return req_headers`


			`def mirror(cat_url, args):`
			`page = get_page(cat_url)`

			`group_urls = {}`
Initial checkin 2021-02-08 11:31:24 -08:00
			`if 'viewsimfilecategory.php' in cat_url:`
			`simgroup = page.find('div', {'class': 'headertop'}).h1`
Add group banner retrieval 2021-03-15 14:40:02 -07:00			`group_url = cat_url`
Initial checkin 2021-02-08 11:31:24 -08:00			`else:`
			`simgroup = None`

			`for row in page.find_all('tr'):`
Add group banner retrieval 2021-03-15 14:40:02 -07:00			`simfile = row.find(`
			`"a", href=lambda href: href and "viewsimfile.php" in href)`
			`group_link = row.find(`
			`"a", href=lambda href: href and "viewsimfilecategory.php" in href)`
			`if group_link:`
			`simgroup = group_link`
			`group_url = group_link['href']`
Initial checkin 2021-02-08 11:31:24 -08:00
			`if not (simfile and simgroup):`
			`continue`

			`songname = ' '.join(simfile.get_text().replace('/', '-').split())`
			`groupname = ' '.join(simgroup.get_text().replace('/', '-').split())`

			`print(f"collection: '{groupname}' simfile: '{songname}'")`

			`simlink = simfile['href']`
			`try:`
Add group banner retrieval 2021-03-15 14:40:02 -07:00			`sim_id = urllib.parse.parse_qs(urllib.parse.urlparse(`
			`simfile['href']).query)['simfileid'][0]`
Initial checkin 2021-02-08 11:31:24 -08:00			`except KeyError:`
			`print(f"WARNING: no simfileid found on URL {simlink}")`
			`continue`

Add group banner retrieval 2021-03-15 14:40:02 -07:00			`group_urls[groupname] = urllib.parse.urljoin(cat_url, group_url)`

Initial checkin 2021-02-08 11:31:24 -08:00			`url = f'https://zenius-i-vanisher.com/v5.2/download.php?type=ddrsimfile&simfileid={sim_id}'`

			`if args.dry_run:`
			`print(f"Dry run requested, not downloading {url}")`
			`continue`

Updates to work natively on Windows 2021-04-18 16:44:07 -07:00			`filename = os.path.join(args.zipdir, f'{sim_id}.zip')`
			`headers = os.path.join(args.zipdir, f'{sim_id}.headers')`
Add group banner retrieval 2021-03-15 14:40:02 -07:00			`req_headers = load_prev_headers(filename, headers)`
Only download changed files 2021-03-01 14:01:51 -08:00
Updates to work natively on Windows 2021-04-18 16:44:07 -07:00			`retrieve(url, filename, extract=os.path.join(args.songdir, groupname),`
Recursively download source categories 2021-04-21 16:05:53 -07:00			`headers=frozendict(req_headers), save_headers=headers)`
Initial checkin 2021-02-08 11:31:24 -08:00
Add group banner retrieval 2021-03-15 14:40:02 -07:00			`for groupname, group_url in group_urls.items():`
			`page = get_page(group_url)`
			`banner_urls = {urllib.parse.urljoin(group_url, banner['src'])`
Filter out 'no banner' banners 2021-03-15 16:01:31 -07:00			`for banner in page.select('p.centre img')`
			`if 'simfileNoBanner.png' not in banner['src']}`
Add group banner retrieval 2021-03-15 14:40:02 -07:00			`for url in banner_urls:`
Updates to work natively on Windows 2021-04-18 16:44:07 -07:00			`filename = os.path.join(args.songdir, groupname, 'banner.png')`
			`headers = os.path.join(args.zipdir, f'{groupname}-banner.headers')`
Add group banner retrieval 2021-03-15 14:40:02 -07:00			`req_headers = load_prev_headers(filename, headers)`
Filter out 'no banner' banners 2021-03-15 16:01:31 -07:00			`retrieve(url, filename, extract=False,`
Fix banner download 2021-07-23 12:25:52 -07:00			`headers=frozendict(req_headers), save_headers=headers)`
Add group banner retrieval 2021-03-15 14:40:02 -07:00
Recursively download source categories 2021-04-21 16:05:53 -07:00			`return group_urls.values()`

Add group banner retrieval 2021-03-15 14:40:02 -07:00
Cleanups 2021-07-23 12:22:11 -07:00			`def main():`
Initial checkin 2021-02-08 11:31:24 -08:00			`args = parse_args()`
Updates to work natively on Windows 2021-04-18 16:44:07 -07:00
			`os.makedirs(args.songdir, exist_ok=True)`
			`os.makedirs(args.zipdir, exist_ok=True)`

Recursively download source categories 2021-04-21 16:05:53 -07:00			`seen_cats = set()`
Try to fix recursion AGAIN 2021-04-27 22:57:28 -07:00			`pending = queue.Queue()`
			`for url in args.categories:`
			`pending.put(url)`

			`while not pending.empty():`
			`url = pending.get()`
fix recursion again 2021-05-09 23:05:11 -07:00			`found = None`
Try to fix recursion AGAIN 2021-04-27 22:57:28 -07:00			`if url not in seen_cats:`
Recursively download source categories 2021-04-21 16:05:53 -07:00			`seen_cats.add(url)`
Try to fix recursion AGAIN 2021-04-27 22:57:28 -07:00			`found = mirror(url, args)`
Tweaked control flow 2021-05-09 23:09:15 -07:00			`if args.recurse:`
			`for url in found:`
			`print(f'Scheduling discovered category {url}')`
			`pending.put(url)`
Try to fix recursion AGAIN 2021-04-27 22:57:28 -07:00
Cleanups 2021-07-23 12:22:11 -07:00

			`if __name__ == "__main__":`
			`main()`
record errors 2021-07-23 13:56:52 -07:00
			`if errors:`
			`print('Downloading got errors:')`
			`for url, filename, message in errors:`
			`print(f'{url} ({filename}): {message}')`