Browse Source

Add group banner retrieval

main
fluffy 7 months ago
parent
commit
e1ec40c7e0
3 changed files with 123 additions and 31 deletions
  1. +58
    -1
      poetry.lock
  2. +2
    -0
      pyproject.toml
  3. +63
    -30
      zenius.py

+ 58
- 1
poetry.lock View File

@@ -1,3 +1,15 @@
[[package]]
name = "autopep8"
version = "1.5.5"
description = "A tool that automatically formats Python code to conform to the PEP 8 style guide"
category = "dev"
optional = false
python-versions = "*"

[package.dependencies]
pycodestyle = ">=2.6.0"
toml = "*"

[[package]]
name = "beautifulsoup4"
version = "4.9.3"
@@ -37,6 +49,27 @@ category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"

[[package]]
name = "isort"
version = "5.7.0"
description = "A Python utility / library to sort Python imports."
category = "dev"
optional = false
python-versions = ">=3.6,<4.0"

[package.extras]
pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
requirements_deprecated_finder = ["pipreqs", "pip-api"]
colors = ["colorama (>=0.4.3,<0.5.0)"]

[[package]]
name = "pycodestyle"
version = "2.7.0"
description = "Python style guide checker"
category = "dev"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"

[[package]]
name = "requests"
version = "2.25.1"
@@ -63,6 +96,14 @@ category = "main"
optional = false
python-versions = ">=3.5"

[[package]]
name = "toml"
version = "0.10.2"
description = "Python Library for Tom's Obvious, Minimal Language"
category = "dev"
optional = false
python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"

[[package]]
name = "urllib3"
version = "1.26.2"
@@ -79,9 +120,13 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
[metadata]
lock-version = "1.1"
python-versions = "^3.8"
content-hash = "c39469c7911fdacba7ee26ed25cb96c250190f44ebd1c14b904cb824dbc7d35d"
content-hash = "d5d1aa29435a94184496000e498fbd82838e4b8a0f0f8ad117fedb18cbf2021d"

[metadata.files]
autopep8 = [
{file = "autopep8-1.5.5-py2.py3-none-any.whl", hash = "sha256:9e136c472c475f4ee4978b51a88a494bfcd4e3ed17950a44a988d9e434837bea"},
{file = "autopep8-1.5.5.tar.gz", hash = "sha256:cae4bc0fb616408191af41d062d7ec7ef8679c7f27b068875ca3a9e2878d5443"},
]
beautifulsoup4 = [
{file = "beautifulsoup4-4.9.3-py2-none-any.whl", hash = "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35"},
{file = "beautifulsoup4-4.9.3-py3-none-any.whl", hash = "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"},
@@ -99,6 +144,14 @@ idna = [
{file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"},
{file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"},
]
isort = [
{file = "isort-5.7.0-py3-none-any.whl", hash = "sha256:fff4f0c04e1825522ce6949973e83110a6e907750cd92d128b0d14aaaadbffdc"},
{file = "isort-5.7.0.tar.gz", hash = "sha256:c729845434366216d320e936b8ad6f9d681aab72dc7cbc2d51bedc3582f3ad1e"},
]
pycodestyle = [
{file = "pycodestyle-2.7.0-py2.py3-none-any.whl", hash = "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068"},
{file = "pycodestyle-2.7.0.tar.gz", hash = "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"},
]
requests = [
{file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"},
{file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"},
@@ -107,6 +160,10 @@ soupsieve = [
{file = "soupsieve-2.1-py3-none-any.whl", hash = "sha256:4bb21a6ee4707bf43b61230e80740e71bfe56e55d1f1f50924b087bb2975c851"},
{file = "soupsieve-2.1.tar.gz", hash = "sha256:6dc52924dc0bc710a5d16794e6b3480b2c7c08b07729505feab2b2c16661ff6e"},
]
toml = [
{file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
{file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
]
urllib3 = [
{file = "urllib3-1.26.2-py2.py3-none-any.whl", hash = "sha256:d8ff90d979214d7b4f8ce956e80f4028fc6860e4431f731ea4a8c08f23f99473"},
{file = "urllib3-1.26.2.tar.gz", hash = "sha256:19188f96923873c92ccb987120ec4acaa12f0461fa9ce5d3d0772bc965a39e08"},


+ 2
- 0
pyproject.toml View File

@@ -10,6 +10,8 @@ beautifulsoup4 = "^4.9.3"
requests = "^2.25.1"

[tool.poetry.dev-dependencies]
autopep8 = "^1.5.5"
isort = "^5.7.0"

[build-system]
requires = ["poetry-core>=1.0.0"]


+ 63
- 30
zenius.py View File

@@ -1,19 +1,22 @@
import argparse
import functools
import os
import os.path
import pickle
import urllib
import urllib.parse
import zipfile
from urllib.request import urlretrieve

import requests
from bs4 import BeautifulSoup
import urllib
import os.path
import zipfile
import os
import argparse
import pickle


def parse_args(*args):
parser = argparse.ArgumentParser(description="Mirror simfiles from ZIV")

parser.add_argument('categories', type=str, nargs='*', help='ZIV category pages to mirror',
default=['https://zenius-i-vanisher.com/v5.2/simfiles.php?category=latest20official'])
default=['https://zenius-i-vanisher.com/v5.2/simfiles.php?category=latest20official'])

feature = parser.add_mutually_exclusive_group(required=False)
feature.add_argument('--dry-run', '-n',
@@ -26,22 +29,22 @@ def parse_args(*args):

return parser.parse_args(*args)

def retrieve(url, filename, save_headers=None, **kwargs):

def retrieve(url, filename, save_headers=None, extract=True, **kwargs):
print(f'Downloading {url} -> {filename}')
try:
os.makedirs('zips', exist_ok=True)

req = requests.get(url, **kwargs, stream=True)
if req.status_code == 200:
with open(filename, 'wb') as output:
for chunk in req.iter_content(1024):
output.write(chunk)

with zipfile.ZipFile(filename, 'r') as zip:
songdir = f'songs/{groupname}'
print(f'Extracting into {songdir}')
os.makedirs(songdir, exist_ok=True)
zip.extractall(songdir)
if extract:
with zipfile.ZipFile(filename, 'r') as zip:
songdir = f'songs/{groupname}'
print(f'Extracting into {songdir}')
os.makedirs(songdir, exist_ok=True)
zip.extractall(songdir)

if save_headers:
pickle.dump(req.headers, open(save_headers, 'wb'))
@@ -62,18 +65,43 @@ def retrieve(url, filename, save_headers=None, **kwargs):

return req.headers

def mirror(cat_url, args):

@functools.lru_cache()
def get_page(cat_url):
request = requests.get(cat_url)
page = BeautifulSoup(request.text, features="html.parser")
return BeautifulSoup(request.text, features="html.parser")


def load_prev_headers(filename, header_file):
req_headers = {}
if os.path.isfile(header_file) and os.path.isfile(filename):
prev_headers = pickle.load(open(header_file, 'rb'))
if 'etag' in prev_headers:
req_headers['If-None-Match'] = prev_headers['etag']
if 'last-modified' in prev_headers:
req_headers['If-Modified-Since'] = prev_headers['last-modified']
return req_headers


def mirror(cat_url, args):
page = get_page(cat_url)

group_urls = {}

if 'viewsimfilecategory.php' in cat_url:
simgroup = page.find('div', {'class': 'headertop'}).h1
group_url = cat_url
else:
simgroup = None

for row in page.find_all('tr'):
simfile = row.find("a", href=lambda href: href and "viewsimfile.php" in href)
simgroup = simgroup or row.find("a", href=lambda href: href and "viewsimfilecategory.php" in href)
simfile = row.find(
"a", href=lambda href: href and "viewsimfile.php" in href)
group_link = row.find(
"a", href=lambda href: href and "viewsimfilecategory.php" in href)
if group_link:
simgroup = group_link
group_url = group_link['href']

if not (simfile and simgroup):
continue
@@ -85,11 +113,14 @@ def mirror(cat_url, args):

simlink = simfile['href']
try:
sim_id = urllib.parse.parse_qs(urllib.parse.urlparse(simfile['href']).query)['simfileid'][0]
sim_id = urllib.parse.parse_qs(urllib.parse.urlparse(
simfile['href']).query)['simfileid'][0]
except KeyError:
print(f"WARNING: no simfileid found on URL {simlink}")
continue

group_urls[groupname] = urllib.parse.urljoin(cat_url, group_url)

url = f'https://zenius-i-vanisher.com/v5.2/download.php?type=ddrsimfile&simfileid={sim_id}'

if args.dry_run:
@@ -98,19 +129,21 @@ def mirror(cat_url, args):

filename = f'zips/{sim_id}.zip'
headers = f'zips/{sim_id}.headers'
if os.path.isfile(headers):
prev_headers = pickle.load(open(headers, 'rb'))
req_headers = {}
if os.path.isfile(filename):
if 'etag' in prev_headers:
req_headers['If-None-Match'] = prev_headers['etag']
if 'last-modified' in prev_headers:
req_headers['If-Modified-Since'] = prev_headers['last-modified']
else:
prev_headers = {}
req_headers = load_prev_headers(filename, headers)

retrieve(url, filename, headers=req_headers, save_headers=headers)

for groupname, group_url in group_urls.items():
page = get_page(group_url)
banner_urls = {urllib.parse.urljoin(group_url, banner['src'])
for banner in page.select('p.centre img')}
for url in banner_urls:
filename = f'songs/{groupname}/banner.png'
headers = f'zips/{groupname}-banner.headers'
req_headers = load_prev_headers(filename, headers)
retrieve(url, filename, extract=False, headers=req_headers, save_headers=headers)


if __name__ == "__main__":
args = parse_args()
for url in args.categories:


Loading…
Cancel
Save