humbleParser/bundle_parser.py

46 lines
1.6 KiB
Python

# bundle_parser.py
import requests
import json
from bs4 import BeautifulSoup
import logging
logger = logging.getLogger(__name__)
class BundleParser:
def __init__(self, url, category=None):
self.url = url
self.category = category
def fetch_data(self):
logger.info(f"Rufe Bundle-Daten von {self.url} ab...")
response = requests.get(self.url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
script_tag = soup.find("script", {"id": "webpack-bundle-page-data", "type": "application/json"})
if not script_tag:
logger.error("Kein JSON-Datenblock 'webpack-bundle-page-data' gefunden!")
raise ValueError("Kein JSON-Datenblock gefunden!")
data = json.loads(script_tag.string)
logger.debug(f"Erhaltener JSON-Block (gekürzt): {str(data)[:200]} ...")
return data
def get_relevant_bundle_data(self):
data = self.fetch_data()
return data.get("bundleData", {})
def parse_items(self):
bundle_data = self.get_relevant_bundle_data()
items = bundle_data.get("items", [])
parsed_items = []
for item in items:
title = item.get("title", "Unbekannt")
category = item.get("category", self.category if self.category else "Unbekannt")
details = json.dumps(item, sort_keys=True, ensure_ascii=False)
parsed_items.append({
"title": title,
"category": category,
"details": details
})
return parsed_items