82 lines
3.4 KiB
Python
82 lines
3.4 KiB
Python
# bundle_parser.py
|
||
import requests
|
||
import json
|
||
from bs4 import BeautifulSoup
|
||
import logging
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
class BundleParser:
|
||
def __init__(self, url, category=None):
|
||
self.url = url
|
||
self.category = category
|
||
|
||
def fetch_data(self):
|
||
logger.info(f"Fetching data from {self.url} ...")
|
||
response = requests.get(self.url)
|
||
response.raise_for_status()
|
||
soup = BeautifulSoup(response.text, "html.parser")
|
||
# Zunächst versuchen wir, den Detailseiten-Skriptblock zu finden:
|
||
script_tag = soup.find("script", {"id": "webpack-bundle-page-data", "type": "application/json"})
|
||
if script_tag:
|
||
data = json.loads(script_tag.string)
|
||
logger.debug(f"Found webpack-bundle-page-data (detail): {str(data)[:200]} ...")
|
||
return data
|
||
# Falls nicht gefunden, versuchen wir den Übersichts-Skriptblock:
|
||
script_tag = soup.find("script", {"id": "landingPage-json-data", "type": "application/json"})
|
||
if script_tag:
|
||
data = json.loads(script_tag.string)
|
||
logger.debug(f"Found landingPage-json-data (overview): {str(data)[:200]} ...")
|
||
return data
|
||
logger.error("Kein JSON-Datenblock gefunden!")
|
||
raise ValueError("Kein JSON-Datenblock auf der Seite gefunden.")
|
||
|
||
def get_relevant_bundle_data(self):
|
||
data = self.fetch_data()
|
||
if "bundleData" in data:
|
||
return data["bundleData"]
|
||
# Falls wir auf einer Übersichtsseite sind, versuchen wir, anhand der Kategorie
|
||
# den ersten Eintrag aus dem entsprechenden Mosaic zu verwenden:
|
||
if "data" in data and self.category:
|
||
category_data = data["data"].get(self.category, {})
|
||
mosaics = category_data.get("mosaic", [])
|
||
for section in mosaics:
|
||
products = section.get("products", [])
|
||
if products:
|
||
# Hier kannst du anpassen – z. B. alle Produkte verarbeiten oder den ersten wählen.
|
||
return products[0]
|
||
return {}
|
||
|
||
def parse_items(self):
|
||
bundle_data = self.get_relevant_bundle_data()
|
||
items = bundle_data.get("items", [])
|
||
parsed_items = []
|
||
for item in items:
|
||
title = item.get("title", "Unbekannt")
|
||
cat = item.get("category", self.category if self.category else "Unbekannt")
|
||
details = json.dumps(item, sort_keys=True, ensure_ascii=False)
|
||
parsed_items.append({
|
||
"title": title,
|
||
"category": cat,
|
||
"details": details
|
||
})
|
||
return parsed_items
|
||
|
||
def get_bundle_urls(self):
|
||
# Funktion zum Extrahieren von Detailseiten-URLs aus der Übersichtsseite
|
||
data = self.fetch_data()
|
||
urls = []
|
||
if "data" in data and self.category:
|
||
category_data = data["data"].get(self.category, {})
|
||
mosaics = category_data.get("mosaic", [])
|
||
for section in mosaics:
|
||
for product in section.get("products", []):
|
||
url = product.get("product_url", "")
|
||
if url:
|
||
if url.startswith("http"):
|
||
full_url = url
|
||
else:
|
||
full_url = requests.compat.urljoin(self.url, url)
|
||
urls.append(full_url)
|
||
return urls
|