humbleParser/bundle_parser.py

82 lines
3.4 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# bundle_parser.py
import requests
import json
from bs4 import BeautifulSoup
import logging
logger = logging.getLogger(__name__)
class BundleParser:
def __init__(self, url, category=None):
self.url = url
self.category = category
def fetch_data(self):
logger.info(f"Fetching data from {self.url} ...")
response = requests.get(self.url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Zunächst versuchen wir, den Detailseiten-Skriptblock zu finden:
script_tag = soup.find("script", {"id": "webpack-bundle-page-data", "type": "application/json"})
if script_tag:
data = json.loads(script_tag.string)
logger.debug(f"Found webpack-bundle-page-data (detail): {str(data)[:200]} ...")
return data
# Falls nicht gefunden, versuchen wir den Übersichts-Skriptblock:
script_tag = soup.find("script", {"id": "landingPage-json-data", "type": "application/json"})
if script_tag:
data = json.loads(script_tag.string)
logger.debug(f"Found landingPage-json-data (overview): {str(data)[:200]} ...")
return data
logger.error("Kein JSON-Datenblock gefunden!")
raise ValueError("Kein JSON-Datenblock auf der Seite gefunden.")
def get_relevant_bundle_data(self):
data = self.fetch_data()
if "bundleData" in data:
return data["bundleData"]
# Falls wir auf einer Übersichtsseite sind, versuchen wir, anhand der Kategorie
# den ersten Eintrag aus dem entsprechenden Mosaic zu verwenden:
if "data" in data and self.category:
category_data = data["data"].get(self.category, {})
mosaics = category_data.get("mosaic", [])
for section in mosaics:
products = section.get("products", [])
if products:
# Hier kannst du anpassen z.B. alle Produkte verarbeiten oder den ersten wählen.
return products[0]
return {}
def parse_items(self):
bundle_data = self.get_relevant_bundle_data()
items = bundle_data.get("items", [])
parsed_items = []
for item in items:
title = item.get("title", "Unbekannt")
cat = item.get("category", self.category if self.category else "Unbekannt")
details = json.dumps(item, sort_keys=True, ensure_ascii=False)
parsed_items.append({
"title": title,
"category": cat,
"details": details
})
return parsed_items
def get_bundle_urls(self):
# Funktion zum Extrahieren von Detailseiten-URLs aus der Übersichtsseite
data = self.fetch_data()
urls = []
if "data" in data and self.category:
category_data = data["data"].get(self.category, {})
mosaics = category_data.get("mosaic", [])
for section in mosaics:
for product in section.get("products", []):
url = product.get("product_url", "")
if url:
if url.startswith("http"):
full_url = url
else:
full_url = requests.compat.urljoin(self.url, url)
urls.append(full_url)
return urls