From 4c0b6512f36de645bed75ce6750e98bfab5ddbbb Mon Sep 17 00:00:00 2001 From: Czechman Date: Sun, 16 Feb 2025 15:49:23 +0100 Subject: [PATCH] refactoring --- bundle_checker.py | 32 ++++++++++++++++++-------- bundle_parser.py | 58 ++++++++++++++++++++++++++++++++++++++--------- models.py | 21 +++++++++++++++-- 3 files changed, 88 insertions(+), 23 deletions(-) diff --git a/bundle_checker.py b/bundle_checker.py index 5b967f0..e98c07e 100644 --- a/bundle_checker.py +++ b/bundle_checker.py @@ -2,20 +2,32 @@ import logging from bundle_parser import BundleParser from models import Base, Bundle, BundleVersion, BundleSalesHistory, BundleItem +import requests - +# Konfiguriere das Logging +logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s") logger = logging.getLogger(__name__) def main(): - url = "https://www.humblebundle.com/bundles" - parser = BundleParser(url, category="books") - try: - bundle_data = parser.get_relevant_bundle_data() - logger.info(f"Erhaltene Bundle-Daten: {bundle_data}") - items = parser.parse_items() - logger.info(f"Gefundene Items: {items}") - except Exception as e: - logger.error(f"Fehler: {e}") + overview_url = "https://www.humblebundle.com/bundles" + # Hier kannst du zwischen "books", "games" und "software" wählen: + category = "books" + + logger.info("Extrahiere Bundle-URLs von der Übersichtsseite ...") + overview_parser = BundleParser(overview_url, category=category) + bundle_urls = overview_parser.get_bundle_urls() + logger.info(f"Gefundene {len(bundle_urls)} Bundle-URLs für Kategorie '{category}'.") + + for url in bundle_urls: + logger.info(f"Verarbeite Bundle: {url}") + parser = BundleParser(url) + try: + bundle_data = parser.get_relevant_bundle_data() + logger.info(f"Detaildaten: {bundle_data}") + items = parser.parse_items() + logger.info(f"Extrahierte Items: {items}") + except Exception as e: + logger.error(f"Fehler bei {url}: {e}") if __name__ == "__main__": main() diff --git a/bundle_parser.py b/bundle_parser.py index 826f5d0..a2643d1 100644 --- a/bundle_parser.py +++ b/bundle_parser.py @@ -1,5 +1,4 @@ # bundle_parser.py - import requests import json from bs4 import BeautifulSoup @@ -13,21 +12,40 @@ class BundleParser: self.category = category def fetch_data(self): - logger.info(f"Rufe Bundle-Daten von {self.url} ab...") + logger.info(f"Fetching data from {self.url} ...") response = requests.get(self.url) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") + # Zunächst versuchen wir, den Detailseiten-Skriptblock zu finden: script_tag = soup.find("script", {"id": "webpack-bundle-page-data", "type": "application/json"}) - if not script_tag: - logger.error("Kein JSON-Datenblock 'webpack-bundle-page-data' gefunden!") - raise ValueError("Kein JSON-Datenblock gefunden!") - data = json.loads(script_tag.string) - logger.debug(f"Erhaltener JSON-Block (gekürzt): {str(data)[:200]} ...") - return data + if script_tag: + data = json.loads(script_tag.string) + logger.debug(f"Found webpack-bundle-page-data (detail): {str(data)[:200]} ...") + return data + # Falls nicht gefunden, versuchen wir den Übersichts-Skriptblock: + script_tag = soup.find("script", {"id": "landingPage-json-data", "type": "application/json"}) + if script_tag: + data = json.loads(script_tag.string) + logger.debug(f"Found landingPage-json-data (overview): {str(data)[:200]} ...") + return data + logger.error("Kein JSON-Datenblock gefunden!") + raise ValueError("Kein JSON-Datenblock auf der Seite gefunden.") def get_relevant_bundle_data(self): data = self.fetch_data() - return data.get("bundleData", {}) + if "bundleData" in data: + return data["bundleData"] + # Falls wir auf einer Übersichtsseite sind, versuchen wir, anhand der Kategorie + # den ersten Eintrag aus dem entsprechenden Mosaic zu verwenden: + if "data" in data and self.category: + category_data = data["data"].get(self.category, {}) + mosaics = category_data.get("mosaic", []) + for section in mosaics: + products = section.get("products", []) + if products: + # Hier kannst du anpassen – z. B. alle Produkte verarbeiten oder den ersten wählen. + return products[0] + return {} def parse_items(self): bundle_data = self.get_relevant_bundle_data() @@ -35,11 +53,29 @@ class BundleParser: parsed_items = [] for item in items: title = item.get("title", "Unbekannt") - category = item.get("category", self.category if self.category else "Unbekannt") + cat = item.get("category", self.category if self.category else "Unbekannt") details = json.dumps(item, sort_keys=True, ensure_ascii=False) parsed_items.append({ "title": title, - "category": category, + "category": cat, "details": details }) return parsed_items + + def get_bundle_urls(self): + # Funktion zum Extrahieren von Detailseiten-URLs aus der Übersichtsseite + data = self.fetch_data() + urls = [] + if "data" in data and self.category: + category_data = data["data"].get(self.category, {}) + mosaics = category_data.get("mosaic", []) + for section in mosaics: + for product in section.get("products", []): + url = product.get("product_url", "") + if url: + if url.startswith("http"): + full_url = url + else: + full_url = requests.compat.urljoin(self.url, url) + urls.append(full_url) + return urls diff --git a/models.py b/models.py index 622d4d9..fb7cb72 100644 --- a/models.py +++ b/models.py @@ -1,5 +1,7 @@ +# models.py from sqlalchemy import create_engine, Column, Integer, String, Float, DateTime, ForeignKey, Text from sqlalchemy.orm import declarative_base, relationship +from datetime import datetime Base = declarative_base() @@ -20,7 +22,22 @@ class BundleVersion(Base): bundle_id = Column(Integer, ForeignKey('bundles.id')) version_hash = Column(String) version_data = Column(Text) - timestamp = Column(DateTime) + timestamp = Column(DateTime, default=datetime.utcnow) bundle = relationship("Bundle", back_populates="versions", foreign_keys=[bundle_id]) -# ... weitere Modelle (BundleSalesHistory, BundleItem, ...) +class BundleSalesHistory(Base): + __tablename__ = 'bundle_sales_history' + id = Column(Integer, primary_key=True) + bundle_id = Column(Integer, ForeignKey('bundles.id')) + bundles_sold = Column(Float) + timestamp = Column(DateTime, default=datetime.utcnow) + bundle = relationship("Bundle", back_populates="sales_history") + +class BundleItem(Base): + __tablename__ = 'bundle_items' + id = Column(Integer, primary_key=True) + bundle_id = Column(Integer, ForeignKey('bundles.id')) + title = Column(String) + category = Column(String) + details = Column(Text) + bundle = relationship("Bundle", back_populates="items")