refactoring

This commit is contained in:
Czechman 2025-02-16 15:49:23 +01:00
parent c5715c1062
commit 4c0b6512f3
3 changed files with 88 additions and 23 deletions

View File

@ -2,20 +2,32 @@
import logging
from bundle_parser import BundleParser
from models import Base, Bundle, BundleVersion, BundleSalesHistory, BundleItem
import requests
# Konfiguriere das Logging
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
def main():
url = "https://www.humblebundle.com/bundles"
parser = BundleParser(url, category="books")
overview_url = "https://www.humblebundle.com/bundles"
# Hier kannst du zwischen "books", "games" und "software" wählen:
category = "books"
logger.info("Extrahiere Bundle-URLs von der Übersichtsseite ...")
overview_parser = BundleParser(overview_url, category=category)
bundle_urls = overview_parser.get_bundle_urls()
logger.info(f"Gefundene {len(bundle_urls)} Bundle-URLs für Kategorie '{category}'.")
for url in bundle_urls:
logger.info(f"Verarbeite Bundle: {url}")
parser = BundleParser(url)
try:
bundle_data = parser.get_relevant_bundle_data()
logger.info(f"Erhaltene Bundle-Daten: {bundle_data}")
logger.info(f"Detaildaten: {bundle_data}")
items = parser.parse_items()
logger.info(f"Gefundene Items: {items}")
logger.info(f"Extrahierte Items: {items}")
except Exception as e:
logger.error(f"Fehler: {e}")
logger.error(f"Fehler bei {url}: {e}")
if __name__ == "__main__":
main()

View File

@ -1,5 +1,4 @@
# bundle_parser.py
import requests
import json
from bs4 import BeautifulSoup
@ -13,21 +12,40 @@ class BundleParser:
self.category = category
def fetch_data(self):
logger.info(f"Rufe Bundle-Daten von {self.url} ab...")
logger.info(f"Fetching data from {self.url} ...")
response = requests.get(self.url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Zunächst versuchen wir, den Detailseiten-Skriptblock zu finden:
script_tag = soup.find("script", {"id": "webpack-bundle-page-data", "type": "application/json"})
if not script_tag:
logger.error("Kein JSON-Datenblock 'webpack-bundle-page-data' gefunden!")
raise ValueError("Kein JSON-Datenblock gefunden!")
if script_tag:
data = json.loads(script_tag.string)
logger.debug(f"Erhaltener JSON-Block (gekürzt): {str(data)[:200]} ...")
logger.debug(f"Found webpack-bundle-page-data (detail): {str(data)[:200]} ...")
return data
# Falls nicht gefunden, versuchen wir den Übersichts-Skriptblock:
script_tag = soup.find("script", {"id": "landingPage-json-data", "type": "application/json"})
if script_tag:
data = json.loads(script_tag.string)
logger.debug(f"Found landingPage-json-data (overview): {str(data)[:200]} ...")
return data
logger.error("Kein JSON-Datenblock gefunden!")
raise ValueError("Kein JSON-Datenblock auf der Seite gefunden.")
def get_relevant_bundle_data(self):
data = self.fetch_data()
return data.get("bundleData", {})
if "bundleData" in data:
return data["bundleData"]
# Falls wir auf einer Übersichtsseite sind, versuchen wir, anhand der Kategorie
# den ersten Eintrag aus dem entsprechenden Mosaic zu verwenden:
if "data" in data and self.category:
category_data = data["data"].get(self.category, {})
mosaics = category_data.get("mosaic", [])
for section in mosaics:
products = section.get("products", [])
if products:
# Hier kannst du anpassen z.B. alle Produkte verarbeiten oder den ersten wählen.
return products[0]
return {}
def parse_items(self):
bundle_data = self.get_relevant_bundle_data()
@ -35,11 +53,29 @@ class BundleParser:
parsed_items = []
for item in items:
title = item.get("title", "Unbekannt")
category = item.get("category", self.category if self.category else "Unbekannt")
cat = item.get("category", self.category if self.category else "Unbekannt")
details = json.dumps(item, sort_keys=True, ensure_ascii=False)
parsed_items.append({
"title": title,
"category": category,
"category": cat,
"details": details
})
return parsed_items
def get_bundle_urls(self):
# Funktion zum Extrahieren von Detailseiten-URLs aus der Übersichtsseite
data = self.fetch_data()
urls = []
if "data" in data and self.category:
category_data = data["data"].get(self.category, {})
mosaics = category_data.get("mosaic", [])
for section in mosaics:
for product in section.get("products", []):
url = product.get("product_url", "")
if url:
if url.startswith("http"):
full_url = url
else:
full_url = requests.compat.urljoin(self.url, url)
urls.append(full_url)
return urls

View File

@ -1,5 +1,7 @@
# models.py
from sqlalchemy import create_engine, Column, Integer, String, Float, DateTime, ForeignKey, Text
from sqlalchemy.orm import declarative_base, relationship
from datetime import datetime
Base = declarative_base()
@ -20,7 +22,22 @@ class BundleVersion(Base):
bundle_id = Column(Integer, ForeignKey('bundles.id'))
version_hash = Column(String)
version_data = Column(Text)
timestamp = Column(DateTime)
timestamp = Column(DateTime, default=datetime.utcnow)
bundle = relationship("Bundle", back_populates="versions", foreign_keys=[bundle_id])
# ... weitere Modelle (BundleSalesHistory, BundleItem, ...)
class BundleSalesHistory(Base):
__tablename__ = 'bundle_sales_history'
id = Column(Integer, primary_key=True)
bundle_id = Column(Integer, ForeignKey('bundles.id'))
bundles_sold = Column(Float)
timestamp = Column(DateTime, default=datetime.utcnow)
bundle = relationship("Bundle", back_populates="sales_history")
class BundleItem(Base):
__tablename__ = 'bundle_items'
id = Column(Integer, primary_key=True)
bundle_id = Column(Integer, ForeignKey('bundles.id'))
title = Column(String)
category = Column(String)
details = Column(Text)
bundle = relationship("Bundle", back_populates="items")