refactoring
This commit is contained in:
parent
c5715c1062
commit
4c0b6512f3
|
|
@ -2,20 +2,32 @@
|
|||
import logging
|
||||
from bundle_parser import BundleParser
|
||||
from models import Base, Bundle, BundleVersion, BundleSalesHistory, BundleItem
|
||||
import requests
|
||||
|
||||
|
||||
# Konfiguriere das Logging
|
||||
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def main():
|
||||
url = "https://www.humblebundle.com/bundles"
|
||||
parser = BundleParser(url, category="books")
|
||||
overview_url = "https://www.humblebundle.com/bundles"
|
||||
# Hier kannst du zwischen "books", "games" und "software" wählen:
|
||||
category = "books"
|
||||
|
||||
logger.info("Extrahiere Bundle-URLs von der Übersichtsseite ...")
|
||||
overview_parser = BundleParser(overview_url, category=category)
|
||||
bundle_urls = overview_parser.get_bundle_urls()
|
||||
logger.info(f"Gefundene {len(bundle_urls)} Bundle-URLs für Kategorie '{category}'.")
|
||||
|
||||
for url in bundle_urls:
|
||||
logger.info(f"Verarbeite Bundle: {url}")
|
||||
parser = BundleParser(url)
|
||||
try:
|
||||
bundle_data = parser.get_relevant_bundle_data()
|
||||
logger.info(f"Erhaltene Bundle-Daten: {bundle_data}")
|
||||
logger.info(f"Detaildaten: {bundle_data}")
|
||||
items = parser.parse_items()
|
||||
logger.info(f"Gefundene Items: {items}")
|
||||
logger.info(f"Extrahierte Items: {items}")
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler: {e}")
|
||||
logger.error(f"Fehler bei {url}: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
# bundle_parser.py
|
||||
|
||||
import requests
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
|
|
@ -13,21 +12,40 @@ class BundleParser:
|
|||
self.category = category
|
||||
|
||||
def fetch_data(self):
|
||||
logger.info(f"Rufe Bundle-Daten von {self.url} ab...")
|
||||
logger.info(f"Fetching data from {self.url} ...")
|
||||
response = requests.get(self.url)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
# Zunächst versuchen wir, den Detailseiten-Skriptblock zu finden:
|
||||
script_tag = soup.find("script", {"id": "webpack-bundle-page-data", "type": "application/json"})
|
||||
if not script_tag:
|
||||
logger.error("Kein JSON-Datenblock 'webpack-bundle-page-data' gefunden!")
|
||||
raise ValueError("Kein JSON-Datenblock gefunden!")
|
||||
if script_tag:
|
||||
data = json.loads(script_tag.string)
|
||||
logger.debug(f"Erhaltener JSON-Block (gekürzt): {str(data)[:200]} ...")
|
||||
logger.debug(f"Found webpack-bundle-page-data (detail): {str(data)[:200]} ...")
|
||||
return data
|
||||
# Falls nicht gefunden, versuchen wir den Übersichts-Skriptblock:
|
||||
script_tag = soup.find("script", {"id": "landingPage-json-data", "type": "application/json"})
|
||||
if script_tag:
|
||||
data = json.loads(script_tag.string)
|
||||
logger.debug(f"Found landingPage-json-data (overview): {str(data)[:200]} ...")
|
||||
return data
|
||||
logger.error("Kein JSON-Datenblock gefunden!")
|
||||
raise ValueError("Kein JSON-Datenblock auf der Seite gefunden.")
|
||||
|
||||
def get_relevant_bundle_data(self):
|
||||
data = self.fetch_data()
|
||||
return data.get("bundleData", {})
|
||||
if "bundleData" in data:
|
||||
return data["bundleData"]
|
||||
# Falls wir auf einer Übersichtsseite sind, versuchen wir, anhand der Kategorie
|
||||
# den ersten Eintrag aus dem entsprechenden Mosaic zu verwenden:
|
||||
if "data" in data and self.category:
|
||||
category_data = data["data"].get(self.category, {})
|
||||
mosaics = category_data.get("mosaic", [])
|
||||
for section in mosaics:
|
||||
products = section.get("products", [])
|
||||
if products:
|
||||
# Hier kannst du anpassen – z. B. alle Produkte verarbeiten oder den ersten wählen.
|
||||
return products[0]
|
||||
return {}
|
||||
|
||||
def parse_items(self):
|
||||
bundle_data = self.get_relevant_bundle_data()
|
||||
|
|
@ -35,11 +53,29 @@ class BundleParser:
|
|||
parsed_items = []
|
||||
for item in items:
|
||||
title = item.get("title", "Unbekannt")
|
||||
category = item.get("category", self.category if self.category else "Unbekannt")
|
||||
cat = item.get("category", self.category if self.category else "Unbekannt")
|
||||
details = json.dumps(item, sort_keys=True, ensure_ascii=False)
|
||||
parsed_items.append({
|
||||
"title": title,
|
||||
"category": category,
|
||||
"category": cat,
|
||||
"details": details
|
||||
})
|
||||
return parsed_items
|
||||
|
||||
def get_bundle_urls(self):
|
||||
# Funktion zum Extrahieren von Detailseiten-URLs aus der Übersichtsseite
|
||||
data = self.fetch_data()
|
||||
urls = []
|
||||
if "data" in data and self.category:
|
||||
category_data = data["data"].get(self.category, {})
|
||||
mosaics = category_data.get("mosaic", [])
|
||||
for section in mosaics:
|
||||
for product in section.get("products", []):
|
||||
url = product.get("product_url", "")
|
||||
if url:
|
||||
if url.startswith("http"):
|
||||
full_url = url
|
||||
else:
|
||||
full_url = requests.compat.urljoin(self.url, url)
|
||||
urls.append(full_url)
|
||||
return urls
|
||||
|
|
|
|||
21
models.py
21
models.py
|
|
@ -1,5 +1,7 @@
|
|||
# models.py
|
||||
from sqlalchemy import create_engine, Column, Integer, String, Float, DateTime, ForeignKey, Text
|
||||
from sqlalchemy.orm import declarative_base, relationship
|
||||
from datetime import datetime
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
|
|
@ -20,7 +22,22 @@ class BundleVersion(Base):
|
|||
bundle_id = Column(Integer, ForeignKey('bundles.id'))
|
||||
version_hash = Column(String)
|
||||
version_data = Column(Text)
|
||||
timestamp = Column(DateTime)
|
||||
timestamp = Column(DateTime, default=datetime.utcnow)
|
||||
bundle = relationship("Bundle", back_populates="versions", foreign_keys=[bundle_id])
|
||||
|
||||
# ... weitere Modelle (BundleSalesHistory, BundleItem, ...)
|
||||
class BundleSalesHistory(Base):
|
||||
__tablename__ = 'bundle_sales_history'
|
||||
id = Column(Integer, primary_key=True)
|
||||
bundle_id = Column(Integer, ForeignKey('bundles.id'))
|
||||
bundles_sold = Column(Float)
|
||||
timestamp = Column(DateTime, default=datetime.utcnow)
|
||||
bundle = relationship("Bundle", back_populates="sales_history")
|
||||
|
||||
class BundleItem(Base):
|
||||
__tablename__ = 'bundle_items'
|
||||
id = Column(Integer, primary_key=True)
|
||||
bundle_id = Column(Integer, ForeignKey('bundles.id'))
|
||||
title = Column(String)
|
||||
category = Column(String)
|
||||
details = Column(Text)
|
||||
bundle = relationship("Bundle", back_populates="items")
|
||||
|
|
|
|||
Loading…
Reference in New Issue