refactoring
This commit is contained in:
parent
c5715c1062
commit
4c0b6512f3
|
|
@ -2,20 +2,32 @@
|
||||||
import logging
|
import logging
|
||||||
from bundle_parser import BundleParser
|
from bundle_parser import BundleParser
|
||||||
from models import Base, Bundle, BundleVersion, BundleSalesHistory, BundleItem
|
from models import Base, Bundle, BundleVersion, BundleSalesHistory, BundleItem
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Konfiguriere das Logging
|
||||||
|
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
url = "https://www.humblebundle.com/bundles"
|
overview_url = "https://www.humblebundle.com/bundles"
|
||||||
parser = BundleParser(url, category="books")
|
# Hier kannst du zwischen "books", "games" und "software" wählen:
|
||||||
try:
|
category = "books"
|
||||||
bundle_data = parser.get_relevant_bundle_data()
|
|
||||||
logger.info(f"Erhaltene Bundle-Daten: {bundle_data}")
|
logger.info("Extrahiere Bundle-URLs von der Übersichtsseite ...")
|
||||||
items = parser.parse_items()
|
overview_parser = BundleParser(overview_url, category=category)
|
||||||
logger.info(f"Gefundene Items: {items}")
|
bundle_urls = overview_parser.get_bundle_urls()
|
||||||
except Exception as e:
|
logger.info(f"Gefundene {len(bundle_urls)} Bundle-URLs für Kategorie '{category}'.")
|
||||||
logger.error(f"Fehler: {e}")
|
|
||||||
|
for url in bundle_urls:
|
||||||
|
logger.info(f"Verarbeite Bundle: {url}")
|
||||||
|
parser = BundleParser(url)
|
||||||
|
try:
|
||||||
|
bundle_data = parser.get_relevant_bundle_data()
|
||||||
|
logger.info(f"Detaildaten: {bundle_data}")
|
||||||
|
items = parser.parse_items()
|
||||||
|
logger.info(f"Extrahierte Items: {items}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Fehler bei {url}: {e}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,4 @@
|
||||||
# bundle_parser.py
|
# bundle_parser.py
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import json
|
import json
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
@ -13,21 +12,40 @@ class BundleParser:
|
||||||
self.category = category
|
self.category = category
|
||||||
|
|
||||||
def fetch_data(self):
|
def fetch_data(self):
|
||||||
logger.info(f"Rufe Bundle-Daten von {self.url} ab...")
|
logger.info(f"Fetching data from {self.url} ...")
|
||||||
response = requests.get(self.url)
|
response = requests.get(self.url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
# Zunächst versuchen wir, den Detailseiten-Skriptblock zu finden:
|
||||||
script_tag = soup.find("script", {"id": "webpack-bundle-page-data", "type": "application/json"})
|
script_tag = soup.find("script", {"id": "webpack-bundle-page-data", "type": "application/json"})
|
||||||
if not script_tag:
|
if script_tag:
|
||||||
logger.error("Kein JSON-Datenblock 'webpack-bundle-page-data' gefunden!")
|
data = json.loads(script_tag.string)
|
||||||
raise ValueError("Kein JSON-Datenblock gefunden!")
|
logger.debug(f"Found webpack-bundle-page-data (detail): {str(data)[:200]} ...")
|
||||||
data = json.loads(script_tag.string)
|
return data
|
||||||
logger.debug(f"Erhaltener JSON-Block (gekürzt): {str(data)[:200]} ...")
|
# Falls nicht gefunden, versuchen wir den Übersichts-Skriptblock:
|
||||||
return data
|
script_tag = soup.find("script", {"id": "landingPage-json-data", "type": "application/json"})
|
||||||
|
if script_tag:
|
||||||
|
data = json.loads(script_tag.string)
|
||||||
|
logger.debug(f"Found landingPage-json-data (overview): {str(data)[:200]} ...")
|
||||||
|
return data
|
||||||
|
logger.error("Kein JSON-Datenblock gefunden!")
|
||||||
|
raise ValueError("Kein JSON-Datenblock auf der Seite gefunden.")
|
||||||
|
|
||||||
def get_relevant_bundle_data(self):
|
def get_relevant_bundle_data(self):
|
||||||
data = self.fetch_data()
|
data = self.fetch_data()
|
||||||
return data.get("bundleData", {})
|
if "bundleData" in data:
|
||||||
|
return data["bundleData"]
|
||||||
|
# Falls wir auf einer Übersichtsseite sind, versuchen wir, anhand der Kategorie
|
||||||
|
# den ersten Eintrag aus dem entsprechenden Mosaic zu verwenden:
|
||||||
|
if "data" in data and self.category:
|
||||||
|
category_data = data["data"].get(self.category, {})
|
||||||
|
mosaics = category_data.get("mosaic", [])
|
||||||
|
for section in mosaics:
|
||||||
|
products = section.get("products", [])
|
||||||
|
if products:
|
||||||
|
# Hier kannst du anpassen – z. B. alle Produkte verarbeiten oder den ersten wählen.
|
||||||
|
return products[0]
|
||||||
|
return {}
|
||||||
|
|
||||||
def parse_items(self):
|
def parse_items(self):
|
||||||
bundle_data = self.get_relevant_bundle_data()
|
bundle_data = self.get_relevant_bundle_data()
|
||||||
|
|
@ -35,11 +53,29 @@ class BundleParser:
|
||||||
parsed_items = []
|
parsed_items = []
|
||||||
for item in items:
|
for item in items:
|
||||||
title = item.get("title", "Unbekannt")
|
title = item.get("title", "Unbekannt")
|
||||||
category = item.get("category", self.category if self.category else "Unbekannt")
|
cat = item.get("category", self.category if self.category else "Unbekannt")
|
||||||
details = json.dumps(item, sort_keys=True, ensure_ascii=False)
|
details = json.dumps(item, sort_keys=True, ensure_ascii=False)
|
||||||
parsed_items.append({
|
parsed_items.append({
|
||||||
"title": title,
|
"title": title,
|
||||||
"category": category,
|
"category": cat,
|
||||||
"details": details
|
"details": details
|
||||||
})
|
})
|
||||||
return parsed_items
|
return parsed_items
|
||||||
|
|
||||||
|
def get_bundle_urls(self):
|
||||||
|
# Funktion zum Extrahieren von Detailseiten-URLs aus der Übersichtsseite
|
||||||
|
data = self.fetch_data()
|
||||||
|
urls = []
|
||||||
|
if "data" in data and self.category:
|
||||||
|
category_data = data["data"].get(self.category, {})
|
||||||
|
mosaics = category_data.get("mosaic", [])
|
||||||
|
for section in mosaics:
|
||||||
|
for product in section.get("products", []):
|
||||||
|
url = product.get("product_url", "")
|
||||||
|
if url:
|
||||||
|
if url.startswith("http"):
|
||||||
|
full_url = url
|
||||||
|
else:
|
||||||
|
full_url = requests.compat.urljoin(self.url, url)
|
||||||
|
urls.append(full_url)
|
||||||
|
return urls
|
||||||
|
|
|
||||||
21
models.py
21
models.py
|
|
@ -1,5 +1,7 @@
|
||||||
|
# models.py
|
||||||
from sqlalchemy import create_engine, Column, Integer, String, Float, DateTime, ForeignKey, Text
|
from sqlalchemy import create_engine, Column, Integer, String, Float, DateTime, ForeignKey, Text
|
||||||
from sqlalchemy.orm import declarative_base, relationship
|
from sqlalchemy.orm import declarative_base, relationship
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
|
|
||||||
|
|
@ -20,7 +22,22 @@ class BundleVersion(Base):
|
||||||
bundle_id = Column(Integer, ForeignKey('bundles.id'))
|
bundle_id = Column(Integer, ForeignKey('bundles.id'))
|
||||||
version_hash = Column(String)
|
version_hash = Column(String)
|
||||||
version_data = Column(Text)
|
version_data = Column(Text)
|
||||||
timestamp = Column(DateTime)
|
timestamp = Column(DateTime, default=datetime.utcnow)
|
||||||
bundle = relationship("Bundle", back_populates="versions", foreign_keys=[bundle_id])
|
bundle = relationship("Bundle", back_populates="versions", foreign_keys=[bundle_id])
|
||||||
|
|
||||||
# ... weitere Modelle (BundleSalesHistory, BundleItem, ...)
|
class BundleSalesHistory(Base):
|
||||||
|
__tablename__ = 'bundle_sales_history'
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
bundle_id = Column(Integer, ForeignKey('bundles.id'))
|
||||||
|
bundles_sold = Column(Float)
|
||||||
|
timestamp = Column(DateTime, default=datetime.utcnow)
|
||||||
|
bundle = relationship("Bundle", back_populates="sales_history")
|
||||||
|
|
||||||
|
class BundleItem(Base):
|
||||||
|
__tablename__ = 'bundle_items'
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
bundle_id = Column(Integer, ForeignKey('bundles.id'))
|
||||||
|
title = Column(String)
|
||||||
|
category = Column(String)
|
||||||
|
details = Column(Text)
|
||||||
|
bundle = relationship("Bundle", back_populates="items")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue