rework catalogue wrapper to split entries based on space div
This commit is contained in:
@@ -1,12 +1,13 @@
|
|||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import loguru
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from src import LOG_DIR
|
||||||
from src.logic import BookData as Book
|
from src.logic import BookData as Book
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
import sys
|
|
||||||
import loguru
|
|
||||||
from src import LOG_DIR
|
|
||||||
URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND"
|
URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND"
|
||||||
BASE = "https://rds.ibs-bw.de"
|
BASE = "https://rds.ibs-bw.de"
|
||||||
|
|
||||||
@@ -20,6 +21,8 @@ log.add(
|
|||||||
rotation="1 day",
|
rotation="1 day",
|
||||||
retention="1 month",
|
retention="1 month",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class Catalogue:
|
class Catalogue:
|
||||||
def __init__(self, timeout=5):
|
def __init__(self, timeout=5):
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
@@ -57,45 +60,65 @@ class Catalogue:
|
|||||||
log.info(f"Searching for term: {searchterm}")
|
log.info(f"Searching for term: {searchterm}")
|
||||||
|
|
||||||
links = self.get_book_links(searchterm)
|
links = self.get_book_links(searchterm)
|
||||||
|
print(links)
|
||||||
for link in links:
|
for link in links:
|
||||||
result = self.search(link)
|
result = self.search(link)
|
||||||
# in result search for class col-xs-12 rds-dl RDS_LOCATION
|
# in result search for class col-xs-12 rds-dl RDS_LOCATION
|
||||||
# if found, return text of href
|
# if found, return text of href
|
||||||
soup = BeautifulSoup(result, "html.parser")
|
soup = BeautifulSoup(result, "html.parser")
|
||||||
location = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
|
|
||||||
for loc in location:
|
# Optional (unchanged): title and ppn if you need them
|
||||||
if f"1. OG Semesterapparat" in loc.text:
|
title_el = soup.find("div", class_="headline text")
|
||||||
title = (
|
title = title_el.get_text(strip=True) if title_el else None
|
||||||
soup.find("div", class_="headline text")
|
|
||||||
.text.replace("\n", "")
|
ppn_el = soup.find(
|
||||||
.strip()
|
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN"
|
||||||
|
)
|
||||||
|
ppn = ppn_el.get_text(strip=True) if ppn_el else None
|
||||||
|
|
||||||
|
signature = None
|
||||||
|
|
||||||
|
panel = soup.select_one("div.panel-body")
|
||||||
|
if panel:
|
||||||
|
# Collect the RDS_* blocks in order, using the 'space' divs as separators
|
||||||
|
groups = []
|
||||||
|
cur = {}
|
||||||
|
for node in panel.select(
|
||||||
|
"div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space"
|
||||||
|
):
|
||||||
|
classes = node.get("class", [])
|
||||||
|
# Separator between entries
|
||||||
|
if "space" in classes:
|
||||||
|
if cur:
|
||||||
|
groups.append(cur)
|
||||||
|
cur = {}
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Read the value from the corresponding panel cell
|
||||||
|
val_el = node.select_one(".rds-dl-panel")
|
||||||
|
val = (
|
||||||
|
val_el.get_text(" ", strip=True)
|
||||||
|
if val_el
|
||||||
|
else node.get_text(" ", strip=True)
|
||||||
)
|
)
|
||||||
ppn = soup.find(
|
|
||||||
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN"
|
if "RDS_SIGNATURE" in classes:
|
||||||
)
|
cur["signature"] = val
|
||||||
signature = soup.find(
|
elif "RDS_STATUS" in classes:
|
||||||
"div", class_="col-xs-12 rds-dl RDS_SIGNATURE"
|
cur["status"] = val
|
||||||
)
|
elif "RDS_LOCATION" in classes:
|
||||||
if signature:
|
cur["location"] = val
|
||||||
signature = (
|
|
||||||
signature.find_next("div")
|
if cur: # append the last group if not followed by a space
|
||||||
.find_next("div")
|
groups.append(cur)
|
||||||
.text.replace("\n", "")
|
|
||||||
.strip()
|
# Find the signature for the entry whose location mentions "Semesterapparat"
|
||||||
|
for g in groups:
|
||||||
|
loc = g.get("location", "").lower()
|
||||||
|
if "semesterapparat" in loc:
|
||||||
|
signature = g.get("signature")
|
||||||
|
return Book(
|
||||||
|
title=title,
|
||||||
|
ppn=ppn,
|
||||||
|
signature=signature,
|
||||||
)
|
)
|
||||||
# use ppn to find the next div and extract the text
|
|
||||||
if ppn:
|
|
||||||
ppn = ppn.find_next("div").text.replace("\n", "").strip()
|
|
||||||
else:
|
|
||||||
ppn = None
|
|
||||||
isbn = soup.find(
|
|
||||||
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_ISBN"
|
|
||||||
)
|
|
||||||
if isbn:
|
|
||||||
isbn = isbn.find_next("div").find_next("div").text
|
|
||||||
else:
|
|
||||||
isbn = None
|
|
||||||
return Book(
|
|
||||||
title=title, ppn=ppn, signature=signature, isbn=isbn, link=link
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|||||||
Reference in New Issue
Block a user