add appnr, new url

This commit is contained in:
WorldTeacher
2024-08-02 15:22:52 +02:00
parent 947ef75db6
commit ee8862a220
3 changed files with 57 additions and 46 deletions

View File

@@ -13,7 +13,9 @@ logger = MyLogger(__name__)
config = OmegaConf.load("config.yaml")
API_URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{}/"
PPN_URL = 'https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?lookfor="{}"+&type=AllFields&limit=10&sort=py+desc%2C+title'
PPN_URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND"
BASE = "https://rds.ibs-bw.de"
#
TITLE = "RDS_TITLE"
SIGNATURE = "RDS_SIGNATURE"
EDITION = "RDS_EDITION"
@@ -28,11 +30,15 @@ HEADERS = {
class WebRequest:
def __init__(self) -> None:
def __init__(self, appnr) -> None:
"""Request data from the web, and format it depending on the mode."""
self.apparat = appnr
if int(self.apparat) < 10:
self.apparat = f"0{self.apparat}"
self.signature = None
self.ppn = None
self.data = None
self.timeout = 5
logger.log_info("Initialized WebRequest")
def get_ppn(self, signature):
@@ -41,50 +47,54 @@ class WebRequest:
signature = signature.replace("+", "%2B")
if "doi.org" in signature:
signature = signature.split("/")[-1]
url = PPN_URL.format(signature)
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser", from_encoding="utf-8")
if soup.find("div", class_="media") is None:
logger.log_error(f"No data found for {signature}")
return self
ppn = soup.find("div", class_="media").get("id")
self.ppn = ppn
self.ppn = signature
return self
def get_link_data(self):
page = requests.get(PPN_URL.format(self.ppn))
soup = BeautifulSoup(page.content, "html.parser")
# find div that contains daia_ in the id
# find the pre tag in that div
# return the text
# div = soup.find("div",id=lambda x: x and "daia_" in x)
# pre = div.find("pre")
return soup
def search_book(self, searchterm: str):
response = requests.get(PPN_URL.format(searchterm), timeout=self.timeout)
return response.text
def get_data(self) -> list[str] | str:
# url = API_URL.format(self.ppn)
if self.ppn is None:
logger.log_error("No PPN found")
return "error"
page = requests.get(API_URL.format(self.ppn))
logger.log_info(f"Requesting data from {API_URL.format(self.ppn)}")
logger.log_info(f"Status code: {page.status_code}")
# print(page.content)
soup = BeautifulSoup(page.content, "html.parser")
pre_tag = soup.find_all("pre")
# print(pre_tag)
return_data = []
def get_book_links(self, searchterm: str):
response = self.search_book(searchterm)
soup = BeautifulSoup(response, "html.parser")
links = soup.find_all("a", class_="title getFull")
res = []
for link in links:
res.append(BASE + link["href"])
return res
if pre_tag:
for tag in pre_tag:
data = tag.text.strip()
return_data.append(data)
return return_data
else:
print("No <pre> tag found")
logger.log_error("No <pre> tag found")
return return_data
def search(self, link: str):
response = requests.get(link, timeout=self.timeout)
return response.text
def get_data(
self,
):
links = self.get_book_links(self.ppn)
print(links)
for link in links:
result = self.search(link)
# in result search for class col-xs-12 rds-dl RDS_LOCATION
# if found, return text of href
soup = BeautifulSoup(result, "html.parser")
locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
if locations:
for location in locations:
item_location = location.find(
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
).text.strip()
print(item_location)
if f"Semesterapparat-{self.apparat}" in item_location:
pre_tag = soup.find_all("pre")
return_data = []
if pre_tag:
for tag in pre_tag:
data = tag.text.strip()
return_data.append(data)
return return_data
else:
logger.log_error("No <pre> tag found")
return return_data
class BibTextTransformer:
@@ -109,7 +119,7 @@ class BibTextTransformer:
self.data = None
# self.bookdata = BookData(**self.data)
def get_data(self, data: list) -> str:
def get_data(self, data: list):
RIS_IDENT = "TY -"
ARRAY_IDENT = "[kid]"
COinS_IDENT = "ctx_ver"