add files

This commit is contained in:
WorldTeacher
2024-01-26 08:28:01 +01:00
parent dd9ee24a8f
commit 0a9818940c
110 changed files with 21563 additions and 0 deletions

0
src/logic/__init__.py Normal file
View File

59
src/logic/c_sort.py Normal file
View File

@@ -0,0 +1,59 @@
from typing import List, Tuple
from natsort import natsorted
def custom_sort(unsorted: List[Tuple[str, int, int]]) -> List[Tuple[str, int, int]]:
"""Sort a list of semesters in the format "SoSe n" and "WiSe n/n+1" in the correct order.
Where n == year in 2 digit format
Args:
----
unsorted (list[tuple]): List of semesters in the format "SoSe n" and "WiSe n/n+1"
Returns:
-------
ret (list[tuple]): Sorted list in correct order of WiSe n/n+1 and SoSe n
"""
summer = natsorted([i for i in unsorted if "SoSe" in i[0]])
winter = natsorted([i for i in unsorted if "WiSe" in i[0]])
summer = natsorted(summer, key=lambda x: x[0])
winter = natsorted(winter, key=lambda x: x[0])
# Merge the lists
ret = []
i = 0
j = 0
while i < len(summer) and j < len(winter):
if summer[i][0][5:] <= winter[j][0][5:]:
ret.append(summer[i])
i += 1
else:
ret.append(winter[j])
j += 1
# Append the remaining items
while i < len(summer):
ret.append(summer[i])
i += 1
while j < len(winter):
ret.append(winter[j])
j += 1
return ret
# Test the function
pass
if __name__ == "__main__":
unsorted = [
("WiSe 23/24", 7, 5),
("SoSe 23", 5, 0),
("SoSe 22", 1, 0),
("WiSe 22/23", 1, 0),
("SoSe 15", 1, 0),
]
print(custom_sort(unsorted))

221
src/logic/constants.py Normal file
View File

@@ -0,0 +1,221 @@
APP_NRS = [i for i in range(1, 181)]
PROF_TITLES = [
"Dr. mult.",
"Dr. paed.",
"Dr. rer. pol.",
"Dr. sc. techn.",
"Drs.",
"Dr. agr.",
"Dr. habil.",
"Dr. oec.",
"Dr. med.",
"Dr. e. h.",
"Dr. oec. publ.",
"Dr. -Ing.",
"Dr. theol.",
"Dr. med. vet.",
"Dr. ing.",
"Dr. rer. nat.",
"Dr. des.",
"Dr. sc. mus.",
"Dr. h. c.",
"Dr. pharm.",
"Dr. med. dent.",
"Dr. phil. nat.",
"Dr. phil.",
"Dr. iur.",
"Dr.",
"Kein Titel",
]
SEMAP_MEDIA_ACCOUNT_PREFIX = "10080"
semaps = {
"1": "0005",
"2": "0018",
"3": "0021",
"4": "0034",
"5": "0047",
"6": "0050",
"7": "0063",
"8": "0076",
"9": "0089",
"10": "0092",
"11": "0104",
"12": "0117",
"13": "0120",
"14": "0133",
"15": "0146",
"16": "0159",
"17": "0162",
"18": "0175",
"19": "0188",
"20": "0191",
"21": "0203",
"22": "0216",
"23": "0229",
"24": "0232",
"25": "0245",
"26": "0258",
"27": "0261",
"28": "0274",
"29": "0287",
"30": "0290",
"31": "0302",
"32": "0315",
"33": "0328",
"34": "0331",
"35": "0344",
"36": "0357",
"37": "0360",
"38": "0373",
"39": "0386",
"40": "0399",
"41": "0401",
"42": "0414",
"43": "0427",
"44": "0430",
"45": "0443",
"46": "0456",
"47": "0469",
"48": "0472",
"49": "0485",
"50": "0498",
"51": "0500",
"52": "0513",
"53": "0526",
"54": "0539",
"55": "0542",
"56": "0555",
"57": "0568",
"58": "0571",
"59": "0584",
"60": "0597",
"61": "0609",
"62": "0612",
"63": "0625",
"64": "0638",
"65": "0641",
"66": "0654",
"67": "0667",
"68": "0670",
"69": "0683",
"70": "0696",
"71": "0708",
"72": "0711",
"73": "0724",
"74": "0737",
"75": "0740",
"76": "0753",
"77": "0766",
"78": "0779",
"79": "0782",
"80": "0795",
"81": "0807",
"82": "0810",
"83": "0823",
"84": "0836",
"85": "0849",
"86": "0852",
"87": "0865",
"88": "0878",
"89": "0881",
"90": "0894",
"91": "0906",
"92": "0919",
"93": "0922",
"94": "0935",
"95": "0948",
"96": "0951",
"97": "0964",
"98": "0977",
"99": "0980",
"100": "0993",
"101": "1002",
"102": "1015",
"103": "1028",
"104": "1031",
"105": "1044",
"106": "1057",
"107": "1060",
"108": "1073",
"109": "1086",
"110": "1099",
"111": "1101",
"112": "1114",
"113": "1127",
"114": "1130",
"115": "1143",
"116": "1156",
"117": "1169",
"118": "1172",
"119": "1185",
"120": "1198",
"121": "1200",
"122": "1213",
"123": "1226",
"124": "1239",
"125": "1242",
"126": "1255",
"127": "1268",
"128": "1271",
"129": "1284",
"130": "1297",
"131": "1309",
"132": "1312",
"133": "1325",
"134": "1338",
"135": "1341",
"136": "1354",
"137": "1367",
"138": "1370",
"139": "1383",
"140": "1396",
"141": "1408",
"142": "1411",
"143": "1424",
"144": "1437",
"145": "1440",
"146": "1453",
"147": "1466",
"148": "1479",
"149": "1482",
"150": "1495",
"151": "1507",
"152": "1510",
"153": "1523",
"154": "1536",
"155": "1549",
"156": "1552",
"157": "1565",
"158": "1578",
"159": "1581",
"160": "1594",
"161": "1606",
"162": "1619",
"163": "1622",
"164": "1635",
"165": "1648",
"166": "1651",
"167": "1664",
"168": "1677",
"169": "1680",
"170": "1693",
"171": "1705",
"172": "1718",
"173": "1721",
"174": "1734",
"175": "1747",
"176": "1750",
"177": "1763",
"178": "1776",
"179": "1789",
"180": "1792",
}
# take the semaps dict and add the prefix to the values
for key, value in semaps.items():
semaps[key] = f"{SEMAP_MEDIA_ACCOUNT_PREFIX}{value}{value[-1]}"
SEMAP_MEDIA_ACCOUNTS = semaps
# for s in SEMAP_MEDIA_ACCOUNTS:
# assert len(SEMAP_MEDIA_ACCOUNTS[s]) == 10, f"semap {s} has wrong length"
# print(f"{SEMAP_MEDIA_ACCOUNTS[s]}")

27
src/logic/csvparser.py Normal file
View File

@@ -0,0 +1,27 @@
import csv
import pandas as pdf
def csv_to_list(path: str) -> list[str]:
"""
Extracts the data from a csv file and returns it as a pandas dataframe
"""
with open(path, newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
data = []
for row in reader:
for i in range(len(row)):
row[i] = row[i].replace('"', "")
data.append(row)
ret= []
for i in data:
ret.append(i[0])
return ret
if __name__ == "__main__":
text = csv_to_list("C:/Users/aky547/Desktop/semap/71.csv")
#remove linebreaks
print(text)

76
src/logic/dataclass.py Normal file
View File

@@ -0,0 +1,76 @@
import re
from dataclasses import dataclass, field
@dataclass
class ApparatData:
prof_title: str | None = None
profname: str | None = None
dauerapp: bool = False
appnr: int | None = None
appname: str | None = None
app_fach: str | None = None
semester: str | None = None
erstellsemester: str | None = None
prof_mail: str | None = None
prof_tel: int | None = None
deleted: int = 0
prof_adis_id: int | None = None
apparat_adis_id: int | None = None
def get_prof_details(self) -> dict:
return {
"prof_title": self.prof_title,
"profname": self.profname,
"prof_mail": self.prof_mail,
"prof_tel": self.prof_tel,
"fullname": self.profname,
}
@dataclass
class BookData:
ppn: str | None = None
title: str | None = None
signature: str | None = None
edition: str | None = None
link: str | None = None
isbn: str | list | None = field(default_factory=list)
author: str | None = None
language: str | list | None = field(default_factory=list)
publisher: str | None = None
year: str | None = None
pages: str | None = None
# avaliability: dict | None = field(default_factory=dict)
# def assign(self, field,value):
# self.__setattr__(field,value)
def from_dict(self, data: dict):
for key, value in data.items():
setattr(self, key, value)
def to_dict(self):
return self.__dict__
def from_dataclass(self, dataclass):
for key, value in dataclass.__dict__.items():
setattr(self, key, value)
def from_string(self, data: str):
if not data.startswith("BookData"):
raise ValueError("No valid BookData string")
else:
pattern = r"(\w+)='([^']*)'"
data_dict = dict(re.findall(pattern, data))
print(data_dict)
for key, value in data_dict.items():
setattr(self, key, value)
return self
@dataclass
class MailData:
subject: str | None = None
body: str | None = None
mailto: str | None = None
prof: str | None = None

45
src/logic/fileparser.py Normal file
View File

@@ -0,0 +1,45 @@
import csv
import pandas as pd
from docx import Document
def csv_to_list(path: str) -> list[str]:
"""
Extracts the data from a csv file and returns it as a pandas dataframe
"""
with open(path, newline="") as csvfile:
reader = csv.reader(csvfile, delimiter=";", quotechar="|")
data = []
for row in reader:
for i in range(len(row)):
row[i] = row[i].replace('"', "")
data.append(row)
ret = []
for i in data:
ret.append(i[0])
return ret
def word_docx_to_csv(path) -> pd.DataFrame:
doc = Document(path)
tables = doc.tables
m_data = []
for table in tables:
data = []
for row in table.rows:
row_data = []
for cell in row.cells:
text = cell.text
text = text.replace("\n", "")
row_data.append(text)
data.append(row_data)
df = pd.DataFrame(data)
df.columns = df.iloc[0]
df = df.iloc[1:]
m_data.append(df)
df = m_data[2]
return df

View File

@@ -0,0 +1,31 @@
from docx import Document
data={}
wordDoc = Document('files/Semesterapparat - Anmeldung.docx')
paragraphs = wordDoc.tables
for table in paragraphs:
for column in table.columns:
cellcount=0
for cell in column.cells:
if cellcount<12:
cellcount+=1
print(f'cell:{cell.text}')
# print(f'paragraphs[{i}]: {paragraphs[i]}')
# data[i] = paragraphs[i]
# for i in range(0, len(paragraphs)):
# for i in range(2, len(paragraphs)):
# data[i] = paragraphs[i]
print(data)
# for table in wordDoc.tables:
# for row in table.rows:
# print('---')
# for cell in row.cells:
# print(f'cell:{cell.text}')

View File

@@ -0,0 +1,11 @@
import tabula
file="files/Semesterapparat - Anmeldung.pdf"
def extract_book_data(file):
tables=tabula.read_pdf(file,pages="all",encoding="utf-8",multiple_tables=True)
tabula.convert_into(file, file.replace(".pdf"), output_format="csv", pages="all")
with open("files/Semesterapparat - Anmeldung.csv", "r") as f:
content=f.read()

0
src/logic/mail.py Normal file
View File

30
src/logic/pdfparser.py Normal file
View File

@@ -0,0 +1,30 @@
# add depend path to system path
import os
import sys
import pandas as pd
from pdfquery import PDFQuery
def pdf_to_csv(path: str) -> pd.DataFrame:
"""
Extracts the data from a pdf file and returns it as a pandas dataframe
"""
file = PDFQuery(path)
file.load()
#get the text from the pdf file
text_elems = file.extract([
('with_formatter', 'text'),
('all_text', '*')
])
extracted_text = text_elems['all_text']
return extracted_text
if __name__ == "__main__":
text = pdf_to_csv("54_pdf.pdf")
#remove linebreaks
text = text.replace("\n", "")
print(text)

20
src/logic/settings.py Normal file
View File

@@ -0,0 +1,20 @@
import yaml
from dataclasses import dataclass, field
@dataclass
class Settings:
"""Settings for the app."""
save_path: str
database_name: str
database_path: str
default_apps:bool = True
custom_applications: list[dict] = field(default_factory=list)
def save_settings(self):
"""Save the settings to the config file."""
with open("config.yaml", "w") as f:
yaml.dump(self.__dict__, f)
#open the config file and load the settings
with open("config.yaml", "r") as f:
data = yaml.safe_load(f)

195
src/logic/threads.py Normal file
View File

@@ -0,0 +1,195 @@
import threading
import time
from PyQt6.QtCore import QThread, pyqtSignal
from src.backend.database import Database
from log import MyLogger
from src.transformers import RDS_AVAIL_DATA
from src.logic.webrequest import BibTextTransformer, WebRequest
import sqlite3
class BookGrabber(QThread):
updateSignal = pyqtSignal(int, int)
def __init__(
self,
mode: str = None,
data: list = None,
app_id: int = None,
prof_id: int = None,
parent=None,
):
super().__init__(parent)
self.logger = MyLogger("Worker")
self.logger.log_info("Starting worker thread")
self.logger.log_info("Worker thread started")
self.app_id = app_id
self.prof_id = prof_id
self.mode = mode
self.data = data
self.book_id = None
self.db_lock = threading.Lock()
def run(self):
self.db = Database()
item = 0
for entry in self.data:
signature = str(entry)
self.logger.log_info("Processing entry: " + signature)
webdata = WebRequest().get_ppn(entry).get_data()
if webdata == "error":
continue
bd = BibTextTransformer(self.mode).get_data(webdata).return_data()
transformer = BibTextTransformer("RDS")
rds = transformer.get_data(webdata).return_data("rds_availability")
bd.signature = entry
with self.db_lock:
#confirm lock is acquired
print("lock acquired, adding book to database")
self.db.add_medium(bd, self.app_id, self.prof_id)
# get latest book id
self.book_id = self.db.get_latest_book_id()
self.logger.log_info("Added book to database")
state = 0
for rds_item in rds.items:
sign = rds_item.superlocation
loc = rds_item.location
# print(item.location)
if self.app_id in sign or self.app_id in loc:
state = 1
book_id = None
# for book in self.books:
# if book["bookdata"].signature == entry:
# book_id = book["id"]
# break
self.logger.log_info(f"State of {signature}: {state}")
with self.db_lock:
print(
"lock acquired, updating availability of "
+ str(book_id)
+ " to "
+ str(state)
)
try:
self.db.set_availability(self.book_id, state)
except sqlite3.OperationalError as e:
self.logger.log_error(f"Failed to update availability: {e}")
break
# time.sleep(5)
item += 1
self.updateSignal.emit(item, len(self.data))
self.logger.log_info("Worker thread finished")
# teminate thread
self.quit()
class AvailChecker(QThread):
updateSignal = pyqtSignal(str, int)
def __init__(
self, links: list = [], appnumber: int = None, parent=None, books=list[dict]
):
if links is None:
links = []
super().__init__(parent)
self.logger = MyLogger("AvailChecker")
self.logger.log_info("Starting worker thread")
self.logger.log_info(
"Checking availability for "
+ str(links)
+ " with appnumber "
+ str(appnumber)
+ "..."
)
self.links = links
self.appnumber = appnumber
self.books = books
self.db_lock = threading.Lock()
def run(self):
self.db = Database()
state = 0
for link in self.links:
self.logger.log_info("Processing entry: " + str(link))
data = WebRequest().get_ppn(link).get_data()
transformer = BibTextTransformer("RDS")
rds = transformer.get_data(data).return_data("rds_availability")
print(rds)
for item in rds.items:
sign = item.superlocation
loc = item.location
# print(item.location)
if self.appnumber in sign or self.appnumber in loc:
state = 1
book_id = None
for book in self.books:
if book["bookdata"].signature == link:
book_id = book["id"]
break
self.logger.log_info(f"State of {link}: " + str(state))
with self.db_lock:
print(
"lock acquired, updating availability of "
+ str(book_id)
+ " to "
+ str(state)
)
self.db.set_availability(book_id, state)
break
self.updateSignal.emit(item.callnumber, state)
self.logger.log_info("Worker thread finished")
# teminate thread
self.quit()
class AutoAdder(QThread):
updateSignal = pyqtSignal(int)
setTextSignal = pyqtSignal(int)
progress = pyqtSignal(int)
def __init__(self, data=None, app_id=None, prof_id=None, parent=None):
super().__init__(parent)
self.logger = MyLogger("AutoAdder")
self.data = data
self.app_id = app_id
self.prof_id = prof_id
print("Launched AutoAdder")
print(self.data, self.app_id, self.prof_id)
def run(self):
self.db = Database()
# show the dialog, start the thread to gather data and dynamically update progressbar and listwidget
self.logger.log_info("Starting worker thread")
item = 0
for entry in self.data:
try:
# webdata = WebRequest().get_ppn(entry).get_data()
# bd = BibTextTransformer("ARRAY").get_data(webdata).return_data()
# bd.signature = entry
self.updateSignal.emit(item)
self.setTextSignal.emit(entry)
# qsleep
item += 1
self.progress.emit(item)
print(item, len(self.data))
time.sleep(1)
except Exception as e:
print(e)
self.logger.log_exception(
f"The query failed with message {e} for signature {entry}"
)
continue
if item == len(self.data):
self.logger.log_info("Worker thread finished")
# teminate thread
self.finished.emit()

1974
src/logic/userInterface.py Normal file

File diff suppressed because it is too large Load Diff

176
src/logic/webrequest.py Normal file
View File

@@ -0,0 +1,176 @@
import requests
from bs4 import BeautifulSoup
from omegaconf import OmegaConf
from src.logic.dataclass import BookData
from log import MyLogger
from src.transformers import ARRAYData, BibTeXData, COinSData, RDSData, RISData
#import sleep_and_retry decorator to retry requests
from ratelimit import limits, sleep_and_retry
logger = MyLogger(__name__)
config = OmegaConf.load("config.yaml")
API_URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{}/"
PPN_URL = 'https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?lookfor="{}"+&type=AllFields&limit=10&sort=py+desc%2C+title'
TITLE = "RDS_TITLE"
SIGNATURE = "RDS_SIGNATURE"
EDITION = "RDS_EDITION"
ISBN = "RDS_ISBN"
AUTHOR = "RDS_PERSON"
HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",
"Accept-Language": "en-US, en;q=0.5",
}
class WebRequest:
def __init__(self) -> None:
"""Request data from the web, and format it depending on the mode."""
self.signature = None
self.ppn = None
self.data = None
logger.log_info("Initialized WebRequest")
def get_ppn(self, signature):
self.signature = signature
if "+" in signature:
signature = signature.replace("+", "%2B")
if "doi.org" in signature:
signature = signature.split("/")[-1]
url = PPN_URL.format(signature)
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser", from_encoding="utf-8")
if soup.find("div", class_="media") is None:
logger.log_error(f"No data found for {signature}")
return self
ppn = soup.find("div", class_="media").get("id")
self.ppn = ppn
return self
def get_link_data(self):
page = requests.get(PPN_URL.format(self.ppn))
soup = BeautifulSoup(page.content, "html.parser")
# find div that contains daia_ in the id
# find the pre tag in that div
# return the text
# div = soup.find("div",id=lambda x: x and "daia_" in x)
# pre = div.find("pre")
return soup
def get_data(self) -> list[str] | str:
# url = API_URL.format(self.ppn)
if self.ppn is None:
logger.log_error("No PPN found")
return "error"
page = requests.get(API_URL.format(self.ppn))
logger.log_info(f"Requesting data from {API_URL.format(self.ppn)}")
logger.log_info(f"Status code: {page.status_code}")
# print(page.content)
soup = BeautifulSoup(page.content, "html.parser")
pre_tag = soup.find_all("pre")
# print(pre_tag)
return_data = []
if pre_tag:
for tag in pre_tag:
data = tag.text.strip()
return_data.append(data)
return return_data
else:
print("No <pre> tag found")
logger.log_error("No <pre> tag found")
return return_data
class BibTextTransformer:
def __init__(self, mode: str) -> None:
self.mode = mode
self.field = None
# print(self.field)
self.data = None
# self.bookdata = BookData(**self.data)
def get_data(self, data: list) -> str:
RIS_IDENT = "TY -"
ARRAY_IDENT = "[kid]"
COinS_IDENT = "ctx_ver"
BIBTEX_IDENT = "@book"
RDS_IDENT = "RDS ---------------------------------- "
if self.mode == "RIS":
for line in data:
if RIS_IDENT in line:
self.data = line
elif self.mode == "ARRAY":
for line in data:
if ARRAY_IDENT in line:
self.data = line
elif self.mode == "COinS":
for line in data:
if COinS_IDENT in line:
self.data = line
elif self.mode == "BibTeX":
for line in data:
if BIBTEX_IDENT in line:
self.data = line
elif self.mode == "RDS":
for line in data:
if RDS_IDENT in line:
self.data = line
return self
def return_data(self, option=None) -> BookData:
"""Return Data to caller.
Args:
option (string, optional): Option for RDS as there are two filetypes. Use rds_availability or rds_data. Anything else gives a dict of both responses. Defaults to None.
Returns:
BookData: _description_
"""
if self.mode == "ARRAY":
return ARRAYData().transform(self.data)
elif self.mode == "COinS":
return COinSData().transform(self.data)
elif self.mode == "BibTeX":
return BibTeXData().transform(self.data)
elif self.mode == "RIS":
return RISData().transform(self.data)
elif self.mode == "RDS":
return RDSData().transform(self.data).return_data(option)
def cover(isbn):
test_url = f"https://www.buchhandel.de/cover/{isbn}/{isbn}-cover-m.jpg"
print(test_url)
data = requests.get(test_url, stream=True)
return data.content
def get_content(soup, css_class):
return soup.find("div", class_=css_class).text.strip()
if __name__ == "__main__":
print("main")
link = "ZE 77000 W492"
data = WebRequest().get_ppn(link).get_data()
print(data)
# # data.get_ppn("ME 3000 S186 (2)")
# # print(data.ppn)
# # desc=data.get_data()
# # print(type(desc))
# # print(desc)
# txt = (
# BibTextTransformer("RIS")
# .get_data(WebRequest().get_ppn("ST 250 U42 (15)").get_data())
# .return_data()
# )
# print(txt)
# print(data)
# print(BibTextTransformer(data).bookdata)

26
src/logic/wordparser.py Normal file
View File

@@ -0,0 +1,26 @@
import pandas as pd
from docx import Document
def word_docx_to_csv(path) -> pd.DataFrame:
doc = Document(path)
tables = doc.tables
m_data = []
for table in tables:
data = []
for row in table.rows:
row_data = []
for cell in row.cells:
text = cell.text
text = text.replace("\n", "")
row_data.append(text)
data.append(row_data)
df = pd.DataFrame(data)
df.columns = df.iloc[0]
df = df.iloc[1:]
m_data.append(df)
df = m_data[2]
return df