diff --git a/main.py b/main.py new file mode 100644 index 0000000..8d6e62c --- /dev/null +++ b/main.py @@ -0,0 +1,4 @@ +from src.ui.interface import launch + +if __name__ == "__main__": + launch() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..d90bb08 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,28 @@ +[project] +name = "linkavailablechecker" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.13" +dependencies = [ + "beautifulsoup4>=4.12.3", + "loguru>=0.7.3", + "playwright>=1.49.1", + "pyqt6-charts>=6.8.0", + "pyqt6>=6.8.1", + "pyqtgraph>=0.13.7", + "ratelimit>=2.2.1", + "requests>=2.32.3", + "prettytable>=3.14.0", + "cloudscraper>=1.2.71", + "limit>=0.2.3", + "alive-progress>=3.2.0", + "tqdm>=4.67.1", +] + +[dependency-groups] +dev = [ + "pip>=25.0", + "selenium>=4.29.0", + "undetected-chromedriver>=3.5.5", +] diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..97d07c1 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +from .ui.interface import MainWindow diff --git a/src/database.py b/src/database.py new file mode 100644 index 0000000..c43ef1f --- /dev/null +++ b/src/database.py @@ -0,0 +1,169 @@ +import sqlite3 + + +class Database: + def __init__(self, db_name): + self.conn = sqlite3.connect(db_name) + self.cursor = self.conn.cursor() + self.create_table() + + def create_table(self): + self.cursor.execute(""" + CREATE TABLE IF NOT EXISTS response ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + ppn TEXT NOT NULL, + source_link TEXT , + destination_link TEXT, + response_code INTEGER NOT NULL DEFAULT 0 + ) + """) + self.conn.commit() + + def add_data(self, ppn, source_link, destination_link=None, response_code=0): + self.cursor.execute( + """ + INSERT INTO response (ppn, source_link, destination_link, response_code) + VALUES (?, ?, ?, ?) + """, + (ppn, source_link, destination_link, response_code), + ) + self.conn.commit() + + def get_data(self, query, args): + self.cursor.execute(query, args) + result = self.cursor.fetchall() + return result + + def get_links_by_response_code(self, response_code: int): + self.cursor.execute( + "SELECT id, source_link FROM response WHERE response_code = ?", + (response_code,), + ) + return self.cursor.fetchall() + + def get_links(self): + self.cursor.execute( + "SELECT id, source_link FROM response WHERE response_code = 0" + ) + return self.cursor.fetchall() + + def update_response_code(self, id, response_code, destination_link): + self.cursor.execute( + "UPDATE response SET response_code = ?, destination_link = ? WHERE id = ?", + (response_code, destination_link, id), + ) + self.conn.commit() + + def reset_status_codes(self): + self.cursor.execute("UPDATE response SET response_code = 0") + self.cursor.execute("UPDATE response SET destination_link = NULL") + self.conn.commit() + + def get_status_codes(self): + self.cursor.execute("SELECT DISTINCT response_code FROM response") + results = self.cursor.fetchall() + result = [str(result[0]) for result in results] + result.sort() + return result + + def close(self): + self.conn.close() + + def get_status_code_counts(self): + self.cursor.execute( + "SELECT response_code, COUNT(*) FROM response GROUP BY response_code" + ) + result = self.cursor.fetchall() + res = [(x[0], x[1]) for x in result] + res.sort(key=lambda x: x[0]) + return res + + def get_publishers(self): + self.cursor.execute("SELECT source_link FROM response") + links = self.cursor.fetchall() + publishers = [] + for link in links: + if "http" not in link[0]: + publishers.append(link[0]) + continue + if not "//" in link[0]: + publishers.append(link[0]) + continue + publisher = link[0].split("//")[1].split("/")[0] + publishers.append(publisher) + return list(set(publishers)) + + def get_results_by_publisher(self, publisher, distinct=False): + if distinct: + self.cursor.execute( + "SELECT destination_link, response_code, source_link FROM response WHERE source_link LIKE ?", + (f"%{publisher}%",), + ) + return self.cursor.fetchall() + self.cursor.execute( + "SELECT DISTINCT destination_link, response_code, source_link FROM response WHERE source_link LIKE ?", + (f"%{publisher}%",), + ) + return self.cursor.fetchall() + + def get_results_by_publisher_and_status( + self, publisher, status_code, distinct=False + ): + if distinct: + self.cursor.execute( + "SELECT DISTINCT ppn, destination_link, source_link FROM response WHERE source_link LIKE ? AND response_code = ?", + (f"%{publisher}%", status_code), + ) + result = self.cursor.fetchall() + + self.cursor.execute( + "SELECT ppn, destination_link, source_link FROM response WHERE source_link LIKE ? AND response_code = ?", + (f"%{publisher}%", status_code), + ) + result = self.cursor.fetchall() + + return [(x[0], x[1], x[2]) for x in result] + + def get_publisher_count(self, publisher): + self.cursor.execute( + "SELECT COUNT(*) FROM response WHERE source_link LIKE ?", + (f"%{publisher}%",), + ) + return self.cursor.fetchone()[0] + + def get_checked_by_publisher(self, publisher): + self.cursor.execute( + "SELECT COUNT(*) FROM response WHERE source_link LIKE ? AND response_code != 0", + (f"%{publisher}%",), + ) + return self.cursor.fetchone()[0] + + def get_num_of_links_for_status_code(self, statuscode): + data = [] + for publisher in self.get_publishers(): + self.cursor.execute( + "SELECT COUNT(*) FROM response WHERE source_link LIKE ? AND response_code = ?", + (f"%{publisher}%", statuscode), + ) + data.append((publisher, self.cursor.fetchone()[0])) + return data + + def get_num_of_links_for_status_code_and_publisher(self, publisher, statuscode): + self.cursor.execute( + "SELECT COUNT(*) FROM response WHERE source_link LIKE ? AND response_code = ?", + (f"%{publisher}%", statuscode), + ) + return self.cursor.fetchone() + + def get_unique_count_ppns(self): + self.cursor.execute("SELECT COUNT(DISTINCT ppn) FROM response") + return self.cursor.fetchone()[0] + + def get_link_count(self): + self.cursor.execute("SELECT COUNT(*) FROM response") + return self.cursor.fetchone()[0] + + +if __name__ == "__main__": + db = Database("lfer.db") + print(db.get_unique_count_ppns(), db.get_link_count()) diff --git a/src/ui/interface.py b/src/ui/interface.py new file mode 100644 index 0000000..f7555b8 --- /dev/null +++ b/src/ui/interface.py @@ -0,0 +1,446 @@ +import os +import time +from PyQt6 import QtWidgets, QtCore +from PyQt6.QtCore import pyqtSlot +from PyQt6.QtWidgets import QVBoxLayout +from PyQt6.QtCharts import QChart, QChartView, QPieSeries, QPieSlice +from PyQt6.QtGui import QPainter +from src.ui.threads import ( + CheckThread, + WebscraperThread, + StatusCodeThread, + CheckThreadPlaywright, +) +from .sources.Ui_main_interface import Ui_MainWindow +from src.database import Database +import prettytable +import loguru +import sys +from src.ui.utils import QtqdmProgressBar + + +log = loguru.logger +log.remove() +log.add(sys.stdout, level="INFO") + + +class MainWindow(QtWidgets.QMainWindow, Ui_MainWindow): + def __init__(self): + super().__init__() + self.setupUi(self) + self.showResults.clicked.connect(self.create_graph) + # set summary to show the first tab + self.summary.setCurrentIndex(0) + self.spinTimeout.setButtonSymbols( + QtWidgets.QAbstractSpinBox.ButtonSymbols.PlusMinus + ) + self.spinTimer.setButtonSymbols( + QtWidgets.QAbstractSpinBox.ButtonSymbols.PlusMinus + ) + # self.splitter = QtWidgets.QSplitter(QtCore.Qt.Orientation.Horizontal) + # self.splitter.addWidget(self.graph_frame) + # self.splitter.addWidget(self.chart_frame) + # self.splitter.setSizes([2, 1]) + # self.widget.layout().addWidget(self.splitter) + # self.widget.layout().removeWidget(self.chart_frame) + # self.widget.layout().removeWidget(self.graph_frame) + + self.db = Database("lfer.db") + self.db_publishers = self.db.get_publishers() + self.db_publishers.sort() + self.publishers.addItems(self.db_publishers) + self.response_data.itemDoubleClicked.connect(self.display_detailed_overview) + + self.status_codes = self.db.get_status_codes() + # self.progressBar = None + self.progressBar_qtdm = QtqdmProgressBar(self) + self.progressBar_qtdm.setStyle(QtWidgets.QStyleFactory.create("Fusion")) # + pb_font = self.progressBar_qtdm.font() + pb_font.setBold(False) + self.progressBar_qtdm.setFont(pb_font) + self.widget.setLayout(QVBoxLayout()) + self.widget.layout().addWidget(self.progressBar_qtdm) + + # self.horizontalLayout_7.setStretch( + # self.horizontalLayout_7.indexOf(self.progressBar_qtdm), 1 + # ) + + self.progressBar_qtdm.hide() + # status code check data + self.check_status_code.addItems(self.status_codes) + self.check_code.clicked.connect(self.check_status_code_clicked) + self.total = 0 + self.total_scraper = 0 + # self.progressBar.hide() + self.check_overview.hide() + + self.statusMessage = QtWidgets.QLabel() + self.webscraperMessage = QtWidgets.QLabel() + self.statusBar().addPermanentWidget(self.statusMessage) + self.statusBar().addPermanentWidget(self.webscraperMessage) + # overview data + self.checkThreads = [] + self.update_database_stats() + self.updatedatabasestats.clicked.connect(self.update_database_stats) + + # database stuff + self.load_ppn.clicked.connect(self.load_ppns) + self.resetStatusCodes.clicked.connect(self.reset_status_codes) + + # statuscodegraph #! not working as intended + self.stats_scroll_area_table.itemDoubleClicked.connect(self.show_graph) + self.graph_data = [] + self.graph_data_total = 0 + self.status_code = None + self.stats_scroll_area_table.setColumnCount(3) + # set column 3 size to use the remaining space based on the table size + # set column 3 width to 50px + self.stats_scroll_area_table.setColumnWidth(2, 50) + self.stats_scroll_area_table.horizontalHeader().setSectionResizeMode( + 2, QtWidgets.QHeaderView.ResizeMode.Stretch + ) + # export + self.exportBtn.clicked.connect(self.export_data_single) + self.exportAll.clicked.connect(self.export_data_all) + self.exportAll.setEnabled(False) + + # eta + self.usePlaywright.clicked.connect(self.toggle_playwright) + + def toggle_playwright(self): + if self.usePlaywright.isChecked(): + self.groupBox_2.setEnabled(False) + else: + self.groupBox_2.setEnabled(True) + + def export_data_single(self): + self.export_data(self.detailed_overview) + + def export_data_all(self): + table = prettytable.PrettyTable() + table.field_names = ["PPN", "Link"] + status_code = self.status_code + data = self.db.get_data( + "SELECT ppn, source_link FROM response WHERE response_code = ?", + (status_code,), + ) + for item in data: + table.add_row([item[0], item[1]]) + with open("export.tsv", "w", encoding="utf-8") as f: + f.write(table.get_csv_string().replace(",", "\t")) + os.system("start export.tsv") + + def export_data(self, tableData): + table = prettytable.PrettyTable() + table.field_names = ["PPN", "Link"] + for i in range(self.tableData.rowCount()): + table.add_row( + [ + self.tableData.item(i, 0).text(), + self.tableData.item(i, 1).text(), + ] + ) + # export as tsv + with open("export.tsv", "w", encoding="utf-8") as f: + f.write(table.get_csv_string().replace(",", "\t")) + os.system("start export.tsv") + + def show_graph(self, pos): + print("show graph") + self.exportAll.setEnabled(True) + status_code = self.stats_scroll_area_table.item(pos.row(), 0).text() + self.status_code = status_code + scthread = StatusCodeThread(status_code=status_code) + scthread.progressSignal.connect(self.update_progress_graph) + scthread.total_entries_signal.connect(self.set_total_entries_graph) + scthread.current_data_signal.connect(self.add_data) + scthread.start() + self.checkThreads.append(scthread) + + def add_data(self, data): + if data[1] != 0: + self.graph_data.append(data) + + def set_total_entries_graph(self, total): + self.graph_data_total = total + + def update_progress_graph(self, progress): + self.statusMessage.setText(f"Progress: {progress}/{self.graph_data_total}") + if progress == self.graph_data_total: + self.statusMessage.setText("") + self.statusBar().showMessage("Done", 2000) + time.sleep(2) + self.summary.setCurrentIndex(0) + self.graph_data.sort(key=lambda x: x[1], reverse=True) + self.create_pie_chart( + self.graph_data, "Data for Status Code {}".format(self.status_code) + ) + self.add_table_data(self.graph_data) + self.statusMessage.setText("") + + def reset_status_codes(self): + # create a warning dialog + dialog = QtWidgets.QMessageBox() + dialog.setWindowTitle("Reset Status Codes") + dialog.setText("Do you really want to reset all status codes?") + dialog.setStandardButtons( + QtWidgets.QMessageBox.StandardButton.Yes + | QtWidgets.QMessageBox.StandardButton.No + ) + dialog.setDefaultButton(QtWidgets.QMessageBox.StandardButton.No) + + if dialog.exec() == QtWidgets.QMessageBox.StandardButton.Yes: + self.db.reset_status_codes() + self.update_database_stats() + else: + pass + + def load_ppns(self): + filedialog = QtWidgets.QFileDialog() + filedialog.setFileMode(QtWidgets.QFileDialog.FileMode.ExistingFiles) + filedialog.setNameFilter("Text files (*.txt)") + if filedialog.exec(): + files = filedialog.selectedFiles() + for file in files: + scrapeThread = WebscraperThread(ppnfilePath=file) + scrapeThread.updateSignal.connect(self.update_progress_scraper) + scrapeThread.total_entries_signal.connect( + self.set_total_entries_scraper + ) + scrapeThread.start() + self.checkThreads.append(scrapeThread) + # self.update_database_stats() + # update status codes in the dropdown + self.check_status_code.clear() + self.check_status_code.addItems(self.db.get_status_codes()) + + def update_progress_scraper(self, progress): + self.webscraperMessage.setText(f"Progress: {progress}/{self.total_scraper}") + self.update_database_stats() + + def set_total_entries_scraper(self, total): + self.total_scraper = total + + def update_database_stats(self): + stats = self.db.get_status_code_counts() + self.stats_scroll_area_table.setRowCount(0) + total_entries = sum([stat[1] for stat in stats]) + total_links = self.db.get_link_count() + ppn_count = self.db.get_unique_count_ppns() + self.databasestats.setTextFormat(QtCore.Qt.TextFormat.RichText) + self.databasestats.setText( + "Es wurden {} Links für {} Medien gefunden".format( + total_links, ppn_count + ) + ) + for stat in stats: + self.stats_scroll_area_table.insertRow( + self.stats_scroll_area_table.rowCount() + ) + self.stats_scroll_area_table.setItem( + self.stats_scroll_area_table.rowCount() - 1, + 0, + QtWidgets.QTableWidgetItem(str(stat[0])), + ) + self.stats_scroll_area_table.setItem( + self.stats_scroll_area_table.rowCount() - 1, + 1, + QtWidgets.QTableWidgetItem(str(stat[1])), + ) + self.stats_scroll_area_table.setItem( + self.stats_scroll_area_table.rowCount() - 1, + 2, + QtWidgets.QTableWidgetItem( + str(round(stat[1] / total_entries * 100, 2)) + "%" + ), + ) + + def update_eta(self, eta): + # self.eta.setText(f"ETA: {eta}") + pass + + @pyqtSlot() + @pyqtSlot() + def check_status_code_clicked(self): + self.progressBar_qtdm.setValue(0) + self.progressBar_qtdm.show() + self.check_overview.setPlainText("") + status_code = self.check_status_code.currentText() + + if self.usePlaywright.isChecked(): + log.info("Using Playwright") + checkThread = CheckThreadPlaywright(status_code=str(status_code)) + else: + log.info("Using Requests") + checkThread = CheckThread(status_code=str(status_code)) + + checkThread.setTimes(self.spinTimeout.value(), self.spinTimer.value()) + self.statusBar().showMessage("Checking status code", 2000) + + # Restore these signal connections + checkThread.updateSignal.connect(self.update_progress) + checkThread.total_entries_signal.connect(self.set_total_entries) + checkThread.resultSignal.connect(self.showMessage) + checkThread.progress.connect(self.progressBar_qtdm.make_progress) + checkThread.finished.connect(self.update_done) + self.check_code.setEnabled(False) + self.check_overview.show() + checkThread.start() + self.checkThreads.append(checkThread) + + def update_done(self): + self.check_code.setEnabled(True) + self.progressBar_qtdm.hide() + self.progressBar_qtdm.setValue(0) + + self.statusBar().showMessage("Done", 2000) + + # def check_status_code_clicked(self): + # self.progressBar_qtdm.setValue(0) + # self.progressBar_qtdm.show() + # self.check_overview.setPlainText("") + # status_code = self.check_status_code.currentText() + # if self.usePlaywright.isChecked(): + # log.info("Using Playwright") + # checkThread = CheckThreadPlaywright(status_code=str(status_code)) + # else: + # log.info("Using Requests") + # checkThread = CheckThread(status_code=str(status_code)) + # checkThread.setTimes(self.spinTimeout.value(), self.spinTimer.value()) + # self.statusBar().showMessage("Checking status code", 2000) + # # checkThread.updateSignal.connect(self.update_progress) + # # checkThread.total_entries_signal.connect(self.set_total_entries) + # # checkThread.resultSignal.connect(self.showMessage) + # # checkThread.etaSignal.connect(self.progressBar_qtdm.make_progress) + # checkThread.progress.connect(self.progressBar_qtdm.make_progress) + # self.check_code.setEnabled(False) + # # self.eta.setText("ETA: Calculating...") + # self.check_overview.show() + # checkThread.start() + # self.checkThreads.append(checkThread) + + def set_total_entries(self, total): + self.total = total + self.progressBar_qtdm.show() + self.progressBar_qtdm.setMaximum(total) + + def showMessage(self, message): + currentText = self.check_overview.toPlainText() + lines = currentText.count("\n") + if lines > 100: + self.check_overview.setPlainText("") + self.check_overview.append(message) + + # follow last line + self.check_overview.verticalScrollBar().setValue( + self.check_overview.verticalScrollBar().maximum() + ) + + def update_progress(self): + self.updatedatabasestats.click() + + def create_graph(self): + self.summary.setCurrentIndex(0) + publisher = self.publishers.currentText() + duplicates = self.addDuplicates.isChecked() + if duplicates: + data = self.db.get_results_by_publisher(publisher, distinct=True) + else: + data = self.db.get_results_by_publisher(publisher, distinct=False) + publisher_count = self.db.get_publisher_count(publisher) + checked_count = self.db.get_checked_by_publisher(publisher) + self.total_results_publisher.setText( + "Anzahl Titel: {}, davon geprüft: {}".format(publisher_count, checked_count) + ) + graph_data = {} + for item in data: + if item[1] in graph_data: + graph_data[item[1]] += 1 + else: + graph_data[item[1]] = 1 + table_data = [] + for key, value in graph_data.items(): + table_data.append((key, value)) + self.create_pie_chart( + table_data, title="Statistik der Status Codes für {}".format(publisher) + ) + self.add_table_data(table_data) + + def add_table_data(self, data): + self.response_data.setRowCount(0) + + for item in data: + rowPosition = self.response_data.rowCount() + self.response_data.insertRow(rowPosition) + self.response_data.setItem( + rowPosition, 0, QtWidgets.QTableWidgetItem(str(item[0])) + ) + self.response_data.setItem( + rowPosition, 1, QtWidgets.QTableWidgetItem(str(item[1])) + ) + + def create_pie_chart(self, data, title="Count of Status Codes"): + # self.splitter.setSizes([80, 20]) + # self.splitter.setSizes([2, 1]) + + if self.graph_frame.layout().count() > 0: + self.graph_frame.layout().itemAt(0).widget().deleteLater() + series = QPieSeries() + for item in data: + series.append(str(item[0]), item[1]) + + chart = QChart() + chart.addSeries(series) + chart.setAnimationOptions(QChart.AnimationOption.SeriesAnimations) + chart.legend().setVisible(True) + chart.legend().setAlignment(QtCore.Qt.AlignmentFlag.AlignRight) + chart.setTitle(title) + + chartview = QChartView(chart) + chartview.setRenderHint(QPainter.RenderHint.Antialiasing) + + self.graph_frame.layout().addWidget(chartview) + slices = series.slices() + slices.sort(key=lambda x: x.percentage(), reverse=True) + for slice in slices: + slice.setLabel(f"{slice.label()} {round(slice.percentage() * 100, 2)}%") + # display slice data on hover + + def display_detailed_overview(self, item): + self.summary.setCurrentIndex(1) + # from self.response_data get the selected status code + status_code = self.response_data.item(item.row(), 0).text() + if status_code.isnumeric(): + publisher = self.publishers.currentText() + data = self.db.get_results_by_publisher_and_status(publisher, status_code) + else: + publisher = status_code + status_code = self.status_code + data = self.db.get_results_by_publisher_and_status(publisher, status_code) + self.detailed_overview.setRowCount(0) + for res in data: + self.detailed_overview.insertRow(self.detailed_overview.rowCount()) + self.detailed_overview.setItem( + self.detailed_overview.rowCount() - 1, + 0, + QtWidgets.QTableWidgetItem(str(res[0])), + ) + self.detailed_overview.setItem( + self.detailed_overview.rowCount() - 1, + 1, + QtWidgets.QTableWidgetItem(str(res[1])), + ) + self.detailed_overview.setItem( + self.detailed_overview.rowCount() - 1, + 2, + QtWidgets.QTableWidgetItem(str(res[2])), + ) + + +def launch(): + app = QtWidgets.QApplication([]) + # app.setStyle("Fusion") + + window = MainWindow() + window.show() + app.exec() diff --git a/src/ui/sources/Ui_main_interface.py b/src/ui/sources/Ui_main_interface.py new file mode 100644 index 0000000..08044d1 --- /dev/null +++ b/src/ui/sources/Ui_main_interface.py @@ -0,0 +1,340 @@ +# Form implementation generated from reading ui file 'c:\Users\aky547\GitHub\LinkAvailableChecker\src\ui\sources\main_interface.ui' +# +# Created by: PyQt6 UI code generator 6.7.1 +# +# WARNING: Any manual changes made to this file will be lost when pyuic6 is +# run again. Do not edit this file unless you know what you are doing. + + +from PyQt6 import QtCore, QtGui, QtWidgets + + +class Ui_MainWindow(object): + def setupUi(self, MainWindow): + MainWindow.setObjectName("MainWindow") + MainWindow.resize(824, 740) + self.centralwidget = QtWidgets.QWidget(parent=MainWindow) + self.centralwidget.setObjectName("centralwidget") + self.verticalLayout = QtWidgets.QVBoxLayout(self.centralwidget) + self.verticalLayout.setObjectName("verticalLayout") + self.horizontalLayout_4 = QtWidgets.QHBoxLayout() + self.horizontalLayout_4.setObjectName("horizontalLayout_4") + self.database_stats = QtWidgets.QGroupBox(parent=self.centralwidget) + font = QtGui.QFont() + font.setBold(True) + self.database_stats.setFont(font) + self.database_stats.setObjectName("database_stats") + self.verticalLayout_6 = QtWidgets.QVBoxLayout(self.database_stats) + self.verticalLayout_6.setObjectName("verticalLayout_6") + self.horizontalLayout_6 = QtWidgets.QHBoxLayout() + self.horizontalLayout_6.setObjectName("horizontalLayout_6") + self.load_ppn = QtWidgets.QPushButton(parent=self.database_stats) + sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Policy.Maximum, QtWidgets.QSizePolicy.Policy.Fixed) + sizePolicy.setHorizontalStretch(0) + sizePolicy.setVerticalStretch(0) + sizePolicy.setHeightForWidth(self.load_ppn.sizePolicy().hasHeightForWidth()) + self.load_ppn.setSizePolicy(sizePolicy) + font = QtGui.QFont() + font.setBold(False) + self.load_ppn.setFont(font) + self.load_ppn.setObjectName("load_ppn") + self.horizontalLayout_6.addWidget(self.load_ppn) + self.resetStatusCodes = QtWidgets.QPushButton(parent=self.database_stats) + sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Policy.Maximum, QtWidgets.QSizePolicy.Policy.Fixed) + sizePolicy.setHorizontalStretch(0) + sizePolicy.setVerticalStretch(0) + sizePolicy.setHeightForWidth(self.resetStatusCodes.sizePolicy().hasHeightForWidth()) + self.resetStatusCodes.setSizePolicy(sizePolicy) + font = QtGui.QFont() + font.setBold(False) + self.resetStatusCodes.setFont(font) + self.resetStatusCodes.setObjectName("resetStatusCodes") + self.horizontalLayout_6.addWidget(self.resetStatusCodes) + self.verticalLayout_6.addLayout(self.horizontalLayout_6) + self.databasestats = QtWidgets.QLabel(parent=self.database_stats) + font = QtGui.QFont() + font.setBold(False) + self.databasestats.setFont(font) + self.databasestats.setText("") + self.databasestats.setTextFormat(QtCore.Qt.TextFormat.PlainText) + self.databasestats.setObjectName("databasestats") + self.verticalLayout_6.addWidget(self.databasestats) + self.stats_scroll_area_table = QtWidgets.QTableWidget(parent=self.database_stats) + self.stats_scroll_area_table.setMinimumSize(QtCore.QSize(0, 100)) + font = QtGui.QFont() + font.setBold(False) + self.stats_scroll_area_table.setFont(font) + self.stats_scroll_area_table.setFrameShape(QtWidgets.QFrame.Shape.NoFrame) + self.stats_scroll_area_table.setVerticalScrollBarPolicy(QtCore.Qt.ScrollBarPolicy.ScrollBarAsNeeded) + self.stats_scroll_area_table.setHorizontalScrollBarPolicy(QtCore.Qt.ScrollBarPolicy.ScrollBarAsNeeded) + self.stats_scroll_area_table.setEditTriggers(QtWidgets.QAbstractItemView.EditTrigger.NoEditTriggers) + self.stats_scroll_area_table.setAlternatingRowColors(True) + self.stats_scroll_area_table.setObjectName("stats_scroll_area_table") + self.stats_scroll_area_table.setColumnCount(3) + self.stats_scroll_area_table.setRowCount(0) + item = QtWidgets.QTableWidgetItem() + self.stats_scroll_area_table.setHorizontalHeaderItem(0, item) + item = QtWidgets.QTableWidgetItem() + self.stats_scroll_area_table.setHorizontalHeaderItem(1, item) + item = QtWidgets.QTableWidgetItem() + self.stats_scroll_area_table.setHorizontalHeaderItem(2, item) + self.stats_scroll_area_table.horizontalHeader().setDefaultSectionSize(90) + self.stats_scroll_area_table.horizontalHeader().setMinimumSectionSize(50) + self.stats_scroll_area_table.horizontalHeader().setStretchLastSection(True) + self.stats_scroll_area_table.verticalHeader().setDefaultSectionSize(35) + self.verticalLayout_6.addWidget(self.stats_scroll_area_table) + self.updatedatabasestats = QtWidgets.QPushButton(parent=self.database_stats) + font = QtGui.QFont() + font.setBold(False) + self.updatedatabasestats.setFont(font) + self.updatedatabasestats.setObjectName("updatedatabasestats") + self.verticalLayout_6.addWidget(self.updatedatabasestats) + self.horizontalLayout_4.addWidget(self.database_stats) + self.verticalLayout_8 = QtWidgets.QVBoxLayout() + self.verticalLayout_8.setObjectName("verticalLayout_8") + self.verticalLayout_7 = QtWidgets.QVBoxLayout() + self.verticalLayout_7.setSizeConstraint(QtWidgets.QLayout.SizeConstraint.SetDefaultConstraint) + self.verticalLayout_7.setObjectName("verticalLayout_7") + self.horizontalLayout_3 = QtWidgets.QHBoxLayout() + self.horizontalLayout_3.setObjectName("horizontalLayout_3") + self.label = QtWidgets.QLabel(parent=self.centralwidget) + self.label.setObjectName("label") + self.horizontalLayout_3.addWidget(self.label) + self.publishers = QtWidgets.QComboBox(parent=self.centralwidget) + self.publishers.setEditable(True) + self.publishers.setObjectName("publishers") + self.horizontalLayout_3.addWidget(self.publishers) + self.horizontalLayout_3.setStretch(0, 1) + self.horizontalLayout_3.setStretch(1, 5) + self.verticalLayout_7.addLayout(self.horizontalLayout_3) + self.horizontalLayout_5 = QtWidgets.QHBoxLayout() + self.horizontalLayout_5.setObjectName("horizontalLayout_5") + spacerItem = QtWidgets.QSpacerItem(40, 20, QtWidgets.QSizePolicy.Policy.Expanding, QtWidgets.QSizePolicy.Policy.Minimum) + self.horizontalLayout_5.addItem(spacerItem) + self.addDuplicates = QtWidgets.QCheckBox(parent=self.centralwidget) + self.addDuplicates.setLayoutDirection(QtCore.Qt.LayoutDirection.LeftToRight) + self.addDuplicates.setChecked(True) + self.addDuplicates.setTristate(False) + self.addDuplicates.setObjectName("addDuplicates") + self.horizontalLayout_5.addWidget(self.addDuplicates) + self.verticalLayout_7.addLayout(self.horizontalLayout_5) + self.showResults = QtWidgets.QPushButton(parent=self.centralwidget) + self.showResults.setObjectName("showResults") + self.verticalLayout_7.addWidget(self.showResults) + self.groupBox = QtWidgets.QGroupBox(parent=self.centralwidget) + font = QtGui.QFont() + font.setBold(True) + self.groupBox.setFont(font) + self.groupBox.setObjectName("groupBox") + self.formLayout = QtWidgets.QFormLayout(self.groupBox) + self.formLayout.setObjectName("formLayout") + self.label_3 = QtWidgets.QLabel(parent=self.groupBox) + font = QtGui.QFont() + font.setBold(False) + self.label_3.setFont(font) + self.label_3.setObjectName("label_3") + self.formLayout.setWidget(0, QtWidgets.QFormLayout.ItemRole.LabelRole, self.label_3) + self.check_status_code = QtWidgets.QComboBox(parent=self.groupBox) + font = QtGui.QFont() + font.setBold(False) + self.check_status_code.setFont(font) + self.check_status_code.setObjectName("check_status_code") + self.formLayout.setWidget(0, QtWidgets.QFormLayout.ItemRole.FieldRole, self.check_status_code) + self.usePlaywright = QtWidgets.QCheckBox(parent=self.groupBox) + self.usePlaywright.setObjectName("usePlaywright") + self.formLayout.setWidget(1, QtWidgets.QFormLayout.ItemRole.LabelRole, self.usePlaywright) + self.check_code = QtWidgets.QPushButton(parent=self.groupBox) + font = QtGui.QFont() + font.setBold(False) + self.check_code.setFont(font) + self.check_code.setObjectName("check_code") + self.formLayout.setWidget(1, QtWidgets.QFormLayout.ItemRole.FieldRole, self.check_code) + self.groupBox_2 = QtWidgets.QGroupBox(parent=self.groupBox) + self.groupBox_2.setObjectName("groupBox_2") + self.verticalLayout_9 = QtWidgets.QVBoxLayout(self.groupBox_2) + self.verticalLayout_9.setObjectName("verticalLayout_9") + self.label_4 = QtWidgets.QLabel(parent=self.groupBox_2) + font = QtGui.QFont() + font.setBold(False) + self.label_4.setFont(font) + self.label_4.setObjectName("label_4") + self.verticalLayout_9.addWidget(self.label_4) + self.spinTimer = QtWidgets.QSpinBox(parent=self.groupBox_2) + font = QtGui.QFont() + font.setBold(False) + self.spinTimer.setFont(font) + self.spinTimer.setButtonSymbols(QtWidgets.QAbstractSpinBox.ButtonSymbols.UpDownArrows) + self.spinTimer.setMinimum(1) + self.spinTimer.setMaximum(60) + self.spinTimer.setObjectName("spinTimer") + self.verticalLayout_9.addWidget(self.spinTimer) + self.label_5 = QtWidgets.QLabel(parent=self.groupBox_2) + font = QtGui.QFont() + font.setBold(False) + self.label_5.setFont(font) + self.label_5.setObjectName("label_5") + self.verticalLayout_9.addWidget(self.label_5) + self.spinTimeout = QtWidgets.QSpinBox(parent=self.groupBox_2) + font = QtGui.QFont() + font.setBold(False) + self.spinTimeout.setFont(font) + self.spinTimeout.setButtonSymbols(QtWidgets.QAbstractSpinBox.ButtonSymbols.UpDownArrows) + self.spinTimeout.setMinimum(5) + self.spinTimeout.setMaximum(100) + self.spinTimeout.setObjectName("spinTimeout") + self.verticalLayout_9.addWidget(self.spinTimeout) + spacerItem1 = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Policy.Minimum, QtWidgets.QSizePolicy.Policy.Expanding) + self.verticalLayout_9.addItem(spacerItem1) + self.formLayout.setWidget(2, QtWidgets.QFormLayout.ItemRole.LabelRole, self.groupBox_2) + self.check_overview = QtWidgets.QTextBrowser(parent=self.groupBox) + sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Policy.Expanding, QtWidgets.QSizePolicy.Policy.Maximum) + sizePolicy.setHorizontalStretch(0) + sizePolicy.setVerticalStretch(0) + sizePolicy.setHeightForWidth(self.check_overview.sizePolicy().hasHeightForWidth()) + self.check_overview.setSizePolicy(sizePolicy) + self.check_overview.setMinimumSize(QtCore.QSize(0, 100)) + self.check_overview.setMaximumSize(QtCore.QSize(16777215, 130)) + font = QtGui.QFont() + font.setBold(False) + self.check_overview.setFont(font) + self.check_overview.setOpenExternalLinks(True) + self.check_overview.setObjectName("check_overview") + self.formLayout.setWidget(2, QtWidgets.QFormLayout.ItemRole.FieldRole, self.check_overview) + self.horizontalLayout_7 = QtWidgets.QHBoxLayout() + self.horizontalLayout_7.setObjectName("horizontalLayout_7") + self.widget = QtWidgets.QWidget(parent=self.groupBox) + self.widget.setMinimumSize(QtCore.QSize(0, 30)) + self.widget.setObjectName("widget") + self.horizontalLayout_7.addWidget(self.widget) + self.formLayout.setLayout(3, QtWidgets.QFormLayout.ItemRole.FieldRole, self.horizontalLayout_7) + self.verticalLayout_7.addWidget(self.groupBox) + self.verticalLayout_8.addLayout(self.verticalLayout_7) + self.horizontalLayout_4.addLayout(self.verticalLayout_8) + self.horizontalLayout_4.setStretch(0, 1) + self.horizontalLayout_4.setStretch(1, 2) + self.verticalLayout.addLayout(self.horizontalLayout_4) + self.widget1 = QtWidgets.QWidget(parent=self.centralwidget) + self.widget1.setObjectName("widget1") + self.horizontalLayout = QtWidgets.QHBoxLayout(self.widget1) + self.horizontalLayout.setObjectName("horizontalLayout") + self.graph_frame = QtWidgets.QFrame(parent=self.widget1) + self.graph_frame.setFrameShape(QtWidgets.QFrame.Shape.StyledPanel) + self.graph_frame.setFrameShadow(QtWidgets.QFrame.Shadow.Raised) + self.graph_frame.setObjectName("graph_frame") + self.verticalLayout_2 = QtWidgets.QVBoxLayout(self.graph_frame) + self.verticalLayout_2.setContentsMargins(2, 2, 2, 2) + self.verticalLayout_2.setSpacing(0) + self.verticalLayout_2.setObjectName("verticalLayout_2") + self.horizontalLayout.addWidget(self.graph_frame) + self.chart_frame = QtWidgets.QFrame(parent=self.widget1) + self.chart_frame.setFrameShape(QtWidgets.QFrame.Shape.StyledPanel) + self.chart_frame.setFrameShadow(QtWidgets.QFrame.Shadow.Raised) + self.chart_frame.setObjectName("chart_frame") + self.verticalLayout_3 = QtWidgets.QVBoxLayout(self.chart_frame) + self.verticalLayout_3.setContentsMargins(2, 2, 2, 2) + self.verticalLayout_3.setSpacing(0) + self.verticalLayout_3.setObjectName("verticalLayout_3") + self.summary = QtWidgets.QStackedWidget(parent=self.chart_frame) + self.summary.setObjectName("summary") + self.page = QtWidgets.QWidget() + self.page.setObjectName("page") + self.verticalLayout_5 = QtWidgets.QVBoxLayout(self.page) + self.verticalLayout_5.setObjectName("verticalLayout_5") + self.response_data = QtWidgets.QTableWidget(parent=self.page) + self.response_data.setEditTriggers(QtWidgets.QAbstractItemView.EditTrigger.NoEditTriggers) + self.response_data.setAlternatingRowColors(True) + self.response_data.setObjectName("response_data") + self.response_data.setColumnCount(2) + self.response_data.setRowCount(0) + item = QtWidgets.QTableWidgetItem() + self.response_data.setHorizontalHeaderItem(0, item) + item = QtWidgets.QTableWidgetItem() + self.response_data.setHorizontalHeaderItem(1, item) + self.verticalLayout_5.addWidget(self.response_data) + self.total_results_publisher = QtWidgets.QLabel(parent=self.page) + self.total_results_publisher.setText("") + self.total_results_publisher.setObjectName("total_results_publisher") + self.verticalLayout_5.addWidget(self.total_results_publisher) + self.exportAll = QtWidgets.QPushButton(parent=self.page) + self.exportAll.setObjectName("exportAll") + self.verticalLayout_5.addWidget(self.exportAll) + self.summary.addWidget(self.page) + self.page_2 = QtWidgets.QWidget() + self.page_2.setObjectName("page_2") + self.verticalLayout_4 = QtWidgets.QVBoxLayout(self.page_2) + self.verticalLayout_4.setObjectName("verticalLayout_4") + self.detailed_overview = QtWidgets.QTableWidget(parent=self.page_2) + self.detailed_overview.setObjectName("detailed_overview") + self.detailed_overview.setColumnCount(3) + self.detailed_overview.setRowCount(0) + item = QtWidgets.QTableWidgetItem() + self.detailed_overview.setHorizontalHeaderItem(0, item) + item = QtWidgets.QTableWidgetItem() + self.detailed_overview.setHorizontalHeaderItem(1, item) + item = QtWidgets.QTableWidgetItem() + self.detailed_overview.setHorizontalHeaderItem(2, item) + self.verticalLayout_4.addWidget(self.detailed_overview) + self.exportBtn = QtWidgets.QPushButton(parent=self.page_2) + self.exportBtn.setObjectName("exportBtn") + self.verticalLayout_4.addWidget(self.exportBtn) + self.summary.addWidget(self.page_2) + self.verticalLayout_3.addWidget(self.summary) + self.horizontalLayout.addWidget(self.chart_frame) + self.verticalLayout.addWidget(self.widget1) + self.verticalLayout.setStretch(0, 1) + self.verticalLayout.setStretch(1, 4) + MainWindow.setCentralWidget(self.centralwidget) + self.statusbar = QtWidgets.QStatusBar(parent=MainWindow) + self.statusbar.setObjectName("statusbar") + MainWindow.setStatusBar(self.statusbar) + self.actionPPNs_laden = QtGui.QAction(parent=MainWindow) + self.actionPPNs_laden.setObjectName("actionPPNs_laden") + self.actionStatus_Code_zur_cksetzen = QtGui.QAction(parent=MainWindow) + self.actionStatus_Code_zur_cksetzen.setObjectName("actionStatus_Code_zur_cksetzen") + + self.retranslateUi(MainWindow) + self.summary.setCurrentIndex(0) + QtCore.QMetaObject.connectSlotsByName(MainWindow) + + def retranslateUi(self, MainWindow): + _translate = QtCore.QCoreApplication.translate + MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow")) + self.database_stats.setTitle(_translate("MainWindow", "Statistik der Datenbank")) + self.load_ppn.setText(_translate("MainWindow", "PPNs laden")) + self.resetStatusCodes.setText(_translate("MainWindow", "Status Codes löschen")) + item = self.stats_scroll_area_table.horizontalHeaderItem(0) + item.setText(_translate("MainWindow", "Status Code")) + item = self.stats_scroll_area_table.horizontalHeaderItem(1) + item.setText(_translate("MainWindow", "Anzahl Treffer")) + item = self.stats_scroll_area_table.horizontalHeaderItem(2) + item.setText(_translate("MainWindow", "%")) + self.updatedatabasestats.setText(_translate("MainWindow", "Aktualisieren")) + self.label.setText(_translate("MainWindow", "Verlag")) + self.addDuplicates.setText(_translate("MainWindow", "Dubletten entfernen")) + self.showResults.setText(_translate("MainWindow", "Daten anzeigen")) + self.groupBox.setTitle(_translate("MainWindow", "Statuscode(s) prüfen")) + self.label_3.setText(_translate("MainWindow", "Status Code")) + self.usePlaywright.setToolTip(_translate("MainWindow", "Verwendet einen emulierten Browser, um die Seiten zu prüfen, dauert länger, kein ETA berechenbar")) + self.usePlaywright.setText(_translate("MainWindow", "Browser\n" +"verwenden")) + self.check_code.setText(_translate("MainWindow", "Prüfen")) + self.groupBox_2.setTitle(_translate("MainWindow", "Timers")) + self.label_4.setToolTip(_translate("MainWindow", "Pause zwischen den Anfragen")) + self.label_4.setText(_translate("MainWindow", "Sleep Timer")) + self.label_5.setToolTip(_translate("MainWindow", "Limit, ab wann eine Anfrage als ungültig gewertet wird")) + self.label_5.setText(_translate("MainWindow", "Timeout")) + self.response_data.setSortingEnabled(True) + item = self.response_data.horizontalHeaderItem(0) + item.setText(_translate("MainWindow", "Response Code")) + item = self.response_data.horizontalHeaderItem(1) + item.setText(_translate("MainWindow", "Anzahl Treffer")) + self.exportAll.setText(_translate("MainWindow", "Alles exportieren")) + item = self.detailed_overview.horizontalHeaderItem(0) + item.setText(_translate("MainWindow", "PPN")) + item = self.detailed_overview.horizontalHeaderItem(1) + item.setText(_translate("MainWindow", "Unser Link")) + item = self.detailed_overview.horizontalHeaderItem(2) + item.setText(_translate("MainWindow", "Ergebnis")) + self.exportBtn.setText(_translate("MainWindow", "Daten exportieren")) + self.actionPPNs_laden.setText(_translate("MainWindow", "PPNs laden")) + self.actionStatus_Code_zur_cksetzen.setText(_translate("MainWindow", "Status Code zurücksetzen")) diff --git a/src/ui/sources/main_interface.ui b/src/ui/sources/main_interface.ui new file mode 100644 index 0000000..416fcf3 --- /dev/null +++ b/src/ui/sources/main_interface.ui @@ -0,0 +1,573 @@ + + + MainWindow + + + + 0 + 0 + 824 + 740 + + + + MainWindow + + + + + + + + + + true + + + + Statistik der Datenbank + + + + + + + + + 0 + 0 + + + + + false + + + + PPNs laden + + + + + + + + 0 + 0 + + + + + false + + + + Status Codes löschen + + + + + + + + + + false + + + + + + + Qt::PlainText + + + + + + + + 0 + 100 + + + + + false + + + + QFrame::NoFrame + + + Qt::ScrollBarAsNeeded + + + Qt::ScrollBarAsNeeded + + + QAbstractItemView::NoEditTriggers + + + true + + + 50 + + + 90 + + + true + + + 35 + + + + Status Code + + + + + Anzahl Treffer + + + + + % + + + + + + + + + false + + + + Aktualisieren + + + + + + + + + + + + QLayout::SetDefaultConstraint + + + + + + + Verlag + + + + + + + true + + + + + + + + + + + Qt::Horizontal + + + + 40 + 20 + + + + + + + + Qt::LeftToRight + + + Dubletten entfernen + + + true + + + false + + + + + + + + + Daten anzeigen + + + + + + + + true + + + + Statuscode prüfen + + + + + + + false + + + + Status Code + + + + + + + + false + + + + + + + + Verwendet einen emulierten Browser, um die Seiten zu prüfen, dauert länger, kein ETA berechenbar + + + Browser +verwenden + + + + + + + + false + + + + Prüfen + + + + + + + Timers + + + + + + + false + + + + Pause zwischen den Anfragen + + + Sleep Timer + + + + + + + + false + + + + QAbstractSpinBox::UpDownArrows + + + 1 + + + 60 + + + + + + + + false + + + + Limit, ab wann eine Anfrage als ungültig gewertet wird + + + Timeout + + + + + + + + false + + + + QAbstractSpinBox::UpDownArrows + + + 5 + + + 100 + + + + + + + Qt::Vertical + + + + 20 + 40 + + + + + + + + + + + + 0 + 0 + + + + + 0 + 100 + + + + + 16777215 + 130 + + + + + false + + + + true + + + + + + + + + + 0 + 30 + + + + + + + + + + + + + + + + + + + + + + QFrame::StyledPanel + + + QFrame::Raised + + + + 0 + + + 2 + + + 2 + + + 2 + + + 2 + + + + + + + + QFrame::StyledPanel + + + QFrame::Raised + + + + 0 + + + 2 + + + 2 + + + 2 + + + 2 + + + + + 0 + + + + + + + QAbstractItemView::NoEditTriggers + + + true + + + true + + + + Response Code + + + + + Anzahl Treffer + + + + + + + + + + + + + + + Alles exportieren + + + + + + + + + + + + PPN + + + + + Unser Link + + + + + Ergebnis + + + + + + + + Daten exportieren + + + + + + + + + + + + + + + + + + + PPNs laden + + + + + Status Code zurücksetzen + + + + + + diff --git a/src/ui/threads.py b/src/ui/threads.py new file mode 100644 index 0000000..8576273 --- /dev/null +++ b/src/ui/threads.py @@ -0,0 +1,495 @@ +from PyQt6 import QtCore, QtWidgets +from PyQt6.QtCore import QThread, pyqtSignal, QTimer +from src.database import Database +import time +import loguru +import xml.etree.ElementTree as ET +import requests +from ratelimit import limits, sleep_and_retry +from datetime import timedelta +from datetime import datetime +import asyncio +from playwright.async_api import async_playwright +import tqdm +import os + +log = loguru.logger +log.remove() +log.add("status_code.log", rotation="100 MB") + + +class Qtqdm(tqdm.std.tqdm): + """ + Override the tqdm class so we can push updates via a custom callback + """ + + def __init__( + self, + iterable=None, + desc=None, + total=None, + leave=True, + file=open(os.devnull, "w"), + ncols=None, + mininterval=0.1, + maxinterval=10.0, + miniters=None, + ascii=None, + disable=False, + unit="it", + unit_scale=False, + dynamic_ncols=False, + smoothing=0.3, + bar_format=None, + initial=0, + position=None, + postfix=None, + unit_divisor=1000, + write_bytes=None, + lock_args=None, + nrows=None, + colour=None, + delay=0, + gui=False, + update_callback=None, + **kwargs, + ): + self._update_callback = update_callback + super(Qtqdm, self).__init__( + iterable, + desc, + total, + leave, + file, # no change here + ncols, + mininterval, + maxinterval, + miniters, + ascii, + disable, + unit, + unit_scale, + False, # change param ? + smoothing, + bar_format, + initial, + position, + postfix, + unit_divisor, + gui, + **kwargs, + ) + + # override the method to call a custom callback on every refresh + def refresh(self, nolock=False, lock_args=None): + super(Qtqdm, self).refresh(nolock=nolock, lock_args=lock_args) + if self._update_callback: + self._update_callback(self.format_dict) + + +class ETAManager(QtCore.QObject): + etaSignal = QtCore.pyqtSignal(str) + + def __init__(self, links): + super().__init__() + self.remaining_time = 0 + self.links = links + self.running = True + self.eta_thread = QtCore.QThread() + self.eta_worker = CountdownManagedWorker(self.remaining_time, links) + self.eta_worker.moveToThread(self.eta_thread) + self.eta_worker.etaSignal.connect(self.etaSignal.emit) + self.eta_thread.started.connect(self.eta_worker.run) + self.times = [] + + def start(self): + """Start the ETA thread.""" + self.eta_thread.start() + + def calculate_average(self): + if len(self.times) == 0: + return 0 + return sum(self.times) / len(self.times) + + def estimate_remaining_time(self): + average_time = self.calculate_average() + return average_time * len(self.check_links) + + def add_timestamp(self, timestamp): + self.times.append(timestamp) + + +class CountdownManagedWorker(QtCore.QObject): + etaSignal = QtCore.pyqtSignal(str) + + def __init__(self, total_time): + super().__init__() + self.remaining_time = total_time + self.check_links = None + self.running = True + self.times = [] + + def run(self): + """Runs the countdown timer, emitting updated ETAs every second.""" + while self.running: + etatime = str(timedelta(seconds=int(self.remaining_time))) + self.etaSignal.emit(etatime) + time.sleep(1) + self.remaining_time -= 1 + + self.etaSignal.emit("00:00:00") + + +class CountdownWorker(QtCore.QObject): + """Worker for ETA countdown, running in a separate QThread.""" + + etaSignal = QtCore.pyqtSignal(str) + + def __init__(self, total_time): + super().__init__() + self.remaining_time = total_time + self.running = True + + def run(self): + """Runs the countdown timer, emitting updated ETAs every second.""" + while self.remaining_time > 0 and self.running: + etatime = str(timedelta(seconds=int(self.remaining_time))) + self.etaSignal.emit(etatime) + time.sleep(1) + self.remaining_time -= 1 + + self.etaSignal.emit("00:00:00") + + def update_remaining_time(self, remaining_time): + """Updates the remaining time dynamically.""" + self.remaining_time = max(0, remaining_time) + + def stop(self): + """Stops the countdown.""" + self.running = False + + +class CheckThread(QtCore.QThread): + updateSignal = QtCore.pyqtSignal() + total_entries_signal = QtCore.pyqtSignal(int) + resultSignal = QtCore.pyqtSignal(str) + etaSignal = QtCore.pyqtSignal(dict) + startSignal = QtCore.pyqtSignal() + progress = pyqtSignal(dict) + + def __init__(self, parent=None, status_code=None): + super().__init__(parent) + self.check_code = None + self.status_code = status_code + self.sleepTimer = 0 + self.timeout = 0 + self.per_request_time = sum([self.sleepTimer, self.timeout]) + self.running = True + self.eta_worker = None + self.eta_thread = None + + def set_status_code(self, status_code): + self.status_code = status_code + + def setTimes(self, timeout, sleepTimer): + self.timeout = timeout + self.sleepTimer = sleepTimer + self.per_request_time = sum([self.sleepTimer, self.timeout]) + + def run(self): + self.db = Database("lfer.db") + links = self.db.get_links_by_response_code(self.status_code) + self.total_entries_signal.emit(len(links)) + + if len(links) == 0: + self.etaSignal.emit({"text": "Done"}) + return + remaining_time = len(links) * self.per_request_time + + # self.eta_thread = QtCore.QThread() + # self.eta_worker = CountdownWorker(remaining_time) + # self.eta_worker.moveToThread(self.eta_thread) + # self.eta_worker.etaSignal.connect( + # self.etaSignal.emit, QtCore.Qt.ConnectionType.DirectConnection + # ) + # self.eta_thread.started.connect(self.eta_worker.run) + # self.eta_thread.start() + tqdm_object = Qtqdm( + range(len(links)), unit_scale=True, update_callback=self._update_callback + ) + self.startSignal.emit() + for i in tqdm_object: + if not self.running: + break + id, url = links[i] + response_code, destination_link = self.get_status_code(url) + self.db.update_response_code(id, response_code, destination_link) + self.resultSignal.emit(f"{url} : {response_code}") + self.updateSignal.emit() + time.sleep(self.sleepTimer) + + # for i, (id, url) in enumerate(links): + # if not self.running: + # break + + # response_code, destination_link = self.get_status_code(url) + # self.db.update_response_code(id, response_code, destination_link) + # self.updateSignal.emit(i + 1) + + # # Update remaining time dynamically + # remaining_time -= self.per_request_time + # self.eta_worker.update_remaining_time(remaining_time) + + # time.sleep(self.sleepTimer) + + self.db.close() + # self.running = False # Stop the ETA countdown + # self.eta_thread.quit() + # self.eta_thread.wait() + + def _update_callback(self, status): + self.progress.emit(status) + + def stop(self): + """Stops the processing and ETA update.""" + self.running = False + + # for i in range(len(links)): + # id, url = links[i] + # response_code, destination_link = self.get_status_code(url) + # self.db.update_response_code(id, response_code, destination_link) + # self.updateSignal.emit(i + 1) + # self.resultSignal.emit(f"{url} : {response_code}") + # time.sleep(self.sleepTimer) + # self.db.close() + + @log.catch() + @sleep_and_retry + def get_status_code(self, url): + non_support = ["d-nb.info", ".jpg", ".png", ".jpeg"] + + if any(x in url for x in non_support): + log.error(f"URL: {url}, ERROR: Site not supported") + return -2, "Site not supported" + if "Error" in url: + log.error(f"URL: {url}, ERROR: No data found") + return -1, "No data found" + try: + # userAgent = "Automated LFER Status Code Checker/1.1 (alexander.kirchner@ph-freiburg.de)" + userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" + accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" + headers = {"User-Agent": userAgent, "Accept": accept} + response = requests.get(url, headers=headers, timeout=self.timeout) + log.info(f"URL: {url}, Status Code: {response.status_code}") + return response.status_code, response.url + except Exception as e: + log.error(f"URL: {url}, Status Code: 0") + return 0, str(e) + + +class WebscraperThread(QtCore.QThread): + updateSignal = QtCore.pyqtSignal(int) + total_entries_signal = QtCore.pyqtSignal(int) + + def __init__(self, parent=None, ppnfilePath=None): + super().__init__(parent) + self.ppnfilePath = ppnfilePath + + def set_ppnfilePath(self, ppnfilePath): + self.ppnfilePath = ppnfilePath + + def run(self): + self.db = Database("lfer.db") + with open(self.ppnfilePath, "r") as f: + ppns = f.read() + ppns = ppns.split("\n") + self.total_entries_signal.emit(len(ppns)) + for i in range(len(ppns)): + ppn = ppns[i] + data = self.fetch_data(ppn) + links = self.process_response(data) + if links is None: + self.db.add_data(ppn, "Error: No data found") + else: + for link in links: + self.db.add_data(ppn, link) + self.updateSignal.emit(i + 1) + time.sleep(0.1) + self.db.close() + + @sleep_and_retry + @limits(calls=10, period=1) + def fetch_data(self, ppn): + api_url = f"https://sru.bsz-bw.de/swb?version=1.1&query=pica.ppn%3D{ppn}&operation=searchRetrieve&maximumRecords=10&recordSchema=marcxmlk10os" + response = requests.get(api_url) + return response.text + + def process_response(self, response): + """Extracts URLs from datafield 856, subfield u.""" + try: + root = ET.fromstring(response) + namespace = { + "zs": "http://www.loc.gov/zing/srw/", + "marc": "http://www.loc.gov/MARC21/slim", + } + + # Find all recordData elements + record_data = root.find(".//zs:recordData", namespace) + if record_data is None: + return None + + # Find all datafield 856 elements + links = [] + for datafield in record_data.findall( + ".//marc:datafield[@tag='856']", namespace + ): + for subfield in datafield.findall( + "marc:subfield[@code='u']", namespace + ): + links.append(subfield.text) + + return links if links else None + except ET.ParseError: + return None + + +class StatusCodeThread(QtCore.QThread): + progressSignal = QtCore.pyqtSignal(int) + total_entries_signal = QtCore.pyqtSignal(int) + current_data_signal = QtCore.pyqtSignal(tuple) + + def __init__(self, parent=None, status_code=None): + super().__init__(parent) + self.status_code = status_code + + def set_status_code(self, status_code): + self.status_code = status_code + + def run(self): + self.db = Database("lfer.db") + publishers = self.db.get_publishers() + self.total_entries_signal.emit(len(publishers)) + for i in range(len(publishers)): + data = self.db.get_num_of_links_for_status_code_and_publisher( + publishers[i], self.status_code + ) + self.current_data_signal.emit((publishers[i], data[0])) + self.progressSignal.emit(i + 1) + + +class CheckThreadPlaywright(QtCore.QThread): + updateSignal = QtCore.pyqtSignal(int) + total_entries_signal = QtCore.pyqtSignal(int) + resultSignal = QtCore.pyqtSignal(str) + etaSignal = QtCore.pyqtSignal(str) + + def __init__(self, parent=None, status_code=None): + super().__init__(parent) + self.check_code = None + self.status_code = status_code + self.running = True + self.eta_worker = None + self.eta_thread = None + self.browser = None # Browser will be initialized asynchronously + + def set_status_code(self, status_code): + self.status_code = status_code + + def setTimes(self, timeout, sleepTimer): + self.timeout = timeout + self.sleepTimer = sleepTimer + self.per_request_time = sum([self.sleepTimer, self.timeout]) + + async def getBrowser(self): + """Asynchronously launches Playwright browser""" + self.playwright = await async_playwright().start() + browser = await self.playwright.chromium.launch() + return browser + + async def get_page_status(self, browser, url): + """Asynchronously fetches page status""" + page = await browser.new_page() + + try: + response = await page.goto(url, wait_until="domcontentloaded") + # If response is None, the request was aborted (e.g., a PDF opened) + if response is None: + print(f"Navigation aborted (likely due to PDF): {url}") + await page.close() + return -3, url # Treat as success + + destination_link = response.url + status_code = response.status + except Exception as e: + print(f"Error loading {url}: {e}") + status_code = -3 # Custom error code + destination_link = url + finally: + await page.close() + return status_code, destination_link + + def run(self): + """Starts Playwright in an event loop""" + asyncio.run(self.run_async()) + + async def estimate_remaining_time(self, links): + # open a single playwright instance to estimate the time it takes to process a single request + playwright = await async_playwright().start() + browser = await playwright.chromium.launch() + request_start_time = datetime.now() + await self.get_page_status(browser, links[0][1]) + await browser.close() + await playwright.stop() + request_done_time = datetime.now() + difference = (request_done_time - request_start_time).seconds + remaining_time = len(links) * difference + return remaining_time + + async def run_async(self): + """Async version of the run method""" + self.db = Database("lfer.db") + links = self.db.get_links_by_response_code(self.status_code) + self.total_entries_signal.emit(len(links)) + + if len(links) == 0: + self.etaSignal.emit("Done") + return + + playwright = await async_playwright().start() + browser = await playwright.chromium.launch() + remaining_time = await self.estimate_remaining_time(links) + + self.eta_thread = QtCore.QThread() + self.eta_worker = CountdownWorker(remaining_time) + self.eta_worker.moveToThread(self.eta_thread) + self.eta_worker.etaSignal.connect( + self.etaSignal.emit, QtCore.Qt.ConnectionType.DirectConnection + ) + self.eta_thread.started.connect(self.eta_worker.run) + self.eta_thread.start() + + for i, (id, url) in enumerate(links): + if not self.running: + break + request_start_time = datetime.now() + response_code, destination_link = await self.get_page_status(browser, url) + self.db.update_response_code(id, response_code, destination_link) + self.updateSignal.emit(i + 1) + self.resultSignal.emit(f"{url} : {response_code}") + request_done_time = datetime.now() + difference = (request_done_time - request_start_time).seconds + # estimate the ETA based on the time it took to process the request + + remaining_time -= difference + self.eta_worker.update_remaining_time(remaining_time) + + await browser.close() + await playwright.stop() # Ensure Playwright stops properly + self.db.close() + self.running = False # Stop the ETA countdown + self.eta_thread.quit() + self.eta_thread.wait() + + def stop(self): + """Stops the processing and ETA update.""" + self.running = False diff --git a/src/ui/utils.py b/src/ui/utils.py new file mode 100644 index 0000000..80331d9 --- /dev/null +++ b/src/ui/utils.py @@ -0,0 +1,44 @@ +from PyQt6.QtWidgets import QProgressBar +from PyQt6.QtCore import pyqtSlot +from PyQt6 import QtCore +import datetime + + +class QtqdmProgressBar(QProgressBar): + def __init__(self, parent): + super(QtqdmProgressBar, self).__init__(parent) + self.setMinimumWidth(300) # Set minimum width + self.setMinimumHeight(20) # Set minimum height + self.setTextVisible(True) # Enable text display + self.setFormat("%p%") # Set format to show percentage + self.setAlignment(QtCore.Qt.AlignmentFlag.AlignCenter) + + @pyqtSlot(dict) + def make_progress(self, status: dict): + # print(status) + # {'n': 29, 'total': 30, 'elapsed': 2.9780092239379883, 'ncols': None, 'nrows': None, + # 'prefix': 'Progress bar Name: ', 'ascii': False, 'unit': 'it', 'unit_scale': True, + # 'rate': 9.74477716265916, 'bar_format': None, 'postfix': None, 'unit_divisor': 1000, 'initial': 0, + # 'colour': None} + + initial = status.get("initial", 0) + total = status.get("total", 0) + iteration = status.get("n", 0) + unit = status.get("unit", "it") + rate = status.get("rate", 0.0) + rate = ( + 0.0 if rate is None else rate + ) # rate is None on the start of the iteration + time_elapsed = status.get("elapsed", 0.0) + time_remaining = ((total - iteration) / rate) if rate and rate > 0 else 0 + # get remaining time in DD:HH:MM:SS format + formated_time_remaining = str(datetime.timedelta(seconds=time_remaining)) + prefix = status.get("prefix", "") + text = status.get("text", "") + + self.setMinimum(initial or 0) + self.setMaximum(total or 0) + self.setValue(iteration or 0) + self.setFormat( + f"{prefix} %v of %m (%p%) at {rate:0.4f} {unit}/sec after {time_elapsed:0.2f} sec > {formated_time_remaining}" # was time_remaining:0.2f sec + ) diff --git a/status_code.py b/status_code.py new file mode 100644 index 0000000..ac437d9 --- /dev/null +++ b/status_code.py @@ -0,0 +1,104 @@ +import requests +from src.database import Database +import threading +from ratelimit import limits, sleep_and_retry +import time + +import loguru + +log = loguru.logger +log.remove() +log.add("status_code.log", rotation="100 MB") + + +THREADS = 10 +threadlist = [] +db = Database("lfer.db") + +links = db.get_links() +LINKLEN = len(links) +LINKPROGRESS = 0 +RESPONSES = [] +non_support = ["d-nb.info", ".jpg", ".png", ".jpeg"] + + +@log.catch() +def get_status_code(url): + if any(x in url for x in non_support): + log.error(f"URL: {url}, ERROR: Site not supported") + return -2, "Site not supported" + if "Error" in url: + log.error(f"URL: {url}, ERROR: No data found") + return -1, "No data found" + try: + userAgent = ( + "Automated LFER Status Code Checker/1.0 (alexander.kirchner@ph-freiburg.de)" + ) + accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" + headers = {"User-Agent": userAgent, "Accept": accept} + response = requests.get(url, headers=headers, timeout=50) + log.info(f"URL: {url}, Status Code: {response.status_code}") + return response.status_code, response.url + except Exception as e: + log.error(f"URL: {url}, Status Code: 0") + return 0, str(e) + + +def worker(listpart): + global LINKPROGRESS + global RESPONSES + for link in listpart: + id, url = link + response_code, destination_link = get_status_code(url) + RESPONSES.append((id, response_code, destination_link)) + LINKPROGRESS += 1 + print("Progress: ", LINKPROGRESS, "/", LINKLEN, end="\r") + + +def main_threaded(): + global threadlist + global links + global THREADS + global LINKLEN + global LINKPROGRESS + for i in range(THREADS): + start = i * (LINKLEN // THREADS) + end = (i + 1) * (LINKLEN // THREADS) + if i == THREADS - 1: + end = LINKLEN + threadlist.append(threading.Thread(target=worker, args=(links[start:end],))) + for thread in threadlist: + thread.start() + for thread in threadlist: + thread.join() + for response in RESPONSES: + id, response_code, destination_link = response + db.update_response_code(id, response_code, destination_link) + print("Done") + + +def main(): + for i in range(len(links)): + id, url = links[i] + response_code, destination_link = get_status_code(url) + db.update_response_code(id, response_code, destination_link) + print("Progress: ", i + 1, "/", LINKLEN, end="\r") + time.sleep(1) + print("Done") + + +def check_by_status_code(status_code): + links = db.get_links_by_response_code(status_code) + for i in range(len(links)): + id, url = links[i] + response_code, destination_link = get_status_code(url) + if response_code == status_code: + db.update_response_code(id, response_code, destination_link) + print("Progress: ", i + 1, "/", LINKLEN, end="\r") + time.sleep(1) + print("Done") + + +if __name__ == "__main__": + main() # checks all with code 0 + # check_by_status_code(429) # checks titles with timeout diff --git a/webscraper.py b/webscraper.py new file mode 100644 index 0000000..027354e --- /dev/null +++ b/webscraper.py @@ -0,0 +1,63 @@ +import requests +from ratelimit import limits, sleep_and_retry +from src.database import Database +import xml.etree.ElementTree as ET + + +db = Database("lfer.db") + + +@sleep_and_retry +@limits(calls=10, period=1) +def fetch_data(ppn): + api_url = f"https://sru.bsz-bw.de/swb?version=1.1&query=pica.ppn%3D{ppn}&operation=searchRetrieve&maximumRecords=10&recordSchema=marcxmlk10os" + response = requests.get(api_url) + return response.text + + +def process_response(response): + """Extracts URLs from datafield 856, subfield u.""" + try: + root = ET.fromstring(response) + namespace = { + "zs": "http://www.loc.gov/zing/srw/", + "marc": "http://www.loc.gov/MARC21/slim", + } + + # Find all recordData elements + record_data = root.find(".//zs:recordData", namespace) + if record_data is None: + return None + + # Find all datafield 856 elements + links = [] + for datafield in record_data.findall( + ".//marc:datafield[@tag='856']", namespace + ): + for subfield in datafield.findall("marc:subfield[@code='u']", namespace): + links.append(subfield.text) + + return links if links else None + except ET.ParseError: + return None + + +def get_data(): + with open("ppnlist.txt", "r") as f: + ppns = f.read() + ppns = ppns.split("\n") + for ppn in ppns: + data = fetch_data(ppn) + links = process_response(data) + if links is None: + db.add_data(ppn, "Error: No data found") + else: + for link in links: + db.add_data(ppn, link) + print("Progress: ", ppns.index(ppn) + 1, "/", len(ppns), end="\r") + + +if __name__ == "__main__": + print("Hello from webscraper!\nScraping the list of PPNs...") + get_data() + print("Done")