105 lines
3.0 KiB
Python
105 lines
3.0 KiB
Python
import requests
|
|
from src.database import Database
|
|
import threading
|
|
from ratelimit import limits, sleep_and_retry
|
|
import time
|
|
|
|
import loguru
|
|
|
|
log = loguru.logger
|
|
log.remove()
|
|
log.add("status_code.log", rotation="100 MB")
|
|
|
|
|
|
THREADS = 10
|
|
threadlist = []
|
|
db = Database("lfer.db")
|
|
|
|
links = db.get_links()
|
|
LINKLEN = len(links)
|
|
LINKPROGRESS = 0
|
|
RESPONSES = []
|
|
non_support = ["d-nb.info", ".jpg", ".png", ".jpeg"]
|
|
|
|
|
|
@log.catch()
|
|
def get_status_code(url):
|
|
if any(x in url for x in non_support):
|
|
log.error(f"URL: {url}, ERROR: Site not supported")
|
|
return -2, "Site not supported"
|
|
if "Error" in url:
|
|
log.error(f"URL: {url}, ERROR: No data found")
|
|
return -1, "No data found"
|
|
try:
|
|
userAgent = (
|
|
"Automated LFER Status Code Checker/1.0 (alexander.kirchner@ph-freiburg.de)"
|
|
)
|
|
accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
|
headers = {"User-Agent": userAgent, "Accept": accept}
|
|
response = requests.get(url, headers=headers, timeout=50)
|
|
log.info(f"URL: {url}, Status Code: {response.status_code}")
|
|
return response.status_code, response.url
|
|
except Exception as e:
|
|
log.error(f"URL: {url}, Status Code: 0")
|
|
return 0, str(e)
|
|
|
|
|
|
def worker(listpart):
|
|
global LINKPROGRESS
|
|
global RESPONSES
|
|
for link in listpart:
|
|
id, url = link
|
|
response_code, destination_link = get_status_code(url)
|
|
RESPONSES.append((id, response_code, destination_link))
|
|
LINKPROGRESS += 1
|
|
print("Progress: ", LINKPROGRESS, "/", LINKLEN, end="\r")
|
|
|
|
|
|
def main_threaded():
|
|
global threadlist
|
|
global links
|
|
global THREADS
|
|
global LINKLEN
|
|
global LINKPROGRESS
|
|
for i in range(THREADS):
|
|
start = i * (LINKLEN // THREADS)
|
|
end = (i + 1) * (LINKLEN // THREADS)
|
|
if i == THREADS - 1:
|
|
end = LINKLEN
|
|
threadlist.append(threading.Thread(target=worker, args=(links[start:end],)))
|
|
for thread in threadlist:
|
|
thread.start()
|
|
for thread in threadlist:
|
|
thread.join()
|
|
for response in RESPONSES:
|
|
id, response_code, destination_link = response
|
|
db.update_response_code(id, response_code, destination_link)
|
|
print("Done")
|
|
|
|
|
|
def main():
|
|
for i in range(len(links)):
|
|
id, url = links[i]
|
|
response_code, destination_link = get_status_code(url)
|
|
db.update_response_code(id, response_code, destination_link)
|
|
print("Progress: ", i + 1, "/", LINKLEN, end="\r")
|
|
time.sleep(1)
|
|
print("Done")
|
|
|
|
|
|
def check_by_status_code(status_code):
|
|
links = db.get_links_by_response_code(status_code)
|
|
for i in range(len(links)):
|
|
id, url = links[i]
|
|
response_code, destination_link = get_status_code(url)
|
|
if response_code == status_code:
|
|
db.update_response_code(id, response_code, destination_link)
|
|
print("Progress: ", i + 1, "/", LINKLEN, end="\r")
|
|
time.sleep(1)
|
|
print("Done")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() # checks all with code 0
|
|
# check_by_status_code(429) # checks titles with timeout
|