diff --git a/.gitignore b/.gitignore index 6f7eb31..39ed182 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ dist/ nyaapy.egg-info .vscode env/ -*.pyc \ No newline at end of file +*.pyc +test_files \ No newline at end of file diff --git a/NyaaPy/nyaa.py b/NyaaPy/nyaa.py index d916e06..7b9d4c0 100644 --- a/NyaaPy/nyaa.py +++ b/NyaaPy/nyaa.py @@ -1,21 +1,28 @@ import requests -import urllib.parse -from bs4 import BeautifulSoup from NyaaPy import utils + class Nyaa: def __init__(self): - self.URI = "http://nyaa.si" + self.SITE = utils.TorrentSite.NYAASI + self.URI = "https://nyaa.si" def last_uploads(self, number_of_results): - r = requests.get(self.URI) - soup = BeautifulSoup(r.text, 'html.parser') - rows = soup.select('table tr') + r = requests.get(self.SITE.value) - return utils.parse_nyaa(table_rows=rows, limit=number_of_results + 1) + # If anything up with nyaa servers let the user know. + r.raise_for_status() + + return utils.parse_nyaa( + request_text=r.text, + limit=number_of_results + 1, + site=self.SITE + ) def search(self, keyword, **kwargs): + url = self.SITE.value + user = kwargs.get('user', None) category = kwargs.get('category', 0) subcategory = kwargs.get('subcategory', 0) @@ -29,26 +36,32 @@ class Nyaa: if page > 0: r = requests.get("{}/{}?f={}&c={}_{}&q={}&p={}".format( - self.URI, user_uri, filters, category, subcategory, keyword, + url, user_uri, filters, category, subcategory, keyword, page)) else: r = requests.get("{}/{}?f={}&c={}_{}&q={}".format( - self.URI, user_uri, filters, category, subcategory, keyword)) + url, user_uri, filters, category, subcategory, keyword)) - soup = BeautifulSoup(r.text, 'html.parser') - rows = soup.select('table tr') + r.raise_for_status() - return utils.parse_nyaa(rows, limit=None) + return utils.parse_nyaa( + request_text=r.text, + limit=None, + site=self.SITE + ) def get(self, id): - r = requests.get("{}/view/{}".format(self.URI, id)) - soup = BeautifulSoup(r.text, 'html.parser') - content = soup.findAll("div", {"class": "panel", "id": None}) + r = requests.get("{}/view/{}".format(self.SITE.value, id)) + r.raise_for_status() - return utils.parse_single(content) + return utils.parse_single(request_text=r.text, site=self.SITE) def get_user(self, username): - r = requests.get("{}/user/{}".format(self.URI, username)) - soup = BeautifulSoup(r.text, 'html.parser') + r = requests.get("{}/user/{}".format(self.SITE.value, username)) + r.raise_for_status() - return utils.parse_nyaa(soup.select('table tr'), limit=None) + return utils.parse_nyaa( + request_text=r.text, + limit=None, + site=self.SITE + ) diff --git a/NyaaPy/pantsu.py b/NyaaPy/pantsu.py index f9bef3a..ef6f04f 100644 --- a/NyaaPy/pantsu.py +++ b/NyaaPy/pantsu.py @@ -1,18 +1,24 @@ import requests from NyaaPy import utils + class Pantsu: def __init__(self): self.BASE_URL = "https://nyaa.pantsu.cat/api" - + self.SITE = utils.TorrentSite.NYAANET + def last_uploads(self, number_of_results): - r = requests.get(self.URI) - soup = BeautifulSoup(r.text, 'html.parser') - rows = soup.select('table tr') - - return utils.parse_nyaa(rows, limit=number_of_results + 1) + r = requests.get(self.SITE.value) + r.raise_for_status() + with open("test.html", "w") as f: + f.write(r.text) + return utils.parse_nyaa( + request_text=r.text, + limit=number_of_results + 1, + site=self.SITE + ) # Torrents - GET def search(self, keyword, **kwargs): @@ -23,10 +29,11 @@ class Pantsu: def view(self, item_id): request = requests.get("{}/view/{}".format(self.BASE_URL, item_id)) + request.raise_for_status() + return request.json() # Torrents - POST - def upload(self): return "Work in progress!" @@ -34,7 +41,6 @@ class Pantsu: return "Work in progress!" # Users - def login(self, username, password): login = requests.post("{}/login/".format( self.BASE_URL), data={'username': username, 'password': password}) diff --git a/NyaaPy/sukebei.py b/NyaaPy/sukebei.py index d0223c2..cab9f88 100644 --- a/NyaaPy/sukebei.py +++ b/NyaaPy/sukebei.py @@ -1,9 +1,14 @@ import requests -from bs4 import BeautifulSoup from NyaaPy import utils + class SukebeiNyaa: + + def __init__(self): + self.SITE = utils.TorrentSite.SUKEBEINYAASI + def search(self, keyword, **kwargs): + uri = self.SITE.value category = kwargs.get('category', 0) subcategory = kwargs.get('subcategory', 0) filters = kwargs.get('filters', 0) @@ -11,37 +16,37 @@ class SukebeiNyaa: if page > 0: r = requests.get("{}/?f={}&c={}_{}&q={}&p={}".format( - "http://sukebei.nyaa.si", filters, category, subcategory, + uri, filters, category, subcategory, keyword, page)) else: r = requests.get("{}/?f={}&c={}_{}&q={}".format( - "http://sukebei.nyaa.si", filters, category, subcategory, + uri, filters, category, subcategory, keyword)) - soup = BeautifulSoup(r.text, 'html.parser') - rows = soup.select('table tr') - - return utils.parse_nyaa(rows, limit=None) + r.raise_for_status() + return utils.parse_nyaa(r.text, limit=None, site=self.SITE) def get(self, id): - r = requests.get("http://sukebei.nyaa.si/view/{}".format(id)) - soup = BeautifulSoup(r.text, 'html.parser') - content = soup.findAll("div", {"class": "panel", "id": None}) + r = requests.get("{}/view/{}".format(self.SITE.value, id)) + r.raise_for_status() - return utils.parse_single(content) + return utils.parse_single(r.text, self.SITE) def get_user(self, username): - r = requests.get("http://sukebei.nyaa.si/user/{}".format(username)) - soup = BeautifulSoup(r.text, 'html.parser') + r = requests.get("{}/user/{}".format(self.SITE.value, username)) + r.raise_for_status() - return utils.parse_nyaa(soup.select('table tr'), limit=None) + return utils.parse_nyaa(r.text, limit=None, site=self.SITE) - def news(self, number_of_results): - r = requests.get("http://sukebei.nyaa.si/") - soup = BeautifulSoup(r.text, 'html.parser') - rows = soup.select('table tr') + def last_uploads(self, number_of_results): + r = requests.get(self.SITE.value) + r.raise_for_status() - return utils.parse_sukebei(rows, limit=number_of_results + 1) + return utils.parse_nyaa( + r.text, + limit=number_of_results + 1, + site=self.SITE + ) class SukebeiPantsu: diff --git a/NyaaPy/utils.py b/NyaaPy/utils.py index de46e4b..c6b0d71 100644 --- a/NyaaPy/utils.py +++ b/NyaaPy/utils.py @@ -3,9 +3,24 @@ ''' import re +from enum import Enum +from lxml import etree + + +class TorrentSite(Enum): + """ + Contains torrent sites + """ + NYAASI = "https://nyaa.si" + SUKEBEINYAASI = "https://sukebei.nyaa.si" + + # * nyaa.pantsu.cat redirects to nyaa.net + NYAANET = "https://nyaa.net" + SUKEBEINYAANET = "https://sukebei.nyaa.net" + def nyaa_categories(b): - c = b.replace('/?c=', '') + c = b.replace('?c=', '') cats = c.split('_') cat = cats[0] @@ -69,131 +84,123 @@ def nyaa_categories(b): return category_name -def parse_nyaa(table_rows, limit): - if limit == 0: - limit = len(table_rows) + +def parse_nyaa(request_text, limit, site): + parser = etree.HTMLParser() + tree = etree.fromstring(request_text, parser) + + # Put proper domain here. + uri = site.value torrents = [] - for row in table_rows[:limit]: + # Going through table rows + for tr in tree.xpath("//tbody//tr")[:limit]: block = [] - for td in row.find_all('td'): - if td.find_all('a'): - for link in td.find_all('a'): - if link.get('href')[-9:] != '#comments': - block.append(link.get('href')) - if link.text.rstrip(): - block.append(link.text) + for td in tr.xpath("./td"): + for link in td.xpath("./a"): - if td.text.rstrip(): - block.append(td.text.rstrip()) + href = link.attrib.get("href").split('/')[-1] - if row.has_attr('class'): - if row['class'][0] == 'danger': + # Only caring about non-comment pages. + if href[-9:] != "#comments": + block.append(href) + + if link.text and link.text.strip(): + block.append(link.text.strip()) + + if td.text is not None and td.text.strip(): + block.append(td.text.strip()) + + # Add type of torrent based on tr class. + if tr.attrib.get("class") is not None: + if 'danger' in tr.attrib.get("class"): block.append("remake") - elif row['class'][0] == 'success': + elif 'success' in tr.attrib.get("class"): block.append("trusted") else: block.append("default") + else: + block.append("default") + # Decide category. + if site in [TorrentSite.NYAASI, TorrentSite.NYAANET]: + category = nyaa_categories(block[0]) + elif site in [TorrentSite.SUKEBEINYAASI, TorrentSite.SUKEBEINYAANET]: + category = sukebei_categories(block[0]) + else: + raise ArgumentException("Unknown TorrentSite received!") + + # Create torrent object try: torrent = { - 'id': block[1].replace("/view/", ""), - 'category': nyaa_categories(block[0]), - 'url': "http://nyaa.si{}".format(block[1]), + 'id': block[1], + 'category': category, + 'url': "{}/view/{}".format(uri, block[1]), 'name': block[2], - 'download_url': "http://nyaa.si{}".format(block[4]), - 'magnet': block[5], - 'size': block[6], - 'date': block[7], - 'seeders': block[8], - 'leechers': block[9], - 'completed_downloads': block[10], - 'type': block[11], + 'download_url': "{}/download/{}".format(uri, block[3]), + 'magnet': block[4], + 'size': block[5], + 'date': block[6], + 'seeders': block[7], + 'leechers': block[8], + 'completed_downloads': block[9], + 'type': block[10] } - torrents.append(torrent) - except IndexError as ie: + except IndexError: pass - return torrents -def parse_single(content): + +def parse_single(request_text, site): + parser = etree.HTMLParser() + tree = etree.fromstring(request_text, parser) + + # Put proper domain here. + uri = site.value + torrent = {} data = [] torrent_files = [] - for row in content[0].find_all('div', {'class': 'row'}): - for div in row.find_all('div', {'class': 'col-md-5'}): - data.append(div.text.replace("\n", "")) + # Find basic uploader info & torrent stats + for row in tree.xpath("//div[@class='row']"): + for div_text in row.xpath("./div[@class='col-md-5']//text()"): + d = div_text.strip() + if d: + data.append(d) - files = content[2].find('div', - {'class', 'torrent-file-list'}).find_all('li') + # Find files, we need only text of the li element(s). + # Sorry about Pycodestyle aka PEP8 (E501) error + for el in tree.xpath("//div[contains(@class, 'torrent-file-list')]//li/text()"): + if el.rstrip(): + torrent_files.append(el) - for file in files: - torrent_files.append(file.text) - - torrent['title'] = re.sub('\n|\r|\t', '', content[0].find('h3', { - "class": "panel-title"}).text.replace("\n", "")) + torrent['title'] = \ + tree.xpath("//h3[@class='panel-title']/text()")[0].strip() torrent['category'] = data[0] - torrent['uploader'] = data[2] - torrent['uploader_profile'] = "https://nyaa.si/user/{}".format(data[2]) - torrent['website'] = re.sub('\t', '', data[4]) - torrent['size'] = data[6] - torrent['date'] = data[1] - torrent['seeders'] = data[3] - torrent['leechers'] = data[5] - torrent['completed'] = data[7] - torrent['hash'] = data[8] - torrent['description'] = re.sub('\t', '', content[1].find('div', { - 'id': 'torrent-description'}).text) + torrent['uploader'] = data[4] + torrent['uploader_profile'] = "{}/user/{}".format(uri, data[4]) + torrent['website'] = data[6] + torrent['size'] = data[8] + torrent['date'] = data[3] + torrent['seeders'] = data[5] + torrent['leechers'] = data[7] + torrent['completed'] = data[9] + torrent['hash'] = data[10] torrent['files'] = torrent_files + torrent['description'] = "" + for s in tree.xpath("//div[@id='torrent-description']"): + torrent['description'] += s.text + return torrent -def parse_sukebei(table_rows, limit): - if limit == 0: - limit = len(table_rows) - - torrents = [] - - for row in table_rows[:limit]: - block = [] - - for td in row.find_all('td'): - for link in td.find_all('a'): - if link.get('href')[-9:] != '#comments': - block.append(link.get('href')) - block.append(link.text.rstrip()) - - if td.text.rstrip(): - block.append(td.text.rstrip()) - - try: - torrent = { - 'id': block[1].replace("/view/", ""), - 'category': sukebei_categories(block[0]), - 'url': "http://sukebei.nyaa.si{}".format(block[1]), - 'name': block[2], - 'download_url': "http://sukebei.nyaa.si{}".format( - block[4]), - 'magnet': block[5], - 'size': block[6], - 'date': block[7], - 'seeders': block[8], - 'leechers': block[9], - 'completed_downloads': block[10], - } - except IndexError as ie: - pass - - torrents.append(torrent) - - return torrents def sukebei_categories(b): - c = b.replace('/?c=', '') + c = b.replace('?c=', '') cats = c.split('_') cat = cats[0] @@ -227,6 +234,7 @@ def sukebei_categories(b): return category_name + # Pantsu Utils def query_builder(q, params): available_params = ["category", "page", "limit", "userID", "fromID", diff --git a/requirements.txt b/requirements.txt index ef3a347..19198a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ requests>=2.20.0 beautifulsoup4==4.6.0 +lxml \ No newline at end of file diff --git a/tests/test.py b/tests/test.py index bed0c60..8b9215e 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,7 +1,60 @@ -from NyaaPy import Pantsu, Nyaa +from NyaaPy import Nyaa +from pprint import pprint +from datetime import datetime +import json +import sys +import os + +# Creating a folder for test_files +# ! not included in github project. +if not os.path.isdir("test_files"): + os.makedirs("test_files") -pantsu = Pantsu() nyaa = Nyaa() -print(pantsu.search(keyword='koe no katachi', - lang=["es", "ja"], category=[1, 3])) \ No newline at end of file +# Get fresh torrents +dt_latest_torrents_begin = datetime.now() +latest_torrents = nyaa.last_uploads(100) +dt_latest_torrents_end = datetime.now() +with open("test_files/nyaa_latest_torrent_test.json", 'w') as f: + json.dump(latest_torrents, f) + +# Search some nasty stuff +dt_search_begin = datetime.now() +test_search = nyaa.search("kimi no na wa") +dt_search_end = datetime.now() +with open("test_files/nyaa_search_test.json", 'w') as f: + json.dump(test_search, f) + +# Get first torrent from found torrents +dt_single_torrent_begin = datetime.now() +single_torrent = nyaa.get(test_search[0]["id"]) +dt_single_torrent_end = datetime.now() +with open("test_files/nyaa_single_torrent_test.json", 'w') as f: + json.dump(single_torrent, f) + +dt_user_begin = datetime.now() +user_torrents = nyaa.get_user("HorribleSubs") +dt_user_end = datetime.now() +with open("test_files/nyaa_single_user_test.json", 'w') as f: + json.dump(user_torrents, f) + +print( + "Latest torrents time:", + (dt_latest_torrents_end - dt_latest_torrents_begin).microseconds / 1000, + "msec") +print( + "Test search time:", + (dt_search_end - dt_search_begin).microseconds / 1000, + "msec" +) +print( + "Single torrent time:", + (dt_single_torrent_end - dt_single_torrent_begin).microseconds / 1000, + "msec" +) +print( + "Single user time:", + (dt_user_end - dt_user_begin).microseconds / 1000, + "msec" +) diff --git a/tests/test_pantsu.py b/tests/test_pantsu.py new file mode 100644 index 0000000..f77e593 --- /dev/null +++ b/tests/test_pantsu.py @@ -0,0 +1,6 @@ +""" +* Pantsu need some serious work +Regular data single_torrent parser not working from other Nyaa alternatives +Needs some work +""" +print("TODO") diff --git a/tests/test_sukebei.py b/tests/test_sukebei.py new file mode 100644 index 0000000..eebaaf6 --- /dev/null +++ b/tests/test_sukebei.py @@ -0,0 +1,58 @@ +from NyaaPy import SukebeiNyaa +from datetime import datetime +import json +import os + +# Creating a folder for test_files +# ! not included in github project. +if not os.path.isdir("test_files"): + os.makedirs("test_files") + +nyaa = SukebeiNyaa() + +# Get fresh torrents +dt_latest_torrents_begin = datetime.now() +latest_torrents = nyaa.last_uploads(100) +dt_latest_torrents_end = datetime.now() +with open("test_files/sukebei_latest_torrent_test.json", 'w') as f: + json.dump(latest_torrents, f) + +# Search some nasty stuff +dt_search_begin = datetime.now() +test_search = nyaa.search("G Senjou no maou") +dt_search_end = datetime.now() +with open("test_files/sukebei_search_test.json", 'w') as f: + json.dump(test_search, f) + +# Get first torrent from found torrents +dt_single_torrent_begin = datetime.now() +single_torrent = nyaa.get(test_search[0]["id"]) +dt_single_torrent_end = datetime.now() +with open("test_files/sukebei_single_torrent_test.json", 'w') as f: + json.dump(single_torrent, f) + +dt_user_begin = datetime.now() +user_torrents = nyaa.get_user("RUNBKK") +dt_user_end = datetime.now() +with open("test_files/sukebei_single_user_test.json", 'w') as f: + json.dump(user_torrents, f) + +print( + "Latest torrents time:", + (dt_latest_torrents_end - dt_latest_torrents_begin).microseconds / 1000, + "msec") +print( + "Test search time:", + (dt_search_end - dt_search_begin).microseconds / 1000, + "msec" +) +print( + "Single torrent time:", + (dt_single_torrent_end - dt_single_torrent_begin).microseconds / 1000, + "msec" +) +print( + "Single user time:", + (dt_user_end - dt_user_begin).microseconds / 1000, + "msec" +)