Nyaa.si & sukebei.nyaa.si LXML fully ready

This commit is contained in:
Ferenc Nánási
2020-02-01 15:11:35 +01:00
parent bf01a922f0
commit fd28e65c8b
8 changed files with 190 additions and 109 deletions

3
.gitignore vendored
View File

@@ -3,4 +3,5 @@ dist/
nyaapy.egg-info nyaapy.egg-info
.vscode .vscode
env/ env/
*.pyc *.pyc
test_files

View File

@@ -1,25 +1,27 @@
import requests import requests
import urllib.parse
from NyaaPy import utils from NyaaPy import utils
class Nyaa: class Nyaa:
def __init__(self): def __init__(self):
self.URI = "http://nyaa.si" self.SITE = utils.TorrentSite.NYAASI
def last_uploads(self, number_of_results): def last_uploads(self, number_of_results):
r = requests.get(self.URI) r = requests.get(self.SITE.value)
# If anything up with nyaa servers let the user know. # If anything up with nyaa servers let the user know.
r.raise_for_status() r.raise_for_status()
return utils.parse_nyaa( return utils.parse_nyaa(
request_text=r.text, request_text=r.text,
limit=number_of_results + 1 limit=number_of_results + 1,
site=self.SITE
) )
def search(self, keyword, **kwargs): def search(self, keyword, **kwargs):
url = self.SITE.value
user = kwargs.get('user', None) user = kwargs.get('user', None)
category = kwargs.get('category', 0) category = kwargs.get('category', 0)
subcategory = kwargs.get('subcategory', 0) subcategory = kwargs.get('subcategory', 0)
@@ -33,24 +35,32 @@ class Nyaa:
if page > 0: if page > 0:
r = requests.get("{}/{}?f={}&c={}_{}&q={}&p={}".format( r = requests.get("{}/{}?f={}&c={}_{}&q={}&p={}".format(
self.URI, user_uri, filters, category, subcategory, keyword, url, user_uri, filters, category, subcategory, keyword,
page)) page))
else: else:
r = requests.get("{}/{}?f={}&c={}_{}&q={}".format( r = requests.get("{}/{}?f={}&c={}_{}&q={}".format(
self.URI, user_uri, filters, category, subcategory, keyword)) url, user_uri, filters, category, subcategory, keyword))
r.raise_for_status() r.raise_for_status()
return utils.parse_nyaa(request_text=r.text, limit=None) return utils.parse_nyaa(
request_text=r.text,
limit=None,
site=self.SITE
)
def get(self, id): def get(self, id):
r = requests.get("{}/view/{}".format(self.URI, id)) r = requests.get("{}/view/{}".format(self.SITE.value, id))
r.raise_for_status() r.raise_for_status()
return utils.parse_single(request_text=r.text) return utils.parse_single(request_text=r.text, site=self.SITE)
def get_user(self, username): def get_user(self, username):
r = requests.get("{}/user/{}".format(self.URI, username)) r = requests.get("{}/user/{}".format(self.SITE.value, username))
r.raise_for_status() r.raise_for_status()
return utils.parse_nyaa(request_text=r.text, limit=None) return utils.parse_nyaa(
request_text=r.text,
limit=None,
site=self.SITE
)

View File

@@ -1,18 +1,24 @@
import requests import requests
from NyaaPy import utils from NyaaPy import utils
class Pantsu: class Pantsu:
def __init__(self): def __init__(self):
self.BASE_URL = "https://nyaa.pantsu.cat/api" self.BASE_URL = "https://nyaa.pantsu.cat/api"
self.SITE = utils.TorrentSite.NYAANET
def last_uploads(self, number_of_results): def last_uploads(self, number_of_results):
r = requests.get(self.URI) r = requests.get(self.SITE.value)
soup = BeautifulSoup(r.text, 'html.parser') r.raise_for_status()
rows = soup.select('table tr') with open("test.html", "w") as f:
f.write(r.text)
return utils.parse_nyaa(rows, limit=number_of_results + 1)
return utils.parse_nyaa(
request_text=r.text,
limit=number_of_results + 1,
site=self.SITE
)
# Torrents - GET # Torrents - GET
def search(self, keyword, **kwargs): def search(self, keyword, **kwargs):
@@ -23,10 +29,11 @@ class Pantsu:
def view(self, item_id): def view(self, item_id):
request = requests.get("{}/view/{}".format(self.BASE_URL, item_id)) request = requests.get("{}/view/{}".format(self.BASE_URL, item_id))
request.raise_for_status()
return request.json() return request.json()
# Torrents - POST # Torrents - POST
def upload(self): def upload(self):
return "Work in progress!" return "Work in progress!"
@@ -34,7 +41,6 @@ class Pantsu:
return "Work in progress!" return "Work in progress!"
# Users # Users
def login(self, username, password): def login(self, username, password):
login = requests.post("{}/login/".format( login = requests.post("{}/login/".format(
self.BASE_URL), data={'username': username, 'password': password}) self.BASE_URL), data={'username': username, 'password': password})

View File

@@ -1,9 +1,14 @@
import requests import requests
from bs4 import BeautifulSoup
from NyaaPy import utils from NyaaPy import utils
class SukebeiNyaa: class SukebeiNyaa:
def __init__(self):
self.SITE = utils.TorrentSite.SUKEBEINYAASI
def search(self, keyword, **kwargs): def search(self, keyword, **kwargs):
uri = self.SITE.value
category = kwargs.get('category', 0) category = kwargs.get('category', 0)
subcategory = kwargs.get('subcategory', 0) subcategory = kwargs.get('subcategory', 0)
filters = kwargs.get('filters', 0) filters = kwargs.get('filters', 0)
@@ -11,37 +16,37 @@ class SukebeiNyaa:
if page > 0: if page > 0:
r = requests.get("{}/?f={}&c={}_{}&q={}&p={}".format( r = requests.get("{}/?f={}&c={}_{}&q={}&p={}".format(
"http://sukebei.nyaa.si", filters, category, subcategory, uri, filters, category, subcategory,
keyword, page)) keyword, page))
else: else:
r = requests.get("{}/?f={}&c={}_{}&q={}".format( r = requests.get("{}/?f={}&c={}_{}&q={}".format(
"http://sukebei.nyaa.si", filters, category, subcategory, uri, filters, category, subcategory,
keyword)) keyword))
soup = BeautifulSoup(r.text, 'html.parser') r.raise_for_status()
rows = soup.select('table tr') return utils.parse_nyaa(r.text, limit=None, site=self.SITE)
return utils.parse_nyaa(rows, limit=None)
def get(self, id): def get(self, id):
r = requests.get("http://sukebei.nyaa.si/view/{}".format(id)) r = requests.get("{}/view/{}".format(self.SITE.value, id))
soup = BeautifulSoup(r.text, 'html.parser') r.raise_for_status()
content = soup.findAll("div", {"class": "panel", "id": None})
return utils.parse_single(content) return utils.parse_single(r.text, self.SITE)
def get_user(self, username): def get_user(self, username):
r = requests.get("http://sukebei.nyaa.si/user/{}".format(username)) r = requests.get("{}/user/{}".format(self.SITE.value, username))
soup = BeautifulSoup(r.text, 'html.parser') r.raise_for_status()
return utils.parse_nyaa(soup.select('table tr'), limit=None) return utils.parse_nyaa(r.text, limit=None, site=self.SITE)
def news(self, number_of_results): def last_uploads(self, number_of_results):
r = requests.get("http://sukebei.nyaa.si/") r = requests.get(self.SITE.value)
soup = BeautifulSoup(r.text, 'html.parser') r.raise_for_status()
rows = soup.select('table tr')
return utils.parse_sukebei(rows, limit=number_of_results + 1) return utils.parse_nyaa(
r.text,
limit=number_of_results + 1,
site=self.SITE
)
class SukebeiPantsu: class SukebeiPantsu:

View File

@@ -3,9 +3,22 @@
''' '''
import re import re
from enum import Enum
from lxml import etree from lxml import etree
class TorrentSite(Enum):
"""
Contains torrent sites
"""
NYAASI = "https://nyaa.si"
SUKEBEINYAASI = "https://sukebei.nyaa.si"
# * nyaa.pantsu.cat redirects to nyaa.net
NYAANET = "https://nyaa.net"
SUKEBEINYAANET = "https://sukebei.nyaa.net"
def nyaa_categories(b): def nyaa_categories(b):
c = b.replace('?c=', '') c = b.replace('?c=', '')
cats = c.split('_') cats = c.split('_')
@@ -72,10 +85,13 @@ def nyaa_categories(b):
return category_name return category_name
def parse_nyaa(request_text, limit): def parse_nyaa(request_text, limit, site):
parser = etree.HTMLParser() parser = etree.HTMLParser()
tree = etree.fromstring(request_text, parser) tree = etree.fromstring(request_text, parser)
# Put proper domain here.
uri = site.value
torrents = [] torrents = []
# Going through table rows # Going through table rows
@@ -94,25 +110,36 @@ def parse_nyaa(request_text, limit):
if link.text and link.text.strip(): if link.text and link.text.strip():
block.append(link.text.strip()) block.append(link.text.strip())
if td.text and td.text.strip(): if td.text is not None and td.text.strip():
block.append(td.text.strip()) block.append(td.text.strip())
# Add type of torrent based on tr class. # Add type of torrent based on tr class.
if 'danger' in tr.attrib.get("class"): if tr.attrib.get("class") is not None:
block.append("remake") if 'danger' in tr.attrib.get("class"):
elif 'success' in tr.attrib.get("class"): block.append("remake")
block.append("trusted") elif 'success' in tr.attrib.get("class"):
block.append("trusted")
else:
block.append("default")
else: else:
block.append("default") block.append("default")
# Decide category.
if site in [TorrentSite.NYAASI, TorrentSite.NYAANET]:
category = nyaa_categories(block[0])
elif site in [TorrentSite.SUKEBEINYAASI, TorrentSite.SUKEBEINYAANET]:
category = sukebei_categories(block[0])
else:
raise ArgumentException("Unknown TorrentSite received!")
# Create torrent object # Create torrent object
try: try:
torrent = { torrent = {
'id': block[1], 'id': block[1],
'category': nyaa_categories(block[0]), 'category': category,
'url': "https://nyaa.si/view/{}".format(block[1]), 'url': "{}/view/{}".format(uri, block[1]),
'name': block[2], 'name': block[2],
'download_url': "https://nyaa.si/download/{}".format(block[3]), 'download_url': "{}/download/{}".format(uri, block[3]),
'magnet': block[4], 'magnet': block[4],
'size': block[5], 'size': block[5],
'date': block[6], 'date': block[6],
@@ -127,10 +154,13 @@ def parse_nyaa(request_text, limit):
return torrents return torrents
def parse_single(request_text): def parse_single(request_text, site):
parser = etree.HTMLParser() parser = etree.HTMLParser()
tree = etree.fromstring(request_text, parser) tree = etree.fromstring(request_text, parser)
# Put proper domain here.
uri = site.value
torrent = {} torrent = {}
data = [] data = []
torrent_files = [] torrent_files = []
@@ -152,7 +182,7 @@ def parse_single(request_text):
tree.xpath("//h3[@class='panel-title']/text()")[0].strip() tree.xpath("//h3[@class='panel-title']/text()")[0].strip()
torrent['category'] = data[0] torrent['category'] = data[0]
torrent['uploader'] = data[4] torrent['uploader'] = data[4]
torrent['uploader_profile'] = "http://nyaa.si/user/{}".format(data[4]) torrent['uploader_profile'] = "{}/user/{}".format(uri, data[4])
torrent['website'] = data[6] torrent['website'] = data[6]
torrent['size'] = data[8] torrent['size'] = data[8]
torrent['date'] = data[3] torrent['date'] = data[3]
@@ -169,49 +199,8 @@ def parse_single(request_text):
return torrent return torrent
def parse_sukebei(table_rows, limit):
if limit == 0:
limit = len(table_rows)
torrents = []
for row in table_rows[:limit]:
block = []
for td in row.find_all('td'):
for link in td.find_all('a'):
if link.get('href')[-9:] != '#comments':
block.append(link.get('href'))
block.append(link.text.rstrip())
if td.text.rstrip():
block.append(td.text.rstrip())
try:
torrent = {
'id': block[1].replace("/view/", ""),
'category': sukebei_categories(block[0]),
'url': "http://sukebei.nyaa.si{}".format(block[1]),
'name': block[2],
'download_url': "http://sukebei.nyaa.si{}".format(
block[4]),
'magnet': block[5],
'size': block[6],
'date': block[7],
'seeders': block[8],
'leechers': block[9],
'completed_downloads': block[10],
}
except IndexError as ie:
pass
torrents.append(torrent)
return torrents
def sukebei_categories(b): def sukebei_categories(b):
c = b.replace('/?c=', '') c = b.replace('?c=', '')
cats = c.split('_') cats = c.split('_')
cat = cats[0] cat = cats[0]

View File

@@ -1,40 +1,51 @@
from NyaaPy import Pantsu, Nyaa from NyaaPy import Nyaa
from pprint import pprint from pprint import pprint
from datetime import datetime from datetime import datetime
import json
import sys
import os
# Creating a folder for test_files
# ! not included in github project.
if not os.path.isdir("test_files"):
os.makedirs("test_files")
# pantsu = Pantsu()
nyaa = Nyaa() nyaa = Nyaa()
# Get fresh torrents # Get fresh torrents
dt_latest_torrents_begin = datetime.now() dt_latest_torrents_begin = datetime.now()
latest_torrents = nyaa.last_uploads(100) latest_torrents = nyaa.last_uploads(100)
dt_latest_torrents_end = datetime.now() dt_latest_torrents_end = datetime.now()
with open("test_files/nyaa_latest_torrent_test.json", 'w') as f:
json.dump(latest_torrents, f)
# I'd like to watch Tenki no ko, but not uploaded yet. # Search some nasty stuff
dt_search_begin = datetime.now() dt_search_begin = datetime.now()
test_search = nyaa.search("Kimi no Na wa") test_search = nyaa.search("kimi no na wa")
dt_search_end = datetime.now() dt_search_end = datetime.now()
# pprint(test_search) with open("test_files/nyaa_search_test.json", 'w') as f:
json.dump(test_search, f)
# Get first torrent from found torrents # Get first torrent from found torrents
# print("First result torrent info:")
dt_single_torrent_begin = datetime.now() dt_single_torrent_begin = datetime.now()
single_torrent = nyaa.get(test_search[0]["id"]) single_torrent = nyaa.get(test_search[0]["id"])
dt_single_torrent_end = datetime.now() dt_single_torrent_end = datetime.now()
#pprint(single_torrent) with open("test_files/nyaa_single_torrent_test.json", 'w') as f:
json.dump(single_torrent, f)
dt_user_begin = datetime.now() dt_user_begin = datetime.now()
user_torrents = nyaa.get_user("Lilith-Raws") user_torrents = nyaa.get_user("HorribleSubs")
dt_user_end = datetime.now() dt_user_end = datetime.now()
#pprint(user_torrents) with open("test_files/nyaa_single_user_test.json", 'w') as f:
json.dump(user_torrents, f)
print( print(
"Latest torrents time:", "Latest torrents time:",
(dt_latest_torrents_end - dt_latest_torrents_begin).microseconds / 1000, (dt_latest_torrents_end - dt_latest_torrents_begin).microseconds / 1000,
"msec") "msec")
print( print(
"Test search time:", "Test search time:",
(dt_search_end - dt_search_begin).microseconds/ 1000, (dt_search_end - dt_search_begin).microseconds / 1000,
"msec" "msec"
) )
print( print(
@@ -44,11 +55,6 @@ print(
) )
print( print(
"Single user time:", "Single user time:",
(dt_user_end - dt_user_begin ).microseconds / 1000, (dt_user_end - dt_user_begin).microseconds / 1000,
"msec" "msec"
) )
"""
print(pantsu.search(keyword='koe no katachi',
lang=["es", "ja"], category=[1, 3]))
"""

6
tests/test_pantsu.py Normal file
View File

@@ -0,0 +1,6 @@
"""
* Pantsu need some serious work
Regular data single_torrent parser not working from other Nyaa alternatives
Needs some work
"""
print("TODO")

58
tests/test_sukebei.py Normal file
View File

@@ -0,0 +1,58 @@
from NyaaPy import SukebeiNyaa
from datetime import datetime
import json
import os
# Creating a folder for test_files
# ! not included in github project.
if not os.path.isdir("test_files"):
os.makedirs("test_files")
nyaa = SukebeiNyaa()
# Get fresh torrents
dt_latest_torrents_begin = datetime.now()
latest_torrents = nyaa.last_uploads(100)
dt_latest_torrents_end = datetime.now()
with open("test_files/sukebei_latest_torrent_test.json", 'w') as f:
json.dump(latest_torrents, f)
# Search some nasty stuff
dt_search_begin = datetime.now()
test_search = nyaa.search("G Senjou no maou")
dt_search_end = datetime.now()
with open("test_files/sukebei_search_test.json", 'w') as f:
json.dump(test_search, f)
# Get first torrent from found torrents
dt_single_torrent_begin = datetime.now()
single_torrent = nyaa.get(test_search[0]["id"])
dt_single_torrent_end = datetime.now()
with open("test_files/sukebei_single_torrent_test.json", 'w') as f:
json.dump(single_torrent, f)
dt_user_begin = datetime.now()
user_torrents = nyaa.get_user("RUNBKK")
dt_user_end = datetime.now()
with open("test_files/sukebei_single_user_test.json", 'w') as f:
json.dump(user_torrents, f)
print(
"Latest torrents time:",
(dt_latest_torrents_end - dt_latest_torrents_begin).microseconds / 1000,
"msec")
print(
"Test search time:",
(dt_search_end - dt_search_begin).microseconds / 1000,
"msec"
)
print(
"Single torrent time:",
(dt_single_torrent_end - dt_single_torrent_begin).microseconds / 1000,
"msec"
)
print(
"Single user time:",
(dt_user_end - dt_user_begin).microseconds / 1000,
"msec"
)