Nyaa.si & sukebei.nyaa.si LXML fully ready

This commit is contained in:
Ferenc Nánási
2020-02-01 15:11:35 +01:00
parent bf01a922f0
commit 5c93e516ba
6 changed files with 126 additions and 87 deletions

3
.gitignore vendored
View File

@@ -3,4 +3,5 @@ dist/
nyaapy.egg-info nyaapy.egg-info
.vscode .vscode
env/ env/
*.pyc *.pyc
test_files

View File

@@ -1,12 +1,11 @@
import requests import requests
import urllib.parse
from NyaaPy import utils from NyaaPy import utils
class Nyaa: class Nyaa:
def __init__(self): def __init__(self):
self.URI = "http://nyaa.si" self.URI = "https://nyaa.si"
def last_uploads(self, number_of_results): def last_uploads(self, number_of_results):
r = requests.get(self.URI) r = requests.get(self.URI)

View File

@@ -1,8 +1,12 @@
import requests import requests
from bs4 import BeautifulSoup
from NyaaPy import utils from NyaaPy import utils
class SukebeiNyaa: class SukebeiNyaa:
def __init__(self):
self.URI = "https://sukebei.nyaa.si"
def search(self, keyword, **kwargs): def search(self, keyword, **kwargs):
category = kwargs.get('category', 0) category = kwargs.get('category', 0)
subcategory = kwargs.get('subcategory', 0) subcategory = kwargs.get('subcategory', 0)
@@ -11,37 +15,37 @@ class SukebeiNyaa:
if page > 0: if page > 0:
r = requests.get("{}/?f={}&c={}_{}&q={}&p={}".format( r = requests.get("{}/?f={}&c={}_{}&q={}&p={}".format(
"http://sukebei.nyaa.si", filters, category, subcategory, self.URI, filters, category, subcategory,
keyword, page)) keyword, page))
else: else:
r = requests.get("{}/?f={}&c={}_{}&q={}".format( r = requests.get("{}/?f={}&c={}_{}&q={}".format(
"http://sukebei.nyaa.si", filters, category, subcategory, self.URI, filters, category, subcategory,
keyword)) keyword))
soup = BeautifulSoup(r.text, 'html.parser') r.raise_for_status()
rows = soup.select('table tr') return utils.parse_nyaa(r.text, limit=None, sukebei=True)
return utils.parse_nyaa(rows, limit=None)
def get(self, id): def get(self, id):
r = requests.get("http://sukebei.nyaa.si/view/{}".format(id)) r = requests.get("{}/view/{}".format(self.URI, id))
soup = BeautifulSoup(r.text, 'html.parser') r.raise_for_status()
content = soup.findAll("div", {"class": "panel", "id": None})
return utils.parse_single(content) return utils.parse_single(r.text, sukebei=True)
def get_user(self, username): def get_user(self, username):
r = requests.get("http://sukebei.nyaa.si/user/{}".format(username)) r = requests.get("{}/user/{}".format(self.URI, username))
soup = BeautifulSoup(r.text, 'html.parser') r.raise_for_status()
return utils.parse_nyaa(soup.select('table tr'), limit=None) return utils.parse_nyaa(r.text, limit=None, sukebei=True)
def news(self, number_of_results): def last_uploads(self, number_of_results):
r = requests.get("http://sukebei.nyaa.si/") r = requests.get(self.URI)
soup = BeautifulSoup(r.text, 'html.parser') r.raise_for_status()
rows = soup.select('table tr')
return utils.parse_sukebei(rows, limit=number_of_results + 1) return utils.parse_nyaa(
r.text,
limit=number_of_results + 1,
sukebei=True
)
class SukebeiPantsu: class SukebeiPantsu:

View File

@@ -72,10 +72,15 @@ def nyaa_categories(b):
return category_name return category_name
def parse_nyaa(request_text, limit): def parse_nyaa(request_text, limit, sukebei=False):
parser = etree.HTMLParser() parser = etree.HTMLParser()
tree = etree.fromstring(request_text, parser) tree = etree.fromstring(request_text, parser)
if sukebei is False:
uri = "https://nyaa.si"
else:
uri = "https://sukebei.nyaa.si"
torrents = [] torrents = []
# Going through table rows # Going through table rows
@@ -109,10 +114,10 @@ def parse_nyaa(request_text, limit):
try: try:
torrent = { torrent = {
'id': block[1], 'id': block[1],
'category': nyaa_categories(block[0]), 'category': nyaa_categories(block[0]) if sukebei is False else sukebei_categories(block[0]),
'url': "https://nyaa.si/view/{}".format(block[1]), 'url': "{}/view/{}".format(uri, block[1]),
'name': block[2], 'name': block[2],
'download_url': "https://nyaa.si/download/{}".format(block[3]), 'download_url': "{}/download/{}".format(uri, block[3]),
'magnet': block[4], 'magnet': block[4],
'size': block[5], 'size': block[5],
'date': block[6], 'date': block[6],
@@ -127,10 +132,15 @@ def parse_nyaa(request_text, limit):
return torrents return torrents
def parse_single(request_text): def parse_single(request_text, sukebei=False):
parser = etree.HTMLParser() parser = etree.HTMLParser()
tree = etree.fromstring(request_text, parser) tree = etree.fromstring(request_text, parser)
if sukebei is False:
uri = "https://nyaa.si"
else:
uri = "https://sukebei.nyaa.si"
torrent = {} torrent = {}
data = [] data = []
torrent_files = [] torrent_files = []
@@ -152,7 +162,7 @@ def parse_single(request_text):
tree.xpath("//h3[@class='panel-title']/text()")[0].strip() tree.xpath("//h3[@class='panel-title']/text()")[0].strip()
torrent['category'] = data[0] torrent['category'] = data[0]
torrent['uploader'] = data[4] torrent['uploader'] = data[4]
torrent['uploader_profile'] = "http://nyaa.si/user/{}".format(data[4]) torrent['uploader_profile'] = "{}/user/{}".format(uri, data[4])
torrent['website'] = data[6] torrent['website'] = data[6]
torrent['size'] = data[8] torrent['size'] = data[8]
torrent['date'] = data[3] torrent['date'] = data[3]
@@ -169,49 +179,8 @@ def parse_single(request_text):
return torrent return torrent
def parse_sukebei(table_rows, limit):
if limit == 0:
limit = len(table_rows)
torrents = []
for row in table_rows[:limit]:
block = []
for td in row.find_all('td'):
for link in td.find_all('a'):
if link.get('href')[-9:] != '#comments':
block.append(link.get('href'))
block.append(link.text.rstrip())
if td.text.rstrip():
block.append(td.text.rstrip())
try:
torrent = {
'id': block[1].replace("/view/", ""),
'category': sukebei_categories(block[0]),
'url': "http://sukebei.nyaa.si{}".format(block[1]),
'name': block[2],
'download_url': "http://sukebei.nyaa.si{}".format(
block[4]),
'magnet': block[5],
'size': block[6],
'date': block[7],
'seeders': block[8],
'leechers': block[9],
'completed_downloads': block[10],
}
except IndexError as ie:
pass
torrents.append(torrent)
return torrents
def sukebei_categories(b): def sukebei_categories(b):
c = b.replace('/?c=', '') c = b.replace('?c=', '')
cats = c.split('_') cats = c.split('_')
cat = cats[0] cat = cats[0]

View File

@@ -1,40 +1,51 @@
from NyaaPy import Pantsu, Nyaa from NyaaPy import Nyaa
from pprint import pprint from pprint import pprint
from datetime import datetime from datetime import datetime
import json
import sys
import os
# Creating a folder for test_files
# ! not included in github project.
if not os.path.isdir("test_files"):
os.makedirs("test_files")
# pantsu = Pantsu()
nyaa = Nyaa() nyaa = Nyaa()
# Get fresh torrents # Get fresh torrents
dt_latest_torrents_begin = datetime.now() dt_latest_torrents_begin = datetime.now()
latest_torrents = nyaa.last_uploads(100) latest_torrents = nyaa.last_uploads(100)
dt_latest_torrents_end = datetime.now() dt_latest_torrents_end = datetime.now()
with open("test_files/nyaa_latest_torrent_test.json", 'w') as f:
json.dump(latest_torrents, f)
# I'd like to watch Tenki no ko, but not uploaded yet. # Search some nasty stuff
dt_search_begin = datetime.now() dt_search_begin = datetime.now()
test_search = nyaa.search("Kimi no Na wa") test_search = nyaa.search("kimi no na wa")
dt_search_end = datetime.now() dt_search_end = datetime.now()
# pprint(test_search) with open("test_files/nyaa_search_test.json", 'w') as f:
json.dump(test_search, f)
# Get first torrent from found torrents # Get first torrent from found torrents
# print("First result torrent info:")
dt_single_torrent_begin = datetime.now() dt_single_torrent_begin = datetime.now()
single_torrent = nyaa.get(test_search[0]["id"]) single_torrent = nyaa.get(test_search[0]["id"])
dt_single_torrent_end = datetime.now() dt_single_torrent_end = datetime.now()
#pprint(single_torrent) with open("test_files/nyaa_single_torrent_test.json", 'w') as f:
json.dump(single_torrent, f)
dt_user_begin = datetime.now() dt_user_begin = datetime.now()
user_torrents = nyaa.get_user("Lilith-Raws") user_torrents = nyaa.get_user("HorribleSubs")
dt_user_end = datetime.now() dt_user_end = datetime.now()
#pprint(user_torrents) with open("test_files/nyaa_single_user_test.json", 'w') as f:
json.dump(user_torrents, f)
print( print(
"Latest torrents time:", "Latest torrents time:",
(dt_latest_torrents_end - dt_latest_torrents_begin).microseconds / 1000, (dt_latest_torrents_end - dt_latest_torrents_begin).microseconds / 1000,
"msec") "msec")
print( print(
"Test search time:", "Test search time:",
(dt_search_end - dt_search_begin).microseconds/ 1000, (dt_search_end - dt_search_begin).microseconds / 1000,
"msec" "msec"
) )
print( print(
@@ -44,11 +55,6 @@ print(
) )
print( print(
"Single user time:", "Single user time:",
(dt_user_end - dt_user_begin ).microseconds / 1000, (dt_user_end - dt_user_begin).microseconds / 1000,
"msec" "msec"
) )
"""
print(pantsu.search(keyword='koe no katachi',
lang=["es", "ja"], category=[1, 3]))
"""

60
tests/test_sukebei.py Normal file
View File

@@ -0,0 +1,60 @@
from NyaaPy import SukebeiNyaa
from pprint import pprint
from datetime import datetime
import json
import sys
import os
# Creating a folder for test_files
# ! not included in github project.
if not os.path.isdir("test_files"):
os.makedirs("test_files")
nyaa = SukebeiNyaa()
# Get fresh torrents
dt_latest_torrents_begin = datetime.now()
latest_torrents = nyaa.last_uploads(100)
dt_latest_torrents_end = datetime.now()
with open("test_files/sukebei_latest_torrent_test.json", 'w') as f:
json.dump(latest_torrents, f)
# Search some nasty stuff
dt_search_begin = datetime.now()
test_search = nyaa.search("G Senjou no maou")
dt_search_end = datetime.now()
with open("test_files/sukebei_search_test.json", 'w') as f:
json.dump(test_search, f)
# Get first torrent from found torrents
dt_single_torrent_begin = datetime.now()
single_torrent = nyaa.get(test_search[0]["id"])
dt_single_torrent_end = datetime.now()
with open("test_files/sukebei_single_torrent_test.json", 'w') as f:
json.dump(single_torrent, f)
dt_user_begin = datetime.now()
user_torrents = nyaa.get_user("RUNBKK")
dt_user_end = datetime.now()
with open("test_files/sukebei_single_user_test.json", 'w') as f:
json.dump(user_torrents, f)
print(
"Latest torrents time:",
(dt_latest_torrents_end - dt_latest_torrents_begin).microseconds / 1000,
"msec")
print(
"Test search time:",
(dt_search_end - dt_search_begin).microseconds / 1000,
"msec"
)
print(
"Single torrent time:",
(dt_single_torrent_end - dt_single_torrent_begin).microseconds / 1000,
"msec"
)
print(
"Single user time:",
(dt_user_end - dt_user_begin).microseconds / 1000,
"msec"
)