diff --git a/NyaaPy/nyaa.py b/NyaaPy/nyaa.py index aaf3275..9165c37 100644 --- a/NyaaPy/nyaa.py +++ b/NyaaPy/nyaa.py @@ -1,6 +1,6 @@ import requests import urllib.parse -from NyaaPy import utils_lxml +from NyaaPy import utils class Nyaa: @@ -14,12 +14,11 @@ class Nyaa: # If anything up with nyaa servers let the user know. r.raise_for_status() - return utils_lxml.parse_nyaa( + return utils.parse_nyaa( request_text=r.text, limit=number_of_results + 1 ) -""" def search(self, keyword, **kwargs): user = kwargs.get('user', None) category = kwargs.get('category', 0) @@ -40,21 +39,20 @@ class Nyaa: r = requests.get("{}/{}?f={}&c={}_{}&q={}".format( self.URI, user_uri, filters, category, subcategory, keyword)) - soup = BeautifulSoup(r.text, 'html.parser') - rows = soup.select('table tr') + r.raise_for_status() - return utils.parse_nyaa(rows, limit=None) + return utils.parse_nyaa(request_text=r.text, limit=None) def get(self, id): r = requests.get("{}/view/{}".format(self.URI, id)) - soup = BeautifulSoup(r.text, 'html.parser') - content = soup.findAll("div", {"class": "panel", "id": None}) - - return utils.parse_single(content) + r.raise_for_status() + # ! Description not working TODO + # with open("test.html", "w") as f: + # f.write(r.text) + return utils.parse_single(request_text=r.text) def get_user(self, username): r = requests.get("{}/user/{}".format(self.URI, username)) soup = BeautifulSoup(r.text, 'html.parser') return utils.parse_nyaa(soup.select('table tr'), limit=None) -""" diff --git a/NyaaPy/utils.py b/NyaaPy/utils.py index de46e4b..fbbbdf0 100644 --- a/NyaaPy/utils.py +++ b/NyaaPy/utils.py @@ -3,9 +3,12 @@ ''' import re +from lxml import etree +from pprint import pprint + def nyaa_categories(b): - c = b.replace('/?c=', '') + c = b.replace('?c=', '') cats = c.split('_') cat = cats[0] @@ -69,89 +72,104 @@ def nyaa_categories(b): return category_name -def parse_nyaa(table_rows, limit): - if limit == 0: - limit = len(table_rows) + +def parse_nyaa(request_text, limit): + parser = etree.HTMLParser() + tree = etree.fromstring(request_text, parser) torrents = [] - for row in table_rows[:limit]: + # Going through table rows + for tr in tree.xpath("//tbody//tr")[:limit]: block = [] - for td in row.find_all('td'): - if td.find_all('a'): - for link in td.find_all('a'): - if link.get('href')[-9:] != '#comments': - block.append(link.get('href')) - if link.text.rstrip(): - block.append(link.text) + for td in tr.xpath("./td"): + for link in td.xpath("./a"): - if td.text.rstrip(): - block.append(td.text.rstrip()) + href = link.attrib.get("href").split('/')[-1] - if row.has_attr('class'): - if row['class'][0] == 'danger': - block.append("remake") - elif row['class'][0] == 'success': - block.append("trusted") - else: - block.append("default") + # Only caring about non-comment pages. + if href[-9:] != "#comments": + block.append(href) + if link.text and link.text.strip(): + block.append(link.text.strip()) + + if td.text and td.text.strip(): + block.append(td.text.strip()) + + # Add type of torrent based on tr class. + if 'danger' in tr.attrib.get("class"): + block.append("remake") + elif 'success' in tr.attrib.get("class"): + block.append("trusted") + else: + block.append("default") + + # Create torrent object try: torrent = { - 'id': block[1].replace("/view/", ""), + 'id': block[1], 'category': nyaa_categories(block[0]), - 'url': "http://nyaa.si{}".format(block[1]), + 'url': "https://nyaa.si/view/{}".format(block[1]), 'name': block[2], - 'download_url': "http://nyaa.si{}".format(block[4]), - 'magnet': block[5], - 'size': block[6], - 'date': block[7], - 'seeders': block[8], - 'leechers': block[9], - 'completed_downloads': block[10], - 'type': block[11], + 'download_url': "https://nyaa.si/download/{}".format(block[3]), + 'magnet': block[4], + 'size': block[5], + 'date': block[6], + 'seeders': block[7], + 'leechers': block[8], + 'completed_downloads': block[9], + 'type': block[10] } - torrents.append(torrent) - except IndexError as ie: + except IndexError: pass - return torrents -def parse_single(content): + +def parse_single(request_text): + parser = etree.HTMLParser() + tree = etree.fromstring(request_text, parser) + torrent = {} data = [] torrent_files = [] - for row in content[0].find_all('div', {'class': 'row'}): - for div in row.find_all('div', {'class': 'col-md-5'}): - data.append(div.text.replace("\n", "")) + # Find basic uploader info & torrent stats + for row in tree.xpath("//div[@class='row']"): + for div_text in row.xpath("./div[@class='col-md-5']//text()"): + d = div_text.strip() + if d: + data.append(d) - files = content[2].find('div', - {'class', 'torrent-file-list'}).find_all('li') + # Find files, we need only text of the li element(s). + # Sorry about Pycodestyle aka PEP8 (E501) error + for el in tree.xpath("//div[contains(@class, 'torrent-file-list')]//li/text()"): + if el.rstrip(): + torrent_files.append(el) - for file in files: - torrent_files.append(file.text) - - torrent['title'] = re.sub('\n|\r|\t', '', content[0].find('h3', { - "class": "panel-title"}).text.replace("\n", "")) + torrent['title'] = \ + tree.xpath("//h3[@class='panel-title']/text()")[0].strip() torrent['category'] = data[0] - torrent['uploader'] = data[2] - torrent['uploader_profile'] = "https://nyaa.si/user/{}".format(data[2]) - torrent['website'] = re.sub('\t', '', data[4]) - torrent['size'] = data[6] - torrent['date'] = data[1] - torrent['seeders'] = data[3] - torrent['leechers'] = data[5] - torrent['completed'] = data[7] - torrent['hash'] = data[8] - torrent['description'] = re.sub('\t', '', content[1].find('div', { - 'id': 'torrent-description'}).text) + torrent['uploader'] = data[4] + torrent['uploader_profile'] = "http://nyaa.si/user/{}".format(data[4]) + torrent['website'] = data[6] + torrent['size'] = data[8] + torrent['date'] = data[3] + torrent['seeders'] = data[5] + torrent['leechers'] = data[7] + torrent['completed'] = data[9] + torrent['hash'] = data[10] torrent['files'] = torrent_files + torrent['description'] = "" + for s in tree.xpath("//div[@id='torrent-description']"): + torrent['description'] += s.text + return torrent + def parse_sukebei(table_rows, limit): if limit == 0: limit = len(table_rows) @@ -192,6 +210,7 @@ def parse_sukebei(table_rows, limit): return torrents + def sukebei_categories(b): c = b.replace('/?c=', '') cats = c.split('_') @@ -227,6 +246,7 @@ def sukebei_categories(b): return category_name + # Pantsu Utils def query_builder(q, params): available_params = ["category", "page", "limit", "userID", "fromID", diff --git a/NyaaPy/utils_lxml.py b/NyaaPy/utils_lxml.py deleted file mode 100644 index f7ea11a..0000000 --- a/NyaaPy/utils_lxml.py +++ /dev/null @@ -1,265 +0,0 @@ -''' - Module utils -''' - -import re -from lxml import etree -from pprint import pprint - - -def nyaa_categories(b): - c = b.replace('?c=', '') - cats = c.split('_') - - cat = cats[0] - subcat = cats[1] - - categories = { - "1": { - "name": "Anime", - "subcats": { - "1": "Anime Music Video", - "2": "English-translated", - "3": "Non-English-translated", - "4": "Raw" - } - }, - "2": { - "name": "Audio", - "subcats": { - "1": "Lossless", - "2": "Lossy" - } - }, - "3": { - "name": "Literature", - "subcats": { - "1": "English-translated", - "2": "Non-English-translated", - "3": "Raw" - } - }, - "4": { - "name": "Live Action", - "subcats": { - "1": "English-translated", - "2": "Idol/Promotional Video", - "3": "Non-English-translated", - "4": "Raw" - } - }, - "5": { - "name": "Pictures", - "subcats": { - "1": "Graphics", - "2": "Photos" - } - }, - "6": { - "name": "Software", - "subcats": { - "1": "Applications", - "2": "Games" - } - } - } - - try: - category_name = "{} - {}".format( - categories[cat]['name'], categories[cat]['subcats'][subcat]) - except Exception: - pass - - return category_name - - -def parse_nyaa(request_text, limit): - parser = etree.HTMLParser() - tree = etree.fromstring(request_text, parser) - - torrents = [] - - # Going through table rows - for tr in tree.xpath("//tbody//tr")[:limit]: - block = [] - - # Find basic torrent data - for td in tr.xpath("./td"): - for link in td.xpath("./a"): - block.append(link.attrib.get("href").split('/')[-1]) - - if link.text and link.text.rstrip(): - block.append(link.text) - - if td.text and td.text.rstrip(): - block.append(td.text) - - # Add type of torrent based on tr class. - if 'danger' in tr.attrib.get("class"): - block.append("remake") - elif 'success' in tr.attrib.get("class"): - block.append("trusted") - else: - block.append("default") - - # Create torrent object - try: - torrent = { - 'id': block[1], - 'category': nyaa_categories(block[0]), - 'url': "https://nyaa.si/view/{}".format(block[1]), - 'name': block[2], - 'download_url': "https://nyaa.si/download/{}".format(block[3]), - 'magnet': block[4], - 'size': block[5], - 'date': block[6], - 'seeders': block[7], - 'leechers': block[8], - 'completed_downloads': block[9], - 'type': block[10] - } - torrents.append(torrent) - except IndexError: - pass - return torrents - - -# TODO: Parse single is not done yet. -def parse_single(content): - torrent = {} - data = [] - torrent_files = [] - - for row in content[0].find_all('div', {'class': 'row'}): - for div in row.find_all('div', {'class': 'col-md-5'}): - data.append(div.text.replace("\n", "")) - - files = content[2].find('div', - {'class', 'torrent-file-list'}).find_all('li') - - for file in files: - torrent_files.append(file.text) - - torrent['title'] = re.sub('\n|\r|\t', '', content[0].find('h3', { - "class": "panel-title"}).text.replace("\n", "")) - torrent['category'] = data[0] - torrent['uploader'] = data[2] - torrent['uploader_profile'] = "https://nyaa.si/user/{}".format(data[2]) - torrent['website'] = re.sub('\t', '', data[4]) - torrent['size'] = data[6] - torrent['date'] = data[1] - torrent['seeders'] = data[3] - torrent['leechers'] = data[5] - torrent['completed'] = data[7] - torrent['hash'] = data[8] - torrent['description'] = re.sub('\t', '', content[1].find('div', { - 'id': 'torrent-description'}).text) - torrent['files'] = torrent_files - - return torrent - - -# TODO: Not ready -def parse_sukebei(table_rows, limit): - if limit == 0: - limit = len(table_rows) - - torrents = [] - - for row in table_rows[:limit]: - block = [] - - for td in row.find_all('td'): - for link in td.find_all('a'): - if link.get('href')[-9:] != '#comments': - block.append(link.get('href')) - block.append(link.text.rstrip()) - - if td.text.rstrip(): - block.append(td.text.rstrip()) - - try: - torrent = { - 'id': block[1].replace("/view/", ""), - 'category': sukebei_categories(block[0]), - 'url': "http://sukebei.nyaa.si{}".format(block[1]), - 'name': block[2], - 'download_url': "http://sukebei.nyaa.si{}".format( - block[4]), - 'magnet': block[5], - 'size': block[6], - 'date': block[7], - 'seeders': block[8], - 'leechers': block[9], - 'completed_downloads': block[10], - } - except IndexError as ie: - pass - - torrents.append(torrent) - - return torrents - - -# TODO Not ready -def sukebei_categories(b): - c = b.replace('/?c=', '') - cats = c.split('_') - - cat = cats[0] - subcat = cats[1] - - categories = { - "1": { - "name": "Art", - "subcats": { - "1": "Anime", - "2": "Doujinshi", - "3": "Games", - "4": "Manga", - "5": "Pictures", - } - }, - "2": { - "name": "Real Life", - "subcats": { - "1": "Photobooks & Pictures", - "2": "Videos" - } - } - } - - try: - category_name = "{} - {}".format( - categories[cat]['name'], categories[cat]['subcats'][subcat]) - except Exception: - pass - - return category_name - - -# TODO: Not tested -# Pantsu Utils -def query_builder(q, params): - available_params = ["category", "page", "limit", "userID", "fromID", - "status", "maxage", "toDate", "fromDate", - "dateType", "minSize", "maxSize", "sizeType", - "sort", "order", "lang"] - query = "?q={}".format(q.replace(" ", "+")) - - for param, value in params.items(): - if param in available_params: - if (param != "category" and param != "status" and - param != "lang"): - query += "&{}={}".format(param, value) - elif param == "category": - query += "&c={}_{}".format(value[0], value[1]) - - elif param == "status": - query += "&s={}".format(value) - - elif param == "lang": - for lang in value: - query += "&lang={}".format(lang) - - return query diff --git a/tests/test.py b/tests/test.py index 233f817..caf5875 100644 --- a/tests/test.py +++ b/tests/test.py @@ -3,7 +3,20 @@ from pprint import pprint # pantsu = Pantsu() nyaa = Nyaa() -pprint(nyaa.last_uploads(5)) + +# Get fresh torrents +print("Latest torrents:") +latest_torrents = rnyaa.last_uploads(5) + +# I'd like to watch Tenki no ko, but not uploaded yet. +print("Search results for Kimi no Na wa:") +test_search = nyaa.search("Kimi no Na wa") +pprint(test_search) + +# Get first torrent from found torrents +print("First result torrent info:") +single_torrent = nyaa.get(test_search[0]["id"]) +pprint(single_torrent) """ print(pantsu.search(keyword='koe no katachi',