fixed web scraping for nyaa.si

This commit is contained in:
JuanjoSalvador
2017-10-12 23:08:00 +02:00
parent 5cb2079188
commit b632bdda41
2 changed files with 78 additions and 28 deletions

View File

@@ -11,46 +11,96 @@ __copyright__ = '2017 Juanjo Salvador'
__license__ = 'MIT license' __license__ = 'MIT license'
class Nyaa(): class Nyaa():
def search(keyword, category, subcategory, filters): '''
r = requests.get("http://nyaa.si/?f={}&c={}_{}&q={}".format(filters, category, subcategory, keyword)) Return a list of dicts with the results of the query.
'''
def search(keyword, category, subcategory, filters, page):
if page:
r = requests.get("http://nyaa.si/?f={}&c={}_{}&q={}&p={}".format(filters, category, subcategory, keyword, page))
else:
r = requests.get("http://nyaa.si/?f={}&c={}_{}&q={}".format(filters, category, subcategory, keyword))
soup = BeautifulSoup(r.text, 'html.parser') soup = BeautifulSoup(r.text, 'html.parser')
rows = soup.select('table tr') rows = soup.select('table tr')
torrents = [] torrents = []
for row in rows: for row in rows:
td = row.find_all('td') block = []
torrent = []
for i in td: for td in row.find_all('td'):
if i.find('a'): if td.find_all('a'):
torrent.append(i.find('a').get('href')) for link in td.find_all('a'):
text = i.text.rstrip() if link.get('href')[-9:] != '#comments':
if len(text) > 0: block.append(link.get('href'))
torrent.append(text) if link.text.rstrip():
else: block.append(link.text)
text = i.text.rstrip()
if len(text) > 0:
torrent.append(text)
torrents.append(torrent) if td.text.rstrip():
block.append(td.text.rstrip())
print(torrents) try:
torrent = {
'category': block[0].replace('/?c=', ''),
'url': "http://nyaa.si{}".format(block[1]),
'name': block[2],
'download_url': "http://nyaa.si{}".format(block[4]),
'magnet': block[5],
'size': block[6],
'date': block[7],
'seeders': block[8],
'leechers': block[9],
'completed_downloads': block[10],
}
torrents.append(torrent)
except IndexError:
print("Error! {}".format(block))
return torrents return torrents
''' '''
Returns an array of OrderedDict with the n last updates of Nyaa.si Returns an array of dicts with the n last updates of Nyaa.si
''' '''
def news(n): def news(n):
nyaa_baseurl = "https://nyaa.si/?page=rss" r = requests.get("http://nyaa.si/")
soup = BeautifulSoup(r.text, 'html.parser')
rows = soup.select('table tr')
request = requests.get(nyaa_baseurl) torrents = []
response = xmltodict.parse(request.text)
results = response['rss']['channel']['item'] for row in rows:
block = []
return results[:n] for td in row.find_all('td'):
if td.find_all('a'):
for link in td.find_all('a'):
if link.get('href')[-9:] != '#comments':
block.append(link.get('href'))
if link.text.rstrip():
block.append(link.text)
if td.text.rstrip():
block.append(td.text.rstrip())
try:
torrent = {
'category': block[0].replace('/?c=', ''),
'url': "http://nyaa.si{}".format(block[1]),
'name': block[2],
'download_url': "http://nyaa.si{}".format(block[4]),
'magnet': block[5],
'size': block[6],
'date': block[7],
'seeders': block[8],
'leechers': block[9],
'completed_downloads': block[10],
}
torrents.append(torrent)
except IndexError:
print("Error! {}".format(block))
return torrents[:n]
class NyaaPantsu(): class NyaaPantsu():
''' '''

View File

@@ -3,16 +3,16 @@ from NyaaPy import Nyaa, NyaaPantsu
# Nyaa.si results # Nyaa.si results
def nyaa_search(): def nyaa_search():
nyaa_query = Nyaa.search('koe no katachi 1080') nyaa_query = Nyaa.search('koe no katachi 1080', 1, 0, 0, 2)
for nyaa in nyaa_query: for nyaa in nyaa_query:
print(nyaa) print(nyaa['date'])
def nyaa_news(): def nyaa_news():
news = Nyaa.news(5) news = Nyaa.news(5)
for result in news: for result in news:
print(result['title']) print(result['name'])
# Nyaa.pantsu.cat results # Nyaa.pantsu.cat results
def pantsu_search(): def pantsu_search():
@@ -34,7 +34,7 @@ def pantsu_news():
nyaa_search() nyaa_search()
#pantsu_search() #pantsu_search()
#nyaa_news() nyaa_news()
#pantsu_news() #pantsu_news()
''' r = requests.get("http://nyaa.si/") ''' r = requests.get("http://nyaa.si/")