fixed web scraping for nyaa.si
This commit is contained in:
@@ -11,46 +11,96 @@ __copyright__ = '2017 Juanjo Salvador'
|
|||||||
__license__ = 'MIT license'
|
__license__ = 'MIT license'
|
||||||
|
|
||||||
class Nyaa():
|
class Nyaa():
|
||||||
def search(keyword, category, subcategory, filters):
|
'''
|
||||||
r = requests.get("http://nyaa.si/?f={}&c={}_{}&q={}".format(filters, category, subcategory, keyword))
|
Return a list of dicts with the results of the query.
|
||||||
|
'''
|
||||||
|
def search(keyword, category, subcategory, filters, page):
|
||||||
|
if page:
|
||||||
|
r = requests.get("http://nyaa.si/?f={}&c={}_{}&q={}&p={}".format(filters, category, subcategory, keyword, page))
|
||||||
|
else:
|
||||||
|
r = requests.get("http://nyaa.si/?f={}&c={}_{}&q={}".format(filters, category, subcategory, keyword))
|
||||||
|
|
||||||
soup = BeautifulSoup(r.text, 'html.parser')
|
soup = BeautifulSoup(r.text, 'html.parser')
|
||||||
rows = soup.select('table tr')
|
rows = soup.select('table tr')
|
||||||
|
|
||||||
torrents = []
|
torrents = []
|
||||||
|
|
||||||
for row in rows:
|
for row in rows:
|
||||||
td = row.find_all('td')
|
block = []
|
||||||
torrent = []
|
|
||||||
|
|
||||||
for i in td:
|
for td in row.find_all('td'):
|
||||||
if i.find('a'):
|
if td.find_all('a'):
|
||||||
torrent.append(i.find('a').get('href'))
|
for link in td.find_all('a'):
|
||||||
text = i.text.rstrip()
|
if link.get('href')[-9:] != '#comments':
|
||||||
if len(text) > 0:
|
block.append(link.get('href'))
|
||||||
torrent.append(text)
|
if link.text.rstrip():
|
||||||
else:
|
block.append(link.text)
|
||||||
text = i.text.rstrip()
|
|
||||||
if len(text) > 0:
|
|
||||||
torrent.append(text)
|
|
||||||
|
|
||||||
torrents.append(torrent)
|
if td.text.rstrip():
|
||||||
|
block.append(td.text.rstrip())
|
||||||
|
|
||||||
print(torrents)
|
try:
|
||||||
|
torrent = {
|
||||||
|
'category': block[0].replace('/?c=', ''),
|
||||||
|
'url': "http://nyaa.si{}".format(block[1]),
|
||||||
|
'name': block[2],
|
||||||
|
'download_url': "http://nyaa.si{}".format(block[4]),
|
||||||
|
'magnet': block[5],
|
||||||
|
'size': block[6],
|
||||||
|
'date': block[7],
|
||||||
|
'seeders': block[8],
|
||||||
|
'leechers': block[9],
|
||||||
|
'completed_downloads': block[10],
|
||||||
|
}
|
||||||
|
|
||||||
|
torrents.append(torrent)
|
||||||
|
except IndexError:
|
||||||
|
print("Error! {}".format(block))
|
||||||
|
|
||||||
return torrents
|
return torrents
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Returns an array of OrderedDict with the n last updates of Nyaa.si
|
Returns an array of dicts with the n last updates of Nyaa.si
|
||||||
'''
|
'''
|
||||||
def news(n):
|
def news(n):
|
||||||
nyaa_baseurl = "https://nyaa.si/?page=rss"
|
r = requests.get("http://nyaa.si/")
|
||||||
|
soup = BeautifulSoup(r.text, 'html.parser')
|
||||||
|
rows = soup.select('table tr')
|
||||||
|
|
||||||
request = requests.get(nyaa_baseurl)
|
torrents = []
|
||||||
response = xmltodict.parse(request.text)
|
|
||||||
|
|
||||||
results = response['rss']['channel']['item']
|
for row in rows:
|
||||||
|
block = []
|
||||||
|
|
||||||
return results[:n]
|
for td in row.find_all('td'):
|
||||||
|
if td.find_all('a'):
|
||||||
|
for link in td.find_all('a'):
|
||||||
|
if link.get('href')[-9:] != '#comments':
|
||||||
|
block.append(link.get('href'))
|
||||||
|
if link.text.rstrip():
|
||||||
|
block.append(link.text)
|
||||||
|
|
||||||
|
if td.text.rstrip():
|
||||||
|
block.append(td.text.rstrip())
|
||||||
|
|
||||||
|
try:
|
||||||
|
torrent = {
|
||||||
|
'category': block[0].replace('/?c=', ''),
|
||||||
|
'url': "http://nyaa.si{}".format(block[1]),
|
||||||
|
'name': block[2],
|
||||||
|
'download_url': "http://nyaa.si{}".format(block[4]),
|
||||||
|
'magnet': block[5],
|
||||||
|
'size': block[6],
|
||||||
|
'date': block[7],
|
||||||
|
'seeders': block[8],
|
||||||
|
'leechers': block[9],
|
||||||
|
'completed_downloads': block[10],
|
||||||
|
}
|
||||||
|
|
||||||
|
torrents.append(torrent)
|
||||||
|
except IndexError:
|
||||||
|
print("Error! {}".format(block))
|
||||||
|
|
||||||
|
return torrents[:n]
|
||||||
|
|
||||||
class NyaaPantsu():
|
class NyaaPantsu():
|
||||||
'''
|
'''
|
||||||
|
|||||||
@@ -3,16 +3,16 @@ from NyaaPy import Nyaa, NyaaPantsu
|
|||||||
|
|
||||||
# Nyaa.si results
|
# Nyaa.si results
|
||||||
def nyaa_search():
|
def nyaa_search():
|
||||||
nyaa_query = Nyaa.search('koe no katachi 1080')
|
nyaa_query = Nyaa.search('koe no katachi 1080', 1, 0, 0, 2)
|
||||||
|
|
||||||
for nyaa in nyaa_query:
|
for nyaa in nyaa_query:
|
||||||
print(nyaa)
|
print(nyaa['date'])
|
||||||
|
|
||||||
def nyaa_news():
|
def nyaa_news():
|
||||||
news = Nyaa.news(5)
|
news = Nyaa.news(5)
|
||||||
|
|
||||||
for result in news:
|
for result in news:
|
||||||
print(result['title'])
|
print(result['name'])
|
||||||
|
|
||||||
# Nyaa.pantsu.cat results
|
# Nyaa.pantsu.cat results
|
||||||
def pantsu_search():
|
def pantsu_search():
|
||||||
@@ -34,7 +34,7 @@ def pantsu_news():
|
|||||||
|
|
||||||
nyaa_search()
|
nyaa_search()
|
||||||
#pantsu_search()
|
#pantsu_search()
|
||||||
#nyaa_news()
|
nyaa_news()
|
||||||
#pantsu_news()
|
#pantsu_news()
|
||||||
|
|
||||||
''' r = requests.get("http://nyaa.si/")
|
''' r = requests.get("http://nyaa.si/")
|
||||||
|
|||||||
Reference in New Issue
Block a user