fixed web scraping for nyaa.si

2017-10-12 23:08:00 +02:00
parent 5cb2079188
commit b632bdda41
2 changed files with 78 additions and 28 deletions
--- a/NyaaPy/init.py
+++ b/NyaaPy/init.py
@@ -11,46 +11,96 @@ __copyright__ = '2017 Juanjo Salvador'
 __license__   = 'MIT license'
 class Nyaa():
-    def search(keyword, category, subcategory, filters):
+    '''
-        r = requests.get("http://nyaa.si/?f={}&c={}_{}&q={}".format(filters, category, subcategory, keyword))
+     Return a list of dicts with the results of the query.
    '''
    def search(keyword, category, subcategory, filters, page):
        if page:
            r = requests.get("http://nyaa.si/?f={}&c={}_{}&q={}&p={}".format(filters, category, subcategory, keyword, page))
        else:
            r = requests.get("http://nyaa.si/?f={}&c={}_{}&q={}".format(filters, category, subcategory, keyword))
        soup = BeautifulSoup(r.text, 'html.parser')
        rows = soup.select('table tr')
        torrents = []
        for row in rows:
-            td = row.find_all('td')
+            block = []
            torrent = []
-            for i in td:
+            for td in row.find_all('td'):
-                if i.find('a'):
+                if td.find_all('a'):
-                    torrent.append(i.find('a').get('href'))
+                    for link in td.find_all('a'):
-                    text = i.text.rstrip()
+                        if link.get('href')[-9:] != '#comments':
-                    if len(text) > 0:
+                            block.append(link.get('href'))
-                        torrent.append(text)
+                            if link.text.rstrip():
-                else:
+                                block.append(link.text)
                    text = i.text.rstrip()
                    if len(text) > 0:
                        torrent.append(text)
-            torrents.append(torrent)
+                if td.text.rstrip():
                    block.append(td.text.rstrip())
-        print(torrents)
+            try:
                torrent = {
                    'category': block[0].replace('/?c=', ''),
                    'url': "http://nyaa.si{}".format(block[1]),
                    'name': block[2],
                    'download_url': "http://nyaa.si{}".format(block[4]),
                    'magnet': block[5],
                    'size': block[6],
                    'date': block[7],
                    'seeders': block[8],
                    'leechers': block[9],
                    'completed_downloads': block[10],
                }
                torrents.append(torrent)
            except IndexError:
                print("Error! {}".format(block))
        return torrents
    '''
-     Returns an array of OrderedDict with the n last updates of Nyaa.si
+     Returns an array of dicts with the n last updates of Nyaa.si
    '''
    def news(n):
-        nyaa_baseurl = "https://nyaa.si/?page=rss"
+        r = requests.get("http://nyaa.si/")
        soup = BeautifulSoup(r.text, 'html.parser')
        rows = soup.select('table tr')
-        request  = requests.get(nyaa_baseurl)
+        torrents = []
        response = xmltodict.parse(request.text)
-        results = response['rss']['channel']['item']
+        for row in rows:
            block = []
-        return results[:n]
+            for td in row.find_all('td'):
                if td.find_all('a'):
                    for link in td.find_all('a'):
                        if link.get('href')[-9:] != '#comments':
                            block.append(link.get('href'))
                            if link.text.rstrip():
                                block.append(link.text)
                if td.text.rstrip():
                    block.append(td.text.rstrip())
            try:
                torrent = {
                    'category': block[0].replace('/?c=', ''),
                    'url': "http://nyaa.si{}".format(block[1]),
                    'name': block[2],
                    'download_url': "http://nyaa.si{}".format(block[4]),
                    'magnet': block[5],
                    'size': block[6],
                    'date': block[7],
                    'seeders': block[8],
                    'leechers': block[9],
                    'completed_downloads': block[10],
                }
                torrents.append(torrent)
            except IndexError:
                print("Error! {}".format(block))
        return torrents[:n]
 class NyaaPantsu():
    '''
--- a/tests/test.py
+++ b/tests/test.py
@@ -3,16 +3,16 @@ from NyaaPy import Nyaa, NyaaPantsu
 # Nyaa.si results
 def nyaa_search():
-    nyaa_query = Nyaa.search('koe no katachi 1080')
+    nyaa_query = Nyaa.search('koe no katachi 1080', 1, 0, 0, 2)
    for nyaa in nyaa_query:
-        print(nyaa)
+        print(nyaa['date'])
 def nyaa_news():
    news = Nyaa.news(5)
    for result in news:
-        print(result['title'])
+        print(result['name'])
 # Nyaa.pantsu.cat results
 def pantsu_search():
@@ -34,7 +34,7 @@ def pantsu_news():
 nyaa_search()
 #pantsu_search()
-#nyaa_news()
+nyaa_news()
 #pantsu_news()
 ''' r = requests.get("http://nyaa.si/")