fixed web scraping for nyaa.si

2017-10-12 23:08:00 +02:00
parent 5cb2079188
commit b632bdda41
2 changed files with 78 additions and 28 deletions
--- a/NyaaPy/init.py
+++ b/NyaaPy/init.py
@@ -11,46 +11,96 @@ __copyright__ = '2017 Juanjo Salvador'
 __license__   = 'MIT license'

 class Nyaa():
-    def search(keyword, category, subcategory, filters):
-        r = requests.get("http://nyaa.si/?f={}&c={}_{}&q={}".format(filters, category, subcategory, keyword))
+    '''
+     Return a list of dicts with the results of the query.
+    '''
+    def search(keyword, category, subcategory, filters, page):
+        if page:
+            r = requests.get("http://nyaa.si/?f={}&c={}_{}&q={}&p={}".format(filters, category, subcategory, keyword, page))
+        else:
+            r = requests.get("http://nyaa.si/?f={}&c={}_{}&q={}".format(filters, category, subcategory, keyword))
+
        soup = BeautifulSoup(r.text, 'html.parser')
        rows = soup.select('table tr')

        torrents = []

        for row in rows:
-            td = row.find_all('td')
-            torrent = []
+            block = []

-            for i in td:
-                if i.find('a'):
-                    torrent.append(i.find('a').get('href'))
-                    text = i.text.rstrip()
-                    if len(text) > 0:
-                        torrent.append(text)
-                else:
-                    text = i.text.rstrip()
-                    if len(text) > 0:
-                        torrent.append(text)
+            for td in row.find_all('td'):
+                if td.find_all('a'):
+                    for link in td.find_all('a'):
+                        if link.get('href')[-9:] != '#comments':
+                            block.append(link.get('href'))
+                            if link.text.rstrip():
+                                block.append(link.text)

-            torrents.append(torrent)
-        
-        print(torrents)
+                if td.text.rstrip():
+                    block.append(td.text.rstrip())
+
+            try:
+                torrent = {
+                    'category': block[0].replace('/?c=', ''),
+                    'url': "http://nyaa.si{}".format(block[1]),
+                    'name': block[2],
+                    'download_url': "http://nyaa.si{}".format(block[4]),
+                    'magnet': block[5],
+                    'size': block[6],
+                    'date': block[7],
+                    'seeders': block[8],
+                    'leechers': block[9],
+                    'completed_downloads': block[10],
+                }
+            
+                torrents.append(torrent)
+            except IndexError:
+                print("Error! {}".format(block))

        return torrents
-
    '''
-     Returns an array of OrderedDict with the n last updates of Nyaa.si
+     Returns an array of dicts with the n last updates of Nyaa.si
    '''
    def news(n):
-        nyaa_baseurl = "https://nyaa.si/?page=rss"
+        r = requests.get("http://nyaa.si/")
+        soup = BeautifulSoup(r.text, 'html.parser')
+        rows = soup.select('table tr')

-        request  = requests.get(nyaa_baseurl)
-        response = xmltodict.parse(request.text)
+        torrents = []

-        results = response['rss']['channel']['item']
+        for row in rows:
+            block = []

-        return results[:n]
+            for td in row.find_all('td'):
+                if td.find_all('a'):
+                    for link in td.find_all('a'):
+                        if link.get('href')[-9:] != '#comments':
+                            block.append(link.get('href'))
+                            if link.text.rstrip():
+                                block.append(link.text)
+
+                if td.text.rstrip():
+                    block.append(td.text.rstrip())
+
+            try:
+                torrent = {
+                    'category': block[0].replace('/?c=', ''),
+                    'url': "http://nyaa.si{}".format(block[1]),
+                    'name': block[2],
+                    'download_url': "http://nyaa.si{}".format(block[4]),
+                    'magnet': block[5],
+                    'size': block[6],
+                    'date': block[7],
+                    'seeders': block[8],
+                    'leechers': block[9],
+                    'completed_downloads': block[10],
+                }
+            
+                torrents.append(torrent)
+            except IndexError:
+                print("Error! {}".format(block))
+
+        return torrents[:n]

 class NyaaPantsu():
    '''