Web Scraping WIP, added category, subcategory and filters

2017-10-11 00:04:10 +02:00
parent c18bdb736b
commit 5cb2079188
3 changed files with 52 additions and 29 deletions
--- a/NyaaPy/init.py
+++ b/NyaaPy/init.py
@@ -1,7 +1,6 @@
 import requests
-import xmltodict
 import json
-import collections
+from bs4 import BeautifulSoup

 # Info about the module
 __version__   = '0.4'
@@ -12,29 +11,33 @@ __copyright__ = '2017 Juanjo Salvador'
 __license__   = 'MIT license'

 class Nyaa():
-    '''
-     Make a query to nyaa.si using keyword as keyword.
-     Returns an array of OrderedDict with every result of the query.
-     Returns an empty array if no results.
-    '''
-    def search(keyword):
-        nyaa_baseurl = "https://nyaa.si/?page=rss&c=1_0&f=0&q="
+    def search(keyword, category, subcategory, filters):
+        r = requests.get("http://nyaa.si/?f={}&c={}_{}&q={}".format(filters, category, subcategory, keyword))
+        soup = BeautifulSoup(r.text, 'html.parser')
+        rows = soup.select('table tr')

-        request  = requests.get(nyaa_baseurl + keyword)
-        response = xmltodict.parse(request.text)
+        torrents = []

-        results = []
+        for row in rows:
+            td = row.find_all('td')
+            torrent = []

-        try:
-            if type(response['rss']['channel']['item']) is collections.OrderedDict:
-                results.append(response['rss']['channel']['item'])
-            else:
-                results = response['rss']['channel']['item']
+            for i in td:
+                if i.find('a'):
+                    torrent.append(i.find('a').get('href'))
+                    text = i.text.rstrip()
+                    if len(text) > 0:
+                        torrent.append(text)
+                else:
+                    text = i.text.rstrip()
+                    if len(text) > 0:
+                        torrent.append(text)

-        except KeyError as ex:
-            results = []
+            torrents.append(torrent)
+        
+        print(torrents)

-        return results
+        return torrents

    '''
     Returns an array of OrderedDict with the n last updates of Nyaa.si
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages

 setup(name='nyaapy',
-      version='0.4',
+      version='0.4.1',
      url='https://github.com/juanjosalvador/nyaapy',
      download_url = 'https://github.com/juanjosalvador/nyaapy/archive/0.1.tar.gz',
      license='MIT',
--- a/tests/test.py
+++ b/tests/test.py
@@ -1,15 +1,12 @@
-import json
+import json, requests
 from NyaaPy import Nyaa, NyaaPantsu

 # Nyaa.si results
 def nyaa_search():
    nyaa_query = Nyaa.search('koe no katachi 1080')

-    if len(nyaa_query) > 0:
-        for result in nyaa_query:
-            print(result['title'])
-    else:
-        print('Nothing here!')
+    for nyaa in nyaa_query:
+        print(nyaa)

 def nyaa_news():
    news = Nyaa.news(5)
@@ -35,7 +32,30 @@ def pantsu_news():

 # Uncomment whatever you want to test

-#nyaa_search()
+nyaa_search()
 #pantsu_search()
 #nyaa_news()
-pantsu_news()
+#pantsu_news()
+
+''' r = requests.get("http://nyaa.si/")
+soup = BeautifulSoup(r.text, 'html.parser')
+rows = soup.select('table tr')
+
+torrents = []
+
+for row in rows:
+    td = row.find_all('td')
+    torrent = []
+
+    for i in td:
+        if i.find('a'):
+            torrent.append(i.find('a').get('href'))
+            text = i.text.rstrip()
+            if len(text) > 0:
+                torrent.append(text)
+        else:
+            text = i.text.rstrip()
+            if len(text) > 0:
+                torrent.append(text)
+
+        torrents.append(torrent) '''