euscan: respect robots.txt

Signed-off-by: Corentin Chary <corentincj@iksaif.net>
2011-09-06 16:32:29 +02:00
parent bd75e1af4e
commit a137ef60e3
5 changed files with 52 additions and 2 deletions
--- a/pym/euscan/init.py
+++ b/pym/euscan/init.py
@@ -16,7 +16,8 @@ CONFIG = {
    'brute-force-recursive': True,
    'scan-dir': True,
    'oneshot': False,
-    'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)'
+    'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)',
+    'skip-robots-txt' : False
 }

 output = EOutput(CONFIG['quiet'])
--- a/pym/euscan/handlers/generic.py
+++ b/pym/euscan/handlers/generic.py
@@ -57,6 +57,9 @@ def scan_directory_recursive(cpv, url, steps):
    except IOError:
        return []

+    if not fp:
+        return []
+
    data = fp.read()

    results = []
--- a/pym/euscan/handlers/php.py
+++ b/pym/euscan/handlers/php.py
@@ -38,6 +38,9 @@ def scan(cpv, url):
    except IOError:
        return []

+    if not fp:
+        return []
+
    data = fp.read()

    dom = xml.dom.minidom.parseString(data)
--- a/pym/euscan/handlers/rubygem.py
+++ b/pym/euscan/handlers/rubygem.py
@@ -33,6 +33,9 @@ def scan(cpv, url):
    except IOError:
        return []

+    if not fp:
+        return []
+
    data = fp.read()
    versions = json.loads(data)

--- a/pym/euscan/helpers.py
+++ b/pym/euscan/helpers.py
@@ -1,9 +1,17 @@
-import urllib2
 import os
 import re
 import pkg_resources
 import errno

+import urllib2
+
+try:
+    from urllib import robotparser
+    from urllib import urlparse
+except ImportError:
+    import robotparser
+    import urlparse
+
 import portage
 from portage import dep

@@ -200,7 +208,31 @@ class HeadRequest(urllib2.Request):
    def get_method(self):
        return "HEAD"

+""" RobotParser cache """
+rpcache = {}
+
+def urlallowed(url):
+    if CONFIG['skip-robots-txt']:
+        return True
+
+    protocol, domain = urlparse.urlparse(url)[:2]
+
+    baseurl = '%s://%s' % (protocol, domain)
+    robotsurl = urlparse.urljoin(baseurl, 'robots.txt')
+
+    if rpcache.has_key(baseurl):
+        rp = rpcache[baseurl]
+    else:
+        rp = robotparser.RobotFileParser()
+        rp.set_url(robotsurl)
+        rp.read()
+        rpcache[baseurl] = rp
+    return rp.can_fetch(CONFIG['user-agent'], url)
+
 def urlopen(url, timeout=None, verb="GET"):
+    if not urlallowed(url):
+        return None
+
    if not timeout:
        timeout = timeout_for_url(url)

@@ -217,12 +249,20 @@ def urlopen(url, timeout=None, verb="GET"):
 def tryurl(fileurl, template):
    result = True

+    if not urlallowed(fileurl):
+        output.eerror("Url '%s' blocked by robots.txt" % fileurl)
+        return None
+
    output.ebegin("Trying: " + fileurl)

    try:
        basename = os.path.basename(fileurl)

        fp = urlopen(fileurl, verb='HEAD')
+        if not fp:
+            output.eend(errno.EPERM)
+            return None
+
        headers = fp.info()

        if 'Content-disposition' in headers and basename not in headers['Content-disposition']: