euscan: respect robots.txt

Signed-off-by: Corentin Chary <corentincj@iksaif.net>
2011-09-06 16:32:29 +02:00
parent bd75e1af4e
commit a137ef60e3
5 changed files with 52 additions and 2 deletions
--- a/pym/euscan/init.py
+++ b/pym/euscan/init.py
@@ -16,7 +16,8 @@ CONFIG = {
    'brute-force-recursive': True,
    'scan-dir': True,
    'oneshot': False,
-    'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)'
+    'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)',
    'skip-robots-txt' : False
 }
 output = EOutput(CONFIG['quiet'])
--- a/pym/euscan/handlers/generic.py
+++ b/pym/euscan/handlers/generic.py
@@ -57,6 +57,9 @@ def scan_directory_recursive(cpv, url, steps):
    except IOError:
        return []
    if not fp:
        return []
    data = fp.read()
    results = []
--- a/pym/euscan/handlers/php.py
+++ b/pym/euscan/handlers/php.py
@@ -38,6 +38,9 @@ def scan(cpv, url):
    except IOError:
        return []
    if not fp:
        return []
    data = fp.read()
    dom = xml.dom.minidom.parseString(data)
--- a/pym/euscan/handlers/rubygem.py
+++ b/pym/euscan/handlers/rubygem.py
@@ -33,6 +33,9 @@ def scan(cpv, url):
    except IOError:
        return []
    if not fp:
        return []
    data = fp.read()
    versions = json.loads(data)
--- a/pym/euscan/helpers.py
+++ b/pym/euscan/helpers.py
@@ -1,9 +1,17 @@
 import urllib2
 import os
 import re
 import pkg_resources
 import errno
 import urllib2
 try:
    from urllib import robotparser
    from urllib import urlparse
 except ImportError:
    import robotparser
    import urlparse
 import portage
 from portage import dep
@@ -200,7 +208,31 @@ class HeadRequest(urllib2.Request):
    def get_method(self):
        return "HEAD"
 """ RobotParser cache """
 rpcache = {}
 def urlallowed(url):
    if CONFIG['skip-robots-txt']:
        return True
    protocol, domain = urlparse.urlparse(url)[:2]
    baseurl = '%s://%s' % (protocol, domain)
    robotsurl = urlparse.urljoin(baseurl, 'robots.txt')
    if rpcache.has_key(baseurl):
        rp = rpcache[baseurl]
    else:
        rp = robotparser.RobotFileParser()
        rp.set_url(robotsurl)
        rp.read()
        rpcache[baseurl] = rp
    return rp.can_fetch(CONFIG['user-agent'], url)
 def urlopen(url, timeout=None, verb="GET"):
    if not urlallowed(url):
        return None
    if not timeout:
        timeout = timeout_for_url(url)
@@ -217,12 +249,20 @@ def urlopen(url, timeout=None, verb="GET"):
 def tryurl(fileurl, template):
    result = True
    if not urlallowed(fileurl):
        output.eerror("Url '%s' blocked by robots.txt" % fileurl)
        return None
    output.ebegin("Trying: " + fileurl)
    try:
        basename = os.path.basename(fileurl)
        fp = urlopen(fileurl, verb='HEAD')
        if not fp:
            output.eend(errno.EPERM)
            return None
        headers = fp.info()
        if 'Content-disposition' in headers and basename not in headers['Content-disposition']: