diff --git a/pym/euscan/__init__.py b/pym/euscan/__init__.py index 6e5cc39..f4d1729 100644 --- a/pym/euscan/__init__.py +++ b/pym/euscan/__init__.py @@ -16,7 +16,8 @@ CONFIG = { 'brute-force-recursive': True, 'scan-dir': True, 'oneshot': False, - 'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)' + 'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)', + 'skip-robots-txt' : False } output = EOutput(CONFIG['quiet']) diff --git a/pym/euscan/handlers/generic.py b/pym/euscan/handlers/generic.py index b9bc50e..a40b050 100644 --- a/pym/euscan/handlers/generic.py +++ b/pym/euscan/handlers/generic.py @@ -57,6 +57,9 @@ def scan_directory_recursive(cpv, url, steps): except IOError: return [] + if not fp: + return [] + data = fp.read() results = [] diff --git a/pym/euscan/handlers/php.py b/pym/euscan/handlers/php.py index e70c877..5ec673b 100644 --- a/pym/euscan/handlers/php.py +++ b/pym/euscan/handlers/php.py @@ -38,6 +38,9 @@ def scan(cpv, url): except IOError: return [] + if not fp: + return [] + data = fp.read() dom = xml.dom.minidom.parseString(data) diff --git a/pym/euscan/handlers/rubygem.py b/pym/euscan/handlers/rubygem.py index 19e143d..0b7bdc8 100644 --- a/pym/euscan/handlers/rubygem.py +++ b/pym/euscan/handlers/rubygem.py @@ -33,6 +33,9 @@ def scan(cpv, url): except IOError: return [] + if not fp: + return [] + data = fp.read() versions = json.loads(data) diff --git a/pym/euscan/helpers.py b/pym/euscan/helpers.py index 0d1fa18..c9bf722 100644 --- a/pym/euscan/helpers.py +++ b/pym/euscan/helpers.py @@ -1,9 +1,17 @@ -import urllib2 import os import re import pkg_resources import errno +import urllib2 + +try: + from urllib import robotparser + from urllib import urlparse +except ImportError: + import robotparser + import urlparse + import portage from portage import dep @@ -200,7 +208,31 @@ class HeadRequest(urllib2.Request): def get_method(self): return "HEAD" +""" RobotParser cache """ +rpcache = {} + +def urlallowed(url): + if CONFIG['skip-robots-txt']: + return True + + protocol, domain = urlparse.urlparse(url)[:2] + + baseurl = '%s://%s' % (protocol, domain) + robotsurl = urlparse.urljoin(baseurl, 'robots.txt') + + if rpcache.has_key(baseurl): + rp = rpcache[baseurl] + else: + rp = robotparser.RobotFileParser() + rp.set_url(robotsurl) + rp.read() + rpcache[baseurl] = rp + return rp.can_fetch(CONFIG['user-agent'], url) + def urlopen(url, timeout=None, verb="GET"): + if not urlallowed(url): + return None + if not timeout: timeout = timeout_for_url(url) @@ -217,12 +249,20 @@ def urlopen(url, timeout=None, verb="GET"): def tryurl(fileurl, template): result = True + if not urlallowed(fileurl): + output.eerror("Url '%s' blocked by robots.txt" % fileurl) + return None + output.ebegin("Trying: " + fileurl) try: basename = os.path.basename(fileurl) fp = urlopen(fileurl, verb='HEAD') + if not fp: + output.eend(errno.EPERM) + return None + headers = fp.info() if 'Content-disposition' in headers and basename not in headers['Content-disposition']: