euscan: respect robots.txt

Signed-off-by: Corentin Chary <corentincj@iksaif.net>
This commit is contained in:
Corentin Chary 2011-09-06 16:32:29 +02:00
parent bd75e1af4e
commit a137ef60e3
5 changed files with 52 additions and 2 deletions

View File

@ -16,7 +16,8 @@ CONFIG = {
'brute-force-recursive': True,
'scan-dir': True,
'oneshot': False,
'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)'
'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)',
'skip-robots-txt' : False
}
output = EOutput(CONFIG['quiet'])

View File

@ -57,6 +57,9 @@ def scan_directory_recursive(cpv, url, steps):
except IOError:
return []
if not fp:
return []
data = fp.read()
results = []

View File

@ -38,6 +38,9 @@ def scan(cpv, url):
except IOError:
return []
if not fp:
return []
data = fp.read()
dom = xml.dom.minidom.parseString(data)

View File

@ -33,6 +33,9 @@ def scan(cpv, url):
except IOError:
return []
if not fp:
return []
data = fp.read()
versions = json.loads(data)

View File

@ -1,9 +1,17 @@
import urllib2
import os
import re
import pkg_resources
import errno
import urllib2
try:
from urllib import robotparser
from urllib import urlparse
except ImportError:
import robotparser
import urlparse
import portage
from portage import dep
@ -200,7 +208,31 @@ class HeadRequest(urllib2.Request):
def get_method(self):
return "HEAD"
""" RobotParser cache """
rpcache = {}
def urlallowed(url):
if CONFIG['skip-robots-txt']:
return True
protocol, domain = urlparse.urlparse(url)[:2]
baseurl = '%s://%s' % (protocol, domain)
robotsurl = urlparse.urljoin(baseurl, 'robots.txt')
if rpcache.has_key(baseurl):
rp = rpcache[baseurl]
else:
rp = robotparser.RobotFileParser()
rp.set_url(robotsurl)
rp.read()
rpcache[baseurl] = rp
return rp.can_fetch(CONFIG['user-agent'], url)
def urlopen(url, timeout=None, verb="GET"):
if not urlallowed(url):
return None
if not timeout:
timeout = timeout_for_url(url)
@ -217,12 +249,20 @@ def urlopen(url, timeout=None, verb="GET"):
def tryurl(fileurl, template):
result = True
if not urlallowed(fileurl):
output.eerror("Url '%s' blocked by robots.txt" % fileurl)
return None
output.ebegin("Trying: " + fileurl)
try:
basename = os.path.basename(fileurl)
fp = urlopen(fileurl, verb='HEAD')
if not fp:
output.eend(errno.EPERM)
return None
headers = fp.info()
if 'Content-disposition' in headers and basename not in headers['Content-disposition']: