euscan: respect robots.txt
Signed-off-by: Corentin Chary <corentincj@iksaif.net>
This commit is contained in:
parent
bd75e1af4e
commit
a137ef60e3
@ -16,7 +16,8 @@ CONFIG = {
|
||||
'brute-force-recursive': True,
|
||||
'scan-dir': True,
|
||||
'oneshot': False,
|
||||
'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)'
|
||||
'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)',
|
||||
'skip-robots-txt' : False
|
||||
}
|
||||
|
||||
output = EOutput(CONFIG['quiet'])
|
||||
|
@ -57,6 +57,9 @@ def scan_directory_recursive(cpv, url, steps):
|
||||
except IOError:
|
||||
return []
|
||||
|
||||
if not fp:
|
||||
return []
|
||||
|
||||
data = fp.read()
|
||||
|
||||
results = []
|
||||
|
@ -38,6 +38,9 @@ def scan(cpv, url):
|
||||
except IOError:
|
||||
return []
|
||||
|
||||
if not fp:
|
||||
return []
|
||||
|
||||
data = fp.read()
|
||||
|
||||
dom = xml.dom.minidom.parseString(data)
|
||||
|
@ -33,6 +33,9 @@ def scan(cpv, url):
|
||||
except IOError:
|
||||
return []
|
||||
|
||||
if not fp:
|
||||
return []
|
||||
|
||||
data = fp.read()
|
||||
versions = json.loads(data)
|
||||
|
||||
|
@ -1,9 +1,17 @@
|
||||
import urllib2
|
||||
import os
|
||||
import re
|
||||
import pkg_resources
|
||||
import errno
|
||||
|
||||
import urllib2
|
||||
|
||||
try:
|
||||
from urllib import robotparser
|
||||
from urllib import urlparse
|
||||
except ImportError:
|
||||
import robotparser
|
||||
import urlparse
|
||||
|
||||
import portage
|
||||
from portage import dep
|
||||
|
||||
@ -200,7 +208,31 @@ class HeadRequest(urllib2.Request):
|
||||
def get_method(self):
|
||||
return "HEAD"
|
||||
|
||||
""" RobotParser cache """
|
||||
rpcache = {}
|
||||
|
||||
def urlallowed(url):
|
||||
if CONFIG['skip-robots-txt']:
|
||||
return True
|
||||
|
||||
protocol, domain = urlparse.urlparse(url)[:2]
|
||||
|
||||
baseurl = '%s://%s' % (protocol, domain)
|
||||
robotsurl = urlparse.urljoin(baseurl, 'robots.txt')
|
||||
|
||||
if rpcache.has_key(baseurl):
|
||||
rp = rpcache[baseurl]
|
||||
else:
|
||||
rp = robotparser.RobotFileParser()
|
||||
rp.set_url(robotsurl)
|
||||
rp.read()
|
||||
rpcache[baseurl] = rp
|
||||
return rp.can_fetch(CONFIG['user-agent'], url)
|
||||
|
||||
def urlopen(url, timeout=None, verb="GET"):
|
||||
if not urlallowed(url):
|
||||
return None
|
||||
|
||||
if not timeout:
|
||||
timeout = timeout_for_url(url)
|
||||
|
||||
@ -217,12 +249,20 @@ def urlopen(url, timeout=None, verb="GET"):
|
||||
def tryurl(fileurl, template):
|
||||
result = True
|
||||
|
||||
if not urlallowed(fileurl):
|
||||
output.eerror("Url '%s' blocked by robots.txt" % fileurl)
|
||||
return None
|
||||
|
||||
output.ebegin("Trying: " + fileurl)
|
||||
|
||||
try:
|
||||
basename = os.path.basename(fileurl)
|
||||
|
||||
fp = urlopen(fileurl, verb='HEAD')
|
||||
if not fp:
|
||||
output.eend(errno.EPERM)
|
||||
return None
|
||||
|
||||
headers = fp.info()
|
||||
|
||||
if 'Content-disposition' in headers and basename not in headers['Content-disposition']:
|
||||
|
Loading…
Reference in New Issue
Block a user