euscan: robots.txt, timeout, user-agent, ...
- Add a blacklist for robots.txt, we *want* to scan sourceforge - Set a user-agent that doesn't looks like a browser - Handle timeouts more carefully - If brute force detect too much versions, avoid infinite loops - Handle redirections more carefully Signed-off-by: Corentin Chary <corentincj@iksaif.net>
This commit is contained in:
@ -3,6 +3,8 @@
|
||||
# Copyright 2011 Corentin Chary <corentin.chary@gmail.com>
|
||||
# Distributed under the terms of the GNU General Public License v2
|
||||
|
||||
__version__ = "git"
|
||||
|
||||
import sys
|
||||
|
||||
from portage.output import EOutput
|
||||
@ -10,13 +12,14 @@ from portage.output import EOutput
|
||||
CONFIG = {
|
||||
'nocolor': False,
|
||||
'quiet': False,
|
||||
'verbose': True,
|
||||
'verbose': 1,
|
||||
'debug': False,
|
||||
'brute-force': 3,
|
||||
'brute-force-recursive': True,
|
||||
'brute-force-false-watermark': 50,
|
||||
'scan-dir': True,
|
||||
'oneshot': False,
|
||||
'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)',
|
||||
'user-agent' : 'escan (http://euscan.iksaif.net)',
|
||||
'skip-robots-txt' : False
|
||||
}
|
||||
|
||||
@ -41,11 +44,19 @@ SCANDIR_BLACKLIST_URLS = [
|
||||
|
||||
BRUTEFORCE_BLACKLIST_PACKAGES = [
|
||||
'net-zope/plonepopoll' # infinite loop any http://plone.org/products/plonepopoll/releases/*/plonepopoll-2-6-1.tgz link will work
|
||||
]
|
||||
]
|
||||
|
||||
BRUTEFORCE_BLACKLIST_URLS = [
|
||||
'http://(.*)dockapps.org/download.php/id/(.*)', # infinite loop
|
||||
'http://hydra.nixos.org/build/(.*)', # infinite loop
|
||||
'http://www.rennings.net/gentoo/distfiles/(.*)' # Doesn't respect 404, infinite loop
|
||||
'http://art.gnome.org/download/(.*)' # Doesn't respect 404, infinite loop
|
||||
'http://www.rennings.net/gentoo/distfiles/(.*)', # Doesn't respect 404, infinite loop
|
||||
'http://art.gnome.org/download/(.*)', # Doesn't respect 404, infinite loop
|
||||
'http://barelysufficient.org/~olemarkus/(.*)', # Doesn't respect 404, infinite loop
|
||||
'http://olemarkus.org/~olemarkus/(.*)', # Doesn't respect 404, infinite loop
|
||||
]
|
||||
|
||||
ROBOTS_TXT_BLACKLIST_DOMAINS = [
|
||||
'(.*)sourceforge(.*)',
|
||||
'(.*)github.com',
|
||||
'(.*)berlios(.*)',
|
||||
]
|
||||
|
@ -172,6 +172,10 @@ def brute_force(cpv, url):
|
||||
|
||||
result.append([url, version])
|
||||
|
||||
if len(result) > CONFIG['brute-force-false-watermark']:
|
||||
output.einfo("Broken server detected ! Skipping brute force.")
|
||||
return []
|
||||
|
||||
if CONFIG["brute-force-recursive"]:
|
||||
for v in helpers.gen_versions(list(components), CONFIG["brute-force"]):
|
||||
if v not in versions and tuple(v) not in done:
|
||||
|
@ -15,7 +15,7 @@ except ImportError:
|
||||
import portage
|
||||
from portage import dep
|
||||
|
||||
from euscan import CONFIG, BLACKLIST_VERSIONS, output
|
||||
from euscan import CONFIG, BLACKLIST_VERSIONS, ROBOTS_TXT_BLACKLIST_DOMAINS, output
|
||||
|
||||
def htop_vercmp(a, b):
|
||||
def fixver(v):
|
||||
@ -217,6 +217,14 @@ def urlallowed(url):
|
||||
|
||||
protocol, domain = urlparse.urlparse(url)[:2]
|
||||
|
||||
for bd in ROBOTS_TXT_BLACKLIST_DOMAINS:
|
||||
if re.match(bd, domain):
|
||||
return True
|
||||
|
||||
for d in ['sourceforge', 'berlios', 'github.com']:
|
||||
if d in domain:
|
||||
return True
|
||||
|
||||
if protocol == 'ftp':
|
||||
return True
|
||||
|
||||
@ -226,14 +234,22 @@ def urlallowed(url):
|
||||
if rpcache.has_key(baseurl):
|
||||
rp = rpcache[baseurl]
|
||||
else:
|
||||
from socket import setdefaulttimeout, getdefaulttimeout
|
||||
|
||||
timeout = getdefaulttimeout()
|
||||
setdefaulttimeout(5)
|
||||
|
||||
rp = robotparser.RobotFileParser()
|
||||
rp.set_url(robotsurl)
|
||||
try:
|
||||
rp.read()
|
||||
rpcache[baseurl] = rp
|
||||
except:
|
||||
return True
|
||||
return rp.can_fetch(CONFIG['user-agent'], url)
|
||||
rp = None
|
||||
|
||||
setdefaulttimeout(timeout)
|
||||
|
||||
return rp.can_fetch(CONFIG['user-agent'], url) if rp else False
|
||||
|
||||
def urlopen(url, timeout=None, verb="GET"):
|
||||
if not urlallowed(url):
|
||||
@ -250,7 +266,16 @@ def urlopen(url, timeout=None, verb="GET"):
|
||||
return None
|
||||
|
||||
request.add_header('User-Agent', CONFIG['user-agent'])
|
||||
return urllib2.urlopen(request, None, timeout)
|
||||
|
||||
if CONFIG['verbose']:
|
||||
debuglevel = CONFIG['verbose'] - 1
|
||||
handlers = [urllib2.HTTPHandler(debuglevel=debuglevel)]
|
||||
else:
|
||||
handlers = []
|
||||
|
||||
opener = urllib2.build_opener(*handlers)
|
||||
|
||||
return opener.open(request, None, timeout)
|
||||
|
||||
def tryurl(fileurl, template):
|
||||
result = True
|
||||
@ -277,6 +302,8 @@ def tryurl(fileurl, template):
|
||||
result = None
|
||||
elif 'Content-Type' in headers and 'text/html' in headers['Content-Type']:
|
||||
result = None
|
||||
elif 'Content-Type' in headers and 'application/x-httpd-php' in headers['Content-Type']:
|
||||
result = None
|
||||
elif fp.geturl() != fileurl:
|
||||
regex = regex_from_template(template)
|
||||
baseregex = regex_from_template(os.path.basename(template))
|
||||
|
Reference in New Issue
Block a user