euscan: robots.txt, timeout, user-agent, ...

- Add a blacklist for robots.txt, we *want* to scan sourceforge
- Set a user-agent that doesn't looks like a browser
- Handle timeouts more carefully
- If brute force detect too much versions, avoid infinite loops
- Handle redirections more carefully

Signed-off-by: Corentin Chary <corentincj@iksaif.net>
This commit is contained in:
Corentin Chary
2011-09-21 10:09:50 +02:00
parent 8c40a1795c
commit 14971584af
6 changed files with 95 additions and 17 deletions

View File

@ -3,6 +3,8 @@
# Copyright 2011 Corentin Chary <corentin.chary@gmail.com>
# Distributed under the terms of the GNU General Public License v2
__version__ = "git"
import sys
from portage.output import EOutput
@ -10,13 +12,14 @@ from portage.output import EOutput
CONFIG = {
'nocolor': False,
'quiet': False,
'verbose': True,
'verbose': 1,
'debug': False,
'brute-force': 3,
'brute-force-recursive': True,
'brute-force-false-watermark': 50,
'scan-dir': True,
'oneshot': False,
'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)',
'user-agent' : 'escan (http://euscan.iksaif.net)',
'skip-robots-txt' : False
}
@ -41,11 +44,19 @@ SCANDIR_BLACKLIST_URLS = [
BRUTEFORCE_BLACKLIST_PACKAGES = [
'net-zope/plonepopoll' # infinite loop any http://plone.org/products/plonepopoll/releases/*/plonepopoll-2-6-1.tgz link will work
]
]
BRUTEFORCE_BLACKLIST_URLS = [
'http://(.*)dockapps.org/download.php/id/(.*)', # infinite loop
'http://hydra.nixos.org/build/(.*)', # infinite loop
'http://www.rennings.net/gentoo/distfiles/(.*)' # Doesn't respect 404, infinite loop
'http://art.gnome.org/download/(.*)' # Doesn't respect 404, infinite loop
'http://www.rennings.net/gentoo/distfiles/(.*)', # Doesn't respect 404, infinite loop
'http://art.gnome.org/download/(.*)', # Doesn't respect 404, infinite loop
'http://barelysufficient.org/~olemarkus/(.*)', # Doesn't respect 404, infinite loop
'http://olemarkus.org/~olemarkus/(.*)', # Doesn't respect 404, infinite loop
]
ROBOTS_TXT_BLACKLIST_DOMAINS = [
'(.*)sourceforge(.*)',
'(.*)github.com',
'(.*)berlios(.*)',
]

View File

@ -172,6 +172,10 @@ def brute_force(cpv, url):
result.append([url, version])
if len(result) > CONFIG['brute-force-false-watermark']:
output.einfo("Broken server detected ! Skipping brute force.")
return []
if CONFIG["brute-force-recursive"]:
for v in helpers.gen_versions(list(components), CONFIG["brute-force"]):
if v not in versions and tuple(v) not in done:

View File

@ -15,7 +15,7 @@ except ImportError:
import portage
from portage import dep
from euscan import CONFIG, BLACKLIST_VERSIONS, output
from euscan import CONFIG, BLACKLIST_VERSIONS, ROBOTS_TXT_BLACKLIST_DOMAINS, output
def htop_vercmp(a, b):
def fixver(v):
@ -217,6 +217,14 @@ def urlallowed(url):
protocol, domain = urlparse.urlparse(url)[:2]
for bd in ROBOTS_TXT_BLACKLIST_DOMAINS:
if re.match(bd, domain):
return True
for d in ['sourceforge', 'berlios', 'github.com']:
if d in domain:
return True
if protocol == 'ftp':
return True
@ -226,14 +234,22 @@ def urlallowed(url):
if rpcache.has_key(baseurl):
rp = rpcache[baseurl]
else:
from socket import setdefaulttimeout, getdefaulttimeout
timeout = getdefaulttimeout()
setdefaulttimeout(5)
rp = robotparser.RobotFileParser()
rp.set_url(robotsurl)
try:
rp.read()
rpcache[baseurl] = rp
except:
return True
return rp.can_fetch(CONFIG['user-agent'], url)
rp = None
setdefaulttimeout(timeout)
return rp.can_fetch(CONFIG['user-agent'], url) if rp else False
def urlopen(url, timeout=None, verb="GET"):
if not urlallowed(url):
@ -250,7 +266,16 @@ def urlopen(url, timeout=None, verb="GET"):
return None
request.add_header('User-Agent', CONFIG['user-agent'])
return urllib2.urlopen(request, None, timeout)
if CONFIG['verbose']:
debuglevel = CONFIG['verbose'] - 1
handlers = [urllib2.HTTPHandler(debuglevel=debuglevel)]
else:
handlers = []
opener = urllib2.build_opener(*handlers)
return opener.open(request, None, timeout)
def tryurl(fileurl, template):
result = True
@ -277,6 +302,8 @@ def tryurl(fileurl, template):
result = None
elif 'Content-Type' in headers and 'text/html' in headers['Content-Type']:
result = None
elif 'Content-Type' in headers and 'application/x-httpd-php' in headers['Content-Type']:
result = None
elif fp.geturl() != fileurl:
regex = regex_from_template(template)
baseregex = regex_from_template(os.path.basename(template))