From 14971584af4122cc65352bbf09c8c7b609457a67 Mon Sep 17 00:00:00 2001 From: Corentin Chary Date: Wed, 21 Sep 2011 10:09:50 +0200 Subject: [PATCH] euscan: robots.txt, timeout, user-agent, ... - Add a blacklist for robots.txt, we *want* to scan sourceforge - Set a user-agent that doesn't looks like a browser - Handle timeouts more carefully - If brute force detect too much versions, avoid infinite loops - Handle redirections more carefully Signed-off-by: Corentin Chary --- TODO | 9 ++++----- bin/euscan | 9 +++++++-- pym/euscan/__init__.py | 21 +++++++++++++++----- pym/euscan/handlers/generic.py | 4 ++++ pym/euscan/helpers.py | 35 ++++++++++++++++++++++++++++++---- setup.py | 34 ++++++++++++++++++++++++++++++++- 6 files changed, 95 insertions(+), 17 deletions(-) diff --git a/TODO b/TODO index f7d4993..0c18672 100644 --- a/TODO +++ b/TODO @@ -4,16 +4,12 @@ TODO euscan ------ -- respect robots.txt (portscout) - check other distros (youri) -- clean blacklist system -- add a way to blacklist versions using standard package tokens - - =x11-drivers/xf86-video-intel-2.14.90* - - >=x11-base/xorg-server-1.10.900 Site Handlers ------------- +- sourceforge: http://sourceforge.net/api/file/index/project-name/vboxgtk/mtime/desc/limit/20/rss http://sourceforge.net/api/release/index/project-id/264534/rss - ftp.kde.org: doesn't scan the "unstable" tree - mysql: should use http://downloads.mysql.com/archives/ - mariadb: should use http://downloads.askmonty.org/MariaDB/+releases/ @@ -22,3 +18,6 @@ euscanwww --------- - add progress options for each command +- add last scan in the footer +- add json/xml for each page +- rss scan world + post ? diff --git a/bin/euscan b/bin/euscan index d41b079..4822995 100755 --- a/bin/euscan +++ b/bin/euscan @@ -12,6 +12,7 @@ __email__ = "corentin.chary@gmail.com" __version__ = "git" __productname__ = "euscan" __description__ = "A tool to detect new upstream releases." +__version__ = "git" """ Imports """ @@ -19,6 +20,7 @@ import os import sys import getopt import errno +import httplib from portage.output import white, yellow, turquoise, green, EOutput @@ -134,13 +136,13 @@ def parseArgs(): pp.output.nocolor() elif o in ("-q", "--quiet"): CONFIG['quiet'] = True - CONFIG['verbose'] = False + CONFIG['verbose'] = 0 elif o in ("-1", "--oneshot"): CONFIG['oneshot'] = True elif o in ("-b", "--brute-force"): CONFIG['brute-force'] = int(a) elif o in ("-v", "--verbose") and not CONFIG['quiet']: - CONFIG['verbose'] = True + CONFIG['verbose'] += 1 else: return_code = False @@ -197,6 +199,9 @@ def main(): output = EOutput(CONFIG['quiet']) ret = scan_upstream(package) + if CONFIG['verbose'] > 2: + httplib.HTTPConnection.debuglevel = 1 + print () for url, version in ret: diff --git a/pym/euscan/__init__.py b/pym/euscan/__init__.py index abc6f7d..f5ee6ee 100644 --- a/pym/euscan/__init__.py +++ b/pym/euscan/__init__.py @@ -3,6 +3,8 @@ # Copyright 2011 Corentin Chary # Distributed under the terms of the GNU General Public License v2 +__version__ = "git" + import sys from portage.output import EOutput @@ -10,13 +12,14 @@ from portage.output import EOutput CONFIG = { 'nocolor': False, 'quiet': False, - 'verbose': True, + 'verbose': 1, 'debug': False, 'brute-force': 3, 'brute-force-recursive': True, + 'brute-force-false-watermark': 50, 'scan-dir': True, 'oneshot': False, - 'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)', + 'user-agent' : 'escan (http://euscan.iksaif.net)', 'skip-robots-txt' : False } @@ -41,11 +44,19 @@ SCANDIR_BLACKLIST_URLS = [ BRUTEFORCE_BLACKLIST_PACKAGES = [ 'net-zope/plonepopoll' # infinite loop any http://plone.org/products/plonepopoll/releases/*/plonepopoll-2-6-1.tgz link will work - ] +] BRUTEFORCE_BLACKLIST_URLS = [ 'http://(.*)dockapps.org/download.php/id/(.*)', # infinite loop 'http://hydra.nixos.org/build/(.*)', # infinite loop - 'http://www.rennings.net/gentoo/distfiles/(.*)' # Doesn't respect 404, infinite loop - 'http://art.gnome.org/download/(.*)' # Doesn't respect 404, infinite loop + 'http://www.rennings.net/gentoo/distfiles/(.*)', # Doesn't respect 404, infinite loop + 'http://art.gnome.org/download/(.*)', # Doesn't respect 404, infinite loop + 'http://barelysufficient.org/~olemarkus/(.*)', # Doesn't respect 404, infinite loop + 'http://olemarkus.org/~olemarkus/(.*)', # Doesn't respect 404, infinite loop +] + +ROBOTS_TXT_BLACKLIST_DOMAINS = [ + '(.*)sourceforge(.*)', + '(.*)github.com', + '(.*)berlios(.*)', ] diff --git a/pym/euscan/handlers/generic.py b/pym/euscan/handlers/generic.py index a40b050..1584042 100644 --- a/pym/euscan/handlers/generic.py +++ b/pym/euscan/handlers/generic.py @@ -172,6 +172,10 @@ def brute_force(cpv, url): result.append([url, version]) + if len(result) > CONFIG['brute-force-false-watermark']: + output.einfo("Broken server detected ! Skipping brute force.") + return [] + if CONFIG["brute-force-recursive"]: for v in helpers.gen_versions(list(components), CONFIG["brute-force"]): if v not in versions and tuple(v) not in done: diff --git a/pym/euscan/helpers.py b/pym/euscan/helpers.py index 5285fde..6412220 100644 --- a/pym/euscan/helpers.py +++ b/pym/euscan/helpers.py @@ -15,7 +15,7 @@ except ImportError: import portage from portage import dep -from euscan import CONFIG, BLACKLIST_VERSIONS, output +from euscan import CONFIG, BLACKLIST_VERSIONS, ROBOTS_TXT_BLACKLIST_DOMAINS, output def htop_vercmp(a, b): def fixver(v): @@ -217,6 +217,14 @@ def urlallowed(url): protocol, domain = urlparse.urlparse(url)[:2] + for bd in ROBOTS_TXT_BLACKLIST_DOMAINS: + if re.match(bd, domain): + return True + + for d in ['sourceforge', 'berlios', 'github.com']: + if d in domain: + return True + if protocol == 'ftp': return True @@ -226,14 +234,22 @@ def urlallowed(url): if rpcache.has_key(baseurl): rp = rpcache[baseurl] else: + from socket import setdefaulttimeout, getdefaulttimeout + + timeout = getdefaulttimeout() + setdefaulttimeout(5) + rp = robotparser.RobotFileParser() rp.set_url(robotsurl) try: rp.read() rpcache[baseurl] = rp except: - return True - return rp.can_fetch(CONFIG['user-agent'], url) + rp = None + + setdefaulttimeout(timeout) + + return rp.can_fetch(CONFIG['user-agent'], url) if rp else False def urlopen(url, timeout=None, verb="GET"): if not urlallowed(url): @@ -250,7 +266,16 @@ def urlopen(url, timeout=None, verb="GET"): return None request.add_header('User-Agent', CONFIG['user-agent']) - return urllib2.urlopen(request, None, timeout) + + if CONFIG['verbose']: + debuglevel = CONFIG['verbose'] - 1 + handlers = [urllib2.HTTPHandler(debuglevel=debuglevel)] + else: + handlers = [] + + opener = urllib2.build_opener(*handlers) + + return opener.open(request, None, timeout) def tryurl(fileurl, template): result = True @@ -277,6 +302,8 @@ def tryurl(fileurl, template): result = None elif 'Content-Type' in headers and 'text/html' in headers['Content-Type']: result = None + elif 'Content-Type' in headers and 'application/x-httpd-php' in headers['Content-Type']: + result = None elif fp.geturl() != fileurl: regex = regex_from_template(template) baseregex = regex_from_template(os.path.basename(template)) diff --git a/setup.py b/setup.py index 5a7899f..f64c54d 100755 --- a/setup.py +++ b/setup.py @@ -28,6 +28,35 @@ python_scripts = [os.path.join(cwd, path) for path in ( 'bin/euscan', )] +class set_version(core.Command): + """Set python __version__ to our __version__.""" + description = "hardcode scripts' version using VERSION from environment" + user_options = [] # [(long_name, short_name, desc),] + + def initialize_options (self): + pass + + def finalize_options (self): + pass + + def run(self): + ver = 'git' if __version__ == '9999' else __version__ + print("Settings version to %s" % ver) + def sub(files, pattern): + for f in files: + updated_file = [] + with io.open(f, 'r', 1, 'utf_8') as s: + for line in s: + newline = re.sub(pattern, '"%s"' % ver, line, 1) + if newline != line: + log.info("%s: %s" % (f, newline)) + updated_file.append(newline) + with io.open(f, 'w', 1, 'utf_8') as s: + s.writelines(updated_file) + quote = r'[\'"]{1}' + python_re = r'(?<=^__version__ = )' + quote + '[^\'"]*' + quote + sub(python_scripts, python_re) + packages = [ str('.'.join(root.split(os.sep)[1:])) for root, dirs, files in os.walk('pym/euscan') @@ -37,7 +66,7 @@ packages = [ core.setup( name='euscan', version=__version__, - description='Ebuild Upstream Scan tools.', + description='Ebuild upstream scan utility.', author='Corentin Chary', author_email='corentin.chary@gmail.com', maintainer='Corentin Chary', @@ -51,4 +80,7 @@ core.setup( data_files=( (os.path.join(EPREFIX, 'usr/share/man/man1'), glob('man/*')), ), + cmdclass={ + 'set_version': set_version, + }, )