euscan: robots.txt, timeout, user-agent, ...

- Add a blacklist for robots.txt, we *want* to scan sourceforge
- Set a user-agent that doesn't looks like a browser
- Handle timeouts more carefully
- If brute force detect too much versions, avoid infinite loops
- Handle redirections more carefully

Signed-off-by: Corentin Chary <corentincj@iksaif.net>
This commit is contained in:
Corentin Chary 2011-09-21 10:09:50 +02:00
parent 8c40a1795c
commit 14971584af
6 changed files with 95 additions and 17 deletions

9
TODO
View File

@ -4,16 +4,12 @@ TODO
euscan euscan
------ ------
- respect robots.txt (portscout)
- check other distros (youri) - check other distros (youri)
- clean blacklist system
- add a way to blacklist versions using standard package tokens
- =x11-drivers/xf86-video-intel-2.14.90*
- >=x11-base/xorg-server-1.10.900
Site Handlers Site Handlers
------------- -------------
- sourceforge: http://sourceforge.net/api/file/index/project-name/vboxgtk/mtime/desc/limit/20/rss http://sourceforge.net/api/release/index/project-id/264534/rss
- ftp.kde.org: doesn't scan the "unstable" tree - ftp.kde.org: doesn't scan the "unstable" tree
- mysql: should use http://downloads.mysql.com/archives/ - mysql: should use http://downloads.mysql.com/archives/
- mariadb: should use http://downloads.askmonty.org/MariaDB/+releases/ - mariadb: should use http://downloads.askmonty.org/MariaDB/+releases/
@ -22,3 +18,6 @@ euscanwww
--------- ---------
- add progress options for each command - add progress options for each command
- add last scan in the footer
- add json/xml for each page
- rss scan world + post ?

View File

@ -12,6 +12,7 @@ __email__ = "corentin.chary@gmail.com"
__version__ = "git" __version__ = "git"
__productname__ = "euscan" __productname__ = "euscan"
__description__ = "A tool to detect new upstream releases." __description__ = "A tool to detect new upstream releases."
__version__ = "git"
""" Imports """ """ Imports """
@ -19,6 +20,7 @@ import os
import sys import sys
import getopt import getopt
import errno import errno
import httplib
from portage.output import white, yellow, turquoise, green, EOutput from portage.output import white, yellow, turquoise, green, EOutput
@ -134,13 +136,13 @@ def parseArgs():
pp.output.nocolor() pp.output.nocolor()
elif o in ("-q", "--quiet"): elif o in ("-q", "--quiet"):
CONFIG['quiet'] = True CONFIG['quiet'] = True
CONFIG['verbose'] = False CONFIG['verbose'] = 0
elif o in ("-1", "--oneshot"): elif o in ("-1", "--oneshot"):
CONFIG['oneshot'] = True CONFIG['oneshot'] = True
elif o in ("-b", "--brute-force"): elif o in ("-b", "--brute-force"):
CONFIG['brute-force'] = int(a) CONFIG['brute-force'] = int(a)
elif o in ("-v", "--verbose") and not CONFIG['quiet']: elif o in ("-v", "--verbose") and not CONFIG['quiet']:
CONFIG['verbose'] = True CONFIG['verbose'] += 1
else: else:
return_code = False return_code = False
@ -197,6 +199,9 @@ def main():
output = EOutput(CONFIG['quiet']) output = EOutput(CONFIG['quiet'])
ret = scan_upstream(package) ret = scan_upstream(package)
if CONFIG['verbose'] > 2:
httplib.HTTPConnection.debuglevel = 1
print () print ()
for url, version in ret: for url, version in ret:

View File

@ -3,6 +3,8 @@
# Copyright 2011 Corentin Chary <corentin.chary@gmail.com> # Copyright 2011 Corentin Chary <corentin.chary@gmail.com>
# Distributed under the terms of the GNU General Public License v2 # Distributed under the terms of the GNU General Public License v2
__version__ = "git"
import sys import sys
from portage.output import EOutput from portage.output import EOutput
@ -10,13 +12,14 @@ from portage.output import EOutput
CONFIG = { CONFIG = {
'nocolor': False, 'nocolor': False,
'quiet': False, 'quiet': False,
'verbose': True, 'verbose': 1,
'debug': False, 'debug': False,
'brute-force': 3, 'brute-force': 3,
'brute-force-recursive': True, 'brute-force-recursive': True,
'brute-force-false-watermark': 50,
'scan-dir': True, 'scan-dir': True,
'oneshot': False, 'oneshot': False,
'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)', 'user-agent' : 'escan (http://euscan.iksaif.net)',
'skip-robots-txt' : False 'skip-robots-txt' : False
} }
@ -46,6 +49,14 @@ BRUTEFORCE_BLACKLIST_PACKAGES = [
BRUTEFORCE_BLACKLIST_URLS = [ BRUTEFORCE_BLACKLIST_URLS = [
'http://(.*)dockapps.org/download.php/id/(.*)', # infinite loop 'http://(.*)dockapps.org/download.php/id/(.*)', # infinite loop
'http://hydra.nixos.org/build/(.*)', # infinite loop 'http://hydra.nixos.org/build/(.*)', # infinite loop
'http://www.rennings.net/gentoo/distfiles/(.*)' # Doesn't respect 404, infinite loop 'http://www.rennings.net/gentoo/distfiles/(.*)', # Doesn't respect 404, infinite loop
'http://art.gnome.org/download/(.*)' # Doesn't respect 404, infinite loop 'http://art.gnome.org/download/(.*)', # Doesn't respect 404, infinite loop
'http://barelysufficient.org/~olemarkus/(.*)', # Doesn't respect 404, infinite loop
'http://olemarkus.org/~olemarkus/(.*)', # Doesn't respect 404, infinite loop
]
ROBOTS_TXT_BLACKLIST_DOMAINS = [
'(.*)sourceforge(.*)',
'(.*)github.com',
'(.*)berlios(.*)',
] ]

View File

@ -172,6 +172,10 @@ def brute_force(cpv, url):
result.append([url, version]) result.append([url, version])
if len(result) > CONFIG['brute-force-false-watermark']:
output.einfo("Broken server detected ! Skipping brute force.")
return []
if CONFIG["brute-force-recursive"]: if CONFIG["brute-force-recursive"]:
for v in helpers.gen_versions(list(components), CONFIG["brute-force"]): for v in helpers.gen_versions(list(components), CONFIG["brute-force"]):
if v not in versions and tuple(v) not in done: if v not in versions and tuple(v) not in done:

View File

@ -15,7 +15,7 @@ except ImportError:
import portage import portage
from portage import dep from portage import dep
from euscan import CONFIG, BLACKLIST_VERSIONS, output from euscan import CONFIG, BLACKLIST_VERSIONS, ROBOTS_TXT_BLACKLIST_DOMAINS, output
def htop_vercmp(a, b): def htop_vercmp(a, b):
def fixver(v): def fixver(v):
@ -217,6 +217,14 @@ def urlallowed(url):
protocol, domain = urlparse.urlparse(url)[:2] protocol, domain = urlparse.urlparse(url)[:2]
for bd in ROBOTS_TXT_BLACKLIST_DOMAINS:
if re.match(bd, domain):
return True
for d in ['sourceforge', 'berlios', 'github.com']:
if d in domain:
return True
if protocol == 'ftp': if protocol == 'ftp':
return True return True
@ -226,14 +234,22 @@ def urlallowed(url):
if rpcache.has_key(baseurl): if rpcache.has_key(baseurl):
rp = rpcache[baseurl] rp = rpcache[baseurl]
else: else:
from socket import setdefaulttimeout, getdefaulttimeout
timeout = getdefaulttimeout()
setdefaulttimeout(5)
rp = robotparser.RobotFileParser() rp = robotparser.RobotFileParser()
rp.set_url(robotsurl) rp.set_url(robotsurl)
try: try:
rp.read() rp.read()
rpcache[baseurl] = rp rpcache[baseurl] = rp
except: except:
return True rp = None
return rp.can_fetch(CONFIG['user-agent'], url)
setdefaulttimeout(timeout)
return rp.can_fetch(CONFIG['user-agent'], url) if rp else False
def urlopen(url, timeout=None, verb="GET"): def urlopen(url, timeout=None, verb="GET"):
if not urlallowed(url): if not urlallowed(url):
@ -250,7 +266,16 @@ def urlopen(url, timeout=None, verb="GET"):
return None return None
request.add_header('User-Agent', CONFIG['user-agent']) request.add_header('User-Agent', CONFIG['user-agent'])
return urllib2.urlopen(request, None, timeout)
if CONFIG['verbose']:
debuglevel = CONFIG['verbose'] - 1
handlers = [urllib2.HTTPHandler(debuglevel=debuglevel)]
else:
handlers = []
opener = urllib2.build_opener(*handlers)
return opener.open(request, None, timeout)
def tryurl(fileurl, template): def tryurl(fileurl, template):
result = True result = True
@ -277,6 +302,8 @@ def tryurl(fileurl, template):
result = None result = None
elif 'Content-Type' in headers and 'text/html' in headers['Content-Type']: elif 'Content-Type' in headers and 'text/html' in headers['Content-Type']:
result = None result = None
elif 'Content-Type' in headers and 'application/x-httpd-php' in headers['Content-Type']:
result = None
elif fp.geturl() != fileurl: elif fp.geturl() != fileurl:
regex = regex_from_template(template) regex = regex_from_template(template)
baseregex = regex_from_template(os.path.basename(template)) baseregex = regex_from_template(os.path.basename(template))

View File

@ -28,6 +28,35 @@ python_scripts = [os.path.join(cwd, path) for path in (
'bin/euscan', 'bin/euscan',
)] )]
class set_version(core.Command):
"""Set python __version__ to our __version__."""
description = "hardcode scripts' version using VERSION from environment"
user_options = [] # [(long_name, short_name, desc),]
def initialize_options (self):
pass
def finalize_options (self):
pass
def run(self):
ver = 'git' if __version__ == '9999' else __version__
print("Settings version to %s" % ver)
def sub(files, pattern):
for f in files:
updated_file = []
with io.open(f, 'r', 1, 'utf_8') as s:
for line in s:
newline = re.sub(pattern, '"%s"' % ver, line, 1)
if newline != line:
log.info("%s: %s" % (f, newline))
updated_file.append(newline)
with io.open(f, 'w', 1, 'utf_8') as s:
s.writelines(updated_file)
quote = r'[\'"]{1}'
python_re = r'(?<=^__version__ = )' + quote + '[^\'"]*' + quote
sub(python_scripts, python_re)
packages = [ packages = [
str('.'.join(root.split(os.sep)[1:])) str('.'.join(root.split(os.sep)[1:]))
for root, dirs, files in os.walk('pym/euscan') for root, dirs, files in os.walk('pym/euscan')
@ -37,7 +66,7 @@ packages = [
core.setup( core.setup(
name='euscan', name='euscan',
version=__version__, version=__version__,
description='Ebuild Upstream Scan tools.', description='Ebuild upstream scan utility.',
author='Corentin Chary', author='Corentin Chary',
author_email='corentin.chary@gmail.com', author_email='corentin.chary@gmail.com',
maintainer='Corentin Chary', maintainer='Corentin Chary',
@ -51,4 +80,7 @@ core.setup(
data_files=( data_files=(
(os.path.join(EPREFIX, 'usr/share/man/man1'), glob('man/*')), (os.path.join(EPREFIX, 'usr/share/man/man1'), glob('man/*')),
), ),
cmdclass={
'set_version': set_version,
},
) )