euscan: robots.txt, timeout, user-agent, ...

- Add a blacklist for robots.txt, we *want* to scan sourceforge
- Set a user-agent that doesn't looks like a browser
- Handle timeouts more carefully
- If brute force detect too much versions, avoid infinite loops
- Handle redirections more carefully

Signed-off-by: Corentin Chary <corentincj@iksaif.net>
This commit is contained in:
Corentin Chary 2011-09-21 10:09:50 +02:00
parent 8c40a1795c
commit 14971584af
6 changed files with 95 additions and 17 deletions

9
TODO
View File

@ -4,16 +4,12 @@ TODO
euscan
------
- respect robots.txt (portscout)
- check other distros (youri)
- clean blacklist system
- add a way to blacklist versions using standard package tokens
- =x11-drivers/xf86-video-intel-2.14.90*
- >=x11-base/xorg-server-1.10.900
Site Handlers
-------------
- sourceforge: http://sourceforge.net/api/file/index/project-name/vboxgtk/mtime/desc/limit/20/rss http://sourceforge.net/api/release/index/project-id/264534/rss
- ftp.kde.org: doesn't scan the "unstable" tree
- mysql: should use http://downloads.mysql.com/archives/
- mariadb: should use http://downloads.askmonty.org/MariaDB/+releases/
@ -22,3 +18,6 @@ euscanwww
---------
- add progress options for each command
- add last scan in the footer
- add json/xml for each page
- rss scan world + post ?

View File

@ -12,6 +12,7 @@ __email__ = "corentin.chary@gmail.com"
__version__ = "git"
__productname__ = "euscan"
__description__ = "A tool to detect new upstream releases."
__version__ = "git"
""" Imports """
@ -19,6 +20,7 @@ import os
import sys
import getopt
import errno
import httplib
from portage.output import white, yellow, turquoise, green, EOutput
@ -134,13 +136,13 @@ def parseArgs():
pp.output.nocolor()
elif o in ("-q", "--quiet"):
CONFIG['quiet'] = True
CONFIG['verbose'] = False
CONFIG['verbose'] = 0
elif o in ("-1", "--oneshot"):
CONFIG['oneshot'] = True
elif o in ("-b", "--brute-force"):
CONFIG['brute-force'] = int(a)
elif o in ("-v", "--verbose") and not CONFIG['quiet']:
CONFIG['verbose'] = True
CONFIG['verbose'] += 1
else:
return_code = False
@ -197,6 +199,9 @@ def main():
output = EOutput(CONFIG['quiet'])
ret = scan_upstream(package)
if CONFIG['verbose'] > 2:
httplib.HTTPConnection.debuglevel = 1
print ()
for url, version in ret:

View File

@ -3,6 +3,8 @@
# Copyright 2011 Corentin Chary <corentin.chary@gmail.com>
# Distributed under the terms of the GNU General Public License v2
__version__ = "git"
import sys
from portage.output import EOutput
@ -10,13 +12,14 @@ from portage.output import EOutput
CONFIG = {
'nocolor': False,
'quiet': False,
'verbose': True,
'verbose': 1,
'debug': False,
'brute-force': 3,
'brute-force-recursive': True,
'brute-force-false-watermark': 50,
'scan-dir': True,
'oneshot': False,
'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)',
'user-agent' : 'escan (http://euscan.iksaif.net)',
'skip-robots-txt' : False
}
@ -41,11 +44,19 @@ SCANDIR_BLACKLIST_URLS = [
BRUTEFORCE_BLACKLIST_PACKAGES = [
'net-zope/plonepopoll' # infinite loop any http://plone.org/products/plonepopoll/releases/*/plonepopoll-2-6-1.tgz link will work
]
]
BRUTEFORCE_BLACKLIST_URLS = [
'http://(.*)dockapps.org/download.php/id/(.*)', # infinite loop
'http://hydra.nixos.org/build/(.*)', # infinite loop
'http://www.rennings.net/gentoo/distfiles/(.*)' # Doesn't respect 404, infinite loop
'http://art.gnome.org/download/(.*)' # Doesn't respect 404, infinite loop
'http://www.rennings.net/gentoo/distfiles/(.*)', # Doesn't respect 404, infinite loop
'http://art.gnome.org/download/(.*)', # Doesn't respect 404, infinite loop
'http://barelysufficient.org/~olemarkus/(.*)', # Doesn't respect 404, infinite loop
'http://olemarkus.org/~olemarkus/(.*)', # Doesn't respect 404, infinite loop
]
ROBOTS_TXT_BLACKLIST_DOMAINS = [
'(.*)sourceforge(.*)',
'(.*)github.com',
'(.*)berlios(.*)',
]

View File

@ -172,6 +172,10 @@ def brute_force(cpv, url):
result.append([url, version])
if len(result) > CONFIG['brute-force-false-watermark']:
output.einfo("Broken server detected ! Skipping brute force.")
return []
if CONFIG["brute-force-recursive"]:
for v in helpers.gen_versions(list(components), CONFIG["brute-force"]):
if v not in versions and tuple(v) not in done:

View File

@ -15,7 +15,7 @@ except ImportError:
import portage
from portage import dep
from euscan import CONFIG, BLACKLIST_VERSIONS, output
from euscan import CONFIG, BLACKLIST_VERSIONS, ROBOTS_TXT_BLACKLIST_DOMAINS, output
def htop_vercmp(a, b):
def fixver(v):
@ -217,6 +217,14 @@ def urlallowed(url):
protocol, domain = urlparse.urlparse(url)[:2]
for bd in ROBOTS_TXT_BLACKLIST_DOMAINS:
if re.match(bd, domain):
return True
for d in ['sourceforge', 'berlios', 'github.com']:
if d in domain:
return True
if protocol == 'ftp':
return True
@ -226,14 +234,22 @@ def urlallowed(url):
if rpcache.has_key(baseurl):
rp = rpcache[baseurl]
else:
from socket import setdefaulttimeout, getdefaulttimeout
timeout = getdefaulttimeout()
setdefaulttimeout(5)
rp = robotparser.RobotFileParser()
rp.set_url(robotsurl)
try:
rp.read()
rpcache[baseurl] = rp
except:
return True
return rp.can_fetch(CONFIG['user-agent'], url)
rp = None
setdefaulttimeout(timeout)
return rp.can_fetch(CONFIG['user-agent'], url) if rp else False
def urlopen(url, timeout=None, verb="GET"):
if not urlallowed(url):
@ -250,7 +266,16 @@ def urlopen(url, timeout=None, verb="GET"):
return None
request.add_header('User-Agent', CONFIG['user-agent'])
return urllib2.urlopen(request, None, timeout)
if CONFIG['verbose']:
debuglevel = CONFIG['verbose'] - 1
handlers = [urllib2.HTTPHandler(debuglevel=debuglevel)]
else:
handlers = []
opener = urllib2.build_opener(*handlers)
return opener.open(request, None, timeout)
def tryurl(fileurl, template):
result = True
@ -277,6 +302,8 @@ def tryurl(fileurl, template):
result = None
elif 'Content-Type' in headers and 'text/html' in headers['Content-Type']:
result = None
elif 'Content-Type' in headers and 'application/x-httpd-php' in headers['Content-Type']:
result = None
elif fp.geturl() != fileurl:
regex = regex_from_template(template)
baseregex = regex_from_template(os.path.basename(template))

View File

@ -28,6 +28,35 @@ python_scripts = [os.path.join(cwd, path) for path in (
'bin/euscan',
)]
class set_version(core.Command):
"""Set python __version__ to our __version__."""
description = "hardcode scripts' version using VERSION from environment"
user_options = [] # [(long_name, short_name, desc),]
def initialize_options (self):
pass
def finalize_options (self):
pass
def run(self):
ver = 'git' if __version__ == '9999' else __version__
print("Settings version to %s" % ver)
def sub(files, pattern):
for f in files:
updated_file = []
with io.open(f, 'r', 1, 'utf_8') as s:
for line in s:
newline = re.sub(pattern, '"%s"' % ver, line, 1)
if newline != line:
log.info("%s: %s" % (f, newline))
updated_file.append(newline)
with io.open(f, 'w', 1, 'utf_8') as s:
s.writelines(updated_file)
quote = r'[\'"]{1}'
python_re = r'(?<=^__version__ = )' + quote + '[^\'"]*' + quote
sub(python_scripts, python_re)
packages = [
str('.'.join(root.split(os.sep)[1:]))
for root, dirs, files in os.walk('pym/euscan')
@ -37,7 +66,7 @@ packages = [
core.setup(
name='euscan',
version=__version__,
description='Ebuild Upstream Scan tools.',
description='Ebuild upstream scan utility.',
author='Corentin Chary',
author_email='corentin.chary@gmail.com',
maintainer='Corentin Chary',
@ -51,4 +80,7 @@ core.setup(
data_files=(
(os.path.join(EPREFIX, 'usr/share/man/man1'), glob('man/*')),
),
cmdclass={
'set_version': set_version,
},
)