euscan: robots.txt, timeout, user-agent, ...
- Add a blacklist for robots.txt, we *want* to scan sourceforge - Set a user-agent that doesn't looks like a browser - Handle timeouts more carefully - If brute force detect too much versions, avoid infinite loops - Handle redirections more carefully Signed-off-by: Corentin Chary <corentincj@iksaif.net>
This commit is contained in:
parent
8c40a1795c
commit
14971584af
9
TODO
9
TODO
@ -4,16 +4,12 @@ TODO
|
||||
euscan
|
||||
------
|
||||
|
||||
- respect robots.txt (portscout)
|
||||
- check other distros (youri)
|
||||
- clean blacklist system
|
||||
- add a way to blacklist versions using standard package tokens
|
||||
- =x11-drivers/xf86-video-intel-2.14.90*
|
||||
- >=x11-base/xorg-server-1.10.900
|
||||
|
||||
Site Handlers
|
||||
-------------
|
||||
|
||||
- sourceforge: http://sourceforge.net/api/file/index/project-name/vboxgtk/mtime/desc/limit/20/rss http://sourceforge.net/api/release/index/project-id/264534/rss
|
||||
- ftp.kde.org: doesn't scan the "unstable" tree
|
||||
- mysql: should use http://downloads.mysql.com/archives/
|
||||
- mariadb: should use http://downloads.askmonty.org/MariaDB/+releases/
|
||||
@ -22,3 +18,6 @@ euscanwww
|
||||
---------
|
||||
|
||||
- add progress options for each command
|
||||
- add last scan in the footer
|
||||
- add json/xml for each page
|
||||
- rss scan world + post ?
|
||||
|
@ -12,6 +12,7 @@ __email__ = "corentin.chary@gmail.com"
|
||||
__version__ = "git"
|
||||
__productname__ = "euscan"
|
||||
__description__ = "A tool to detect new upstream releases."
|
||||
__version__ = "git"
|
||||
|
||||
""" Imports """
|
||||
|
||||
@ -19,6 +20,7 @@ import os
|
||||
import sys
|
||||
import getopt
|
||||
import errno
|
||||
import httplib
|
||||
|
||||
from portage.output import white, yellow, turquoise, green, EOutput
|
||||
|
||||
@ -134,13 +136,13 @@ def parseArgs():
|
||||
pp.output.nocolor()
|
||||
elif o in ("-q", "--quiet"):
|
||||
CONFIG['quiet'] = True
|
||||
CONFIG['verbose'] = False
|
||||
CONFIG['verbose'] = 0
|
||||
elif o in ("-1", "--oneshot"):
|
||||
CONFIG['oneshot'] = True
|
||||
elif o in ("-b", "--brute-force"):
|
||||
CONFIG['brute-force'] = int(a)
|
||||
elif o in ("-v", "--verbose") and not CONFIG['quiet']:
|
||||
CONFIG['verbose'] = True
|
||||
CONFIG['verbose'] += 1
|
||||
else:
|
||||
return_code = False
|
||||
|
||||
@ -197,6 +199,9 @@ def main():
|
||||
output = EOutput(CONFIG['quiet'])
|
||||
ret = scan_upstream(package)
|
||||
|
||||
if CONFIG['verbose'] > 2:
|
||||
httplib.HTTPConnection.debuglevel = 1
|
||||
|
||||
print ()
|
||||
|
||||
for url, version in ret:
|
||||
|
@ -3,6 +3,8 @@
|
||||
# Copyright 2011 Corentin Chary <corentin.chary@gmail.com>
|
||||
# Distributed under the terms of the GNU General Public License v2
|
||||
|
||||
__version__ = "git"
|
||||
|
||||
import sys
|
||||
|
||||
from portage.output import EOutput
|
||||
@ -10,13 +12,14 @@ from portage.output import EOutput
|
||||
CONFIG = {
|
||||
'nocolor': False,
|
||||
'quiet': False,
|
||||
'verbose': True,
|
||||
'verbose': 1,
|
||||
'debug': False,
|
||||
'brute-force': 3,
|
||||
'brute-force-recursive': True,
|
||||
'brute-force-false-watermark': 50,
|
||||
'scan-dir': True,
|
||||
'oneshot': False,
|
||||
'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)',
|
||||
'user-agent' : 'escan (http://euscan.iksaif.net)',
|
||||
'skip-robots-txt' : False
|
||||
}
|
||||
|
||||
@ -46,6 +49,14 @@ BRUTEFORCE_BLACKLIST_PACKAGES = [
|
||||
BRUTEFORCE_BLACKLIST_URLS = [
|
||||
'http://(.*)dockapps.org/download.php/id/(.*)', # infinite loop
|
||||
'http://hydra.nixos.org/build/(.*)', # infinite loop
|
||||
'http://www.rennings.net/gentoo/distfiles/(.*)' # Doesn't respect 404, infinite loop
|
||||
'http://art.gnome.org/download/(.*)' # Doesn't respect 404, infinite loop
|
||||
'http://www.rennings.net/gentoo/distfiles/(.*)', # Doesn't respect 404, infinite loop
|
||||
'http://art.gnome.org/download/(.*)', # Doesn't respect 404, infinite loop
|
||||
'http://barelysufficient.org/~olemarkus/(.*)', # Doesn't respect 404, infinite loop
|
||||
'http://olemarkus.org/~olemarkus/(.*)', # Doesn't respect 404, infinite loop
|
||||
]
|
||||
|
||||
ROBOTS_TXT_BLACKLIST_DOMAINS = [
|
||||
'(.*)sourceforge(.*)',
|
||||
'(.*)github.com',
|
||||
'(.*)berlios(.*)',
|
||||
]
|
||||
|
@ -172,6 +172,10 @@ def brute_force(cpv, url):
|
||||
|
||||
result.append([url, version])
|
||||
|
||||
if len(result) > CONFIG['brute-force-false-watermark']:
|
||||
output.einfo("Broken server detected ! Skipping brute force.")
|
||||
return []
|
||||
|
||||
if CONFIG["brute-force-recursive"]:
|
||||
for v in helpers.gen_versions(list(components), CONFIG["brute-force"]):
|
||||
if v not in versions and tuple(v) not in done:
|
||||
|
@ -15,7 +15,7 @@ except ImportError:
|
||||
import portage
|
||||
from portage import dep
|
||||
|
||||
from euscan import CONFIG, BLACKLIST_VERSIONS, output
|
||||
from euscan import CONFIG, BLACKLIST_VERSIONS, ROBOTS_TXT_BLACKLIST_DOMAINS, output
|
||||
|
||||
def htop_vercmp(a, b):
|
||||
def fixver(v):
|
||||
@ -217,6 +217,14 @@ def urlallowed(url):
|
||||
|
||||
protocol, domain = urlparse.urlparse(url)[:2]
|
||||
|
||||
for bd in ROBOTS_TXT_BLACKLIST_DOMAINS:
|
||||
if re.match(bd, domain):
|
||||
return True
|
||||
|
||||
for d in ['sourceforge', 'berlios', 'github.com']:
|
||||
if d in domain:
|
||||
return True
|
||||
|
||||
if protocol == 'ftp':
|
||||
return True
|
||||
|
||||
@ -226,14 +234,22 @@ def urlallowed(url):
|
||||
if rpcache.has_key(baseurl):
|
||||
rp = rpcache[baseurl]
|
||||
else:
|
||||
from socket import setdefaulttimeout, getdefaulttimeout
|
||||
|
||||
timeout = getdefaulttimeout()
|
||||
setdefaulttimeout(5)
|
||||
|
||||
rp = robotparser.RobotFileParser()
|
||||
rp.set_url(robotsurl)
|
||||
try:
|
||||
rp.read()
|
||||
rpcache[baseurl] = rp
|
||||
except:
|
||||
return True
|
||||
return rp.can_fetch(CONFIG['user-agent'], url)
|
||||
rp = None
|
||||
|
||||
setdefaulttimeout(timeout)
|
||||
|
||||
return rp.can_fetch(CONFIG['user-agent'], url) if rp else False
|
||||
|
||||
def urlopen(url, timeout=None, verb="GET"):
|
||||
if not urlallowed(url):
|
||||
@ -250,7 +266,16 @@ def urlopen(url, timeout=None, verb="GET"):
|
||||
return None
|
||||
|
||||
request.add_header('User-Agent', CONFIG['user-agent'])
|
||||
return urllib2.urlopen(request, None, timeout)
|
||||
|
||||
if CONFIG['verbose']:
|
||||
debuglevel = CONFIG['verbose'] - 1
|
||||
handlers = [urllib2.HTTPHandler(debuglevel=debuglevel)]
|
||||
else:
|
||||
handlers = []
|
||||
|
||||
opener = urllib2.build_opener(*handlers)
|
||||
|
||||
return opener.open(request, None, timeout)
|
||||
|
||||
def tryurl(fileurl, template):
|
||||
result = True
|
||||
@ -277,6 +302,8 @@ def tryurl(fileurl, template):
|
||||
result = None
|
||||
elif 'Content-Type' in headers and 'text/html' in headers['Content-Type']:
|
||||
result = None
|
||||
elif 'Content-Type' in headers and 'application/x-httpd-php' in headers['Content-Type']:
|
||||
result = None
|
||||
elif fp.geturl() != fileurl:
|
||||
regex = regex_from_template(template)
|
||||
baseregex = regex_from_template(os.path.basename(template))
|
||||
|
34
setup.py
34
setup.py
@ -28,6 +28,35 @@ python_scripts = [os.path.join(cwd, path) for path in (
|
||||
'bin/euscan',
|
||||
)]
|
||||
|
||||
class set_version(core.Command):
|
||||
"""Set python __version__ to our __version__."""
|
||||
description = "hardcode scripts' version using VERSION from environment"
|
||||
user_options = [] # [(long_name, short_name, desc),]
|
||||
|
||||
def initialize_options (self):
|
||||
pass
|
||||
|
||||
def finalize_options (self):
|
||||
pass
|
||||
|
||||
def run(self):
|
||||
ver = 'git' if __version__ == '9999' else __version__
|
||||
print("Settings version to %s" % ver)
|
||||
def sub(files, pattern):
|
||||
for f in files:
|
||||
updated_file = []
|
||||
with io.open(f, 'r', 1, 'utf_8') as s:
|
||||
for line in s:
|
||||
newline = re.sub(pattern, '"%s"' % ver, line, 1)
|
||||
if newline != line:
|
||||
log.info("%s: %s" % (f, newline))
|
||||
updated_file.append(newline)
|
||||
with io.open(f, 'w', 1, 'utf_8') as s:
|
||||
s.writelines(updated_file)
|
||||
quote = r'[\'"]{1}'
|
||||
python_re = r'(?<=^__version__ = )' + quote + '[^\'"]*' + quote
|
||||
sub(python_scripts, python_re)
|
||||
|
||||
packages = [
|
||||
str('.'.join(root.split(os.sep)[1:]))
|
||||
for root, dirs, files in os.walk('pym/euscan')
|
||||
@ -37,7 +66,7 @@ packages = [
|
||||
core.setup(
|
||||
name='euscan',
|
||||
version=__version__,
|
||||
description='Ebuild Upstream Scan tools.',
|
||||
description='Ebuild upstream scan utility.',
|
||||
author='Corentin Chary',
|
||||
author_email='corentin.chary@gmail.com',
|
||||
maintainer='Corentin Chary',
|
||||
@ -51,4 +80,7 @@ core.setup(
|
||||
data_files=(
|
||||
(os.path.join(EPREFIX, 'usr/share/man/man1'), glob('man/*')),
|
||||
),
|
||||
cmdclass={
|
||||
'set_version': set_version,
|
||||
},
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user