euscan: shake the code

- add custom site handlers
- use a custom user agent
- fix some bugs in management commands

Signed-off-by: Corentin Chary <corentincj@iksaif.net>
This commit is contained in:
Corentin Chary
2011-08-31 15:38:32 +02:00
parent 5634c59944
commit 752fb04425
22 changed files with 1550 additions and 842 deletions

49
pym/euscan/__init__.py Normal file
View File

@ -0,0 +1,49 @@
#!/usr/bin/python
#
# Copyright 2011 Corentin Chary <corentin.chary@gmail.com>
# Distributed under the terms of the GNU General Public License v2
import sys
from portage.output import EOutput
CONFIG = {
'nocolor': False,
'quiet': False,
'verbose': True,
'debug': False,
'brute-force': 3,
'brute-force-recursive': True,
'scan-dir': True,
'oneshot': False,
'user-agent' : 'Mozilla/5.0 (compatible; euscan; +http://euscan.iksaif.net)'
}
output = EOutput(CONFIG['quiet'])
BLACKLIST_VERSIONS = [
# Compatibility package for running binaries linked against a pre gcc 3.4 libstdc++, won't be updated
'>=sys-libs/libstdc++-v3-3.4',
]
BLACKLIST_PACKAGES = [
# These kernels are almost dead
'sys-kernel/usermode-sources',
'sys-kernel/xbox-sources',
'sys-kernel/cell-sources',
]
SCANDIR_BLACKLIST_URLS = [
'mirror://rubygems/(.*)', # Not browsable
'mirror://gentoo/(.*)' # Directory too big
]
BRUTEFORCE_BLACKLIST_PACKAGES = [
'net-zope/plonepopoll' # infinite loop any http://plone.org/products/plonepopoll/releases/*/plonepopoll-2-6-1.tgz link will work
]
BRUTEFORCE_BLACKLIST_URLS = [
'http://(.*)dockapps.org/download.php/id/(.*)', # infinite loop
'http://hydra.nixos.org/build/(.*)', # infinite loop
'http://www.rennings.net/gentoo/distfiles/(.*)' # Doesn't respect 404, infinite loop
]

View File

@ -0,0 +1,24 @@
from euscan.handlers import generic
from euscan.handlers import php
from euscan.handlers import pypi
from euscan.handlers import rubygem
handlers = [ php, pypi, rubygem, generic ]
def find_best_handler(cpv, url):
for handler in handlers:
if handler.can_handle(cpv, url):
return handler
return None
def scan(cpv, url):
handler = find_best_handler(cpv, url)
if handler:
return handler.scan(cpv, url)
return []
def brute_force(cpv, url):
handler = find_best_handler(cpv, url)
if handler:
return handler.brute_force(cpv, url)
return []

View File

@ -0,0 +1,183 @@
import urllib2
import re
import StringIO
from BeautifulSoup import BeautifulSoup
import portage
from euscan import CONFIG, SCANDIR_BLACKLIST_URLS, BRUTEFORCE_BLACKLIST_PACKAGES, BRUTEFORCE_BLACKLIST_URLS, output
from euscan import helpers
def scan_html(data, url, pattern):
soup = BeautifulSoup(data)
results = []
for link in soup.findAll('a'):
href = link.get("href")
if not href:
continue
if href.startswith(url):
href = href.replace(url, "", 1)
match = re.match(pattern, href, re.I)
if match:
results.append((match.group(1), match.group(0)))
return results
def scan_ftp(data, url, pattern):
buf = StringIO.StringIO(data)
results = []
for line in buf.readlines():
line = line.replace("\n", "").replace("\r", "")
match = re.search(pattern, line, re.I)
if match:
results.append((match.group(1), match.group(0)))
return results
def scan_directory_recursive(cpv, url, steps):
if not steps:
return []
cp, ver, rev = portage.pkgsplit(cpv)
url += steps[0][0]
pattern = steps[0][1]
steps = steps[1:]
output.einfo("Scanning: %s" % url)
try:
fp = helpers.urlopen(url)
except urllib2.URLError:
return []
except IOError:
return []
data = fp.read()
results = []
if re.search("<\s*a\s+[^>]*href", data):
results.extend(scan_html(data, url, pattern))
elif url.startswith('ftp://'):
results.extend(scan_ftp(data, url, pattern))
versions = []
for version, path in results:
if helpers.version_filtered(cp, ver, version):
continue
if not url.endswith('/') and not path.startswith('/'):
path = url + '/' + path
else:
path = url + path
versions.append((path, version))
if steps:
ret = scan_directory_recursive(cpv, path, steps)
versions.extend(ret)
return versions
def scan(cpv, url):
for bu in SCANDIR_BLACKLIST_URLS:
if re.match(bu, url):
output.einfo("%s is blacklisted by rule %s" % (url, bu))
return []
resolved_url = helpers.parse_mirror(url)
cp, ver, rev = portage.pkgsplit(cpv)
template = helpers.template_from_url(resolved_url, ver)
if '${' not in template:
output.einfo("Url doesn't seems to depend on version: %s not found in %s"
% (ver, resolved_url))
return []
else:
output.einfo("Scanning: %s" % template)
steps = helpers.generate_scan_paths(template)
return scan_directory_recursive(cpv, "", steps)
def brute_force(cpv, url):
cp, ver, rev = portage.pkgsplit(cpv)
url = helpers.parse_mirror(url)
for bp in BRUTEFORCE_BLACKLIST_PACKAGES:
if re.match(bp, cp):
output.einfo("%s is blacklisted by rule %s" % (cp, bp))
return []
for bp in BRUTEFORCE_BLACKLIST_URLS:
if re.match(bp, url):
output.einfo("%s is blacklisted by rule %s" % (cp, bp))
return []
output.einfo("Generating version from " + ver)
components = helpers.split_version(ver)
versions = helpers.gen_versions(components, CONFIG["brute-force"])
""" Remove unwanted versions """
for v in versions:
if helpers.vercmp(cp, ver, helpers.join_version(v)) >= 0:
versions.remove(v)
if not versions:
output.einfo("Can't generate new versions from " + ver)
return []
template = helpers.template_from_url(url, ver)
if '${PV}' not in template:
output.einfo("Url doesn't seems to depend on full version: %s not found in %s"
% (ver, url))
return []
else:
output.einfo("Brute forcing: %s" % template)
result = []
i = 0
done = []
while i < len(versions):
components = versions[i]
i += 1
if components in done:
continue
done.append(tuple(components))
version = helpers.join_version(components)
if helpers.version_filtered(cp, ver, version):
continue
url = helpers.url_from_template(template, version)
infos = helpers.tryurl(url, template)
if not infos:
continue
result.append([url, version])
if CONFIG["brute-force-recursive"]:
for v in helpers.gen_versions(components, CONFIG["brute-force"]):
if v not in versions and tuple(v) not in done:
versions.append(v)
if CONFIG["oneshot"]:
break
return result
def can_handle(cpv, url):
return True

View File

@ -0,0 +1,65 @@
import re
import portage
import urllib2
import xml.dom.minidom
from euscan import helpers, output
def can_handle(cpv, url):
if url.startswith('http://pear.php.net/get/'):
return True
if url.startswith('http://pecl.php.net/get/'):
return True
return False
def guess_package_and_channel(cp, url):
match = re.search('http://(.*)/get/(.*)-(.*).tgz', url)
if match:
host = match.group(1)
pkg = match.group(2)
else:
cat, pkg = cp.split("/")
return pkg, host
def scan(cpv, url):
pkg, channel = guess_package_and_channel(cpv, url)
orig_url = url
url = 'http://%s/rest/r/%s/allreleases.xml' % (channel, pkg.lower())
output.einfo("Using: " + url)
try:
fp = helpers.urlopen(url)
except urllib2.URLError:
return []
except IOError:
return []
data = fp.read()
dom = xml.dom.minidom.parseString(data)
nodes = dom.getElementsByTagName("v")
ret = []
cp, ver, rev = portage.pkgsplit(cpv)
for node in nodes:
version = node.childNodes[0].data
if helpers.version_filtered(cp, ver, version):
continue
url = 'http://%s/get/%s-%s.tgz' % (channel, pkg, version)
if url == orig_url:
continue
ret.append(( url, version ))
return ret
def brute_force(cpv, url):
return []

View File

@ -0,0 +1,51 @@
import xmlrpclib
import pprint
import re
import portage
from euscan import helpers, output
def can_handle(cpv, url):
return url.startswith('mirror://pypi/')
def guess_package(cp, url):
match = re.search('mirror://pypi/\w+/(.*)/.*', url)
if match:
return match.group(1)
cat, pkg = cp.split("/")
return pkg
def scan(cpv, url):
'http://wiki.python.org/moin/PyPiXmlRpc'
package = guess_package(cpv, url)
output.einfo("Using PyPi XMLRPC: " + package)
client = xmlrpclib.ServerProxy('http://pypi.python.org/pypi')
versions = client.package_releases(package)
if not versions:
return versions
versions.reverse()
cp, ver, rev = portage.pkgsplit(cpv)
ret = []
for version in versions:
if helpers.version_filtered(cp, ver, version):
continue
urls = client.release_urls(package, version)
urls = " ".join([ infos['url'] for infos in urls ])
ret.append(( urls, version ))
return ret
def brute_force(cpv, url):
return []

View File

@ -0,0 +1,56 @@
import re
import portage
import json
import urllib2
from euscan import helpers, output
def can_handle(cpv, url):
return url.startswith('mirror://rubygems/')
def guess_gem(cpv, url):
match = re.search('mirror://rubygems/(.*).gem', url)
if match:
cpv = 'fake/%s' % match.group(1)
cp, ver, rev = portage.pkgsplit(cpv)
cat, pkg = cp.split("/")
return pkg
def scan(cpv, url):
'http://guides.rubygems.org/rubygems-org-api/#gemversion'
gem = guess_gem(cpv, url)
url = 'http://rubygems.org/api/v1/versions/%s.json' % gem
output.einfo("Using: " + url)
try:
fp = helpers.urlopen(url, None, 5)
except urllib2.URLError:
return []
except IOError:
return []
data = fp.read()
versions = json.loads(data)
if not versions:
return []
cp, ver, rev = portage.pkgsplit(cpv)
ret = []
for version in versions:
version = version['number']
if helpers.version_filtered(cp, ver, version):
continue
url = 'http://rubygems.org/gems/%s-%s.gem' % (gem, version)
ret.append(( url, version ))
return ret
def brute_force(cpv, url):
return []

309
pym/euscan/helpers.py Normal file
View File

@ -0,0 +1,309 @@
import urllib2
import os
import re
import pkg_resources
import errno
import portage
from portage import dep
from euscan import CONFIG, BLACKLIST_VERSIONS, output
def htop_vercmp(a, b):
def fixver(v):
if v in ['0.11', '0.12', '0.13']:
v = '0.1.' + v[3:]
return v
return simple_vercmp(fixver(a), fixver(b))
VERSION_CMP_PACKAGE_QUIRKS = {
'sys-process/htop' : htop_vercmp
}
_v = r'((\d+)((\.\d+)*)([a-zA-Z]*?)(((-|_)(pre|p|beta|b|alpha|a|rc|r)\d*)*))'
def cast_int_components(version):
for i, obj in enumerate(version):
try:
version[i] = int(obj)
except ValueError:
pass
return version
def simple_vercmp(a, b):
if a == b:
return 0
# For sane versions
r = portage.versions.vercmp(a, b)
if r is not None:
return r
# Fallback
a = pkg_resources.parse_version(a)
b = pkg_resources.parse_version(b)
if a < b:
return -1
else:
return 1
def vercmp(package, a, b):
if package in VERSION_CMP_PACKAGE_QUIRKS:
return VERSION_CMP_PACKAGE_QUIRKS[package](a, b)
return simple_vercmp(a, b)
def version_is_nightly(a, b):
a = pkg_resources.parse_version(a)
b = pkg_resources.parse_version(b)
''' Try to skip nightly builds when not wanted (www-apps/moodle) '''
if len(a) != len(b) and len(b) == 2 and len(b[0]) == len('yyyymmdd'):
return True
return False
def version_blacklisted(cp, version):
rule = None
cpv = '%s-%s' % (cp, version)
''' Check that the generated cpv can be used by portage '''
if not portage.versions.catpkgsplit(cpv):
return False
for bv in BLACKLIST_VERSIONS:
if dep.match_from_list(bv, [cpv]):
rule = bv
None
if rule:
output.einfo("%s is blacklisted by rule %s" % (cpv, bv))
return rule is not None
def version_filtered(cp, base, version):
if vercmp(cp, base, version) >= 0:
return True
if version_blacklisted(cp, version):
return True
if version_is_nightly(base, version):
return True
return False
def generate_templates_vars(version):
ret = []
part = split_version(version)
for i in range(2, len(part)):
ver = []
var = []
for j in range(i):
ver.append(str(part[j]))
var.append('${%d}' % j)
ret.append((".".join(ver), ".".join(var)))
ret.append((version, '${PV}'))
ret.reverse()
return ret
def template_from_url(url, version):
prefix, chunks = url.split('://')
chunks = chunks.split('/')
for i in range(len(chunks)):
chunk = chunks[i]
subs = generate_templates_vars(version)
for sub in subs:
chunk = chunk.replace(sub[0], sub[1])
chunks[i] = chunk
return prefix + "://" + "/".join(chunks)
def url_from_template(url, version):
components = split_version(version)
url = url.replace('${PV}', version)
for i in range(len(components)):
url = url.replace('${%d}' % i, str(components[i]))
return url
# Stolen from distutils.LooseVersion
# Used for brute force to increment the version
def split_version(version):
component_re = re.compile(r'(\d+ | [a-z]+ | \.)', re.VERBOSE)
components = filter(lambda x: x and x != '.', component_re.split(version))
for i in range(len(components)):
try:
components[i] = int(components[i])
except ValueError:
pass
return components
def join_version(components):
version = ""
for i in range(len(components)):
version += str(components[i])
if i >= len(components) - 1:
break
if type(components[i]) != str and type(components[i + 1]) != str:
version += "."
return version
def increment_version(components, level):
n = len(components)
if level > n - 1 or level < 0:
raise Exception
for i in range(n, level + 1, -1):
if type(components[i - 1]) == int:
components[i - 1] = 0
if type(components[level]) == int:
components[level] += 1
return components
def gen_versions(components, level):
n = len(components)
depth = level
level = min(level, n)
if not n:
return []
versions = []
for i in range(n, n - level, -1):
increment_version(components, i - 1)
for j in range(depth):
versions.append(list(components))
increment_version(components, i - 1)
return versions
def urlopen(url, timeout=None):
if not timeout:
if 'sourceforge' in url:
timeout = 15
else:
timeout = 5
request = urllib2.Request(url)
request.add_header('User-Agent', CONFIG['user-agent'])
return urllib2.urlopen(request, None, timeout)
def tryurl(fileurl, template):
result = True
output.ebegin("Trying: " + fileurl)
try:
basename = os.path.basename(fileurl)
fp = urlopen(fileurl)
headers = fp.info()
if 'Content-disposition' in headers and basename not in headers['Content-disposition']:
result = None
elif 'Content-Length' in headers and headers['Content-Length'] == '0':
result = None
elif 'text/html' in headers['Content-Type']:
result = None
elif fp.geturl() != fileurl:
regex = regex_from_template(template)
baseregex = regex_from_template(os.path.basename(template))
basename2 = os.path.basename(fp.geturl())
# Redirect to another (earlier?) version
if basename != basename2 and (re.match(regex, fp.geturl()) or re.match(baseregex, basename2)):
result = None
if result:
result = (fp.geturl(), fp.info())
except urllib2.URLError:
result = None
except IOError:
result = None
output.eend(errno.ENOENT if not result else 0)
return result
def regex_from_template(template):
template = re.escape(template)
template = template.replace('\$\{', '${')
template = template.replace('\}', '}')
template = template.replace('}\.$', '}.$')
template = template.replace('${1}', r'([\d]+?)')
template = re.sub(r'(\$\{\d+\}\.?)+', r'([\w]+?)', template)
#template = re.sub(r'(\$\{\d+\}\.?)+', r'([\w]+?)', template)
#template = re.sub(r'(\$\{\d+\}\.+)+', '(.+?)\.', template)
#template = re.sub(r'(\$\{\d+\})+', '(.+?)', template)
template = template.replace('${PV}', _v)
template = template + r'/?$'
return template
def basedir_from_template(template):
idx = template.find('${')
if idx == -1:
return template
idx = template[0:idx].rfind('/')
if idx == -1:
return ""
return template[0:idx]
def generate_scan_paths(url):
prefix, chunks = url.split('://')
chunks = chunks.split('/')
steps = []
path = prefix + ":/"
for chunk in chunks:
if '${' in chunk:
steps.append((path, regex_from_template(chunk)))
path = ""
else:
path += "/"
path += chunk
return steps
def parse_mirror(uri):
from random import shuffle
mirrors = portage.settings.thirdpartymirrors()
if not uri.startswith("mirror://"):
return uri
eidx = uri.find("/", 9)
if eidx == -1:
output.einfo("Invalid mirror definition in SRC_URI:\n")
output.einfo(" %s\n" % (uri))
return None
mirrorname = uri[9:eidx]
path = uri[eidx+1:]
if mirrorname in mirrors:
mirrors = mirrors[mirrorname]
shuffle(mirrors)
uri = mirrors[0].strip("/") + "/" + path
else:
output.einfo("No known mirror by the name: %s\n" % (mirrorname))
return None
return uri

130
pym/euscan/scan.py Normal file
View File

@ -0,0 +1,130 @@
from __future__ import print_function
import os
import sys
import re
import time
import getopt
import random
import urllib2
import StringIO
import pkg_resources
import portage
import portage.versions
from portage import dep
from portage.dbapi import porttree
from portage.output import white, yellow, turquoise, green, teal, red, EOutput
import gentoolkit.pprinter as pp
from gentoolkit import errors
from gentoolkit.query import Query
from gentoolkit.eclean.search import (port_settings)
from euscan import CONFIG, BLACKLIST_PACKAGES, output
from euscan import handlers
from euscan import helpers
def filter_versions(cp, versions):
filtered = {}
for url, version in versions:
''' Try to keep the most specific urls (determinted by the length) '''
if version in filtered and len(url) < len(filtered[version]):
continue
''' Remove blacklisted versions '''
if helpers.version_blacklisted(cp, version):
continue
filtered[version] = url
return [ (filtered[version], version) for version in filtered ]
def scan_upstream_urls(cpv, urls):
versions = []
for filename in urls:
for url in urls[filename]:
print ()
output.einfo("SRC_URI is '%s'" % url)
if '://' not in url:
output.einfo("Invalid url '%s'" % url)
continue
''' Try normal scan '''
if CONFIG["scan-dir"]:
versions.extend(handlers.scan(cpv, url))
if versions and CONFIG['oneshot']:
break
''' Brute Force '''
if CONFIG["brute-force"] > 0:
versions.extend(handlers.brute_force(cpv, url))
if versions and CONFIG['oneshot']:
break
cp, ver, rev = portage.pkgsplit(cpv)
return filter_versions(cp, versions)
def scan_upstream(query):
matches = Query(query).find(
include_masked=True,
in_installed=False
)
if not matches:
sys.stderr.write(pp.warn("No package matching '%s'" % pp.pkgquery(query)))
return []
matches = sorted(matches)
pkg = matches.pop()
if '9999' in pkg.version:
if len(matches) == 0:
sys.stderr.write(pp.warn("Package '%s' only have a dev version (9999)" % pp.pkgquery(pkg.cp)))
return []
else:
pkg = matches.pop()
if pkg.cp in BLACKLIST_PACKAGES:
sys.stderr.write(pp.warn("Package '%s' is blacklisted" % pp.pkgquery(pkg.cp)))
return []
pp.uprint(" * %s [%s]" % (pp.cpv(pkg.cpv), pp.section(pkg.repo_name())))
pp.uprint()
ebuild_path = pkg.ebuild_path()
if ebuild_path:
pp.uprint('Ebuild: ' + pp.path(os.path.normpath(ebuild_path)))
pp.uprint('Repository: ' + pkg.repo_name())
pp.uprint('Homepage: ' + pkg.environment("HOMEPAGE"))
pp.uprint('Description: ' + pkg.environment("DESCRIPTION"))
cpv = pkg.cpv
metadata = {
"EAPI" : port_settings["EAPI"],
"SRC_URI" : pkg.environment("SRC_URI", False),
}
use = frozenset(port_settings["PORTAGE_USE"].split())
try:
alist = porttree._parse_uri_map(cpv, metadata, use=use)
aalist = porttree._parse_uri_map(cpv, metadata)
except InvalidDependString as e:
sys.stderr.write(pp.warn("%s\n" % str(e)))
sys.stderr.write(pp.warn("Invalid SRC_URI for '%s'" % pp.pkgquery(cpv)))
return []
if "mirror" in portage.settings.features:
urls = aalist
else:
urls = alist
return scan_upstream_urls(pkg.cpv, urls)