euscan: Redesigning the handlers layout

Signed-off-by: volpino <fox91@anche.no>
This commit is contained in:
volpino
2012-07-26 10:44:10 +02:00
parent 36e1aa6d12
commit 12bf1fc75b
13 changed files with 168 additions and 153 deletions

View File

@ -0,0 +1,19 @@
"""
Url wide handlers for scanning upstream
"""
import pkgutil
handlers = []
# autoimport all modules in this directory and append them to handlers list
for loader, module_name, is_pkg in pkgutil.walk_packages(__path__):
module = loader.find_module(module_name).load_module(module_name)
handlers.append(module)
# sort handlers by priority
handlers = sorted(
handlers,
key=lambda handler: handler.PRIORITY,
reverse=True
)

View File

@ -0,0 +1,133 @@
import re
import portage
import urllib2
import json
from euscan import helpers, output
HANDLER_NAME = "cpan"
CONFIDENCE = 100.0
PRIORITY = 90
_cpan_package_name_re = re.compile("mirror://cpan/authors/.*/([^/.]*).*")
def can_handle(pkg, url):
return url.startswith('mirror://cpan/')
def guess_package(cp, url):
match = _cpan_package_name_re.search(url)
pkg = None
if match:
pkg = match.group(1)
try:
cp, ver, rev = portage.pkgsplit('fake/' + pkg)
except:
pass
cat, pkg = cp.split("/")
return pkg
def gentoo_mangle_version(up_pv):
pv = ""
if up_pv.count('.') == 1:
digits = 0
for i in range(len(up_pv)):
if digits == 3:
pv += "."
digits = 0
c = up_pv[i]
pv += c
digits += int(c.isdigit())
if c == '.':
digits = 0
else:
pv = up_pv
return helpers.gentoo_mangle_version(pv)
def cpan_trim_version(pv):
pv = re.sub('^[a-zA-Z]+', '', pv)
pv = re.sub('[a-zA-Z]$', '', pv)
return pv
def cpan_mangle_version(pv):
pos = pv.find('.')
if pos < 0:
return pv
up_pv = pv.replace('.', '')
up_pv = up_pv[0:pos] + '.' + up_pv[pos:]
up_pv = cpan_trim_version(up_pv)
return up_pv
def cpan_vercmp(cp, a, b):
try:
return float(a) - float(b)
except:
if a < b:
return -1
else:
return 1
def scan(pkg, url):
cp, ver, rev = portage.pkgsplit(pkg.cpv)
pkg = guess_package(cp, url)
orig_url = url
url = 'http://search.cpan.org/api/dist/%s' % pkg
output.einfo("Using: " + url)
try:
fp = helpers.urlopen(url)
except urllib2.URLError:
return []
except IOError:
return []
if not fp:
return []
data = fp.read()
data = json.loads(data)
if 'releases' not in data:
return []
ret = []
for version in data['releases']:
#if version['status'] == 'testing':
# continue
up_pv = version['version']
up_pv = cpan_trim_version(up_pv)
pv = gentoo_mangle_version(up_pv)
up_ver = cpan_mangle_version(ver)
if helpers.version_filtered(cp, up_ver, up_pv, cpan_vercmp):
continue
url = 'mirror://cpan/authors/id/%s/%s/%s/%s' % (
version['cpanid'][0],
version['cpanid'][0:1],
version['cpanid'],
version['archive']
)
if url == orig_url:
continue
ret.append((url, pv, HANDLER_NAME, CONFIDENCE))
return ret

View File

@ -0,0 +1,240 @@
from urlparse import urljoin
import urllib2
import re
import StringIO
try:
from BeautifulSoup import BeautifulSoup
except ImportError:
from bs4 import BeautifulSoup
import portage
from euscan import CONFIG, SCANDIR_BLACKLIST_URLS, \
BRUTEFORCE_BLACKLIST_PACKAGES, BRUTEFORCE_BLACKLIST_URLS, output, helpers
HANDLER_NAME = "generic"
CONFIDENCE = 50.0
PRIORITY = 0
BRUTEFORCE_HANDLER_NAME = "brute_force"
BRUTEFORCE_CONFIDENCE = 30.0
def scan_html(data, url, pattern):
soup = BeautifulSoup(data)
results = []
for link in soup.findAll('a'):
href = link.get("href")
if not href:
continue
if href.startswith(url):
href = href.replace(url, "", 1)
match = re.match(pattern, href, re.I)
if match:
results.append(
(".".join([x for x in match.groups() if x is not None]),
match.group(0))
)
return results
def scan_ftp(data, url, pattern):
buf = StringIO.StringIO(data)
results = []
for line in buf.readlines():
line = line.replace("\n", "").replace("\r", "")
match = re.search(pattern, line, re.I)
if match:
results.append(
(".".join([x for x in match.groups() if x is not None]),
match.group(0))
)
return results
def scan_directory_recursive(cp, ver, rev, url, steps, orig_url):
if not steps:
return []
url += steps[0][0]
pattern = steps[0][1]
steps = steps[1:]
output.einfo("Scanning: %s" % url)
try:
fp = helpers.urlopen(url)
except urllib2.URLError:
return []
except IOError:
return []
if not fp:
return []
data = fp.read()
results = []
if re.search("<\s*a\s+[^>]*href", data, re.I):
results.extend(scan_html(data, url, pattern))
elif url.startswith('ftp://'):
results.extend(scan_ftp(data, url, pattern))
versions = []
for up_pv, path in results:
pv = helpers.gentoo_mangle_version(up_pv)
if helpers.version_filtered(cp, ver, pv):
continue
if not url.endswith("/"):
url = url + "/"
path = urljoin(url, path)
if not steps and path not in orig_url:
versions.append((path, pv, HANDLER_NAME, CONFIDENCE))
if steps:
ret = scan_directory_recursive(cp, ver, rev, path, steps, orig_url)
versions.extend(ret)
return versions
def scan(pkg, url):
if CONFIG["scan-dir"]:
for bu in SCANDIR_BLACKLIST_URLS:
if re.match(bu, url):
output.einfo("%s is blacklisted by rule %s" % (url, bu))
return []
resolved_url = helpers.parse_mirror(url)
if not resolved_url:
return []
cp, ver, rev = portage.pkgsplit(pkg.cpv)
# 'Hack' for _beta/_rc versions where _ is used instead of -
if ver not in resolved_url:
newver = helpers.version_change_end_sep(ver)
if newver and newver in resolved_url:
output.einfo(
"Version: using %s instead of %s" % (newver, ver)
)
ver = newver
template = helpers.template_from_url(resolved_url, ver)
if '${' not in template:
output.einfo(
"Url doesn't seems to depend on version: %s not found in %s" %
(ver, resolved_url)
)
return []
else:
output.einfo("Scanning: %s" % template)
steps = helpers.generate_scan_paths(template)
ret = scan_directory_recursive(cp, ver, rev, "", steps, url)
if not ret:
brute_force(pkg, url)
return ret
def brute_force(pkg, url):
if CONFIG["brute-force"] == 0:
return []
cp, ver, rev = portage.pkgsplit(pkg.cpv)
url = helpers.parse_mirror(url)
if not url:
return []
for bp in BRUTEFORCE_BLACKLIST_PACKAGES:
if re.match(bp, cp):
output.einfo("%s is blacklisted by rule %s" % (cp, bp))
return []
for bp in BRUTEFORCE_BLACKLIST_URLS:
if re.match(bp, url):
output.einfo("%s is blacklisted by rule %s" % (cp, bp))
return []
output.einfo("Generating version from " + ver)
components = helpers.split_version(ver)
versions = helpers.gen_versions(components, CONFIG["brute-force"])
# Remove unwanted versions
for v in versions:
if helpers.vercmp(cp, ver, helpers.join_version(v)) >= 0:
versions.remove(v)
if not versions:
output.einfo("Can't generate new versions from " + ver)
return []
template = helpers.template_from_url(url, ver)
if '${PV}' not in template:
output.einfo(
"Url doesn't seems to depend on full version: %s not found in %s" %
(ver, url))
return []
else:
output.einfo("Brute forcing: %s" % template)
result = []
i = 0
done = []
while i < len(versions):
components = versions[i]
i += 1
if components in done:
continue
done.append(tuple(components))
version = helpers.join_version(components)
if helpers.version_filtered(cp, ver, version):
continue
url = helpers.url_from_template(template, version)
infos = helpers.tryurl(url, template)
if not infos:
continue
result.append([url, version, BRUTEFORCE_HANDLER_NAME,
BRUTEFORCE_CONFIDENCE])
if len(result) > CONFIG['brute-force-false-watermark']:
output.einfo(
"Broken server detected ! Skipping brute force."
)
return []
if CONFIG["brute-force-recursive"]:
for v in helpers.gen_versions(list(components),
CONFIG["brute-force"]):
if v not in versions and tuple(v) not in done:
versions.append(v)
if CONFIG["oneshot"]:
break
return result
def can_handle(pkg, url):
return True

View File

@ -0,0 +1,54 @@
import json
import urllib2
import re
import portage
from euscan import helpers, output
HANDLER_NAME = "github"
CONFIDENCE = 100.0
PRIORITY = 90
def can_handle(pkg, url):
return url.startswith('mirror://github/')
def guess_package(cp, url):
match = re.search('^mirror://github/(.*?)/(.*?)/(.*)$', url)
assert(match)
return (match.group(1), match.group(2), match.group(3))
def scan(pkg, url):
'http://developer.github.com/v3/repos/downloads/'
user, project, filename = guess_package(pkg.cpv, url)
# find out where version is expected to be found
cp, ver, rev = portage.pkgsplit(pkg.cpv)
if ver not in filename:
return
# now create a filename-matching regexp
# XXX: supposedly replace first with (?P<foo>...)
# and remaining ones with (?P=foo)
fnre = re.compile('^%s$' % \
re.escape(filename).replace(re.escape(ver), '(.*?)'))
output.einfo("Using github API for: " + '/'.join(filename))
dlreq = urllib2.urlopen('https://api.github.com/repos/%s/%s/downloads' % \
(user, project))
dls = json.load(dlreq)
for dl in dls:
m = fnre.match(dl['name'])
if m:
pv = helpers.gentoo_mangle_version(m.group(1))
if helpers.version_filtered(cp, ver, pv):
continue
yield (dl['html_url'], pv, HANDLER_NAME, CONFIDENCE)

View File

@ -0,0 +1,39 @@
from euscan.handlers.url import generic
PRIORITY = 90
HANDLER_NAME = "kde"
def can_handle(pkg, url):
if url.startswith('mirror://kde/'):
return True
return False
def clean_results(results):
ret = []
for path, version, _, confidence in results:
if version == '5SUMS':
continue
ret.append((path, version, HANDLER_NAME, confidence))
return ret
def scan(pkg, url):
results = generic.scan(pkg.cpv, url)
if url.startswith('mirror://kde/unstable/'):
url = url.replace('mirror://kde/unstable/', 'mirror://kde/stable/')
results += generic.scan(pkg.cpv, url)
if not results: # if nothing was found go brute forcing
results = generic.brute_force(pkg.cpv, url)
if url.startswith('mirror://kde/unstable/'):
url = url.replace('mirror://kde/unstable/', 'mirror://kde/stable/')
results += generic.brute_force(pkg.cpv, url)
return clean_results(results)

View File

@ -0,0 +1,72 @@
import re
import portage
import urllib2
import xml.dom.minidom
from euscan import helpers, output
HANDLER_NAME = "php"
CONFIDENCE = 100.0
PRIORITY = 90
def can_handle(pkg, url):
if url.startswith('http://pear.php.net/get/'):
return True
if url.startswith('http://pecl.php.net/get/'):
return True
return False
def guess_package_and_channel(cp, url):
match = re.search('http://(.*)/get/(.*)-(.*).tgz', url)
if match:
host = match.group(1)
pkg = match.group(2)
else:
cat, pkg = cp.split("/")
return pkg, host
def scan(pkg, url):
cp, ver, rev = portage.pkgsplit(pkg.cpv)
package, channel = guess_package_and_channel(cp, url)
orig_url = url
url = 'http://%s/rest/r/%s/allreleases.xml' % (channel, package.lower())
output.einfo("Using: " + url)
try:
fp = helpers.urlopen(url)
except urllib2.URLError:
return []
except IOError:
return []
if not fp:
return []
data = fp.read()
dom = xml.dom.minidom.parseString(data)
nodes = dom.getElementsByTagName("v")
ret = []
for node in nodes:
up_pv = node.childNodes[0].data
pv = helpers.gentoo_mangle_version(up_pv)
if helpers.version_filtered(cp, ver, pv):
continue
url = 'http://%s/get/%s-%s.tgz' % (channel, package, up_pv)
if url == orig_url:
continue
ret.append((url, pv, HANDLER_NAME, CONFIDENCE))
return ret

View File

@ -0,0 +1,54 @@
import xmlrpclib
import re
import portage
from euscan import helpers, output
HANDLER_NAME = "pypi"
CONFIDENCE = 100.0
PRIORITY = 90
def can_handle(pkg, url):
return url.startswith('mirror://pypi/')
def guess_package(cp, url):
match = re.search('mirror://pypi/\w+/(.*)/.*', url)
if match:
return match.group(1)
cat, pkg = cp.split("/")
return pkg
def scan(pkg, url):
'http://wiki.python.org/moin/PyPiXmlRpc'
package = guess_package(pkg.cpv, url)
output.einfo("Using PyPi XMLRPC: " + package)
client = xmlrpclib.ServerProxy('http://pypi.python.org/pypi')
versions = client.package_releases(package)
if not versions:
return versions
versions.reverse()
cp, ver, rev = portage.pkgsplit(pkg.cpv)
ret = []
for up_pv in versions:
pv = helpers.gentoo_mangle_version(up_pv)
if helpers.version_filtered(cp, ver, pv):
continue
urls = client.release_urls(package, up_pv)
urls = " ".join([infos['url'] for infos in urls])
ret.append((urls, pv, HANDLER_NAME, CONFIDENCE))
return ret

View File

@ -0,0 +1,73 @@
import re
import portage
import json
import urllib2
from euscan import helpers, output
HANDLER_NAME = "rubygem"
CONFIDENCE = 100.0
PRIORITY = 90
def can_handle(pkg, url):
return url.startswith('mirror://rubygems/')
def guess_gem(cpv, url):
match = re.search('mirror://rubygems/(.*).gem', url)
if match:
cpv = 'fake/%s' % match.group(1)
ret = portage.pkgsplit(cpv)
if not ret:
return None
cp, ver, rev = ret
cat, pkg = cp.split("/")
return pkg
def scan(pkg, url):
'http://guides.rubygems.org/rubygems-org-api/#gemversion'
gem = guess_gem(pkg.cpv, url)
if not gem:
output.eerror("Can't guess gem name using %s and %s" % \
(pkg.cpv, url))
return []
url = 'http://rubygems.org/api/v1/versions/%s.json' % gem
output.einfo("Using: " + url)
try:
fp = helpers.urlopen(url)
except urllib2.URLError:
return []
except IOError:
return []
if not fp:
return []
data = fp.read()
versions = json.loads(data)
if not versions:
return []
cp, ver, rev = portage.pkgsplit(pkg.cpv)
ret = []
for version in versions:
up_pv = version['number']
pv = helpers.gentoo_mangle_version(up_pv)
if helpers.version_filtered(cp, ver, pv):
continue
url = 'http://rubygems.org/gems/%s-%s.gem' % (gem, up_pv)
ret.append((url, pv, HANDLER_NAME, CONFIDENCE))
return ret