Change source layout

* In preparation for PEP517 transition. Signed-off-by: Alfred Wingate <parona@protonmail.com>
2023-11-14 19:58:44 +02:00
parent ec7399752c
commit c873e1520d
25 changed files with 3 additions and 3 deletions
--- a/src/euscan/handlers/init.py
+++ b/src/euscan/handlers/init.py
@ -0,0 +1,216 @@
+import os
+import sys
+import pkgutil
+
+from euscan import CONFIG, output
+
+from gentoolkit.metadata import MetaData
+
+handlers = {'package': [], 'url': [], 'all': {}}
+
+# autoimport all modules in this directory and append them to handlers list
+for loader, module_name, is_pkg in pkgutil.walk_packages(__path__):
+    module = loader.find_module(module_name).load_module(module_name)
+    if not hasattr(module, 'HANDLER_NAME'):
+        continue
+    if hasattr(module, 'scan_url'):
+        handlers['url'].append(module)
+    if hasattr(module, 'scan_pkg'):
+        handlers['package'].append(module)
+    handlers['all'][module.HANDLER_NAME] = module
+
+
+# sort handlers by priority
+def sort_handlers(handlers):
+    return sorted(
+        handlers,
+        key=lambda handler: handler.PRIORITY,
+        reverse=True
+    )
+
+handlers['package'] = sort_handlers(handlers['package'])
+handlers['url'] = sort_handlers(handlers['url'])
+
+
+def find_best_handler(kind, pkg, *args):
+    """
+    Find the best handler for the given package
+    """
+    for handler in handlers[kind]:
+        if (handler.HANDLER_NAME not in CONFIG["handlers-exclude"] and
+            handler.can_handle(pkg, *args)):
+            return handler
+    return None
+
+
+def find_handlers(kind, names):
+    ret = []
+
+    for name in names:
+        # Does this handler exist, and handle this kind of thing ? (pkg / url)
+        if name in handlers['all'] and handlers['all'][name] in handlers[kind]:
+            ret.append(handlers['all'][name])
+
+    return ret
+
+
+def get_metadata(pkg):
+    metadata = {}
+
+    pkg_metadata = None
+
+    meta_override = os.path.join('metadata', pkg.category, pkg.name,
+                                 'metadata.xml')
+
+    try:
+        if os.path.exists(meta_override):
+            pkg_metadata = MetaData(meta_override)
+            output.einfo('Using custom metadata: %s' % meta_override)
+        if not pkg_metadata:
+            pkg_metadata = pkg.metadata
+    except Exception as e:
+        output.ewarn('Error when fetching metadata: %s' % str(e))
+
+    if not pkg_metadata:
+        return {}
+
+    # Support multiple remote-id and multiple watch
+    for upstream in pkg_metadata._xml_tree.findall("upstream"):
+        for node in upstream.findall("watch"):
+            options = dict(node.attrib)
+            options['data'] = node.text
+
+            if "type" in options:
+                handler = options['type']
+            else:
+                handler = "url"
+                options['type'] = "url"
+
+            for key in ["versionmangle", "downloadurlmangle"]:
+                value = options.get(key, None)
+                if value:
+                    options[key] = value.split(";")
+
+            if handler not in metadata:
+                metadata[handler] = []
+            metadata[handler].append(options)
+
+    for upstream in pkg_metadata._xml_tree.findall("upstream"):
+        for node in upstream.findall("remote-id"):
+            handler = node.attrib.get("type")
+            if not handler:
+                continue
+            if handler in metadata:
+                for i in range(len(metadata[handler])):
+                    if not metadata[handler][i]['data']:
+                        metadata[handler][i]['data'] = node.text
+            else:
+                metadata[handler] = [{'type': handler, 'data': node.text}]
+
+    return metadata
+
+
+def scan_pkg(pkg_handler, pkg, options, on_progress=None):
+    versions = []
+
+    if on_progress:
+        on_progress(increment=35)
+
+    for o in options:
+        versions += pkg_handler.scan_pkg(pkg, o)
+
+    if on_progress:
+        on_progress(increment=35)
+
+    return versions
+
+
+def scan_url(pkg, urls, options, on_progress=None):
+    versions = []
+
+    if on_progress:
+        progress_available = 70
+        num_urls = sum([len(urls[fn]) for fn in urls])
+        if num_urls > 0:
+            progress_increment = progress_available / num_urls
+        else:
+            progress_increment = 0
+
+    for filename in urls:
+        for url in urls[filename]:
+            if on_progress and progress_available > 0:
+                on_progress(increment=progress_increment)
+                progress_available -= progress_increment
+
+            output.einfo("SRC_URI is '%s'" % url)
+
+            if '://' not in url:
+                output.einfo("Invalid url '%s'" % url)
+                continue
+
+            try:
+                url_handler = find_best_handler('url', pkg, url)
+                if url_handler:
+                    for o in options:
+                        versions += url_handler.scan_url(pkg, url, o)
+                else:
+                    output.eerror("Can't find a suitable handler!")
+            except Exception as e:
+                output.ewarn(
+                    "Handler failed: [%s] %s" %
+                    (e.__class__.__name__, str(e))
+                )
+
+            if versions and CONFIG['oneshot']:
+                break
+
+    if on_progress and progress_available > 0:
+        on_progress(increment=progress_available)
+
+    return versions
+
+
+def scan(pkg, urls, on_progress=None):
+    """
+    Scans upstream for the given package.
+    First tries if a package wide handler is available, then fallbacks
+    in url handling.
+    """
+
+    if not CONFIG['quiet'] and not CONFIG['format']:
+        sys.stdout.write('\n')
+
+    metadata = get_metadata(pkg)
+    versions = []
+
+    pkg_handlers = find_handlers('package', list(metadata.keys()))
+    if not pkg_handlers:
+        pkg_handler = find_best_handler('package', pkg)
+        if pkg_handler:
+            pkg_handlers = [pkg_handler]
+
+    for pkg_handler in pkg_handlers:
+        options = metadata.get(pkg_handler.HANDLER_NAME, [{}])
+        versions += scan_pkg(pkg_handler, pkg, options, on_progress)
+
+    if not pkg_handlers:
+        versions += scan_url(pkg, urls, [{}], on_progress)
+
+    return versions
+
+
+def mangle(kind, name, string):
+    if name not in handlers['all']:
+        return None
+    handler = handlers['all'][name]
+    if not hasattr(handler, 'mangle_%s' % kind):
+        return None
+    return getattr(handler, 'mangle_%s' % kind)(string)
+
+
+def mangle_url(name, string):
+    return mangle('url', name, string)
+
+
+def mangle_version(name, string):
+    return mangle('version', name, string)
--- a/src/euscan/handlers/berlios.py
+++ b/src/euscan/handlers/berlios.py
@ -0,0 +1,59 @@
+import re
+import urllib.request, urllib.parse, urllib.error
+
+import portage
+
+from euscan.helpers import regex_from_template
+from euscan.handlers.url import process_scan as url_scan
+from euscan import output
+
+HANDLER_NAME = "berlios"
+CONFIDENCE = 90
+PRIORITY = 90
+
+
+berlios_regex = r"mirror://berlios/([^/]+)/([^/]+)"
+
+
+def can_handle(pkg, url=None):
+    if not url:
+        return False
+
+    cp, ver, rev = portage.pkgsplit(pkg.cpv)
+    if ver not in url:
+        return False
+
+    return re.search(berlios_regex, url)
+
+
+def scan_url(pkg, url, options):
+    output.einfo("Using BerliOS handler")
+
+    cp, ver, rev = portage.pkgsplit(pkg.cpv)
+
+    project, filename = re.search(berlios_regex, url).groups()
+
+    project_page = "http://developer.berlios.de/projects/%s" % project
+    content = urllib.request.urlopen(project_page).read()
+
+    project_id = re.search(
+        r"/project/filelist.php\?group_id=(\d+)",
+        content
+    ).group(1)
+
+    base_url = (
+        "http://developer.berlios.de/project/filelist.php?group_id=%s" %
+        project_id
+    )
+
+    file_pattern = regex_from_template(
+        filename.replace(ver, "${PV}")
+    )
+
+    result = url_scan(pkg, base_url, file_pattern)
+
+    ret = []
+    for found_url, pv, _, _ in result:
+        found_url = found_url.replace("prdownload", "download")
+        ret.append((found_url, pv, HANDLER_NAME, CONFIDENCE))
+    return ret
--- a/src/euscan/handlers/cpan.py
+++ b/src/euscan/handlers/cpan.py
@ -0,0 +1,161 @@
+import re
+import portage
+import urllib.request, urllib.error, urllib.parse
+import json
+
+from euscan import helpers, output, mangling
+
+HANDLER_NAME = "cpan"
+CONFIDENCE = 100
+PRIORITY = 90
+
+_cpan_package_name_re = re.compile("mirror://cpan/authors/.*/([^/.]*).*")
+
+
+def can_handle(pkg, url=None):
+    return url and url.startswith('mirror://cpan/')
+
+
+def guess_package(cp, url):
+    match = _cpan_package_name_re.search(url)
+
+    pkg = None
+
+    if match:
+        pkg = match.group(1)
+        try:
+            cp, ver, rev = portage.pkgsplit('fake/' + pkg)
+        except:
+            pass
+
+    cat, pkg = cp.split("/")
+
+    return pkg
+
+
+def mangle_version(up_pv):
+    if up_pv.startswith('v'):
+        return up_pv[1:]
+
+    # clean
+    up_pv = up_pv.replace("._", "_")  # e.g.: 0.999._002 -> 0.999_002
+    up_pv = up_pv.replace("_0.", "_")  # e.g.: 0.30_0.1 -> 0.30_1
+
+    # Detect _rc versions
+    rc_part = ""
+    if up_pv.count("_") == 1:
+        up_pv, rc_part = up_pv.split("_")
+
+    # Gentoo creates groups of 3 digits, except for the first digit,
+    # or when last digit is 0.  e.g.: 4.11 -> 4.110.0
+    splitted = up_pv.split(".")
+
+    if len(splitted) == 2:  # Split second part is sub-groups
+        part = splitted.pop()
+        for i in range(0, len(part), 3):
+            splitted.append(part[i:i + 3])
+
+    if len(splitted) == 2:  # add last group if it's missing
+        splitted.append("0")
+
+    groups = [splitted[0]]
+    for part in splitted[1:-1]:
+            groups.append(part.ljust(3, "0"))
+    if splitted[-1] == "0":
+        groups.append(splitted[-1])
+    else:
+        groups.append(splitted[-1].ljust(3, "0"))
+
+    # if there's a group with leading zeros strip it.  e.g.: 002 -> 2
+    groups = [g.lstrip("0") if g != "0" else g for g in groups]
+
+    pv = ".".join(groups)
+
+    if rc_part:
+        pv = "%s_rc%s" % (pv, rc_part)
+
+    return pv
+
+
+def cpan_mangle_version(pv):
+    pos = pv.find('.')
+    if pos <= 0:
+        return pv
+    up_pv = pv.replace('.', '')
+    up_pv = up_pv[0:pos] + '.' + up_pv[pos:]
+    return up_pv
+
+
+def cpan_vercmp(cp, a, b):
+    try:
+        return float(a) - float(b)
+    except:
+        return helpers.simple_vercmp(a, b)
+
+
+def scan_url(pkg, url, options):
+    cp, ver, rev = portage.pkgsplit(pkg.cpv)
+    remote_pkg = guess_package(cp, url)
+
+    output.einfo("Using CPAN API: %s", remote_pkg)
+
+    return scan_pkg(pkg, {'data': remote_pkg})
+
+
+def scan_pkg(pkg, options):
+    remote_pkg = options['data']
+
+    # Defaults to CPAN mangling rules
+    if 'versionmangle' not in options:
+        options['versionmangle'] = ['cpan', 'gentoo']
+
+    url = 'http://search.cpan.org/api/dist/%s' % remote_pkg
+    cp, ver, rev = pkg.cp, pkg.version, pkg.revision
+    m_ver = cpan_mangle_version(ver)
+
+    output.einfo("Using CPAN API: " + url)
+
+    try:
+        fp = helpers.urlopen(url)
+    except urllib.error.URLError:
+        return []
+    except IOError:
+        return []
+
+    if not fp:
+        return []
+
+    data = fp.read()
+    data = json.loads(data)
+
+    if 'releases' not in data:
+        return []
+
+    ret = []
+
+    for version in data['releases']:
+        #if version['status'] == 'testing':
+        #    continue
+
+        up_pv = version['version']
+        pv = mangling.mangle_version(up_pv, options)
+
+        if up_pv.startswith('v'):
+            if helpers.version_filtered(cp, ver, pv):
+                continue
+        else:
+            m_pv = cpan_mangle_version(up_pv)
+            if helpers.version_filtered(cp, m_ver, m_pv, cpan_vercmp):
+                continue
+
+        url = 'mirror://cpan/authors/id/%s/%s/%s/%s' % (
+            version['cpanid'][0],
+            version['cpanid'][0:1],
+            version['cpanid'],
+            version['archive']
+        )
+
+        url = mangling.mangle_url(url, options)
+        ret.append((url, pv, HANDLER_NAME, CONFIDENCE))
+
+    return ret
--- a/src/euscan/handlers/deb.py
+++ b/src/euscan/handlers/deb.py
@ -0,0 +1,53 @@
+import urllib.request, urllib.parse, urllib.error
+import re
+import bz2
+import zlib
+
+import portage
+
+from euscan import mangling, helpers, output
+
+HANDLER_NAME = "deb"
+CONFIDENCE = 100
+PRIORITY = 90
+
+
+def can_handle(pkg, url=None):
+    return False
+
+
+def scan_pkg(pkg, options):
+    cp, ver, rev = portage.pkgsplit(pkg.cpv)
+
+    packages_url, package_name = options['data'].strip().split(" ", 1)
+
+    output.einfo("Using Debian Packages: " + packages_url)
+
+    fp = urllib.request.urlopen(packages_url)
+    content = fp.read()
+
+    # Support for .gz and .bz2 Packages file
+    if packages_url.endswith(".bz2"):
+        content = bz2.decompress(content)
+    if packages_url.endswith(".gz"):
+        content = zlib.decompress(content, 16 + zlib.MAX_WBITS)
+
+    content = content.split("\n\n")
+
+    result = []
+
+    for package_info in content:
+        package_line = re.search(r"^Package: (.*)$", package_info, re.M)
+        version_line = re.search(r"^Version: (.*)$", package_info, re.M)
+        if package_line and package_line.group(1) == package_name:
+            if version_line:
+                result.append(version_line.group(1))
+
+    ret = []
+    for up_pv in result:
+        url = ""  # TODO: How to find the url?
+        pv = mangling.mangle_version(up_pv, options)
+        if helpers.version_filtered(cp, ver, pv):
+            continue
+        ret.append((url, pv, HANDLER_NAME, CONFIDENCE))
+    return ret
--- a/src/euscan/handlers/freecode.py
+++ b/src/euscan/handlers/freecode.py
@ -0,0 +1,48 @@
+import urllib.request, urllib.parse, urllib.error
+import re
+
+import portage
+
+from euscan import mangling, helpers, output
+
+HANDLER_NAME = "freecode"
+CONFIDENCE = 100
+PRIORITY = 90
+
+
+def can_handle(pkg, url=None):
+    return False
+
+
+def scan_pkg(pkg, options):
+    cp, ver, rev = portage.pkgsplit(pkg.cpv)
+
+    package = options['data'].strip()
+
+    output.einfo("Using FreeCode handler: " + package)
+
+    fp = urllib.request.urlopen("http://freecode.com/projects/%s/releases" % package)
+    content = str(fp.read())
+
+    result = re.findall(
+        r'<a href="/projects/%s/releases/(\d+)">([^<]+)</a>' % package,
+        content
+    )
+
+    ret = []
+    for release_id, up_pv in result:
+        pv = mangling.mangle_version(up_pv, options)
+        if helpers.version_filtered(cp, ver, pv):
+            continue
+        fp = urllib.request.urlopen("http://freecode.com/projects/%s/releases/%s" %
+                            (package, release_id))
+        content = str(fp.read())
+        download_page = re.findall(r'<a href="(/urls/[^"]+)"', content)[0]
+        fp = urllib.request.urlopen("http://freecode.com%s" % download_page)
+        content = str(fp.read())
+        url = re.findall(
+            r'In case it doesn\'t, click here: <a href="([^"]+)"',
+            content
+        )[0]
+        ret.append((url, pv, HANDLER_NAME, CONFIDENCE))
+    return ret
--- a/src/euscan/handlers/generic.py
+++ b/src/euscan/handlers/generic.py
@ -0,0 +1,276 @@
+from urllib.parse import urljoin, urlparse
+import urllib.request, urllib.error, urllib.parse
+import re
+import io
+import difflib
+
+try:
+    from BeautifulSoup import BeautifulSoup
+except ImportError:
+    from bs4 import BeautifulSoup
+
+import portage
+
+from euscan import output, helpers, mangling, CONFIG, SCANDIR_BLACKLIST_URLS, \
+    BRUTEFORCE_BLACKLIST_PACKAGES, BRUTEFORCE_BLACKLIST_URLS
+
+HANDLER_NAME = "generic"
+CONFIDENCE = 45
+PRIORITY = 0
+
+BRUTEFORCE_HANDLER_NAME = "brute_force"
+BRUTEFORCE_CONFIDENCE = 30
+
+
+def confidence_score(found, original, minimum=CONFIDENCE):
+    found_p = urlparse(found)
+    original_p = urlparse(original)
+
+    # check if the base url is the same
+    if found_p.netloc != original_p.netloc:
+        return minimum
+
+    # check if the directory depth is the same
+    if len(found_p.path.split("/")) != len(original_p.path.split("/")):
+        return minimum
+
+    # strip numbers
+    found_path = re.sub(r"[\d+\.]?", "", found_p.path)
+    original_path = re.sub(r"[\d+\.]?", "", original_p.path)
+
+    # strip the first equal part of the path
+    i = 0
+    max_i = len(found_path)
+    while i < max_i and found_path[i] == original_path[i]:
+        i += 1
+    found_path = found_path[i:]
+    original_path = original_path[i:]
+
+    # calculate difference ratio
+    diff = difflib.SequenceMatcher(None, found_path, original_path).ratio()
+    return int(minimum + minimum * diff)  # maximum score is minimum * 2
+
+
+def scan_html(data, url, pattern):
+    soup = BeautifulSoup(data, features="lxml")
+    results = []
+
+    for link in soup.findAll('a'):
+        href = link.get("href")
+        if not href:
+            continue
+
+        if href.startswith(url):
+            href = href.replace(url, "", 1)
+
+        match = re.search(pattern, href, re.I)
+        if match:
+            results.append(
+                (".".join([x for x in match.groups() if x is not None]),
+                 match.group(0))
+            )
+
+
+    return results
+
+
+def scan_ftp(data, url, pattern):
+    buf = io.StringIO(data)
+    results = []
+
+    for line in buf.readlines():
+        line = line.replace("\n", "").replace("\r", "")
+        match = re.search(pattern, line, re.I)
+        if match:
+            results.append(
+                (".".join([x for x in match.groups() if x is not None]),
+                 match.group(0))
+            )
+    return results
+
+
+def scan_directory_recursive(cp, ver, rev, url, steps, orig_url, options):
+    if not steps:
+        return []
+
+    url += steps[0][0]
+    pattern = steps[0][1]
+
+    steps = steps[1:]
+
+    output.einfo("Scanning: %s" % url)
+
+    try:
+        fp = helpers.urlopen(url)
+    except urllib.error.URLError:
+        return []
+    except IOError:
+        return []
+
+    if not fp:
+        return []
+
+    data = fp.read()
+
+    results = []
+
+    if re.search(b"<\s*a\s+[^>]*href", data, re.I):
+        results.extend(scan_html(data, url, pattern))
+    elif url.startswith('ftp://'):
+        results.extend(scan_ftp(data, url, pattern))
+
+    versions = []
+
+    for up_pv, path in results:
+        pv = mangling.mangle_version(up_pv, options)
+
+        if helpers.version_filtered(cp, ver, pv):
+            continue
+        if not url.endswith("/"):
+            url = url + "/"
+        path = urljoin(url, path)
+
+        if not steps and path not in orig_url:
+            confidence = confidence_score(path, orig_url)
+            path = mangling.mangle_url(path, options)
+            versions.append((path, pv, HANDLER_NAME, confidence))
+
+        if steps:
+            ret = scan_directory_recursive(cp, ver, rev, path, steps, orig_url,
+                                           options)
+            versions.extend(ret)
+
+    return versions
+
+
+def scan_url(pkg, url, options):
+    if CONFIG["scan-dir"]:
+        for bu in SCANDIR_BLACKLIST_URLS:
+            if re.match(bu, url):
+                output.einfo("%s is blacklisted by rule %s" % (url, bu))
+                return []
+
+        resolved_url = helpers.parse_mirror(url)
+        if not resolved_url:
+            return []
+
+        cp, ver, rev = portage.pkgsplit(pkg.cpv)
+
+        # 'Hack' for _beta/_rc versions where _ is used instead of -
+        if ver not in resolved_url:
+            newver = helpers.version_change_end_sep(ver)
+            if newver and newver in resolved_url:
+                output.einfo(
+                    "Version: using %s instead of %s" % (newver, ver)
+                )
+                ver = newver
+
+        template = helpers.template_from_url(resolved_url, ver)
+        if '${' not in template:
+            output.einfo(
+                "Url doesn't seems to depend on version: %s not found in %s" %
+                (ver, resolved_url)
+            )
+            return []
+        else:
+            output.einfo("Scanning: %s" % template)
+
+        steps = helpers.generate_scan_paths(template)
+        ret = scan_directory_recursive(cp, ver, rev, "", steps, url, options)
+
+    if not ret:
+        ret = brute_force(pkg, url)
+
+    return ret
+
+
+def brute_force(pkg, url):
+    if CONFIG["brute-force"] == 0:
+        return []
+
+    cp, ver, rev = portage.pkgsplit(pkg.cpv)
+
+    url = helpers.parse_mirror(url)
+    if not url:
+        return []
+
+    for bp in BRUTEFORCE_BLACKLIST_PACKAGES:
+        if re.match(bp, cp):
+            output.einfo("%s is blacklisted by rule %s" % (cp, bp))
+            return []
+
+    for bp in BRUTEFORCE_BLACKLIST_URLS:
+        if re.match(bp, url):
+            output.einfo("%s is blacklisted by rule %s" % (cp, bp))
+            return []
+
+    output.einfo("Generating version from " + ver)
+
+    components = helpers.split_version(ver)
+    versions = helpers.gen_versions(components, CONFIG["brute-force"])
+
+    # Remove unwanted versions
+    for v in versions:
+        if helpers.vercmp(cp, ver, helpers.join_version(v)) >= 0:
+            versions.remove(v)
+
+    if not versions:
+        output.einfo("Can't generate new versions from " + ver)
+        return []
+
+    template = helpers.template_from_url(url, ver)
+
+    if '${PV}' not in template:
+        output.einfo(
+            "Url doesn't seems to depend on full version: %s not found in %s" %
+            (ver, url))
+        return []
+    else:
+        output.einfo("Brute forcing: %s" % template)
+
+    result = []
+
+    i = 0
+    done = []
+
+    while i < len(versions):
+        components = versions[i]
+        i += 1
+        if components in done:
+            continue
+        done.append(tuple(components))
+
+        version = helpers.join_version(components)
+
+        if helpers.version_filtered(cp, ver, version):
+            continue
+
+        try_url = helpers.url_from_template(template, version)
+        infos = helpers.tryurl(try_url, template)
+
+        if not infos:
+            continue
+        confidence = confidence_score(try_url, url,
+                                      minimum=BRUTEFORCE_CONFIDENCE)
+        result.append([try_url, version, BRUTEFORCE_HANDLER_NAME, confidence])
+
+        if len(result) > CONFIG['brute-force-false-watermark']:
+            output.einfo(
+                "Broken server detected ! Skipping brute force."
+            )
+            return []
+
+        if CONFIG["brute-force-recursive"]:
+            for v in helpers.gen_versions(list(components),
+                                          CONFIG["brute-force"]):
+                if v not in versions and tuple(v) not in done:
+                    versions.append(v)
+
+        if CONFIG["oneshot"]:
+            break
+
+    return result
+
+
+def can_handle(pkg, url):
+    return True
--- a/src/euscan/handlers/github.py
+++ b/src/euscan/handlers/github.py
@ -0,0 +1,59 @@
+import json
+import urllib.request, urllib.error, urllib.parse
+import re
+
+import portage
+
+from euscan import helpers, output, mangling
+
+HANDLER_NAME = "github"
+CONFIDENCE = 100
+PRIORITY = 90
+
+
+def can_handle(pkg, url=None):
+    return url and url.startswith('mirror://github/')
+
+
+def guess_package(cp, url):
+    match = re.search('^mirror://github/(.*?)/(.*?)/(.*)$', url)
+
+    assert(match)
+    return (match.group(1), match.group(2), match.group(3))
+
+
+def scan_url(pkg, url, options):
+    'http://developer.github.com/v3/repos/downloads/'
+
+    user, project, filename = guess_package(pkg.cpv, url)
+
+    # find out where version is expected to be found
+    cp, ver, rev = portage.pkgsplit(pkg.cpv)
+    if ver not in filename:
+        return
+
+    # now create a filename-matching regexp
+    # XXX: supposedly replace first with (?P<foo>...)
+    # and remaining ones with (?P=foo)
+    fnre = re.compile('^%s$' % \
+                      re.escape(filename).replace(re.escape(ver), '(.*?)'))
+
+    output.einfo("Using github API for: project=%s user=%s filename=%s" % \
+                 (project, user, filename))
+
+    dlreq = urllib.request.urlopen('https://api.github.com/repos/%s/%s/downloads' % \
+                            (user, project))
+    dls = json.load(dlreq)
+
+    ret = []
+    for dl in dls:
+        m = fnre.match(dl['name'])
+
+        if m:
+            pv = mangling.mangle_version(m.group(1), options)
+            if helpers.version_filtered(cp, ver, pv):
+                continue
+
+            url = mangling.mangle_url(dl['html_url'], options)
+            ret.append((url, pv, HANDLER_NAME, CONFIDENCE))
+    return ret
--- a/src/euscan/handlers/gnome.py
+++ b/src/euscan/handlers/gnome.py
@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+
+import re
+import urllib.request, urllib.error, urllib.parse
+
+try:
+    import simplejson as json
+except ImportError:
+    import json
+
+import portage
+
+from euscan import mangling, helpers, output
+
+HANDLER_NAME = "gnome"
+CONFIDENCE = 100
+PRIORITY = 90
+
+GNOME_URL_SOURCE = 'http://ftp.gnome.org/pub/GNOME/sources'
+
+
+def can_handle(_pkg, url=None):
+    return url and url.startswith('mirror://gnome/')
+
+
+def guess_package(cp, url):
+    match = re.search('mirror://gnome/sources/([^/]+)/.*', url)
+    if match:
+        return match.group(1)
+
+    _cat, pkg = cp.split("/")
+
+    return pkg
+
+
+def scan_url(pkg, url, options):
+    'http://ftp.gnome.org/pub/GNOME/sources/'
+    package = {
+        'data': guess_package(pkg.cpv, url),
+        'type': 'gnome',
+    }
+    return scan_pkg(pkg, package)
+
+
+def scan_pkg(pkg, options):
+    package = options['data']
+
+    output.einfo("Using Gnome json cache: " + package)
+
+    fp = urllib.request.urlopen('/'.join([GNOME_URL_SOURCE, package, 'cache.json']))
+    content = fp.read()
+    fp.close()
+
+    cache = json.loads(content, encoding='ascii')
+
+    if cache[0] != 4:
+        output.eerror('Unknow cache format detected')
+        return []
+
+    versions = cache[2][package]
+
+    if not versions:
+        return []
+
+    versions.reverse()
+
+    cp, ver, _rev = portage.pkgsplit(pkg.cpv)
+
+    ret = []
+    for up_pv in versions:
+        pv = mangling.mangle_version(up_pv, options)
+        if helpers.version_filtered(cp, ver, pv):
+            continue
+        up_files = cache[1][package][up_pv]
+        for tarball_comp in ('tar.xz', 'tar.bz2', 'tar.gz'):
+            if tarball_comp in up_files:
+                url = '/'.join([GNOME_URL_SOURCE, package,
+                                 up_files[tarball_comp]])
+                break
+        else:
+            output.ewarn('No tarball for release %s' % up_pv)
+        ret.append((url, pv, HANDLER_NAME, CONFIDENCE))
+
+    return ret
--- a/src/euscan/handlers/google_code.py
+++ b/src/euscan/handlers/google_code.py
@ -0,0 +1,43 @@
+import re
+import portage
+
+from euscan import output
+from euscan.helpers import regex_from_template
+from euscan.handlers.url import process_scan as url_scan
+
+HANDLER_NAME = "google-code"
+CONFIDENCE = 90
+PRIORITY = 90
+
+
+package_name_regex = r"http://(.+).googlecode.com/files/.+"
+
+
+def can_handle(pkg, url=None):
+    if not url:
+        return False
+
+    cp, ver, rev = portage.pkgsplit(pkg.cpv)
+    if ver not in url:
+        return False
+
+    return re.match(package_name_regex, url)
+
+def scan_url(pkg, url, options):
+    output.einfo("Using Google Code handler")
+
+    cp, ver, rev = portage.pkgsplit(pkg.cpv)
+
+    package_name = re.match(package_name_regex, url).group(1)
+    base_url = "http://code.google.com/p/%s/downloads/list" % package_name
+
+    file_pattern = regex_from_template(
+        url.split("/")[-1].replace(ver, "${PV}")
+    )
+
+    result = url_scan(pkg, base_url, file_pattern)
+
+    ret = []
+    for url, pv, _, _ in result:
+        ret.append((url, pv, HANDLER_NAME, CONFIDENCE))
+    return ret
--- a/src/euscan/handlers/kde.py
+++ b/src/euscan/handlers/kde.py
@ -0,0 +1,38 @@
+from euscan.handlers import generic
+
+PRIORITY = 90
+
+HANDLER_NAME = "kde"
+
+
+def can_handle(pkg, url):
+    return url and url.startswith('mirror://kde/')
+
+
+def clean_results(results):
+    ret = []
+
+    for path, version, _, confidence in results:
+        if version == '5SUMS':
+            continue
+        ret.append((path, version, HANDLER_NAME, confidence))
+
+    return ret
+
+
+def scan_url(pkg, url, options):
+    results = generic.scan(pkg.cpv, url)
+
+    if generic.startswith('mirror://kde/unstable/'):
+        url = generic.replace('mirror://kde/unstable/', 'mirror://kde/stable/')
+        results += generic.scan(pkg.cpv, url)
+
+    if not results:  # if nothing was found go brute forcing
+        results = generic.brute_force(pkg.cpv, url)
+
+        if generic.startswith('mirror://kde/unstable/'):
+            url = generic.replace('mirror://kde/unstable/',
+                                  'mirror://kde/stable/')
+            results += generic.brute_force(pkg.cpv, url)
+
+    return clean_results(results)
--- a/src/euscan/handlers/pear.py
+++ b/src/euscan/handlers/pear.py
@ -0,0 +1,12 @@
+from euscan.handlers import php
+
+HANDLER_NAME = "pear"
+CONFIDENCE = 100
+PRIORITY = 90
+
+
+def can_handle(pkg, url=None):
+    return url and url.startswith('http://%s.php.net/get/' % HANDLER_NAME)
+
+scan_url = php.scan_url
+scan_pkg = php.scan_pkg
--- a/src/euscan/handlers/pecl.py
+++ b/src/euscan/handlers/pecl.py
@ -0,0 +1,11 @@
+from euscan.handlers import php
+
+HANDLER_NAME = "pecl"
+CONFIDENCE = 100
+PRIORITY = 90
+
+def can_handle(pkg, url=None):
+    return url and url.startswith('http://%s.php.net/get/' % HANDLER_NAME)
+
+scan_url = php.scan_url
+scan_pkg = php.scan_pkg
--- a/src/euscan/handlers/php.py
+++ b/src/euscan/handlers/php.py
@ -0,0 +1,69 @@
+import re
+import portage
+import urllib.request, urllib.error, urllib.parse
+import xml.dom.minidom
+
+from euscan import helpers, output, mangling
+
+HANDLER_NAME = "php"
+CONFIDENCE = 100
+PRIORITY = 90
+
+def can_handle(pkg, url=None):
+    return False
+
+def guess_package_and_channel(cp, url):
+    match = re.search('http://(.*)\.php\.net/get/(.*)-(.*).tgz', url)
+
+    if match:
+        host = match.group(1)
+        pkg = match.group(2)
+    else:
+        cat, pkg = cp.split("/")
+
+    return pkg, host
+
+
+def scan_url(pkg, url, options):
+    package, channel = guess_package_and_channel(pkg.cp, url)
+    return scan_pkg(pkg, {'type' : channel, 'data' : package })
+
+def scan_pkg(pkg, options):
+    cp, ver, rev = pkg.cp, pkg.version, pkg.revision
+
+    package = options['data']
+    channel = options['type']
+
+    url = 'http://%s.php.net/rest/r/%s/allreleases.xml' % (channel, package.lower())
+
+    output.einfo("Using: " + url)
+
+    try:
+        fp = helpers.urlopen(url)
+    except urllib.error.URLError:
+        return []
+    except IOError:
+        return []
+
+    if not fp:
+        return []
+
+    data = fp.read()
+
+    dom = xml.dom.minidom.parseString(data)
+
+    nodes = dom.getElementsByTagName("v")
+    ret = []
+
+    for node in nodes:
+        up_pv = node.childNodes[0].data
+        pv = mangling.mangle_version(up_pv, options)
+        if helpers.version_filtered(cp, ver, pv):
+            continue
+
+        url = 'http://%s.php.net/get/%s-%s.tgz' % (channel, package, up_pv)
+        url = mangling.mangle_url(url, options)
+
+        ret.append((url, pv, HANDLER_NAME, CONFIDENCE))
+
+    return ret
--- a/src/euscan/handlers/pypi.py
+++ b/src/euscan/handlers/pypi.py
@ -0,0 +1,58 @@
+import xmlrpc.client
+import re
+
+import portage
+
+from euscan import mangling, helpers, output
+
+HANDLER_NAME = "pypi"
+CONFIDENCE = 100
+PRIORITY = 90
+
+
+def can_handle(pkg, url=None):
+    return url and url.startswith('mirror://pypi/')
+
+
+def guess_package(cp, url):
+    match = re.search('mirror://pypi/\w+/(.*)/.*', url)
+    if match:
+        return match.group(1)
+
+    cat, pkg = cp.split("/")
+
+    return pkg
+
+
+def scan_url(pkg, url, options):
+    'http://wiki.python.org/moin/PyPiXmlRpc'
+
+    package = guess_package(pkg.cpv, url)
+    return scan_pkg(pkg, {'data': package})
+
+
+def scan_pkg(pkg, options):
+    package = options['data']
+
+    output.einfo("Using PyPi XMLRPC: " + package)
+
+    client = xmlrpc.client.ServerProxy('https://pypi.python.org/pypi')
+    versions = client.package_releases(package)
+
+    if not versions:
+        return versions
+
+    versions.reverse()
+
+    cp, ver, rev = portage.pkgsplit(pkg.cpv)
+
+    ret = []
+    for up_pv in versions:
+        pv = mangling.mangle_version(up_pv, options)
+        if helpers.version_filtered(cp, ver, pv):
+            continue
+        urls = client.release_urls(package, up_pv)
+        urls = " ".join([mangling.mangle_url(infos['url'], options)
+                         for infos in urls])
+        ret.append((urls, pv, HANDLER_NAME, CONFIDENCE))
+    return ret
--- a/src/euscan/handlers/rubygems.py
+++ b/src/euscan/handlers/rubygems.py
@ -0,0 +1,75 @@
+import re
+import portage
+import json
+import urllib.request, urllib.error, urllib.parse
+
+from euscan import helpers, output, mangling
+
+HANDLER_NAME = "rubygems"
+CONFIDENCE = 100
+PRIORITY = 90
+
+
+def can_handle(pkg, url=None):
+    return url and url.startswith('mirror://rubygems/')
+
+
+def guess_gem(cpv, url):
+    match = re.search('mirror://rubygems/(.*).gem', url)
+    if match:
+        cpv = 'fake/%s' % match.group(1)
+
+    ret = portage.pkgsplit(cpv)
+    if not ret:
+        return None
+
+    cp, ver, rev = ret
+    cat, pkg = cp.split("/")
+
+    return pkg
+
+
+def scan_url(pkg, url, options):
+    'http://guides.rubygems.org/rubygems-org-api/#gemversion'
+
+    gem = guess_gem(pkg.cpv, url)
+
+    if not gem:
+        output.eerror("Can't guess gem name using %s and %s" % \
+            (pkg.cpv, url))
+        return []
+
+    output.einfo("Using RubyGem API: %s" % gem)
+
+    return scan_pkg(pkg, {'data': gem})
+
+
+def scan_pkg(pkg, options):
+    gem = options['data']
+    url = 'http://rubygems.org/api/v1/versions/%s.json' % gem
+
+    try:
+        fp = helpers.urlopen(url)
+    except urllib.error.URLError:
+        return []
+    except IOError:
+        return []
+
+    if not fp:
+        return []
+
+    data = fp.read()
+    versions = json.loads(data)
+
+    cp, ver, rev = portage.pkgsplit(pkg.cpv)
+
+    ret = []
+    for version in versions:
+        up_pv = version['number']
+        pv = mangling.mangle_version(up_pv, options)
+        if helpers.version_filtered(cp, ver, pv):
+            continue
+        url = 'http://rubygems.org/gems/%s-%s.gem' % (gem, up_pv)
+        url = mangling.mangle_url(url, options)
+        ret.append((url, pv, HANDLER_NAME, CONFIDENCE))
+    return ret
--- a/src/euscan/handlers/sourceforge.py
+++ b/src/euscan/handlers/sourceforge.py
@ -0,0 +1,45 @@
+import re
+
+import portage
+
+from euscan.helpers import regex_from_template
+from euscan.handlers.url import process_scan as url_scan
+from euscan import output
+
+HANDLER_NAME = "sourceforge"
+CONFIDENCE = 90
+PRIORITY = 90
+
+
+def can_handle(pkg, url=None):
+    if not url:
+        return False
+
+    cp, ver, rev = portage.pkgsplit(pkg.cpv)
+    if ver not in url:
+        return False
+
+    return "mirror://sourceforge/" in url
+
+
+def scan_url(pkg, url, options):
+    output.einfo("Using SourceForge handler")
+
+    cp, ver, rev = portage.pkgsplit(pkg.cpv)
+
+    project, filename = re.search(
+        "mirror://sourceforge/([^/]+)/(?:.*/)?([^/]+)",
+        url
+    ).groups()
+
+    base_url = "http://qa.debian.org/watch/sf.php/%s" % project
+    file_pattern = regex_from_template(
+        filename.replace(ver, "${PV}")
+    )
+
+    result = url_scan(pkg, base_url, file_pattern)
+
+    ret = []
+    for url, pv, _, _ in result:
+        ret.append((url, pv, HANDLER_NAME, CONFIDENCE))
+    return ret
--- a/src/euscan/handlers/url.py
+++ b/src/euscan/handlers/url.py
@ -0,0 +1,104 @@
+import re
+import urllib.request, urllib.error, urllib.parse
+
+from euscan.handlers import generic
+from euscan import output, helpers
+
+PRIORITY = 100
+
+HANDLER_NAME = "url"
+CONFIDENCE = 100.0
+
+
+is_pattern = r"\([^\/]+\)"
+
+
+def can_handle(*args):
+    return False
+
+
+def handle_directory_patterns(base, file_pattern):
+    """
+    Directory pattern matching
+    e.g.: base: ftp://ftp.nessus.org/pub/nessus/nessus-([\d\.]+)/src/
+          file_pattern: nessus-core-([\d\.]+)\.tar\.gz
+    """
+    splitted = base.split("/")
+    i = 0
+    basedir = []
+    for elem in splitted:
+        if re.search(is_pattern, elem):
+            break
+        basedir.append(elem)
+        i += 1
+    basedir = "/".join(basedir)
+    directory_pattern = splitted[i]
+    final = "/".join(splitted[i + 1:])
+
+    try:
+        fp = helpers.urlopen(basedir)
+    except urllib.error.URLError:
+        return []
+    except IOError:
+        return []
+
+    if not fp:
+        return []
+
+    data = fp.read()
+
+    if basedir.startswith("ftp://"):
+        scan_data = generic.scan_ftp(data, basedir, directory_pattern)
+    else:
+        scan_data = generic.scan_html(data, basedir, directory_pattern)
+
+    return [("/".join((basedir, path, final)), file_pattern)
+            for _, path in scan_data]
+
+
+def read_options(options):
+    try:
+        base, file_pattern = options['data'].split(" ")[:2]
+    except ValueError:
+        base, file_pattern = options['data'], None
+
+    # the file pattern can be in the base url
+    pattern_regex = r"/([^/]*\([^/]*\)[^/]*)$"
+    match = re.search(pattern_regex, base)
+    if match:
+        file_pattern = match.group(1)
+        base = base.replace(file_pattern, "")
+
+    # handle sf.net specially
+    base = base.replace(
+        "http://sf.net/", "http://qa.debian.org/watch/sf.php/"
+    )
+
+    return base, file_pattern
+
+
+def process_scan(pkg, base, file_pattern, options=None):
+    if options is None:
+        options = {}
+
+    cp, ver, rev = pkg.cp, pkg.version, pkg.revision
+
+    results = []
+    if not re.search(is_pattern, base):
+        steps = [(base, file_pattern)]
+        results = generic.scan_directory_recursive(
+            cp, ver, rev, "", steps, base, options
+        )
+    else:
+        for step in handle_directory_patterns(base, file_pattern):
+            results += generic.scan_directory_recursive(
+                cp, ver, rev, "", [step], base, options
+            )
+
+    return results
+
+
+def scan_pkg(pkg, options):
+    output.einfo("Using watch data")
+    base, file_pattern = read_options(options)
+    return process_scan(pkg, base, file_pattern, options)