# Copyright 2011 Corentin Chary # Copyright 2020-2023 src_prepare group # Distributed under the terms of the GNU General Public License v2 import difflib import io import re import urllib.error import urllib.parse import urllib.request import warnings from urllib.parse import urljoin, urlparse import portage from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning from euscan import ( BRUTEFORCE_BLACKLIST_PACKAGES, BRUTEFORCE_BLACKLIST_URLS, CONFIG, SCANDIR_BLACKLIST_URLS, helpers, mangling, output, ) HANDLER_NAME = "generic" CONFIDENCE = 45 PRIORITY = 0 BRUTEFORCE_HANDLER_NAME = "brute_force" BRUTEFORCE_CONFIDENCE = 30 def confidence_score(found, original, minimum=CONFIDENCE): found_p = urlparse(found) original_p = urlparse(original) # check if the base url is the same if found_p.netloc != original_p.netloc: return minimum # check if the directory depth is the same if len(found_p.path.split("/")) != len(original_p.path.split("/")): return minimum # strip numbers found_path = re.sub(r"[\d+\.]?", "", found_p.path) original_path = re.sub(r"[\d+\.]?", "", original_p.path) # strip the first equal part of the path i = 0 max_i = len(found_path) while i < max_i and found_path[i] == original_path[i]: i += 1 found_path = found_path[i:] original_path = original_path[i:] # calculate difference ratio diff = difflib.SequenceMatcher(None, found_path, original_path).ratio() return int(minimum + minimum * diff) # maximum score is minimum * 2 def scan_html(data, url, pattern): warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning) soup = BeautifulSoup(data, features="lxml") results = [] for link in soup.findAll("a"): href = link.get("href") if not href: continue if href.startswith(url): href = href.replace(url, "", 1) match = re.search(pattern, href, re.I) if match: results.append( (".".join([x for x in match.groups() if x is not None]), match.group(0)) ) return results def scan_ftp(data, url, pattern): buf = io.StringIO(data) results = [] for line in buf.readlines(): line = line.replace("\n", "").replace("\r", "") match = re.search(pattern, line, re.I) if match: results.append( (".".join([x for x in match.groups() if x is not None]), match.group(0)) ) return results def scan_directory_recursive(cp, ver, rev, url, steps, orig_url, options): if not steps: return [] url += steps[0][0] pattern = steps[0][1] steps = steps[1:] output.einfo("Scanning: %s" % url) try: fp = helpers.urlopen(url) except urllib.error.URLError: return [] except IOError: return [] if not fp: return [] data = fp.read() results = [] if re.search(rb"<\s*a\s+[^>]*href", data, re.I): results.extend(scan_html(data, url, pattern)) elif url.startswith("ftp://"): results.extend(scan_ftp(data, url, pattern)) versions = [] for up_pv, path in results: pv = mangling.mangle_version(up_pv, options) if helpers.version_filtered(cp, ver, pv): continue if not url.endswith("/"): url = url + "/" path = urljoin(url, path) if not steps and path not in orig_url: confidence = confidence_score(path, orig_url) path = mangling.mangle_url(path, options) versions.append((path, pv, HANDLER_NAME, confidence)) if steps: ret = scan_directory_recursive(cp, ver, rev, path, steps, orig_url, options) versions.extend(ret) return versions def scan_url(pkg, url, options): if CONFIG["scan-dir"]: for bu in SCANDIR_BLACKLIST_URLS: if re.match(bu, url): output.einfo("%s is blacklisted by rule %s" % (url, bu)) return [] resolved_url = helpers.parse_mirror(url) if not resolved_url: return [] cp, ver, rev = portage.pkgsplit(pkg.cpv) # 'Hack' for _beta/_rc versions where _ is used instead of - if ver not in resolved_url: newver = helpers.version_change_end_sep(ver) if newver and newver in resolved_url: output.einfo("Version: using %s instead of %s" % (newver, ver)) ver = newver template = helpers.template_from_url(resolved_url, ver) if "${" not in template: output.einfo( "Url doesn't seems to depend on version: %s not found in %s" % (ver, resolved_url) ) return [] else: output.einfo("Scanning: %s" % template) steps = helpers.generate_scan_paths(template) ret = scan_directory_recursive(cp, ver, rev, "", steps, url, options) if not ret: ret = brute_force(pkg, url) return ret def brute_force(pkg, url): if CONFIG["brute-force"] == 0: return [] cp, ver, rev = portage.pkgsplit(pkg.cpv) url = helpers.parse_mirror(url) if not url: return [] for bp in BRUTEFORCE_BLACKLIST_PACKAGES: if re.match(bp, cp): output.einfo("%s is blacklisted by rule %s" % (cp, bp)) return [] for bp in BRUTEFORCE_BLACKLIST_URLS: if re.match(bp, url): output.einfo("%s is blacklisted by rule %s" % (cp, bp)) return [] output.einfo("Generating version from " + ver) components = helpers.split_version(ver) versions = helpers.gen_versions(components, CONFIG["brute-force"]) # Remove unwanted versions for v in versions: if helpers.vercmp(cp, ver, helpers.join_version(v)) >= 0: versions.remove(v) if not versions: output.einfo("Can't generate new versions from " + ver) return [] template = helpers.template_from_url(url, ver) if "${PV}" not in template: output.einfo( "Url doesn't seems to depend on full version: %s not found in %s" % (ver, url) ) return [] else: output.einfo("Brute forcing: %s" % template) result = [] i = 0 done = [] while i < len(versions): components = versions[i] i += 1 if components in done: continue done.append(tuple(components)) version = helpers.join_version(components) if helpers.version_filtered(cp, ver, version): continue try_url = helpers.url_from_template(template, version) infos = helpers.tryurl(try_url, template) if not infos: continue confidence = confidence_score(try_url, url, minimum=BRUTEFORCE_CONFIDENCE) result.append([try_url, version, BRUTEFORCE_HANDLER_NAME, confidence]) if len(result) > CONFIG["brute-force-false-watermark"]: output.einfo("Broken server detected ! Skipping brute force.") return [] if CONFIG["brute-force-recursive"]: for v in helpers.gen_versions(list(components), CONFIG["brute-force"]): if v not in versions and tuple(v) not in done: versions.append(v) if CONFIG["oneshot"]: break return result def can_handle(pkg, url): return True