From 326658acb918a083fff3a8e25c24a3d5eff1f7c0 Mon Sep 17 00:00:00 2001 From: volpino Date: Fri, 27 Jul 2012 11:26:54 +0200 Subject: [PATCH] euscan: confidence tweak in generic handler Signed-off-by: volpino --- pym/euscan/handlers/url/cpan.py | 2 +- pym/euscan/handlers/url/generic.py | 49 +++++++++++++++++++++++------ pym/euscan/handlers/url/github.py | 2 +- pym/euscan/handlers/url/php.py | 2 +- pym/euscan/handlers/url/pypi.py | 2 +- pym/euscan/handlers/url/rubygems.py | 2 +- 6 files changed, 45 insertions(+), 14 deletions(-) diff --git a/pym/euscan/handlers/url/cpan.py b/pym/euscan/handlers/url/cpan.py index 0721324..5513e0d 100644 --- a/pym/euscan/handlers/url/cpan.py +++ b/pym/euscan/handlers/url/cpan.py @@ -6,7 +6,7 @@ import json from euscan import helpers, output HANDLER_NAME = "cpan" -CONFIDENCE = 100.0 +CONFIDENCE = 100 PRIORITY = 90 _cpan_package_name_re = re.compile("mirror://cpan/authors/.*/([^/.]*).*") diff --git a/pym/euscan/handlers/url/generic.py b/pym/euscan/handlers/url/generic.py index 1f43a7a..3ba7ac0 100644 --- a/pym/euscan/handlers/url/generic.py +++ b/pym/euscan/handlers/url/generic.py @@ -1,7 +1,8 @@ -from urlparse import urljoin +from urlparse import urljoin, urlparse import urllib2 import re import StringIO +import difflib try: from BeautifulSoup import BeautifulSoup @@ -14,11 +15,40 @@ from euscan import CONFIG, SCANDIR_BLACKLIST_URLS, \ BRUTEFORCE_BLACKLIST_PACKAGES, BRUTEFORCE_BLACKLIST_URLS, output, helpers HANDLER_NAME = "generic" -CONFIDENCE = 50.0 +CONFIDENCE = 45 PRIORITY = 0 BRUTEFORCE_HANDLER_NAME = "brute_force" -BRUTEFORCE_CONFIDENCE = 30.0 +BRUTEFORCE_CONFIDENCE = 30 + + +def confidence_score(found, original, minimum=CONFIDENCE): + found_p = urlparse(found) + original_p = urlparse(original) + + # check if the base url is the same + if found_p.netloc != original_p.netloc: + return minimum + + # check if the directory depth is the same + if len(found_p.path.split("/")) != len(original_p.path.split("/")): + return minimum + + # strip numbers + found_path = re.sub(r"[\d+\.]?", "", found_p.path) + original_path = re.sub(r"[\d+\.]?", "", original_p.path) + + # strip the first equal part of the path + i = 0 + max_i = len(found_path) + while i < max_i and found_path[i] == original_path[i]: + i += 1 + found_path = found_path[i:] + original_path = original_path[i:] + + # calculate difference ratio + diff = difflib.SequenceMatcher(None, found_path, original_path).ratio() + return int(minimum + minimum * diff) # maximum score is minimum * 2 def scan_html(data, url, pattern): @@ -98,7 +128,8 @@ def scan_directory_recursive(cp, ver, rev, url, steps, orig_url): path = urljoin(url, path) if not steps and path not in orig_url: - versions.append((path, pv, HANDLER_NAME, CONFIDENCE)) + confidence = confidence_score(path, orig_url) + versions.append((path, pv, HANDLER_NAME, confidence)) if steps: ret = scan_directory_recursive(cp, ver, rev, path, steps, orig_url) @@ -209,14 +240,14 @@ def brute_force(pkg, url): if helpers.version_filtered(cp, ver, version): continue - url = helpers.url_from_template(template, version) - infos = helpers.tryurl(url, template) + try_url = helpers.url_from_template(template, version) + infos = helpers.tryurl(try_url, template) if not infos: continue - - result.append([url, version, BRUTEFORCE_HANDLER_NAME, - BRUTEFORCE_CONFIDENCE]) + confidence = confidence_score(try_url, url, + minimum=BRUTEFORCE_CONFIDENCE) + result.append([try_url, version, BRUTEFORCE_HANDLER_NAME, confidence]) if len(result) > CONFIG['brute-force-false-watermark']: output.einfo( diff --git a/pym/euscan/handlers/url/github.py b/pym/euscan/handlers/url/github.py index e4ebe10..dc5dd16 100644 --- a/pym/euscan/handlers/url/github.py +++ b/pym/euscan/handlers/url/github.py @@ -7,7 +7,7 @@ import portage from euscan import helpers, output HANDLER_NAME = "github" -CONFIDENCE = 100.0 +CONFIDENCE = 100 PRIORITY = 90 diff --git a/pym/euscan/handlers/url/php.py b/pym/euscan/handlers/url/php.py index 853059a..d0fef71 100644 --- a/pym/euscan/handlers/url/php.py +++ b/pym/euscan/handlers/url/php.py @@ -6,7 +6,7 @@ import xml.dom.minidom from euscan import helpers, output HANDLER_NAME = "php" -CONFIDENCE = 100.0 +CONFIDENCE = 100 PRIORITY = 90 diff --git a/pym/euscan/handlers/url/pypi.py b/pym/euscan/handlers/url/pypi.py index 82251e6..02428ee 100644 --- a/pym/euscan/handlers/url/pypi.py +++ b/pym/euscan/handlers/url/pypi.py @@ -6,7 +6,7 @@ import portage from euscan import helpers, output HANDLER_NAME = "pypi" -CONFIDENCE = 100.0 +CONFIDENCE = 100 PRIORITY = 90 diff --git a/pym/euscan/handlers/url/rubygems.py b/pym/euscan/handlers/url/rubygems.py index a3021f0..3b4facd 100644 --- a/pym/euscan/handlers/url/rubygems.py +++ b/pym/euscan/handlers/url/rubygems.py @@ -6,7 +6,7 @@ import urllib2 from euscan import helpers, output HANDLER_NAME = "rubygems" -CONFIDENCE = 100.0 +CONFIDENCE = 100 PRIORITY = 90