import os import re import errno import urllib.request, urllib.error, urllib.parse from xml.dom.minidom import Document import portage from portage import dep try: from urllib import robotparser from urllib import urlparse except ImportError: import urllib.robotparser import urllib.parse import euscan from euscan import CONFIG, BLACKLIST_VERSIONS, ROBOTS_TXT_BLACKLIST_DOMAINS from euscan.version import parse_version def htop_vercmp(a, b): def fixver(v): if v in ['0.11', '0.12', '0.13']: v = '0.1.' + v[3:] return v return simple_vercmp(fixver(a), fixver(b)) VERSION_CMP_PACKAGE_QUIRKS = { 'sys-process/htop': htop_vercmp } _v_end = r'(?:(?:-|_)(?:pre|p|beta|b|alpha|a|rc|r)\d*)' _v = r'((?:\d+)(?:(?:\.\d+)*)(?:[a-zA-Z]*?)(?:' + _v_end + '*))' def cast_int_components(version): for i, obj in enumerate(version): try: version[i] = int(obj) except ValueError: pass return version def simple_vercmp(a, b): if a == b: return 0 # For sane versions r = portage.versions.vercmp(a, b) if r is not None: return r # Fallback a = parse_version(a) b = parse_version(b) if a < b: return -1 else: return 1 def vercmp(package, a, b): if package in VERSION_CMP_PACKAGE_QUIRKS: return VERSION_CMP_PACKAGE_QUIRKS[package](a, b) return simple_vercmp(a, b) def version_is_nightly(a, b): a = parse_version(a) b = parse_version(b) # Try to skip nightly builds when not wanted (www-apps/moodle) if len(a) != len(b) and len(b) == 2 and len(b[0]) == len('yyyymmdd'): if b[0][:4] != '0000': return True return False def version_blacklisted(cp, version): rule = None cpv = '%s-%s' % (cp, version) # Check that the generated cpv can be used by portage if not portage.versions.catpkgsplit(cpv): return False for bv in BLACKLIST_VERSIONS: if dep.match_from_list(bv, [cpv]): rule = bv None if rule: euscan.output.einfo("%s is blacklisted by rule %s" % (cpv, rule)) return rule is not None def version_change_end_sep(version): match = re.match(r".*(%s)" % _v_end, version) if not match: return None end = match.group(1) if end[0] == '_': newend = end.replace('_', '-') elif end[0] == '-': newend = end.replace('-', '_') else: return None return version.replace(end, newend) def version_filtered(cp, base, version, vercmp=vercmp): if vercmp(cp, base, version) >= 0: return True if version_blacklisted(cp, version): return True if version_is_nightly(base, version): return True return False def generate_templates_vars(version): ret = [] part = split_version(version) for i in range(2, len(part)): ver = [] var = [] for j in range(i): ver.append(str(part[j])) var.append('${%d}' % j) ret.append((".".join(ver), ".".join(var))) ret.append((version, '${PV}')) ret.reverse() return ret def template_from_url(url, version): prefix, chunks = url.split('://') chunks = chunks.split('/') for i in range(len(chunks)): chunk = chunks[i] subs = generate_templates_vars(version) for sub in subs: chunk = chunk.replace(sub[0], sub[1]) chunks[i] = chunk return prefix + "://" + "/".join(chunks) def url_from_template(url, version): components = split_version(version) url = url.replace('${PV}', version) for i in range(len(components)): url = url.replace('${%d}' % i, str(components[i])) return url # Stolen from distutils.LooseVersion # Used for brute force to increment the version def split_version(version): component_re = re.compile(r'(\d+ | [a-z]+ | \.)', re.VERBOSE) components = [x for x in component_re.split(version) if x and x != '.'] for i in range(len(components)): try: components[i] = int(components[i]) except ValueError: pass return components def join_version(components): version = "" for i in range(len(components)): version += str(components[i]) if i >= len(components) - 1: break if type(components[i]) != str and type(components[i + 1]) != str: version += "." return version def increment_version(components, level): n = len(components) if level > n - 1 or level < 0: raise Exception for i in range(n, level + 1, -1): if type(components[i - 1]) == int: components[i - 1] = 0 if type(components[level]) == int: components[level] += 1 return components def gen_versions(components, level): n = len(components) depth = level level = min(level, n) if not n: return [] versions = [] for i in range(n, n - level, -1): increment_version(components, i - 1) for j in range(depth): versions.append(list(components)) increment_version(components, i - 1) return versions def timeout_for_url(url): if 'sourceforge' in url: timeout = 15 else: timeout = 5 return timeout class HeadRequest(urllib.request.Request): def get_method(self): return "HEAD" # RobotParser cache rpcache = {} def urlallowed(url): if CONFIG['skip-robots-txt']: return True protocol, domain = urllib.parse.urlparse(url)[:2] for bd in ROBOTS_TXT_BLACKLIST_DOMAINS: if re.match(bd, domain): return True for d in ['sourceforge', 'berlios', 'github.com']: if d in domain: return True if protocol == 'ftp': return True baseurl = '%s://%s' % (protocol, domain) robotsurl = urllib.parse.urljoin(baseurl, 'robots.txt') if baseurl in rpcache: rp = rpcache[baseurl] else: from socket import setdefaulttimeout, getdefaulttimeout timeout = getdefaulttimeout() setdefaulttimeout(5) rp = urllib.robotparser.RobotFileParser() rp.set_url(robotsurl) try: rp.read() rpcache[baseurl] = rp except: rp = None setdefaulttimeout(timeout) return rp.can_fetch(CONFIG['user-agent'], url) if rp else True def urlopen(url, timeout=None, verb="GET"): if not urlallowed(url): euscan.output.einfo("Url '%s' blocked by robots.txt" % url) return None if not timeout: timeout = timeout_for_url(url) if verb == 'GET': request = urllib.request.Request(url) elif verb == 'HEAD': request = HeadRequest(url) else: return None request.add_header('User-Agent', CONFIG['user-agent']) handlers = [] if CONFIG['cache']: from cache import CacheHandler handlers.append(CacheHandler(CONFIG['cache'])) if CONFIG['verbose']: debuglevel = CONFIG['verbose'] - 1 handlers.append(urllib.request.HTTPHandler(debuglevel=debuglevel)) opener = urllib.request.build_opener(*handlers) return opener.open(request, None, timeout) def tryurl(fileurl, template): result = True if not urlallowed(fileurl): euscan.output.einfo("Url '%s' blocked by robots.txt" % fileurl) return None euscan.output.ebegin("Trying: " + fileurl) try: basename = os.path.basename(fileurl) fp = urlopen(fileurl, verb='HEAD') if not fp: euscan.output.eend(errno.EPERM) return None headers = fp.info() # Some URLs return Content-disposition with different filename # Disable check for now (I have no seen false positives) #if 'Content-disposition' in headers and \ # basename not in headers['Content-disposition']: # result = None if 'Content-Length' in headers and headers['Content-Length'] == '0': result = None elif 'Content-Type' in headers and \ 'text/html' in headers['Content-Type']: result = None elif 'Content-Type' in headers and \ 'application/x-httpd-php' in headers['Content-Type']: result = None elif fp.geturl() != fileurl: regex = regex_from_template(template) baseregex = regex_from_template(os.path.basename(template)) basename2 = os.path.basename(fp.geturl()) # Redirect to another (earlier?) version if basename != basename2 and (re.match(regex, fp.geturl()) or \ re.match(baseregex, basename2)): result = None if result: result = (fp.geturl(), fp.info()) except urllib.error.URLError: result = None except IOError: result = None euscan.output.eend(errno.ENOENT if not result else 0) return result def regex_from_template(template): # Escape regexp = re.escape(template) # Unescape specific stuff regexp = regexp.replace('\$\{', '${') regexp = regexp.replace('\}', '}') regexp = regexp.replace('}\.$', '}.$') # Replace ${\d+} #regexp = regexp.replace('${0}', r'([\d]+?)') regexp = re.sub(r'(\$\{\d+\}(\.?))+', r'([\\w\.]+?)', regexp) #regexp = re.sub(r'(\$\{\d+\}\.?)+', r'([\w]+?)', regexp) #regexp = re.sub(r'(\$\{\d+\}\.+)+', '(.+?)\.', regexp) #regexp = re.sub(r'(\$\{\d+\})+', '(.+?)', regexp) # Full version regexp = regexp.replace('${PV}', _v) # End regexp = regexp + r'/?$' return regexp def basedir_from_template(template): idx = template.find('${') if idx == -1: return template idx = template[0:idx].rfind('/') if idx == -1: return "" return template[0:idx] def generate_scan_paths(url): prefix, chunks = url.split('://') chunks = chunks.split('/') steps = [] path = prefix + ":/" for chunk in chunks: if '${' in chunk: steps.append((path, '^(?:|.*/)' + regex_from_template(chunk))) path = "" else: path += "/" path += chunk return steps def parse_mirror(uri): from random import shuffle mirrors = portage.settings.thirdpartymirrors() if not uri.startswith("mirror://"): return uri eidx = uri.find("/", 9) if eidx == -1: euscan.output.einfo("Invalid mirror definition in SRC_URI:\n") euscan.output.einfo(" %s\n" % (uri)) return None mirrorname = uri[9:eidx] path = uri[eidx + 1:] if mirrorname in mirrors: mirrors = mirrors[mirrorname] shuffle(mirrors) uri = mirrors[0].strip("/") + "/" + path else: euscan.output.einfo("No known mirror by the name: %s" % (mirrorname)) return None return uri def dict_to_xml(data, indent): doc = Document() root = doc.createElement("euscan") doc.appendChild(root) def _set_value(parent, value): if isinstance(value, dict): for k, v in list(value.items()): node = doc.createElement(k) _set_value(node, v) parent.appendChild(node) elif isinstance(value, list): for item in value: node = doc.createElement("value") text = doc.createTextNode(item) node.appendChild(text) parent.appendChild(node) else: text = doc.createTextNode(str(value)) parent.appendChild(text) for key, value in list(data.items()): node = doc.createElement("package") node.setAttribute("name", key) _set_value(node, value) root.appendChild(node) return doc.toprettyxml(indent=" " * indent)