From e42ba7dfd05ff8326981275f5031db5b52564d68 Mon Sep 17 00:00:00 2001 From: Corentin Chary Date: Wed, 12 Dec 2012 22:54:50 +0100 Subject: [PATCH] djeuscan: rework the scan process, don't use alive anymore Signed-off-by: Corentin Chary --- euscanwww/djeuscan/admin.py | 2 +- .../management/commands/scan_portage.py | 7 - euscanwww/djeuscan/models.py | 1 - .../djeuscan/processing/scan/scan_portage.py | 229 ++++++++++-------- .../djeuscan/processing/scan/scan_upstream.py | 27 ++- euscanwww/djeuscan/tasks.py | 6 +- euscanwww/euscanwww/__init__.py | 1 + pym/euscan/__init__.py | 1 + 8 files changed, 152 insertions(+), 122 deletions(-) diff --git a/euscanwww/djeuscan/admin.py b/euscanwww/djeuscan/admin.py index 5ecc512..bdf7f6b 100644 --- a/euscanwww/djeuscan/admin.py +++ b/euscanwww/djeuscan/admin.py @@ -27,7 +27,7 @@ class PackageAdmin(admin.ModelAdmin): class VersionAdmin(admin.ModelAdmin): search_fields = ('package__name', 'package__category') - list_filter = ('overlay', 'packaged', 'alive') + list_filter = ('overlay', 'packaged') class ProblemReportAdmin(admin.ModelAdmin): diff --git a/euscanwww/djeuscan/management/commands/scan_portage.py b/euscanwww/djeuscan/management/commands/scan_portage.py index e70affa..f1d9485 100644 --- a/euscanwww/djeuscan/management/commands/scan_portage.py +++ b/euscanwww/djeuscan/management/commands/scan_portage.py @@ -44,12 +44,6 @@ class Command(BaseCommand): dest='no-log', default=False, help='Don\'t store logs'), - make_option('--prefetch', - action='store_true', - dest='prefetch', - default=False, - help=('Prefetch all versions and packages from DB to ' - 'speedup full scan process.')), ) args = '[package package ...]' help = 'Scans portage tree and fills database' @@ -70,7 +64,6 @@ class Command(BaseCommand): no_log=options["no-log"], purge_packages=options["purge-packages"], purge_versions=options["purge-versions"], - prefetch=options["prefetch"], upstream=options["upstream"], logger=logger, ) diff --git a/euscanwww/djeuscan/models.py b/euscanwww/djeuscan/models.py index 5a0991c..1b75495 100644 --- a/euscanwww/djeuscan/models.py +++ b/euscanwww/djeuscan/models.py @@ -144,7 +144,6 @@ class Version(models.Model): overlay = models.CharField(max_length=128, default='gentoo', db_index=True, validators=[validate_name], blank=True) urls = models.TextField(blank=True) - alive = models.BooleanField(default=True, db_index=True) vtype = models.CharField(max_length=128, blank=True) handler = models.CharField(max_length=128, blank=True, db_index=True) diff --git a/euscanwww/djeuscan/processing/scan/scan_portage.py b/euscanwww/djeuscan/processing/scan/scan_portage.py index 1d6c866..79aba67 100644 --- a/euscanwww/djeuscan/processing/scan/scan_portage.py +++ b/euscanwww/djeuscan/processing/scan/scan_portage.py @@ -8,6 +8,7 @@ import portage from xml.etree.ElementTree import iterparse, ParseError from django.db.transaction import commit_on_success +from django.db import models from django.core.management.color import color_style from euscan.version import get_version_type @@ -29,42 +30,45 @@ class ScanPortage(object): self._cache = {'packages': {}, 'versions': {}} self._overlays = None - self._updated_packages = set() + self._packages_updated = set() + self._versions = set() + self._versions_seen = set() - def updated_packages(self): - return list(self._updated_packages) + def packages_updated(self): + return list(self._packages_updated) - def cache_hash_package(self, category, name): + def hash_package(self, category, name): return '%s/%s' % (category, name) def cache_store_package(self, package): - key = self.cache_hash_package(package.category, package.name) + key = self.hash_package(package.category, package.name) self._cache['packages'][key] = package def cache_get_package(self, category, name): return self._cache['packages'].get( - self.cache_hash_package(category, name) + self.hash_package(category, name) ) - def cache_hash_version(self, category, name, version, revision, slot, + def hash_version(self, category, name, version, revision, overlay): - key = '%s/%s-%s-r%s %s %s' % (category, name, - version, revision, - slot, overlay) + key = '%s/%s-%s-r%s %s' % (category, name, + version, revision, + overlay) return key - def cache_get_version(self, category, name, version, revision, slot, + def cache_get_version(self, category, name, version, revision, overlay): - key = self.cache_hash_version(category, name, version, revision, slot, + key = self.hash_version(category, name, version, revision, overlay) return self._cache['versions'].get(key) def cache_store_version(self, version): - key = self.cache_hash_version( + key = self.hash_version( version.package.category, version.package.name, version.version, - version.revision, version.slot, version.overlay + version.revision, version.overlay ) self._cache['versions'][key] = version + self._versions.add(version) def scan_gentoopm(self, query, category=None): import gentoopm @@ -160,60 +164,28 @@ class ScanPortage(object): category = "" elem.clear() - def prepare_purge_versions(self, packages, query=None, category=None): - if not self.purge_versions: - return - - # Set all versions dead, then set found versions alive and - # delete old versions - if not query: - # Optimisation for --all or --category - self.logger.info('Killing existing versions...') - qs = Version.objects.filter(packaged=True) - if category: - qs = qs.filter(package__category=category) - qs.update(alive=False) - self.logger.info('done') - else: - for package in packages: - Version.objects.filter(package=package, packaged=True).\ - update(alive=False) - def scan(self, query=None, category=None): - if not query: - current_packages = Package.objects.all() - elif '/' in query: - cat, pkg = portage.catsplit(query) - current_packages = Package.objects.filter(category=cat, name=pkg) - else: - current_packages = Package.objects.filter(name=query) - if category: - current_packages = current_packages.filter(category=category) - - self.prepare_purge_versions(current_packages, query, category) - - packages_alive = set() - for data in self.scan_eix_xml(query, category): #for data in self.scan_gentoopm(query, category): cat, pkg = data['category'], data['package'] package = self.store_package( cat, pkg, data['homepage'], data['description'] ) - packages_alive.add("%s/%s" % (cat, pkg)) + new_version = False for cpv, slot, overlay, overlay_path in data['versions']: obj, created = self.store_version( package, cpv, slot, overlay, overlay_path ) + self._versions_seen.add(obj) new_version = created or new_version # If the package has at least one new version scan upstream for it if new_version: - self._updated_packages.add(package) + self._packages_updated.add(package) - self.purge_old_packages(current_packages, packages_alive) self.purge_old_versions() + self.purge_old_packages() def store_package(self, cat, pkg, homepage, description): created = False @@ -239,7 +211,7 @@ class ScanPortage(object): created = False obj = self.cache_get_version( - package.category, package.name, ver, rev, slot, overlay + package.category, package.name, ver, rev, overlay ) overlay_path = overlay_path or portage.settings["PORTDIR"] @@ -249,11 +221,12 @@ class ScanPortage(object): if not obj: obj, created = Version.objects.get_or_create( - package=package, slot=slot, - revision=rev, version=ver, + package=package, + revision=rev, + version=ver, overlay=overlay, defaults={ - "alive": True, + "slot": slot, "packaged": True, "vtype": get_version_type(ver), "confidence": 100, @@ -263,9 +236,10 @@ class ScanPortage(object): } ) if not created: # Created objects have defaults values - obj.alive = True - obj.packaged = True - obj.save() + if obj.slot != slot or obj.package != True: + obj.slot = slot + obj.packaged = True + obj.save() if created: self.cache_store_version(obj) @@ -298,22 +272,51 @@ class ScanPortage(object): return obj, created - def purge_old_packages(self, packages, alive): + def purge_old_packages(self): if not self.purge_packages: return + packages = ( + Package.objects.values("id") + .annotate(version_count=models.Count("version")) + .filter(version_count=0) + ) + packages = ( + Package.objects.filter(id__in=[package['id'] for package in packages]) + ) + for package in packages: - cp = "%s/%s" % (package.category, package.name) - if cp not in alive: - self.logger.info('- [p] %s' % (package)) - package.delete() + self.logger.info('- [p] %s' % (package)) + package.delete() + + def version_hack(self, version): + try: + if version.package.last_version_gentoo: + version.package.last_version_gentoo.pk + if version.package.last_version_overlay: + version.package.last_version_overlay.pk + if version.package.last_version_upstream: + version.package.last_version_upstream.pk + except Version.DoesNotExist: + version.package.last_version_gentoo = None + version.package.last_version_overlay = None + version.package.last_version_upstream = None def purge_old_versions(self): if not self.purge_versions: return - versions = Version.objects.filter(packaged=True, alive=False) + versions = self._versions.difference(self._versions_seen) + for version in versions: + self.logger.info('- [v] %s' % (version)) + + if version.packaged == False: + continue # Not our job + + # Fix last_version_ stuff that is sometime broken + self.version_hack(version) + if version.overlay == 'gentoo': version.package.n_packaged -= 1 else: @@ -321,8 +324,6 @@ class ScanPortage(object): version.package.n_versions -= 1 version.package.save() - self.logger.info('- [v] %s' % (version)) - if self.no_log: continue @@ -335,20 +336,55 @@ class ScanPortage(object): overlay=version.overlay, vtype=version.vtype, ) + # remove from last version ? + version.delete() - versions.delete() + def prefetch(self, packages, category): + self.logger.info('Prefetching current objects...') + + ppackages = Package.objects.all() + pversions = Version.objects.filter(packaged=True).select_related('package').all() + + if category: + ppackages = ppackages.filter(category=category) + pversions = pversions.filter(package__category=category) + if packages: + ids = [ package.id for package in packages ] + ppackages = ppackages.filter(pk__in=ids) + pversions = pversions.filter(package__pk__in=ids) + + for package in ppackages: + self.cache_store_package(package) + for version in pversions: + self.cache_store_version(version) + + self.logger.info('done') + +def populate_categories(logger): + # Populate Category and Overlay + # TODO: - use portage.settings.categories() + # - read metadata.xml to add description + for cat in Package.objects.values('category').distinct(): + obj, created = Category.objects.get_or_create(name=cat["category"]) + if created: + logger.info("+ [c] %s", cat["category"]) + +def populate_overlays(logger): + # TODO: - get informations from layman and portage (path, url) + for overlay in Version.objects.values('overlay').distinct(): + if not overlay["overlay"]: + continue + obj, created = Overlay.objects.get_or_create(name=overlay["overlay"]) + if created: + logger.info("+ [o] %s", overlay["overlay"]) @commit_on_success def scan_portage(packages=None, category=None, no_log=False, upstream=False, - purge_packages=False, purge_versions=False, prefetch=False, - logger=None): + purge_packages=False, purge_versions=False, logger=None): logger = logger or FakeLogger() - if packages is None: - prefetch = True - scan_handler = ScanPortage( logger=logger, no_log=no_log, @@ -358,18 +394,28 @@ def scan_portage(packages=None, category=None, no_log=False, upstream=False, logger.info('Scanning portage tree...') - if prefetch: - logger.info('Prefetching objects...') - ppackages = Package.objects.all() - pversions = Version.objects.select_related('package').all() + if not packages: + qs = Package.objects.all() if category: - ppackages = ppackages.filter(category=category) - pversions = pversions.filter(package__category=category) - for package in ppackages: - scan_handler.cache_store_package(package) - for version in pversions: - scan_handler.cache_store_version(version) - logger.info('done') + qs = qs.filter(category=category) + prefetch_packages = qs + else: + results = [] + for package in packages: + if isinstance(package, Package): + results.append(package) + else: + if '/' in package: + cat, pkg = portage.catsplit(package) + qs = Package.objects.filter(category=cat, name=pkg) + else: + qs = Package.objects.filter(name=package) + for package in qs: + results.append(package) + prefetch_packages = results + + + scan_handler.prefetch(prefetch_packages, category) if not packages and category: scan_handler.scan(category=category) @@ -382,21 +428,8 @@ def scan_portage(packages=None, category=None, no_log=False, upstream=False, else: scan_handler.scan(pkg) - # Populate Category and Overlay - # TODO: - use portage.settings.categories() - # - read metadata.xml to add description - for cat in Package.objects.values('category').distinct(): - obj, created = Category.objects.get_or_create(name=cat["category"]) - if created: - logger.info("+ [c] %s", cat["category"]) - - # TODO: - get informations from layman and portage (path, url) - for overlay in Version.objects.values('overlay').distinct(): - if not overlay["overlay"]: - continue - obj, created = Overlay.objects.get_or_create(name=overlay["overlay"]) - if created: - logger.info("+ [o] %s", overlay["overlay"]) + populate_categories(logger) + populate_overlays(logger) logger.info('Done.') - return scan_handler.updated_packages() + return scan_handler.packages_updated() diff --git a/euscanwww/djeuscan/processing/scan/scan_upstream.py b/euscanwww/djeuscan/processing/scan/scan_upstream.py index bc9159b..da384e5 100644 --- a/euscanwww/djeuscan/processing/scan/scan_upstream.py +++ b/euscanwww/djeuscan/processing/scan/scan_upstream.py @@ -14,6 +14,8 @@ class ScanUpstream(object): def __init__(self, logger=None, purge_versions=False): self.logger = logger or FakeLogger() self.purge_versions = purge_versions + self._versions = set() + self._versions_seen = set() def scan(self, package): CONFIG["format"] = "dict" @@ -74,12 +76,11 @@ class ScanUpstream(object): if created: self.logger.info('+ [p] %s/%s' % (cat, pkg)) - # Set all versions dead, then set found versions alive and - # delete old versions - if self.purge_versions: - Version.objects.filter( - package=obj, packaged=False - ).update(alive=False) + versions = Version.objects.filter( + package=obj, packaged=False + ) + for version in versions: + self._versions.add(version) return obj @@ -87,20 +88,22 @@ class ScanUpstream(object): confidence): obj, created = Version.objects.get_or_create( package=package, - slot='', revision='r0', version=ver, overlay='', - defaults={"alive": True, "urls": url, "packaged": False, + defaults={"slot" : '', "urls": url, "packaged": False, "vtype": version_type, "handler": handler, "confidence": confidence} ) + if not created: - obj.alive = True + obj.slot = '' obj.urls = url obj.packaged = False obj.save() + self._versions_seen.add(obj) + # If it's not a new version, just update the object and continue if not created: return @@ -124,8 +127,10 @@ class ScanUpstream(object): if not self.purge_versions: return - versions = Version.objects.filter(packaged=False, alive=False) + versions = self._versions.difference(self._versions_seen) for version in versions: + if version.packaged == True: + continue # Not our job VersionLog.objects.create( package=version.package, action=VersionLog.VERSION_REMOVED, @@ -141,7 +146,7 @@ class ScanUpstream(object): self.logger.info('- [u] %s %s' % (version, version.urls)) - versions.delete() + version.delete() @commit_on_success diff --git a/euscanwww/djeuscan/tasks.py b/euscanwww/djeuscan/tasks.py index 44e4873..f58a544 100644 --- a/euscanwww/djeuscan/tasks.py +++ b/euscanwww/djeuscan/tasks.py @@ -115,7 +115,7 @@ def scan_metadata(packages=[], category=None, populate=False): @task def scan_portage(packages=None, category=None, no_log=False, purge_packages=False, - purge_versions=False, prefetch=False): + purge_versions=False): """ Scans portage for the given set of packages """ @@ -136,7 +136,6 @@ def scan_portage(packages=None, category=None, no_log=no_log, purge_packages=purge_packages, purge_versions=purge_versions, - prefetch=prefetch, logger=logger, ) @@ -182,7 +181,6 @@ def update_portage(packages=None): packages=None, purge_packages=True, purge_versions=True, - prefetch=True ) scan_metadata(packages=None, populate=True) @@ -204,7 +202,7 @@ def update_portage(packages=None): ( group_one(scan_portage, categories, attr_name="category", purge_packages=True, - purge_versions=True, prefetch=True) | + purge_versions=True) | group_one(scan_metadata, categories, attr_name="category") | update_counters.si(fast=True) diff --git a/euscanwww/euscanwww/__init__.py b/euscanwww/euscanwww/__init__.py index e69de29..8b13789 100644 --- a/euscanwww/euscanwww/__init__.py +++ b/euscanwww/euscanwww/__init__.py @@ -0,0 +1 @@ + diff --git a/pym/euscan/__init__.py b/pym/euscan/__init__.py index 49241dd..2e9deac 100644 --- a/pym/euscan/__init__.py +++ b/pym/euscan/__init__.py @@ -85,6 +85,7 @@ ROBOTS_TXT_BLACKLIST_DOMAINS = [ '(.*)download\.mono-project\.com(.*)', '(.*)fedorahosted\.org(.*)', '(.*)download\.tuxfamily\.org(.*)', + '(.*)festvox\.org(.*)', ] from out import EuscanOutput