djeuscan: rework the scan process, don't use alive anymore

Signed-off-by: Corentin Chary <corentin.chary@gmail.com>
This commit is contained in:
Corentin Chary 2012-12-12 22:54:50 +01:00
parent dfb7a7b986
commit e42ba7dfd0
8 changed files with 152 additions and 122 deletions

View File

@ -27,7 +27,7 @@ class PackageAdmin(admin.ModelAdmin):
class VersionAdmin(admin.ModelAdmin): class VersionAdmin(admin.ModelAdmin):
search_fields = ('package__name', 'package__category') search_fields = ('package__name', 'package__category')
list_filter = ('overlay', 'packaged', 'alive') list_filter = ('overlay', 'packaged')
class ProblemReportAdmin(admin.ModelAdmin): class ProblemReportAdmin(admin.ModelAdmin):

View File

@ -44,12 +44,6 @@ class Command(BaseCommand):
dest='no-log', dest='no-log',
default=False, default=False,
help='Don\'t store logs'), help='Don\'t store logs'),
make_option('--prefetch',
action='store_true',
dest='prefetch',
default=False,
help=('Prefetch all versions and packages from DB to '
'speedup full scan process.')),
) )
args = '[package package ...]' args = '[package package ...]'
help = 'Scans portage tree and fills database' help = 'Scans portage tree and fills database'
@ -70,7 +64,6 @@ class Command(BaseCommand):
no_log=options["no-log"], no_log=options["no-log"],
purge_packages=options["purge-packages"], purge_packages=options["purge-packages"],
purge_versions=options["purge-versions"], purge_versions=options["purge-versions"],
prefetch=options["prefetch"],
upstream=options["upstream"], upstream=options["upstream"],
logger=logger, logger=logger,
) )

View File

@ -144,7 +144,6 @@ class Version(models.Model):
overlay = models.CharField(max_length=128, default='gentoo', db_index=True, overlay = models.CharField(max_length=128, default='gentoo', db_index=True,
validators=[validate_name], blank=True) validators=[validate_name], blank=True)
urls = models.TextField(blank=True) urls = models.TextField(blank=True)
alive = models.BooleanField(default=True, db_index=True)
vtype = models.CharField(max_length=128, blank=True) vtype = models.CharField(max_length=128, blank=True)
handler = models.CharField(max_length=128, blank=True, db_index=True) handler = models.CharField(max_length=128, blank=True, db_index=True)

View File

@ -8,6 +8,7 @@ import portage
from xml.etree.ElementTree import iterparse, ParseError from xml.etree.ElementTree import iterparse, ParseError
from django.db.transaction import commit_on_success from django.db.transaction import commit_on_success
from django.db import models
from django.core.management.color import color_style from django.core.management.color import color_style
from euscan.version import get_version_type from euscan.version import get_version_type
@ -29,42 +30,45 @@ class ScanPortage(object):
self._cache = {'packages': {}, 'versions': {}} self._cache = {'packages': {}, 'versions': {}}
self._overlays = None self._overlays = None
self._updated_packages = set() self._packages_updated = set()
self._versions = set()
self._versions_seen = set()
def updated_packages(self): def packages_updated(self):
return list(self._updated_packages) return list(self._packages_updated)
def cache_hash_package(self, category, name): def hash_package(self, category, name):
return '%s/%s' % (category, name) return '%s/%s' % (category, name)
def cache_store_package(self, package): def cache_store_package(self, package):
key = self.cache_hash_package(package.category, package.name) key = self.hash_package(package.category, package.name)
self._cache['packages'][key] = package self._cache['packages'][key] = package
def cache_get_package(self, category, name): def cache_get_package(self, category, name):
return self._cache['packages'].get( return self._cache['packages'].get(
self.cache_hash_package(category, name) self.hash_package(category, name)
) )
def cache_hash_version(self, category, name, version, revision, slot, def hash_version(self, category, name, version, revision,
overlay): overlay):
key = '%s/%s-%s-r%s %s %s' % (category, name, key = '%s/%s-%s-r%s %s' % (category, name,
version, revision, version, revision,
slot, overlay) overlay)
return key return key
def cache_get_version(self, category, name, version, revision, slot, def cache_get_version(self, category, name, version, revision,
overlay): overlay):
key = self.cache_hash_version(category, name, version, revision, slot, key = self.hash_version(category, name, version, revision,
overlay) overlay)
return self._cache['versions'].get(key) return self._cache['versions'].get(key)
def cache_store_version(self, version): def cache_store_version(self, version):
key = self.cache_hash_version( key = self.hash_version(
version.package.category, version.package.name, version.version, version.package.category, version.package.name, version.version,
version.revision, version.slot, version.overlay version.revision, version.overlay
) )
self._cache['versions'][key] = version self._cache['versions'][key] = version
self._versions.add(version)
def scan_gentoopm(self, query, category=None): def scan_gentoopm(self, query, category=None):
import gentoopm import gentoopm
@ -160,60 +164,28 @@ class ScanPortage(object):
category = "" category = ""
elem.clear() elem.clear()
def prepare_purge_versions(self, packages, query=None, category=None):
if not self.purge_versions:
return
# Set all versions dead, then set found versions alive and
# delete old versions
if not query:
# Optimisation for --all or --category
self.logger.info('Killing existing versions...')
qs = Version.objects.filter(packaged=True)
if category:
qs = qs.filter(package__category=category)
qs.update(alive=False)
self.logger.info('done')
else:
for package in packages:
Version.objects.filter(package=package, packaged=True).\
update(alive=False)
def scan(self, query=None, category=None): def scan(self, query=None, category=None):
if not query:
current_packages = Package.objects.all()
elif '/' in query:
cat, pkg = portage.catsplit(query)
current_packages = Package.objects.filter(category=cat, name=pkg)
else:
current_packages = Package.objects.filter(name=query)
if category:
current_packages = current_packages.filter(category=category)
self.prepare_purge_versions(current_packages, query, category)
packages_alive = set()
for data in self.scan_eix_xml(query, category): for data in self.scan_eix_xml(query, category):
#for data in self.scan_gentoopm(query, category): #for data in self.scan_gentoopm(query, category):
cat, pkg = data['category'], data['package'] cat, pkg = data['category'], data['package']
package = self.store_package( package = self.store_package(
cat, pkg, data['homepage'], data['description'] cat, pkg, data['homepage'], data['description']
) )
packages_alive.add("%s/%s" % (cat, pkg))
new_version = False new_version = False
for cpv, slot, overlay, overlay_path in data['versions']: for cpv, slot, overlay, overlay_path in data['versions']:
obj, created = self.store_version( obj, created = self.store_version(
package, cpv, slot, overlay, overlay_path package, cpv, slot, overlay, overlay_path
) )
self._versions_seen.add(obj)
new_version = created or new_version new_version = created or new_version
# If the package has at least one new version scan upstream for it # If the package has at least one new version scan upstream for it
if new_version: if new_version:
self._updated_packages.add(package) self._packages_updated.add(package)
self.purge_old_packages(current_packages, packages_alive)
self.purge_old_versions() self.purge_old_versions()
self.purge_old_packages()
def store_package(self, cat, pkg, homepage, description): def store_package(self, cat, pkg, homepage, description):
created = False created = False
@ -239,7 +211,7 @@ class ScanPortage(object):
created = False created = False
obj = self.cache_get_version( obj = self.cache_get_version(
package.category, package.name, ver, rev, slot, overlay package.category, package.name, ver, rev, overlay
) )
overlay_path = overlay_path or portage.settings["PORTDIR"] overlay_path = overlay_path or portage.settings["PORTDIR"]
@ -249,11 +221,12 @@ class ScanPortage(object):
if not obj: if not obj:
obj, created = Version.objects.get_or_create( obj, created = Version.objects.get_or_create(
package=package, slot=slot, package=package,
revision=rev, version=ver, revision=rev,
version=ver,
overlay=overlay, overlay=overlay,
defaults={ defaults={
"alive": True, "slot": slot,
"packaged": True, "packaged": True,
"vtype": get_version_type(ver), "vtype": get_version_type(ver),
"confidence": 100, "confidence": 100,
@ -263,7 +236,8 @@ class ScanPortage(object):
} }
) )
if not created: # Created objects have defaults values if not created: # Created objects have defaults values
obj.alive = True if obj.slot != slot or obj.package != True:
obj.slot = slot
obj.packaged = True obj.packaged = True
obj.save() obj.save()
@ -298,22 +272,51 @@ class ScanPortage(object):
return obj, created return obj, created
def purge_old_packages(self, packages, alive): def purge_old_packages(self):
if not self.purge_packages: if not self.purge_packages:
return return
packages = (
Package.objects.values("id")
.annotate(version_count=models.Count("version"))
.filter(version_count=0)
)
packages = (
Package.objects.filter(id__in=[package['id'] for package in packages])
)
for package in packages: for package in packages:
cp = "%s/%s" % (package.category, package.name)
if cp not in alive:
self.logger.info('- [p] %s' % (package)) self.logger.info('- [p] %s' % (package))
package.delete() package.delete()
def version_hack(self, version):
try:
if version.package.last_version_gentoo:
version.package.last_version_gentoo.pk
if version.package.last_version_overlay:
version.package.last_version_overlay.pk
if version.package.last_version_upstream:
version.package.last_version_upstream.pk
except Version.DoesNotExist:
version.package.last_version_gentoo = None
version.package.last_version_overlay = None
version.package.last_version_upstream = None
def purge_old_versions(self): def purge_old_versions(self):
if not self.purge_versions: if not self.purge_versions:
return return
versions = Version.objects.filter(packaged=True, alive=False) versions = self._versions.difference(self._versions_seen)
for version in versions: for version in versions:
self.logger.info('- [v] %s' % (version))
if version.packaged == False:
continue # Not our job
# Fix last_version_ stuff that is sometime broken
self.version_hack(version)
if version.overlay == 'gentoo': if version.overlay == 'gentoo':
version.package.n_packaged -= 1 version.package.n_packaged -= 1
else: else:
@ -321,8 +324,6 @@ class ScanPortage(object):
version.package.n_versions -= 1 version.package.n_versions -= 1
version.package.save() version.package.save()
self.logger.info('- [v] %s' % (version))
if self.no_log: if self.no_log:
continue continue
@ -335,20 +336,55 @@ class ScanPortage(object):
overlay=version.overlay, overlay=version.overlay,
vtype=version.vtype, vtype=version.vtype,
) )
# remove from last version ?
version.delete()
versions.delete() def prefetch(self, packages, category):
self.logger.info('Prefetching current objects...')
ppackages = Package.objects.all()
pversions = Version.objects.filter(packaged=True).select_related('package').all()
if category:
ppackages = ppackages.filter(category=category)
pversions = pversions.filter(package__category=category)
if packages:
ids = [ package.id for package in packages ]
ppackages = ppackages.filter(pk__in=ids)
pversions = pversions.filter(package__pk__in=ids)
for package in ppackages:
self.cache_store_package(package)
for version in pversions:
self.cache_store_version(version)
self.logger.info('done')
def populate_categories(logger):
# Populate Category and Overlay
# TODO: - use portage.settings.categories()
# - read metadata.xml to add description
for cat in Package.objects.values('category').distinct():
obj, created = Category.objects.get_or_create(name=cat["category"])
if created:
logger.info("+ [c] %s", cat["category"])
def populate_overlays(logger):
# TODO: - get informations from layman and portage (path, url)
for overlay in Version.objects.values('overlay').distinct():
if not overlay["overlay"]:
continue
obj, created = Overlay.objects.get_or_create(name=overlay["overlay"])
if created:
logger.info("+ [o] %s", overlay["overlay"])
@commit_on_success @commit_on_success
def scan_portage(packages=None, category=None, no_log=False, upstream=False, def scan_portage(packages=None, category=None, no_log=False, upstream=False,
purge_packages=False, purge_versions=False, prefetch=False, purge_packages=False, purge_versions=False, logger=None):
logger=None):
logger = logger or FakeLogger() logger = logger or FakeLogger()
if packages is None:
prefetch = True
scan_handler = ScanPortage( scan_handler = ScanPortage(
logger=logger, logger=logger,
no_log=no_log, no_log=no_log,
@ -358,18 +394,28 @@ def scan_portage(packages=None, category=None, no_log=False, upstream=False,
logger.info('Scanning portage tree...') logger.info('Scanning portage tree...')
if prefetch: if not packages:
logger.info('Prefetching objects...') qs = Package.objects.all()
ppackages = Package.objects.all()
pversions = Version.objects.select_related('package').all()
if category: if category:
ppackages = ppackages.filter(category=category) qs = qs.filter(category=category)
pversions = pversions.filter(package__category=category) prefetch_packages = qs
for package in ppackages: else:
scan_handler.cache_store_package(package) results = []
for version in pversions: for package in packages:
scan_handler.cache_store_version(version) if isinstance(package, Package):
logger.info('done') results.append(package)
else:
if '/' in package:
cat, pkg = portage.catsplit(package)
qs = Package.objects.filter(category=cat, name=pkg)
else:
qs = Package.objects.filter(name=package)
for package in qs:
results.append(package)
prefetch_packages = results
scan_handler.prefetch(prefetch_packages, category)
if not packages and category: if not packages and category:
scan_handler.scan(category=category) scan_handler.scan(category=category)
@ -382,21 +428,8 @@ def scan_portage(packages=None, category=None, no_log=False, upstream=False,
else: else:
scan_handler.scan(pkg) scan_handler.scan(pkg)
# Populate Category and Overlay populate_categories(logger)
# TODO: - use portage.settings.categories() populate_overlays(logger)
# - read metadata.xml to add description
for cat in Package.objects.values('category').distinct():
obj, created = Category.objects.get_or_create(name=cat["category"])
if created:
logger.info("+ [c] %s", cat["category"])
# TODO: - get informations from layman and portage (path, url)
for overlay in Version.objects.values('overlay').distinct():
if not overlay["overlay"]:
continue
obj, created = Overlay.objects.get_or_create(name=overlay["overlay"])
if created:
logger.info("+ [o] %s", overlay["overlay"])
logger.info('Done.') logger.info('Done.')
return scan_handler.updated_packages() return scan_handler.packages_updated()

View File

@ -14,6 +14,8 @@ class ScanUpstream(object):
def __init__(self, logger=None, purge_versions=False): def __init__(self, logger=None, purge_versions=False):
self.logger = logger or FakeLogger() self.logger = logger or FakeLogger()
self.purge_versions = purge_versions self.purge_versions = purge_versions
self._versions = set()
self._versions_seen = set()
def scan(self, package): def scan(self, package):
CONFIG["format"] = "dict" CONFIG["format"] = "dict"
@ -74,12 +76,11 @@ class ScanUpstream(object):
if created: if created:
self.logger.info('+ [p] %s/%s' % (cat, pkg)) self.logger.info('+ [p] %s/%s' % (cat, pkg))
# Set all versions dead, then set found versions alive and versions = Version.objects.filter(
# delete old versions
if self.purge_versions:
Version.objects.filter(
package=obj, packaged=False package=obj, packaged=False
).update(alive=False) )
for version in versions:
self._versions.add(version)
return obj return obj
@ -87,20 +88,22 @@ class ScanUpstream(object):
confidence): confidence):
obj, created = Version.objects.get_or_create( obj, created = Version.objects.get_or_create(
package=package, package=package,
slot='',
revision='r0', revision='r0',
version=ver, version=ver,
overlay='', overlay='',
defaults={"alive": True, "urls": url, "packaged": False, defaults={"slot" : '', "urls": url, "packaged": False,
"vtype": version_type, "handler": handler, "vtype": version_type, "handler": handler,
"confidence": confidence} "confidence": confidence}
) )
if not created: if not created:
obj.alive = True obj.slot = ''
obj.urls = url obj.urls = url
obj.packaged = False obj.packaged = False
obj.save() obj.save()
self._versions_seen.add(obj)
# If it's not a new version, just update the object and continue # If it's not a new version, just update the object and continue
if not created: if not created:
return return
@ -124,8 +127,10 @@ class ScanUpstream(object):
if not self.purge_versions: if not self.purge_versions:
return return
versions = Version.objects.filter(packaged=False, alive=False) versions = self._versions.difference(self._versions_seen)
for version in versions: for version in versions:
if version.packaged == True:
continue # Not our job
VersionLog.objects.create( VersionLog.objects.create(
package=version.package, package=version.package,
action=VersionLog.VERSION_REMOVED, action=VersionLog.VERSION_REMOVED,
@ -141,7 +146,7 @@ class ScanUpstream(object):
self.logger.info('- [u] %s %s' % (version, version.urls)) self.logger.info('- [u] %s %s' % (version, version.urls))
versions.delete() version.delete()
@commit_on_success @commit_on_success

View File

@ -115,7 +115,7 @@ def scan_metadata(packages=[], category=None, populate=False):
@task @task
def scan_portage(packages=None, category=None, def scan_portage(packages=None, category=None,
no_log=False, purge_packages=False, no_log=False, purge_packages=False,
purge_versions=False, prefetch=False): purge_versions=False):
""" """
Scans portage for the given set of packages Scans portage for the given set of packages
""" """
@ -136,7 +136,6 @@ def scan_portage(packages=None, category=None,
no_log=no_log, no_log=no_log,
purge_packages=purge_packages, purge_packages=purge_packages,
purge_versions=purge_versions, purge_versions=purge_versions,
prefetch=prefetch,
logger=logger, logger=logger,
) )
@ -182,7 +181,6 @@ def update_portage(packages=None):
packages=None, packages=None,
purge_packages=True, purge_packages=True,
purge_versions=True, purge_versions=True,
prefetch=True
) )
scan_metadata(packages=None, populate=True) scan_metadata(packages=None, populate=True)
@ -204,7 +202,7 @@ def update_portage(packages=None):
( (
group_one(scan_portage, categories, group_one(scan_portage, categories,
attr_name="category", purge_packages=True, attr_name="category", purge_packages=True,
purge_versions=True, prefetch=True) | purge_versions=True) |
group_one(scan_metadata, categories, group_one(scan_metadata, categories,
attr_name="category") | attr_name="category") |
update_counters.si(fast=True) update_counters.si(fast=True)

View File

@ -0,0 +1 @@

View File

@ -85,6 +85,7 @@ ROBOTS_TXT_BLACKLIST_DOMAINS = [
'(.*)download\.mono-project\.com(.*)', '(.*)download\.mono-project\.com(.*)',
'(.*)fedorahosted\.org(.*)', '(.*)fedorahosted\.org(.*)',
'(.*)download\.tuxfamily\.org(.*)', '(.*)download\.tuxfamily\.org(.*)',
'(.*)festvox\.org(.*)',
] ]
from out import EuscanOutput from out import EuscanOutput