djeuscan: rework the scan process, don't use alive anymore

Signed-off-by: Corentin Chary <corentin.chary@gmail.com>
This commit is contained in:
Corentin Chary 2012-12-12 22:54:50 +01:00
parent dfb7a7b986
commit e42ba7dfd0
8 changed files with 152 additions and 122 deletions

View File

@ -27,7 +27,7 @@ class PackageAdmin(admin.ModelAdmin):
class VersionAdmin(admin.ModelAdmin):
search_fields = ('package__name', 'package__category')
list_filter = ('overlay', 'packaged', 'alive')
list_filter = ('overlay', 'packaged')
class ProblemReportAdmin(admin.ModelAdmin):

View File

@ -44,12 +44,6 @@ class Command(BaseCommand):
dest='no-log',
default=False,
help='Don\'t store logs'),
make_option('--prefetch',
action='store_true',
dest='prefetch',
default=False,
help=('Prefetch all versions and packages from DB to '
'speedup full scan process.')),
)
args = '[package package ...]'
help = 'Scans portage tree and fills database'
@ -70,7 +64,6 @@ class Command(BaseCommand):
no_log=options["no-log"],
purge_packages=options["purge-packages"],
purge_versions=options["purge-versions"],
prefetch=options["prefetch"],
upstream=options["upstream"],
logger=logger,
)

View File

@ -144,7 +144,6 @@ class Version(models.Model):
overlay = models.CharField(max_length=128, default='gentoo', db_index=True,
validators=[validate_name], blank=True)
urls = models.TextField(blank=True)
alive = models.BooleanField(default=True, db_index=True)
vtype = models.CharField(max_length=128, blank=True)
handler = models.CharField(max_length=128, blank=True, db_index=True)

View File

@ -8,6 +8,7 @@ import portage
from xml.etree.ElementTree import iterparse, ParseError
from django.db.transaction import commit_on_success
from django.db import models
from django.core.management.color import color_style
from euscan.version import get_version_type
@ -29,42 +30,45 @@ class ScanPortage(object):
self._cache = {'packages': {}, 'versions': {}}
self._overlays = None
self._updated_packages = set()
self._packages_updated = set()
self._versions = set()
self._versions_seen = set()
def updated_packages(self):
return list(self._updated_packages)
def packages_updated(self):
return list(self._packages_updated)
def cache_hash_package(self, category, name):
def hash_package(self, category, name):
return '%s/%s' % (category, name)
def cache_store_package(self, package):
key = self.cache_hash_package(package.category, package.name)
key = self.hash_package(package.category, package.name)
self._cache['packages'][key] = package
def cache_get_package(self, category, name):
return self._cache['packages'].get(
self.cache_hash_package(category, name)
self.hash_package(category, name)
)
def cache_hash_version(self, category, name, version, revision, slot,
def hash_version(self, category, name, version, revision,
overlay):
key = '%s/%s-%s-r%s %s %s' % (category, name,
key = '%s/%s-%s-r%s %s' % (category, name,
version, revision,
slot, overlay)
overlay)
return key
def cache_get_version(self, category, name, version, revision, slot,
def cache_get_version(self, category, name, version, revision,
overlay):
key = self.cache_hash_version(category, name, version, revision, slot,
key = self.hash_version(category, name, version, revision,
overlay)
return self._cache['versions'].get(key)
def cache_store_version(self, version):
key = self.cache_hash_version(
key = self.hash_version(
version.package.category, version.package.name, version.version,
version.revision, version.slot, version.overlay
version.revision, version.overlay
)
self._cache['versions'][key] = version
self._versions.add(version)
def scan_gentoopm(self, query, category=None):
import gentoopm
@ -160,60 +164,28 @@ class ScanPortage(object):
category = ""
elem.clear()
def prepare_purge_versions(self, packages, query=None, category=None):
if not self.purge_versions:
return
# Set all versions dead, then set found versions alive and
# delete old versions
if not query:
# Optimisation for --all or --category
self.logger.info('Killing existing versions...')
qs = Version.objects.filter(packaged=True)
if category:
qs = qs.filter(package__category=category)
qs.update(alive=False)
self.logger.info('done')
else:
for package in packages:
Version.objects.filter(package=package, packaged=True).\
update(alive=False)
def scan(self, query=None, category=None):
if not query:
current_packages = Package.objects.all()
elif '/' in query:
cat, pkg = portage.catsplit(query)
current_packages = Package.objects.filter(category=cat, name=pkg)
else:
current_packages = Package.objects.filter(name=query)
if category:
current_packages = current_packages.filter(category=category)
self.prepare_purge_versions(current_packages, query, category)
packages_alive = set()
for data in self.scan_eix_xml(query, category):
#for data in self.scan_gentoopm(query, category):
cat, pkg = data['category'], data['package']
package = self.store_package(
cat, pkg, data['homepage'], data['description']
)
packages_alive.add("%s/%s" % (cat, pkg))
new_version = False
for cpv, slot, overlay, overlay_path in data['versions']:
obj, created = self.store_version(
package, cpv, slot, overlay, overlay_path
)
self._versions_seen.add(obj)
new_version = created or new_version
# If the package has at least one new version scan upstream for it
if new_version:
self._updated_packages.add(package)
self._packages_updated.add(package)
self.purge_old_packages(current_packages, packages_alive)
self.purge_old_versions()
self.purge_old_packages()
def store_package(self, cat, pkg, homepage, description):
created = False
@ -239,7 +211,7 @@ class ScanPortage(object):
created = False
obj = self.cache_get_version(
package.category, package.name, ver, rev, slot, overlay
package.category, package.name, ver, rev, overlay
)
overlay_path = overlay_path or portage.settings["PORTDIR"]
@ -249,11 +221,12 @@ class ScanPortage(object):
if not obj:
obj, created = Version.objects.get_or_create(
package=package, slot=slot,
revision=rev, version=ver,
package=package,
revision=rev,
version=ver,
overlay=overlay,
defaults={
"alive": True,
"slot": slot,
"packaged": True,
"vtype": get_version_type(ver),
"confidence": 100,
@ -263,7 +236,8 @@ class ScanPortage(object):
}
)
if not created: # Created objects have defaults values
obj.alive = True
if obj.slot != slot or obj.package != True:
obj.slot = slot
obj.packaged = True
obj.save()
@ -298,22 +272,51 @@ class ScanPortage(object):
return obj, created
def purge_old_packages(self, packages, alive):
def purge_old_packages(self):
if not self.purge_packages:
return
packages = (
Package.objects.values("id")
.annotate(version_count=models.Count("version"))
.filter(version_count=0)
)
packages = (
Package.objects.filter(id__in=[package['id'] for package in packages])
)
for package in packages:
cp = "%s/%s" % (package.category, package.name)
if cp not in alive:
self.logger.info('- [p] %s' % (package))
package.delete()
def version_hack(self, version):
try:
if version.package.last_version_gentoo:
version.package.last_version_gentoo.pk
if version.package.last_version_overlay:
version.package.last_version_overlay.pk
if version.package.last_version_upstream:
version.package.last_version_upstream.pk
except Version.DoesNotExist:
version.package.last_version_gentoo = None
version.package.last_version_overlay = None
version.package.last_version_upstream = None
def purge_old_versions(self):
if not self.purge_versions:
return
versions = Version.objects.filter(packaged=True, alive=False)
versions = self._versions.difference(self._versions_seen)
for version in versions:
self.logger.info('- [v] %s' % (version))
if version.packaged == False:
continue # Not our job
# Fix last_version_ stuff that is sometime broken
self.version_hack(version)
if version.overlay == 'gentoo':
version.package.n_packaged -= 1
else:
@ -321,8 +324,6 @@ class ScanPortage(object):
version.package.n_versions -= 1
version.package.save()
self.logger.info('- [v] %s' % (version))
if self.no_log:
continue
@ -335,20 +336,55 @@ class ScanPortage(object):
overlay=version.overlay,
vtype=version.vtype,
)
# remove from last version ?
version.delete()
versions.delete()
def prefetch(self, packages, category):
self.logger.info('Prefetching current objects...')
ppackages = Package.objects.all()
pversions = Version.objects.filter(packaged=True).select_related('package').all()
if category:
ppackages = ppackages.filter(category=category)
pversions = pversions.filter(package__category=category)
if packages:
ids = [ package.id for package in packages ]
ppackages = ppackages.filter(pk__in=ids)
pversions = pversions.filter(package__pk__in=ids)
for package in ppackages:
self.cache_store_package(package)
for version in pversions:
self.cache_store_version(version)
self.logger.info('done')
def populate_categories(logger):
# Populate Category and Overlay
# TODO: - use portage.settings.categories()
# - read metadata.xml to add description
for cat in Package.objects.values('category').distinct():
obj, created = Category.objects.get_or_create(name=cat["category"])
if created:
logger.info("+ [c] %s", cat["category"])
def populate_overlays(logger):
# TODO: - get informations from layman and portage (path, url)
for overlay in Version.objects.values('overlay').distinct():
if not overlay["overlay"]:
continue
obj, created = Overlay.objects.get_or_create(name=overlay["overlay"])
if created:
logger.info("+ [o] %s", overlay["overlay"])
@commit_on_success
def scan_portage(packages=None, category=None, no_log=False, upstream=False,
purge_packages=False, purge_versions=False, prefetch=False,
logger=None):
purge_packages=False, purge_versions=False, logger=None):
logger = logger or FakeLogger()
if packages is None:
prefetch = True
scan_handler = ScanPortage(
logger=logger,
no_log=no_log,
@ -358,18 +394,28 @@ def scan_portage(packages=None, category=None, no_log=False, upstream=False,
logger.info('Scanning portage tree...')
if prefetch:
logger.info('Prefetching objects...')
ppackages = Package.objects.all()
pversions = Version.objects.select_related('package').all()
if not packages:
qs = Package.objects.all()
if category:
ppackages = ppackages.filter(category=category)
pversions = pversions.filter(package__category=category)
for package in ppackages:
scan_handler.cache_store_package(package)
for version in pversions:
scan_handler.cache_store_version(version)
logger.info('done')
qs = qs.filter(category=category)
prefetch_packages = qs
else:
results = []
for package in packages:
if isinstance(package, Package):
results.append(package)
else:
if '/' in package:
cat, pkg = portage.catsplit(package)
qs = Package.objects.filter(category=cat, name=pkg)
else:
qs = Package.objects.filter(name=package)
for package in qs:
results.append(package)
prefetch_packages = results
scan_handler.prefetch(prefetch_packages, category)
if not packages and category:
scan_handler.scan(category=category)
@ -382,21 +428,8 @@ def scan_portage(packages=None, category=None, no_log=False, upstream=False,
else:
scan_handler.scan(pkg)
# Populate Category and Overlay
# TODO: - use portage.settings.categories()
# - read metadata.xml to add description
for cat in Package.objects.values('category').distinct():
obj, created = Category.objects.get_or_create(name=cat["category"])
if created:
logger.info("+ [c] %s", cat["category"])
# TODO: - get informations from layman and portage (path, url)
for overlay in Version.objects.values('overlay').distinct():
if not overlay["overlay"]:
continue
obj, created = Overlay.objects.get_or_create(name=overlay["overlay"])
if created:
logger.info("+ [o] %s", overlay["overlay"])
populate_categories(logger)
populate_overlays(logger)
logger.info('Done.')
return scan_handler.updated_packages()
return scan_handler.packages_updated()

View File

@ -14,6 +14,8 @@ class ScanUpstream(object):
def __init__(self, logger=None, purge_versions=False):
self.logger = logger or FakeLogger()
self.purge_versions = purge_versions
self._versions = set()
self._versions_seen = set()
def scan(self, package):
CONFIG["format"] = "dict"
@ -74,12 +76,11 @@ class ScanUpstream(object):
if created:
self.logger.info('+ [p] %s/%s' % (cat, pkg))
# Set all versions dead, then set found versions alive and
# delete old versions
if self.purge_versions:
Version.objects.filter(
versions = Version.objects.filter(
package=obj, packaged=False
).update(alive=False)
)
for version in versions:
self._versions.add(version)
return obj
@ -87,20 +88,22 @@ class ScanUpstream(object):
confidence):
obj, created = Version.objects.get_or_create(
package=package,
slot='',
revision='r0',
version=ver,
overlay='',
defaults={"alive": True, "urls": url, "packaged": False,
defaults={"slot" : '', "urls": url, "packaged": False,
"vtype": version_type, "handler": handler,
"confidence": confidence}
)
if not created:
obj.alive = True
obj.slot = ''
obj.urls = url
obj.packaged = False
obj.save()
self._versions_seen.add(obj)
# If it's not a new version, just update the object and continue
if not created:
return
@ -124,8 +127,10 @@ class ScanUpstream(object):
if not self.purge_versions:
return
versions = Version.objects.filter(packaged=False, alive=False)
versions = self._versions.difference(self._versions_seen)
for version in versions:
if version.packaged == True:
continue # Not our job
VersionLog.objects.create(
package=version.package,
action=VersionLog.VERSION_REMOVED,
@ -141,7 +146,7 @@ class ScanUpstream(object):
self.logger.info('- [u] %s %s' % (version, version.urls))
versions.delete()
version.delete()
@commit_on_success

View File

@ -115,7 +115,7 @@ def scan_metadata(packages=[], category=None, populate=False):
@task
def scan_portage(packages=None, category=None,
no_log=False, purge_packages=False,
purge_versions=False, prefetch=False):
purge_versions=False):
"""
Scans portage for the given set of packages
"""
@ -136,7 +136,6 @@ def scan_portage(packages=None, category=None,
no_log=no_log,
purge_packages=purge_packages,
purge_versions=purge_versions,
prefetch=prefetch,
logger=logger,
)
@ -182,7 +181,6 @@ def update_portage(packages=None):
packages=None,
purge_packages=True,
purge_versions=True,
prefetch=True
)
scan_metadata(packages=None, populate=True)
@ -204,7 +202,7 @@ def update_portage(packages=None):
(
group_one(scan_portage, categories,
attr_name="category", purge_packages=True,
purge_versions=True, prefetch=True) |
purge_versions=True) |
group_one(scan_metadata, categories,
attr_name="category") |
update_counters.si(fast=True)

View File

@ -0,0 +1 @@

View File

@ -85,6 +85,7 @@ ROBOTS_TXT_BLACKLIST_DOMAINS = [
'(.*)download\.mono-project\.com(.*)',
'(.*)fedorahosted\.org(.*)',
'(.*)download\.tuxfamily\.org(.*)',
'(.*)festvox\.org(.*)',
]
from out import EuscanOutput