16 Commits

Author SHA1 Message Date
5b9d44fee1 TODO: weird docs handling
Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-03 20:58:58 +02:00
d8d1767766 TODO: debian remote-id exists
Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-03 20:58:58 +02:00
fbd7a4e139 handlers/github: remove
* Mirror removed and api has very strict ratelimits making impractical
  to use.
* https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=f119d00dab0c3bd087faab36f1a44734772a9d75

Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-03 20:58:58 +02:00
a7ff66ae04 handlers/pypi: stop using mirrors
Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-03 20:58:58 +02:00
5da26b0719 handlers/rubygems: stop using mirrors
* https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=65850a10f84e1b7a2cdf55392fa1d1f0717193c1

Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-03 20:58:58 +02:00
656f8e155e handlers/google_code: dead
* https://bugs.gentoo.org/544092

Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-03 20:58:57 +02:00
294dcc2a9c handlers/freecode: never shouldve been used in ebuilds
* https://bugs.gentoo.org/637970

Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-03 20:58:57 +02:00
c628edc26b handlers/berlios: obselete
* mirror removed in https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=2b72b0462bea5b34bbe4d767ccc44866df81515e
* Rest of the berlios urls use sourceforge now.

Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-03 19:36:20 +02:00
61cbb8e3f9 pre-commit: autoupdate versions
Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-03 19:11:43 +02:00
b2cd013b09 Workaround hard to parse $'' strings
Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-03 00:56:41 +02:00
e9fd94e1a5 Blacklist urls that don't make sense to scan
Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-03 00:56:41 +02:00
e13a62af84 Remove encoding keyword from json()
* Removed in Python 3.9

Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-03 00:56:41 +02:00
d93c3154ac Update GNOME_URL_SOURCE
* It gets redirected eitherway.

Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-03 00:56:41 +02:00
9809d9a805 Add Gitea(+ Forgejo) handler
Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-03 00:56:36 +02:00
d217c839a9 Add GitLab handler
Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-02 22:13:44 +02:00
aad99f71fe Use JSON api for PyPi
* "The XML-RPC API will be deprecated in the future. Use of this API is
  not recommended, and existing consumers of the API should migrate to
  the RSS and/or JSON APIs instead."
* "As a result, this API has a very restrictive rate limit and it may be
  necessary to pause between successive requests." As such this also
  gets around this issue for euscan.

https://warehouse.pypa.io/api-reference/xml-rpc.html

Signed-off-by: Alfred Wingate <parona@protonmail.com>
2024-03-02 16:18:34 +02:00
14 changed files with 203 additions and 256 deletions

View File

@ -1,15 +1,15 @@
repos:
- repo: https://github.com/psf/black
rev: 23.11.0
rev: 24.2.0
hooks:
- id: black
- repo: https://github.com/PyCQA/isort
rev: 5.12.0
rev: 5.13.2
hooks:
- id: isort
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.4
rev: v0.3.0
hooks:
- id: ruff

10
TODO
View File

@ -42,11 +42,7 @@ euscan
- remote-id type deb repository:
-- find out how to get download url (not sure it's possible)
### remote-id
- Propose new remote-id: deb
e.g.: <remote-id type="deb">
http://mysite.com/deb/dists/stable/main/binary-i386/Packages
</remote-id>
- Propose new remote-id: freecode
e.g.: <remote-id type="freecode">projectname</remote-id>
### bugs or unwanted behavior
- Parsing docs and accepting 404's
-- net-analyzer/sensu

View File

@ -16,7 +16,8 @@ description = "Ebuild upstream scan utility."
license = {text = "GPL-2.0"}
dependencies = [
"portage",
"beautifulsoup4>=4.8.2"
"beautifulsoup4>=4.8.2",
"packaging"
]
dynamic = ["version"]

View File

@ -51,8 +51,13 @@ BLACKLIST_PACKAGES = [
]
SCANDIR_BLACKLIST_URLS = [
"mirror://rubygems/(.*)", # Not browsable
"https://rubygems.org/(.*)", # Not browsable
"mirror://gentoo/(.*)", # Directory too big
"https://dev.gentoo.org/(.*)", # There shouldn't be releases here
# Waste of time to go through
"https://crates.io/(.*)",
"https://api.nuget.org/(.*)",
"https://myget.org/(.*)",
]
BRUTEFORCE_BLACKLIST_PACKAGES = [

View File

@ -1,59 +0,0 @@
# Copyright 2011 Corentin Chary <corentin.chary@gmail.com>
# Copyright 2020-2023 src_prepare group
# Distributed under the terms of the GNU General Public License v2
import re
import urllib.error
import urllib.parse
import urllib.request
import portage
from euscan import output
from euscan.handlers.url import process_scan as url_scan
from euscan.helpers import regex_from_template
HANDLER_NAME = "berlios"
CONFIDENCE = 90
PRIORITY = 90
berlios_regex = r"mirror://berlios/([^/]+)/([^/]+)"
def can_handle(pkg, url=None):
if not url:
return False
cp, ver, rev = portage.pkgsplit(pkg.cpv)
if ver not in url:
return False
return re.search(berlios_regex, url)
def scan_url(pkg, url, options):
output.einfo("Using BerliOS handler")
cp, ver, rev = portage.pkgsplit(pkg.cpv)
project, filename = re.search(berlios_regex, url).groups()
project_page = "http://developer.berlios.de/projects/%s" % project
content = urllib.request.urlopen(project_page).read()
project_id = re.search(r"/project/filelist.php\?group_id=(\d+)", content).group(1)
base_url = (
"http://developer.berlios.de/project/filelist.php?group_id=%s" % project_id
)
file_pattern = regex_from_template(filename.replace(ver, "${PV}"))
result = url_scan(pkg, base_url, file_pattern)
ret = []
for found_url, pv, _, _ in result:
found_url = found_url.replace("prdownload", "download")
ret.append((found_url, pv, HANDLER_NAME, CONFIDENCE))
return ret

View File

@ -1,53 +0,0 @@
# Copyright 2011 Corentin Chary <corentin.chary@gmail.com>
# Copyright 2020-2023 src_prepare group
# Distributed under the terms of the GNU General Public License v2
import re
import urllib.error
import urllib.parse
import urllib.request
import portage
from euscan import helpers, mangling, output
HANDLER_NAME = "freecode"
CONFIDENCE = 100
PRIORITY = 90
def can_handle(pkg, url=None):
return False
def scan_pkg(pkg, options):
cp, ver, rev = portage.pkgsplit(pkg.cpv)
package = options["data"].strip()
output.einfo("Using FreeCode handler: " + package)
fp = urllib.request.urlopen("http://freecode.com/projects/%s/releases" % package)
content = str(fp.read())
result = re.findall(
r'<a href="/projects/%s/releases/(\d+)">([^<]+)</a>' % package, content
)
ret = []
for release_id, up_pv in result:
pv = mangling.mangle_version(up_pv, options)
if helpers.version_filtered(cp, ver, pv):
continue
fp = urllib.request.urlopen(
f"http://freecode.com/projects/{package}/releases/{release_id}"
)
content = str(fp.read())
download_page = re.findall(r'<a href="(/urls/[^"]+)"', content)[0]
fp = urllib.request.urlopen("http://freecode.com%s" % download_page)
content = str(fp.read())
url = re.findall(
r'In case it doesn\'t, click here: <a href="([^"]+)"', content
)[0]
ret.append((url, pv, HANDLER_NAME, CONFIDENCE))
return ret

View File

@ -0,0 +1,70 @@
# Copyright 2020-2024 src_prepare group
# Distributed under the terms of the GNU General Public License v2
import json
import re
import portage
from euscan import helpers, mangling, output
HANDLER_NAME = "gitea"
CONFIDENCE = 100
PRIORITY = 90
# Forgejo strives to be compatible with Gitea API
# https://forgejo.org/2024-02-forking-forward/
_gitea_instances = [
"codeberg.org",
"git.osgeo.org",
"gitea.com",
"gitea.ladish.org",
"gitea.osmocom.org",
"gitea.treehouse.systems",
]
gitea_patterns = [
re.compile(rf"https://(?P<domain>{domain})/(?P<repository>[^/]+/[^/]+)")
for domain in _gitea_instances
]
def can_handle(pkg, url=None):
return url and any([re.search(pattern, url) for pattern in gitea_patterns])
def scan_url(pkg, url, options):
"https://docs.gitea.com/api/1.20/#tag/repository/operation/repoListReleases"
match = [
re.search(pattern, url)
for pattern in gitea_patterns
if re.search(pattern, url) is not None
][0]
domain = match.group("domain")
repository = match.group("repository")
output.einfo(f"Using Gitea API in {domain}: {repository}")
request = helpers.urlopen(f"https://{domain}/api/v1/repos/{repository}/releases")
data = json.load(request)
versions = [release["tag_name"] for release in data]
cp, ver, rev = portage.pkgsplit(pkg.cpv)
ret = []
for up_pv in versions:
pv = mangling.mangle_version(up_pv, options)
if helpers.version_filtered(cp, ver, pv):
continue
urls = " ".join(
mangling.mangle_url(release["tarball_url"], options)
for release in data
if release["tag_name"] == up_pv
)
ret.append((urls, pv, HANDLER_NAME, CONFIDENCE))
return ret

View File

@ -1,66 +0,0 @@
# Copyright 2011 Corentin Chary <corentin.chary@gmail.com>
# Copyright 2020-2023 src_prepare group
# Distributed under the terms of the GNU General Public License v2
import json
import re
import urllib.error
import urllib.parse
import urllib.request
import portage
from euscan import helpers, mangling, output
HANDLER_NAME = "github"
CONFIDENCE = 100
PRIORITY = 90
def can_handle(pkg, url=None):
return url and url.startswith("mirror://github/")
def guess_package(cp, url):
match = re.search("^mirror://github/(.*?)/(.*?)/(.*)$", url)
assert match
return (match.group(1), match.group(2), match.group(3))
def scan_url(pkg, url, options):
"http://developer.github.com/v3/repos/downloads/"
user, project, filename = guess_package(pkg.cpv, url)
# find out where version is expected to be found
cp, ver, rev = portage.pkgsplit(pkg.cpv)
if ver not in filename:
return
# now create a filename-matching regexp
# XXX: supposedly replace first with (?P<foo>...)
# and remaining ones with (?P=foo)
fnre = re.compile("^%s$" % re.escape(filename).replace(re.escape(ver), "(.*?)"))
output.einfo(
f"Using github API for: project={project} user={user} filename={filename}"
)
dlreq = urllib.request.urlopen(
f"https://api.github.com/repos/{user}/{project}/downloads"
)
dls = json.load(dlreq)
ret = []
for dl in dls:
m = fnre.match(dl["name"])
if m:
pv = mangling.mangle_version(m.group(1), options)
if helpers.version_filtered(cp, ver, pv):
continue
url = mangling.mangle_url(dl["html_url"], options)
ret.append((url, pv, HANDLER_NAME, CONFIDENCE))
return ret

View File

@ -0,0 +1,82 @@
# Copyright 2020-2024 src_prepare group
# Distributed under the terms of the GNU General Public License v2
import json
import re
import portage
from euscan import helpers, mangling, output
HANDLER_NAME = "gitlab"
CONFIDENCE = 100
PRIORITY = 90
_gitlab_instances = [
"gitlab.com",
"gitlab.freedesktop.org",
"invent.kde.org/",
"gitlab.gnome.org",
"gitlab.kitware.com",
"gitlab.xfce.org",
"code.videolan.org",
"gitlab.xiph.org",
]
gitlab_patterns = [
# Regular expression adapted from pkgcheck
# https://docs.gitlab.com/ee/user/reserved_names.html
re.compile(
rf"https://(?P<domain>{domain})/(?P<repository>((?!api/)\w[^/]*/)+(?!raw/)\w[^/]*)"
)
for domain in _gitlab_instances
]
def can_handle(pkg, url=None):
return url and any([re.search(pattern, url) for pattern in gitlab_patterns])
def scan_url(pkg, url, options):
"https://docs.gitlab.com/ee/api/releases/index.html"
match = [
re.search(pattern, url)
for pattern in gitlab_patterns
if re.search(pattern, url) is not None
][0]
domain = match.group("domain")
repository = match.group("repository")
output.einfo(f"Using GitLab REST API in {domain}: {repository}")
request = helpers.urlopen(
f"https://{domain}/api/v4/projects/{repository.replace('/', '%2F')}/releases"
)
data = json.load(request)
versions = [release["tag_name"] for release in data]
cp, ver, rev = portage.pkgsplit(pkg.cpv)
ret = []
for up_pv in versions:
pv = mangling.mangle_version(up_pv, options)
if helpers.version_filtered(cp, ver, pv):
continue
urls = " ".join(
[
mangling.mangle_url(source["url"], options)
for source in [
release["assets"]["sources"]
for release in data
if release["tag_name"] == up_pv
][0]
# prefer tar.bz2
if source["format"] == "tar.bz2"
]
)
ret.append((urls, pv, HANDLER_NAME, CONFIDENCE))
return ret

View File

@ -20,7 +20,7 @@ HANDLER_NAME = "gnome"
CONFIDENCE = 100
PRIORITY = 90
GNOME_URL_SOURCE = "http://ftp.gnome.org/pub/GNOME/sources"
GNOME_URL_SOURCE = "https://download.gnome.org/sources"
def can_handle(_pkg, url=None):
@ -38,7 +38,7 @@ def guess_package(cp, url):
def scan_url(pkg, url, options):
"http://ftp.gnome.org/pub/GNOME/sources/"
"https://download.gnome.org/sources/"
package = {
"data": guess_package(pkg.cpv, url),
"type": "gnome",
@ -55,7 +55,7 @@ def scan_pkg(pkg, options):
content = fp.read()
fp.close()
cache = json.loads(content, encoding="ascii")
cache = json.loads(content)
if cache[0] != 4:
output.eerror("Unknow cache format detected")

View File

@ -1,47 +0,0 @@
# Copyright 2011 Corentin Chary <corentin.chary@gmail.com>
# Copyright 2020-2023 src_prepare group
# Distributed under the terms of the GNU General Public License v2
import re
import portage
from euscan import output
from euscan.handlers.url import process_scan as url_scan
from euscan.helpers import regex_from_template
HANDLER_NAME = "google-code"
CONFIDENCE = 90
PRIORITY = 90
package_name_regex = r"http://(.+).googlecode.com/files/.+"
def can_handle(pkg, url=None):
if not url:
return False
cp, ver, rev = portage.pkgsplit(pkg.cpv)
if ver not in url:
return False
return re.match(package_name_regex, url)
def scan_url(pkg, url, options):
output.einfo("Using Google Code handler")
cp, ver, rev = portage.pkgsplit(pkg.cpv)
package_name = re.match(package_name_regex, url).group(1)
base_url = "http://code.google.com/p/%s/downloads/list" % package_name
file_pattern = regex_from_template(url.split("/")[-1].replace(ver, "${PV}"))
result = url_scan(pkg, base_url, file_pattern)
ret = []
for url, pv, _, _ in result:
ret.append((url, pv, HANDLER_NAME, CONFIDENCE))
return ret

View File

@ -1,11 +1,13 @@
# Copyright 2011 Corentin Chary <corentin.chary@gmail.com>
# Copyright 2020-2023 src_prepare group
# Copyright 2020-2024 src_prepare group
# Distributed under the terms of the GNU General Public License v2
import json
import re
import xmlrpc.client
import urllib.error
import portage
from packaging.version import parse
from euscan import helpers, mangling, output
@ -15,11 +17,11 @@ PRIORITY = 90
def can_handle(pkg, url=None):
return url and url.startswith("mirror://pypi/")
return url and url.startswith("https://files.pythonhosted.org/packages/source/p/")
def guess_package(cp, url):
match = re.search(r"mirror://pypi/\w+/(.*)/.*", url)
match = re.search(r"https://files.pythonhosted.org/packages/source/p/(.*)/.*", url)
if match:
return match.group(1)
@ -29,7 +31,7 @@ def guess_package(cp, url):
def scan_url(pkg, url, options):
"http://wiki.python.org/moin/PyPiXmlRpc"
"https://peps.python.org/pep-0691/"
package = guess_package(pkg.cpv, url)
return scan_pkg(pkg, {"data": package})
@ -38,15 +40,23 @@ def scan_url(pkg, url, options):
def scan_pkg(pkg, options):
package = options["data"]
output.einfo("Using PyPi XMLRPC: " + package)
output.einfo("Using PyPi JSON API: " + package)
client = xmlrpc.client.ServerProxy("https://pypi.python.org/pypi")
versions = client.package_releases(package)
try:
fp = helpers.urlopen(f"https://pypi.org/pypi/{package}/json/")
except urllib.error.URLError:
return []
except OSError:
return []
if not versions:
return versions
if not fp:
return []
versions.reverse()
data = json.loads(fp.read())
versions = list(data["releases"].keys())
versions.sort(key=parse, reverse=True)
cp, ver, rev = portage.pkgsplit(pkg.cpv)
@ -55,7 +65,12 @@ def scan_pkg(pkg, options):
pv = mangling.mangle_version(up_pv, options)
if helpers.version_filtered(cp, ver, pv):
continue
urls = client.release_urls(package, up_pv)
urls = " ".join([mangling.mangle_url(infos["url"], options) for infos in urls])
urls = " ".join(
[
mangling.mangle_url(file["url"], options)
for file in data["releases"][up_pv]
if file["packagetype"] == "sdist"
]
)
ret.append((urls, pv, HANDLER_NAME, CONFIDENCE))
return ret

View File

@ -1,5 +1,5 @@
# Copyright 2011 Corentin Chary <corentin.chary@gmail.com>
# Copyright 2020-2023 src_prepare group
# Copyright 2020-2024 src_prepare group
# Distributed under the terms of the GNU General Public License v2
import json
@ -18,11 +18,11 @@ PRIORITY = 90
def can_handle(pkg, url=None):
return url and url.startswith("mirror://rubygems/")
return url and url.startswith("https://rubygems.org/")
def guess_gem(cpv, url):
match = re.search("mirror://rubygems/(.*).gem", url)
match = re.search("https://rubygems.org/gems/(.*).gem", url)
if match:
cpv = "fake/%s" % match.group(1)

View File

@ -153,6 +153,9 @@ def scan_upstream(query, on_progress=None):
else:
uris = pkg.environment("SRC_URI")
# Roundabout way to handle $'' strings
uris = uris.encode("raw_unicode_escape").decode("unicode_escape")
cpv = pkg.cpv
uris = parse_src_uri(uris)