Skip to content

Commit

Permalink
Merge pull request #112 from nexB/purl2sym-github-packages
Browse files Browse the repository at this point in the history
Add metadata support for packages hosted on GitHub
  • Loading branch information
keshav-space authored Mar 26, 2024
2 parents b3b2052 + 210eec0 commit 16b267d
Show file tree
Hide file tree
Showing 192 changed files with 80,834 additions and 311 deletions.
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ install_requires =
packageurl-python
requests
python-dateutil
python-dotenv


[options.packages.find]
Expand Down
89 changes: 33 additions & 56 deletions src/fetchcode/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,16 @@
from urllib.parse import urljoin

import htmllistparse
import requests
from packageurl import PackageURL
from packageurl.contrib.route import NoRouteAvailable
from packageurl.contrib.route import Router

from fetchcode.ipkg_release_info import IPKG_RELEASES
from fetchcode.package_util import GITHUB_SOURCE_BY_PACKAGE
from fetchcode.package_util import IPKG_RELEASES
from fetchcode.package_util import GitHubSource
from fetchcode.package_util import MiniupnpPackagesGitHubSource
from fetchcode.packagedcode_models import Package
from fetchcode.utils import get_response

router = Router()

Expand All @@ -44,17 +47,6 @@ def info(url):
return


def get_response(url):
"""
Generate `Package` object for a `url` string
"""
resp = requests.get(url)
if resp.status_code == 200:
return resp.json()

raise Exception(f"Failed to fetch: {url}")


def get_pypi_bugtracker_url(project_urls):
bug_tracking_url = project_urls.get("Tracker")
if not (bug_tracking_url):
Expand Down Expand Up @@ -216,53 +208,38 @@ def get_pypi_data_from_purl(purl):
@router.route("pkg:github/.*")
def get_github_data_from_purl(purl):
"""
Generate `Package` object from the `purl` string of github type
Yield `Package` object from the `purl` string of github type
"""
purl = PackageURL.from_string(purl)
name = purl.name
namespace = purl.namespace
base_path = "https://api.github.com/repos"
api_url = f"{base_path}/{namespace}/{name}"
response = get_response(api_url)
homepage_url = response.get("homepage")
vcs_url = response.get("git_url")
github_url = "https://github.com"
bug_tracking_url = f"{github_url}/{namespace}/{name}/issues"
code_view_url = f"{github_url}/{namespace}/{name}"
license_data = response.get("license") or {}
declared_license = license_data.get("spdx_id")
primary_language = response.get("language")
yield Package(
homepage_url=homepage_url,
vcs_url=vcs_url,
api_url=api_url,
bug_tracking_url=bug_tracking_url,
code_view_url=code_view_url,
declared_license=declared_license,
primary_language=primary_language,
**purl.to_dict(),

gh_package = f"{namespace}/{name}"
gh_source_class = GITHUB_SOURCE_BY_PACKAGE.get(gh_package, GitHubSource)

return gh_source_class.get_package_info(purl)


@router.route(
"pkg:generic/miniupnpc.*",
"pkg:generic/miniupnpd.*",
"pkg:generic/minissdpd.*",
)
def get_github_data_for_miniupnp(purl):
"""
Yield `Package` object for miniupnp packages from GitHub.
"""
generic_purl = PackageURL.from_string(purl)
github_repo_purl = PackageURL(
type="github",
namespace="miniupnp",
name="miniupnp",
version=generic_purl.version,
)

return MiniupnpPackagesGitHubSource.get_package_info(
gh_purl=github_repo_purl, package_name=generic_purl.name
)
release_url = f"{api_url}/releases"
releases = get_response(release_url)
for release in releases:
version = release.get("name")
version_purl = PackageURL(
type=purl.type, namespace=namespace, name=name, version=version
)
download_url = release.get("tarball_url")
code_view_url = f"{github_url}/{namespace}/{name}/tree/{version}"
version_vcs_url = f"{vcs_url}@{version}"
yield Package(
homepage_url=homepage_url,
vcs_url=version_vcs_url,
api_url=api_url,
bug_tracking_url=bug_tracking_url,
code_view_url=code_view_url,
declared_license=declared_license,
primary_language=primary_language,
download_url=download_url,
**version_purl.to_dict(),
)


@router.route("pkg:bitbucket/.*")
Expand Down Expand Up @@ -408,7 +385,7 @@ def get_package_info(cls, package_url):
)

else:
for version, data in archives.items():
for version, data in IPKG_RELEASES.items():
purl = PackageURL(type="generic", name="ipkg", version=version)
yield Package(
homepage_url=cls.source_url,
Expand Down
223 changes: 223 additions & 0 deletions src/fetchcode/ipkg_release_info.py → src/fetchcode/package_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,229 @@

# Since there will be no new releases of ipkg, it's better to
# store them in a dictionary rather than fetching them every time.

import dataclasses
import re

import attr

from fetchcode import utils
from fetchcode.packagedcode_models import Package


def package_from_dict(package_data):
"""
Return a Package built from a `package_data` mapping.
Ignore unknown and unsupported fields.
"""
supported = {attr.name for attr in attr.fields(Package)}
cleaned_package_data = {
key: value for key, value in package_data.items() if key in supported
}
return Package(**cleaned_package_data)


@dataclasses.dataclass
class GitHubSource:
version_regex: re.Pattern = dataclasses.field(
default=None,
metadata={
"help_text": "Regular expression pattern to match and extract version from tag."
},
)
ignored_tag_regex: re.Pattern = dataclasses.field(
default=None,
metadata={"help_text": "Regex to ignore tag."},
)

@classmethod
def get_default_package(cls, purl):
"""Return a Package object populated with default for this data source."""
name = purl.name
namespace = purl.namespace
base_path = "https://api.github.com/repos"
api_url = f"{base_path}/{namespace}/{name}"
response = utils.get_response(api_url)
homepage_url = response.get("homepage")
vcs_url = response.get("git_url")
github_url = "https://github.com"
bug_tracking_url = f"{github_url}/{namespace}/{name}/issues"
code_view_url = f"{github_url}/{namespace}/{name}"
license_data = response.get("license") or {}
declared_license = license_data.get("spdx_id")
primary_language = response.get("language")
return Package(
homepage_url=homepage_url,
vcs_url=vcs_url,
api_url=api_url,
bug_tracking_url=bug_tracking_url,
code_view_url=code_view_url,
declared_license=declared_license,
primary_language=primary_language,
**purl.to_dict(),
)

@classmethod
def get_package_info(cls, package_url):
yield from get_github_packages(
package_url,
cls.version_regex,
cls.ignored_tag_regex,
cls.get_default_package(package_url),
)


def get_github_packages(purl, version_regex, ignored_tag_regex, default_package):
"""
Yield package data from a directory listing for the given source_archive_url.
"""
for package in _get_github_packages(
purl, version_regex, ignored_tag_regex, default_package
):
# Don't yield all packages when a specific version is requested.
if purl.version and package.version != purl.version:
continue

yield package

# If a version is specified in purl and we have found a matching package,
# we don't need to continue searching.
if purl.version:
break


def _get_github_packages(purl, version_regex, ignored_tag_regex, default_package):
"Yield package for GitHub purl"
archive_download_url = (
"https://github.com/{org}/{name}/archive/refs/tags/{tag_name}.tar.gz"
)

package_dict = default_package.to_dict()
for tag, date in utils.fetch_github_tags_gql(purl):
if ignored_tag_regex and ignored_tag_regex.match(tag):
continue

if version_regex:
match = version_regex.match(tag)
if not match:
continue
version = match.group("version")
else:
version = tag

version = version.strip("Vv").strip()
if not version:
continue

download_url = archive_download_url.format(
org=purl.namespace, name=purl.name, tag_name=tag
)

date = date.strftime("%Y-%m-%dT%H:%M:%S")
package_dict.update(
{
"download_url": download_url,
"release_date": date,
"version": version,
}
)

yield package_from_dict(package_dict)


class UBootGitHubSource(GitHubSource):
version_regex = re.compile(r"(?P<version>v\d{4}\.\d{2})(?![\w.-])")
ignored_tag_regex = None


class Genext2fsGitHubSource(GitHubSource):
version_regex = None
ignored_tag_regex = re.compile(r"debian_version\S+upstream_version\S+")


class SquashfsToolsGitHubSource(GitHubSource):
version_regex = re.compile(r"\b[vV]?(?P<version>(?:\d+(\.\d+){1,2}))\b")
ignored_tag_regex = None


class PupnpGitHubSource(GitHubSource):
version_regex = re.compile(r"\brelease-?(?P<version>(?:\d+(\.\d+){1,2}))\b")
ignored_tag_regex = None


class BrotliGitHubSource(GitHubSource):
version_regex = re.compile(r"\b[vV]?(?P<version>(?:\d+(\.\d+){1,2}))\b")
ignored_tag_regex = None


class BpftoolGitHubSource(GitHubSource):
version_regex = re.compile(r"\b[vV]?(?P<version>(?:\d+(\.\d+){1,2}))\b")
ignored_tag_regex = None


class SqliteGitHubSource(GitHubSource):
version_regex = re.compile(r"\bversion-?(?P<version>(?:\d+(\.\d+){1,2}))\b")
ignored_tag_regex = None


class LlvmGitHubSource(GitHubSource):
version_regex = re.compile(r"llvmorg-(?P<version>.+)")
ignored_tag_regex = None


class RpmGitHubSource(GitHubSource):
version_regex = re.compile(r"rpm-(?P<version>[^-]+(?:-(?!release).*)?|-release)")
ignored_tag_regex = None


GITHUB_SOURCE_BY_PACKAGE = {
"avahi/avahi": GitHubSource,
"bestouff/genext2fs": Genext2fsGitHubSource,
"dosfstools/dosfstools": GitHubSource,
"google/brotli": BrotliGitHubSource,
"hewlettpackard/wireless-tools": GitHubSource,
"inotify-tools/inotify-tools": GitHubSource,
"libbpf/bpftool": BpftoolGitHubSource,
"llvm/llvm-project": LlvmGitHubSource,
"nixos/nix": GitHubSource,
"plougher/squashfs-tools": SquashfsToolsGitHubSource,
"pupnp/pupnp": PupnpGitHubSource,
"python/cpython": GitHubSource,
"rpm-software-management/rpm": RpmGitHubSource,
"shadow-maint/shadow": GitHubSource,
"sqlite/sqlite": SqliteGitHubSource,
"u-boot/u-boot": UBootGitHubSource,
}


class MiniupnpPackagesGitHubSource(GitHubSource):
version_regex = None
ignored_tag_regex = None
version_regex_template = r"{}_(?P<version>.+)"

@classmethod
def get_package_info(cls, gh_purl, package_name):
cls.version_regex = re.compile(
cls.version_regex_template.format(re.escape(package_name))
)

packages = get_github_packages(
gh_purl,
cls.version_regex,
cls.ignored_tag_regex,
cls.get_default_package(gh_purl),
)

for package in packages:
package_dict = package.to_dict()
package_dict["namespace"] = None
package_dict["name"] = package_name
package_dict["type"] = "generic"
package_dict["version"] = package_dict["version"].replace("_", ".")

yield package_from_dict(package_dict)


IPKG_RELEASES = {
"0.99.88": {
"url": "https://web.archive.org/web/20090326020239/http:/handhelds.org/download/packages/ipkg/ipkg-0.99.88.tar.gz",
Expand Down
Loading

0 comments on commit 16b267d

Please sign in to comment.