From 44630086b7a74b41b2aed6a082d062dfc6014033 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dylan=20A=C3=AFssi?= <dylan.aissi@collabora.com>
Date: Fri, 28 Jun 2024 13:36:56 +0000
Subject: [PATCH] Add new report displaying the packaging delta size

---
 .gitlab-ci.yml              |  52 +++++
 bin/classes.py              |   1 +
 bin/dashboard               |   9 +
 bin/packaging-check-delta   | 413 ++++++++++++++++++++++++++++++++++++
 data/whitelists.yaml        |   2 +
 templates/index.html.jinja2 |  12 ++
 6 files changed, 489 insertions(+)
 create mode 100755 bin/packaging-check-delta

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index cb0caf3..e547ac4 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -381,6 +381,57 @@ packaging-check-settings:
       when: never
     - if: $CI_PIPELINE_SOURCE != "merge_request_event"
 
+packaging-check-delta:
+  resource_group: gitlab
+  stage: check
+  timeout: 4h
+  tags:
+    - lightweight
+  before_script:
+    - apt update && apt install -y --no-install-recommends
+        git
+        python3-debian
+        python3-gitlab
+        python3-yaml
+        wget
+  script:
+    - CACHE_DELTA_ARGS=""
+    - ARTIFACT_URL=${ARTIFACT_URL:-$CI_API_V4_URL/projects/$CI_PROJECT_ID/jobs/artifacts/$CI_DEFAULT_BRANCH/raw/packaging-cache.json?job=pages}
+    - |
+      if [ "$ARTIFACT_URL" != none ] && [ "$DISABLE_CACHE" == "no" ]
+      then
+        wget --header "JOB-TOKEN: $CI_JOB_TOKEN" "$ARTIFACT_URL" -O cache.json || true
+      fi
+    - |
+      if [ -s cache.json ] && [ "$DISABLE_CACHE" == "no" ]
+      then
+        echo Load cacheable data from cache.json
+        CACHE_DELTA_ARGS="--cache cache.json"
+      fi
+    - ./bin/json-merge
+        --input packaging-data-downstream.json
+        --input packaging-data-sources-upstream.json
+        --input packaging-data-sources-published.json
+        --input packaging-data-binaries-published.json
+        --input packaging-data-obs.json
+        --output packaging-data.json
+    - ./bin/packaging-check-delta
+        --projects packaging-data.json
+        --whitelists data/whitelists.yaml
+        --json packaging-check-delta.json
+        ${CACHE_DELTA_ARGS}
+        ${DEBUG:+--debug}
+        ${LOG_TO_FILE:+--log-to-file $LOG_TO_FILE}
+  artifacts:
+    when: always
+    paths:
+      - packaging-check-delta.json
+      - ${LOG_TO_FILE}
+  rules:
+    - if: $TRIGGER_FROM_JOB
+      when: never
+    - if: $CI_PIPELINE_SOURCE != "merge_request_event"
+
 packaging-check-invariants:
   stage: check
   tags:
@@ -491,6 +542,7 @@ packaging-updates-upstream-linux:
     - ./bin/json-merge
         --input packaging-data.json
         --input packaging-checks.json
+        --input packaging-check-delta.json
         --input packaging-check-settings.json
         --input packaging-updates.json
         --input packaging-updates-upstream-linux.json
diff --git a/bin/classes.py b/bin/classes.py
index 7b6bd72..4736b06 100644
--- a/bin/classes.py
+++ b/bin/classes.py
@@ -85,6 +85,7 @@ class Report(enum.Enum):
     LICENSING_GLOBAL_DEFAULT_UNFRIENDLY = enum.auto()
     LICENSING_GLOBAL_DEFAULT_DUAL_UNFRIENDLY = enum.auto()
     LICENSING_GLOBAL_WHITELIST = enum.auto()
+    DELTA_AVAILABLE = enum.auto()
 
 
 @dataclasses.dataclass
diff --git a/bin/dashboard b/bin/dashboard
index 043ba79..8784eba 100755
--- a/bin/dashboard
+++ b/bin/dashboard
@@ -59,12 +59,21 @@ def preprocess_packaging_data(data):
             )
             for p in packages
         ),
+        "delta_errors_count": sum(
+            count_reports(
+                p, lambda r: r["domain"] == "delta" and r["severity"] == "error"
+            )
+            for p in packages
+        ),
         "total_updates_count": sum(
             count_reports(p, lambda r: r["domain"] == "update") for p in packages
         ),
         "total_licensing_count": sum(
             count_reports(p, lambda r: r["domain"] == "licensing") for p in packages
         ),
+        "total_packaging_delta": sum(
+            count_reports(p, lambda r: r["domain"] == "delta") for p in packages
+        ),
     }
     data["summary"] = summary
 
diff --git a/bin/packaging-check-delta b/bin/packaging-check-delta
new file mode 100755
index 0000000..03cf594
--- /dev/null
+++ b/bin/packaging-check-delta
@@ -0,0 +1,413 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import shlex
+import shutil
+import subprocess
+import types
+
+import debian
+import yaml
+from classes import Report
+
+temp_folder = "temp-gitclone4delta/"
+
+
+def version_to_tag(version):
+    # see https://dep-team.pages.debian.net/deps/dep14/
+    tag = version.replace(":", "%")
+    tag = version.replace("~", "_")
+    return tag
+
+
+def tag_to_version(tag):
+    # see https://dep-team.pages.debian.net/deps/dep14/
+    version = tag.replace("%", ":")
+    version = tag.replace("_", "~")
+    return version
+
+
+def run_git_shallow_clone(url_git):
+    """Run a git shallow clone"""
+    if os.path.isdir(temp_folder):
+        logging.debug(f"Deleting previous {temp_folder}")
+        shutil.rmtree(temp_folder)
+
+    git_shallow_clone = [
+        "git",
+        "clone",
+        "--depth",
+        "1",
+        url_git,
+        temp_folder,
+    ]
+    logging.debug(f"Executing {shlex.join(git_shallow_clone)}")
+    subprocess.run(git_shallow_clone, stdout=subprocess.PIPE)
+
+    return True
+
+
+def run_git_fetch_tags(pkg_upstream_tag, pkg_latest_tag):
+    # We need to fetch tags otherwise we cannot compare them
+    for tag in [pkg_upstream_tag, pkg_latest_tag]:
+        git_fetch_tags = [
+            "git",
+            "fetch",
+            "origin",
+            "tag",
+            tag,
+            "--no-tags",
+        ]
+        logging.debug(f"Executing {shlex.join(git_fetch_tags)}")
+        subprocess.run(git_fetch_tags, stdout=subprocess.PIPE, cwd=temp_folder)
+
+    return
+
+
+def run_git_shortdiff(debian_tag, apertis_tag):
+    """Run a git diff --shortstat debian_tag apertis_tag -- ':!/debian/apertis' ':!/debian/changelog'"""
+
+    git_shortdiff = [
+        "git",
+        "diff",
+        "--shortstat",
+        debian_tag,
+        apertis_tag,
+        "--",
+        ":!/debian/apertis",
+        ":!/debian/changelog",
+    ]
+    logging.debug(f"Executing {shlex.join(git_shortdiff)}")
+    p = subprocess.run(git_shortdiff, stdout=subprocess.PIPE, cwd=temp_folder)
+    package_shortdiff = p.stdout.strip().decode()
+
+    return package_shortdiff
+
+
+def compute_delta(data, cache=None):
+    """Check packaging delta for upstreaming to Debian."""
+    ret = {"packages": {}}
+
+    def add_entry(package, report, severity, **kwargs):
+        logfunc = getattr(logging, severity)
+        logfunc(
+            "%s: %s: %s: %s",
+            package,
+            report.domain,
+            report.kind,
+            ", ".join(f"{k}={v}" for k, v in kwargs.items()),
+        )
+        packages = ret["packages"]
+        reports = packages.setdefault(package, {}).setdefault("reports", [])
+        reports.append(
+            dict(domain=report.domain, kind=report.kind, severity=severity, **kwargs)
+        )
+
+    def error(package, report, **kwargs):
+        add_entry(package, report, "error", **kwargs)
+
+    def warning(package, report, **kwargs):
+        add_entry(package, report, "warning", **kwargs)
+
+    def info(package, report, **kwargs):
+        add_entry(package, report, "info", **kwargs)
+
+    def add_delta_entry(package, pkg_delta):
+        packages = ret["packages"]
+        delta = packages.setdefault(package, {}).setdefault("delta", [])
+        delta.append(dict(pkg_delta))
+
+    def channel_is_active(channel):
+        return channel.get("status") == "active"
+
+    def get_channel_base(channel):
+        return channel.get("base")
+
+    channels = data["channels"]
+    channels_keys = set(channels.keys())
+    base_channels = {get_channel_base(channels[name]) for name in channels_keys}
+    active_channels = {
+        name for name in base_channels if channel_is_active(channels[name])
+    }
+    # Exclude whitelisted channels
+    active_channels = {
+        channel
+        for channel in active_channels
+        if channel not in DELTA_CHANNEL_IGNORELIST
+    }
+
+    def get_highest_matching_version(version, distribution, release=None):
+        pkg_sources_list = list(data["sources"].keys())
+
+        if release is not None:
+            pkg_source_list_release = [i for i in pkg_sources_list if release in i]
+            list_versions = []
+            list_sources = []
+            for source in pkg_source_list_release:
+                if source not in git.branches:
+                    continue
+                if git.branches[source]["version"] in version:
+                    list_versions.append(
+                        debian.debian_support.Version(git.branches[source]["version"])
+                    )
+                    list_sources.append(source)
+        else:
+            list_versions = []
+            list_sources = []
+            for source in pkg_sources_list:
+                if source not in git.branches:
+                    continue
+                for tag in git.branches[source]["tags"]:
+                    tag = tag.replace(distribution + "/", "")
+                    tag = tag_to_version(tag)
+                    if tag in version:
+                        list_versions.append(debian.debian_support.Version(tag))
+                        list_sources.append(source)
+
+        if len(list_versions) > 0:
+            version_upstream = str(max(list_versions))
+            version_source = list_sources[list_versions.index(max(list_versions))]
+        else:
+            version_upstream = None
+            version_source = None
+        return version_upstream, version_source
+
+    for package in data["packages"].values():
+        if "git" not in package:
+            continue
+        if package["git"]["default_branch"] not in active_channels:
+            continue
+
+        # make dict entries accessible with dot notation
+        package = types.SimpleNamespace(**package)
+        git = types.SimpleNamespace(**package.git)
+
+        git_cloned = False
+
+        for branch in git.branches:
+            if branch not in active_channels:
+                continue
+
+            pkg_commit_id = git.branches[branch]["commit_id"]
+            pkg_latest_version = git.branches[branch]["version"]
+
+            # Check for cached data
+            if cache:
+                if package.name in cache and "delta" in cache[package.name]:
+                    cached_list_branch = [
+                        str(*branch.keys()) for branch in cache[package.name]["delta"]
+                    ]
+                    if (
+                        branch in cache[package.name]["git"]["branches"]
+                        and branch in cached_list_branch
+                    ):
+                        if (
+                            pkg_commit_id
+                            == cache[package.name]["git"]["branches"][branch][
+                                "commit_id"
+                            ]
+                        ):
+                            logging.debug(
+                                f"Diff already computed for {package.name} in {branch}, reusing cache data"
+                            )
+                            delta_cached = next(
+                                iter(
+                                    [
+                                        cached_branch.values()
+                                        for cached_branch in cache[package.name][
+                                            "delta"
+                                        ]
+                                        if str(*cached_branch.keys()) == branch
+                                    ][0]
+                                )
+                            )
+                            if delta_cached["diff_shortstat"] is None:
+                                logging.debug(
+                                    f"Cache: {package.name} {pkg_latest_version} does not have a delta"
+                                )
+                            else:
+                                logging.debug(
+                                    f"Cache: {package.name} {pkg_latest_version} does have a delta"
+                                )
+                                info(
+                                    package.name,
+                                    Report.DELTA_AVAILABLE,
+                                    diff_url=delta_cached["diff_url"],
+                                    diff_shortstat=delta_cached["diff_shortstat"],
+                                    pkg_version=pkg_latest_version,
+                                    branch=branch,
+                                )
+                            pkg_delta = {branch: delta_cached}
+                            add_delta_entry(package.name, delta_cached)
+                            continue
+                        else:
+                            logging.debug(
+                                f"Diff not available for {package.name} in cache for this commit id"
+                            )
+                    else:
+                        logging.debug(
+                            f"Branch {branch} not found in cache for {package.name}"
+                        )
+
+            pkg_source_distribution = data["channels"][branch]["source"]["distribution"]
+            pkg_source_release = data["channels"][branch]["source"]["release"]
+            pkg_source_downstream_distribution = data["channels"][branch][
+                "distribution"
+            ]
+
+            pkg_upstream_version, pkg_upstream_source = get_highest_matching_version(
+                pkg_latest_version, pkg_source_distribution, pkg_source_release
+            )
+
+            # If empty then check for all sources (can be from debian/sid or debian/testing)
+            if pkg_upstream_version is None:
+                logging.debug(
+                    f"{package.name} the corresponding upstream version was not found in the usual source, trying a fresher source"
+                )
+                (
+                    pkg_upstream_version,
+                    pkg_upstream_source,
+                ) = get_highest_matching_version(
+                    pkg_latest_version, pkg_source_distribution
+                )
+
+            if pkg_upstream_version is None:
+                logging.debug(
+                    f"{package.name} the corresponding upstream version was not found in git for {pkg_latest_version}"
+                )
+                continue
+
+            package_url = package.git["web_url"]
+            pkg_upstream_tag = (
+                pkg_source_distribution + "/" + version_to_tag(pkg_upstream_version)
+            )
+            pkg_latest_tag = (
+                pkg_source_downstream_distribution
+                + "/"
+                + version_to_tag(pkg_latest_version)
+            )
+
+            if not git_cloned:
+                git_cloned = run_git_shallow_clone(package_url)
+
+            run_git_fetch_tags(pkg_upstream_tag, pkg_latest_tag)
+            package_shortdiff = run_git_shortdiff(pkg_upstream_tag, pkg_latest_tag)
+
+            diff_url = (
+                package_url + "/-/compare/" + pkg_upstream_tag + "..." + pkg_latest_tag
+            )
+
+            if package_shortdiff == "":
+                logging.debug(
+                    f"{package.name} {pkg_latest_version} does not have a delta"
+                )
+                pkg_delta = {
+                    branch: {
+                        "diff_shortstat": None,
+                    }
+                }
+            else:
+                info(
+                    package.name,
+                    Report.DELTA_AVAILABLE,
+                    diff_url=diff_url,
+                    diff_shortstat=package_shortdiff,
+                    pkg_version=pkg_latest_version,
+                    branch=branch,
+                )
+                pkg_delta = {
+                    branch: {
+                        "diff_shortstat": package_shortdiff,
+                        "diff_url": diff_url,
+                    }
+                }
+
+            add_delta_entry(package.name, pkg_delta)
+
+    return ret
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Compute available updates")
+    parser.add_argument(
+        "--debug",
+        action="store_const",
+        dest="loglevel",
+        const=logging.DEBUG,
+        help="print debug information",
+    )
+    parser.add_argument(
+        "--quiet",
+        action="store_const",
+        dest="loglevel",
+        const=logging.WARNING,
+        help="do not print informational output",
+    )
+    parser.add_argument(
+        "--log-to-file",
+        action="store",
+        type=str,
+        dest="log_to_file",
+        help="Log all output to file",
+    )
+    parser.add_argument("--job-id", help="the ID of the calling CI job")
+    parser.add_argument(
+        "--projects",
+        required=True,
+        type=argparse.FileType("r"),
+        help="input file in JSON format",
+    )
+    parser.add_argument(
+        "--json",
+        required=True,
+        type=argparse.FileType("w"),
+        help="file to store results in JSON format",
+    )
+    parser.add_argument(
+        "--cache",
+        required=False,
+        type=argparse.FileType("r"),
+        help="cache file in JSON format",
+    )
+    parser.add_argument(
+        "--whitelists",
+        type=argparse.FileType("r"),
+        help="input file containing white lists in YAML format",
+    )
+    args = parser.parse_args()
+
+    if args.log_to_file:
+        logging.basicConfig(
+            level=args.loglevel or logging.INFO, filename=args.log_to_file
+        )
+    else:
+        logging.basicConfig(level=args.loglevel or logging.INFO)
+
+    cache = None
+    if args.cache:
+        logging.info(f"Loading cacheable data from {args.cache.name}")
+        json_import = json.load(args.cache)
+        try:
+            cache = json_import["packages"]
+        except KeyError:
+            logging.warning("Malformed cache file (missing 'packages' top-level key)")
+
+    if args.whitelists:
+        whitelists = yaml.load(args.whitelists, Loader=yaml.CSafeLoader).get(
+            "whitelists"
+        )
+        DELTA_CHANNEL_IGNORELIST = whitelists["DELTA_CHANNEL_IGNORELIST"]
+    else:
+        DELTA_CHANNEL_IGNORELIST = {}
+
+    data = json.load(args.projects)
+    results = compute_delta(data, cache)
+    if args.job_id:
+        results.setdefault("meta", {})["updates_job_id"] = args.job_id
+
+    json.dump(results, args.json)
diff --git a/data/whitelists.yaml b/data/whitelists.yaml
index 1e5df08..8543e82 100644
--- a/data/whitelists.yaml
+++ b/data/whitelists.yaml
@@ -853,3 +853,5 @@ whitelists:
   GIT_BRANCH_MISMATCH_UPSTREAM_IGNORELIST:
     v2024:
 #      my-pkg: "1.0.0-1+apertis1~v2023"
+  DELTA_CHANNEL_IGNORELIST:
+    apertis/v2023: "Debian oldstable based Apertis release, delta are not relevant anymore"
diff --git a/templates/index.html.jinja2 b/templates/index.html.jinja2
index aa5b2af..ee25c30 100644
--- a/templates/index.html.jinja2
+++ b/templates/index.html.jinja2
@@ -125,6 +125,9 @@
     <li class="list-inline-item">
       {{ summary.total_licensing_count }} license issues
     </li>
+    <li class="list-inline-item">
+      {{ summary.total_packaging_delta }} packaging deltas
+    </li>
     <li class="list-inline-item">
       <a class="text-muted" href="tsv/index.html">🗒️ per-release indices</a>
     </li>
@@ -221,6 +224,9 @@
     <li class="list-inline-item">
       {{ summary.update_errors_count }} update
     </li>
+    <li class="list-inline-item">
+      {{ summary.delta_errors_count }} delta
+    </li>
   </ul>
 {% endblock error %}
 
@@ -418,6 +424,12 @@
             {%- else -%}
               Unknown report: {{ report }}
             {%- endif %}
+          {%- elif report.domain == "delta" -%}
+            {%- if report.kind == "available" -%}
+            Packaging delta for <i>{{ report.pkg_version }}</i>: <a href="{{ report.diff_url }}">{{ report.diff_shortstat }}</a>
+            {%- else -%}
+              Unknown report: {{ report }}
+            {%- endif %}
           {%- else -%}
             Unknown report: {{ report }}
           {%- endif %}
-- 
GitLab