From a51aa6dec031adcc991752bd51b4acfabc7fa3e1 Mon Sep 17 00:00:00 2001
From: Ariel D'Alessandro <ariel.dalessandro@collabora.com>
Date: Fri, 4 Jun 2021 15:21:17 -0300
Subject: [PATCH] ci-license-scan: Manually check whitelisted files

Files whitelisted in debian/apertis/copyright.whitelist are used to
instruct the scanner process not to raise an error which allows the
pipeline to succeed and merge the change. However, as a side effect, the
entries in debian/apertis/copyright with offending licenses are being
removed, dropping important information.

BOM generator for Apertis binary packages needs the whole information.
scan-copyright scanner must be called without a whitelist, so the
pipeline keeps license information for all the present files in the
source package.

This MR adapts ci-license-scan script to generate license information
for all the files, without failing on whitelisted offending ones.

Link: https://phabricator.apertis.org/T7878

Signed-off-by: Ariel D'Alessandro <ariel.dalessandro@collabora.com>
---
 .../overlay/usr/bin/ci-license-scan           | 30 ++++++++++++-------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/package-source-builder/overlay/usr/bin/ci-license-scan b/package-source-builder/overlay/usr/bin/ci-license-scan
index 8aefd7c..dd705c1 100755
--- a/package-source-builder/overlay/usr/bin/ci-license-scan
+++ b/package-source-builder/overlay/usr/bin/ci-license-scan
@@ -17,6 +17,7 @@ import yaml
 import textwrap
 import unicodedata
 import fossology
+import re
 
 # this is necessary to eliminate references in the generated YAML
 # Perl tools use YAML::Tiny which doesn’t support references.
@@ -90,7 +91,6 @@ except:
     import warnings
     warnings.warn("pathspec not available, copyright ignore patterns will not match exactly")
     import fnmatch
-    import re
 
     NOTDOUBLESTAR = re.compile(r'\.\*(?!\.\*)')
 
@@ -198,12 +198,19 @@ def gitignore2pat(s):
 def parse_whitelist(filename: str):
     """
     Attempt to open and parse a whilelist, do nothing if it’s not there
+    Note that a whitelist can only contain git ignore patterns.
     """
     with Path(filename) as f:
         if f.is_file():
-            return gitignore2pat(f.read_text())
+            return gitignore2pat(f.read_text())['ignore']['pattern']
         else:
-            return {}
+            return []
+
+def is_whitelisted(whitelist_patterns, filename):
+    for whitelist_pattern in whitelist_patterns:
+        if re.match(whitelist_pattern, filename):
+            return True
+    return False
 
 def detect_license_for(path, c=None):
     if c is None:
@@ -252,7 +259,7 @@ def reindent_multiline(s: str) -> str:
     lines = s.split('\n')
     return '\n '.join([l.lstrip() for l in lines])
 
-def configure_scanner(whitelists):
+def configure_scanner():
     """
     Merge our metadata with that of the Debian package
     """
@@ -265,8 +272,6 @@ def configure_scanner(whitelists):
                 pass
 
         extra_patterns = [gitignore2pat(builtin_ignores)]
-        for whitelist in whitelists:
-            extra_patterns += [parse_whitelist(whitelist)]
         new_yaml = yaml.safe_dump(merge_scan_patterns(debian_patterns, *extra_patterns), default_style='|')
         scan_patterns.write_text(new_yaml)
     copyright_overrides = {}
@@ -348,11 +353,13 @@ def main():
     if disallowlist:
         print(f"Disallowed licenses: {disallowlist}")
     disallowlist = [d.lower() for d in disallowlist]
-    print(f"Using whitelists: {args.whitelists}")
-    configure_scanner(args.whitelists)
+    configure_scanner()
     Path('debian/apertis/copyright').touch(exist_ok=True)
     sys.stdout.flush()
 
+    print(f"Using whitelists: {args.whitelists}")
+    whitelist_patterns = [re.compile(p) for whitelist in args.whitelists for p in parse_whitelist(whitelist)]
+
     with open('debian/apertis/copyright.new', 'wt', buffering=1) as f:
         print('Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n', file=f)
         scan_copyrights(_out=f)
@@ -395,12 +402,15 @@ def main():
                         copyright, license = detect_license_for(f, debian_copyrights)
                         if copyright is not None and license is not None:
                             if license.synopsis.rstrip('+').lower() in disallowlist:
-                                bad_licenses.add(license.synopsis)
+                                if not is_whitelisted(whitelist_patterns, f):
+                                    bad_licenses.add(license.synopsis)
                             fixups.setdefault(copyright, {}).setdefault(license, []).append(p)
                         else:
                             unknown_licensed = True
                 elif p.license.synopsis.rstrip('+').lower() in disallowlist:
-                    bad_licenses.add(p.license.synopsis)
+                    for f in p.files:
+                        if not is_whitelisted(whitelist_patterns, f):
+                            bad_licenses.add(p.license.synopsis)
             if p.copyright and is_gibberish(p.copyright):
                 p.copyright = 'no-info-found'
         if fixups:
-- 
GitLab