Detlev Casanova · 3ba61fb2 · ef7e20fb · 3ba61fb2
--- a/scripts/generate_bom.py

+ 94

− 55
+++ b/scripts/generate_bom.py

+ 94

− 55
 @@ -26,6 +26,7 @@ VERBOSE_SOURCE = 3
 NO_LICENSE_REPORT_FOUND = 'NoLicenseReportFound'
 NO_COPYRIGHT_REPORT_FOUND = 'NoCopyrightReportFound'
 NO_LICENSE_INFO_FOUND = 'NoLicenseInfoFound'
+AMBIGUOUS_LICENSE_INFO_FOUND = 'AmbiguousLicenseInfoFound'
 NO_COPYRIGHT_INFO_FOUND = 'NoCopyrightInfoFound'
 NO_SOURCE_INFO_FOUND = 'NoSourceInfoFound'

 @@ -41,15 +42,18 @@ re_bin2sources = re.compile(
    '(?P<package>.+)_bin2sources(?:_(?P<arch>[a-zA-Z0-9]+))?\\.json(\\.gz)?$'
 )

+
 def get_base_package_name(dirpath):
    return basename(dirpath)

+
 def open_potentially_gzipped(path, *args, **kw):
    if str(path).endswith('.gz'):
        return gzip.open(path, *args, **kw)
    else:
        return open(path, *args, **kw)

+
 def parse_copyright(copyright):
    """Parse copyrights to ensure each item contains only one copyright and
       not a string of several copyrights
 @@ -153,11 +157,11 @@ class PathDepth():
    def get_paths(self, pd):
        return self.path_depth[pd]

+
 class BomGenerator():
-    def __init__(self, bom_dir, relaxed, dpkg_status, verbose, pretty, comments, copyright, copyright_limit):
+    def __init__(self, bom_dir, dpkg_status, verbose, pretty, comments, copyright, copyright_limit):
        self.bom_dir = bom_dir
-        self.relaxed = relaxed
-        self.dpkg_status =  dpkg_status
+        self.dpkg_status = dpkg_status
        self.verbose = verbose
        self.pretty = pretty
        self.comments = comments
 @@ -165,14 +169,14 @@ class BomGenerator():
        self.copyright_limit = int(copyright_limit)

    def get_license_str(self, paragraph):
-        license =  paragraph.license[0]
+        license = paragraph.license[0]
        if self.comments:
            license += ' ' + str(paragraph.comment)

        return license

    def get_copyright_str(self, paragraph):
-        copyright =  paragraph.copyright
+        copyright = paragraph.copyright

        if len(copyright) > self.copyright_limit:
            return copyright[:self.copyright_limit]
 @@ -182,10 +186,66 @@ class BomGenerator():

        return copyright

+    def find_files_paragraphs(self, copyright_info, f_src):
+        result = []
+        for p in copyright_info.all_files_paragraphs():
+            if p.matches(f_src):
+                result.append(p)
+        return result
+
    # Get the license for file by searching the exact file path
    def get_license(self, copyright_info, f_src):
-        p = copyright_info.find_files_paragraph(f_src)
-        if p != None:
+        # p = copyright_info.find_files_paragraphs(f_src)
+        # This only returns the last matching paragraph. That is because entries in the copyright files are exceptions
+        # to the previous one:
+        # - "*": GPL-2
+        # - "src/*": GPL-2+
+        # - "src/feature/*": GPL-3
+        # a file named 'src/feature/lists.c' will be GPL-3 (last entry)
+        # a file named 'lists.c' will be GPL-2 (first entry)
+        # So, if dwarf2sources cannot retrieve the correct path to the file
+        # (gives 'lists.c' instead of 'src/feature/lists.c'),
+        # it is impossible to know if that is an issue or not because there might be a lists.c file at the root of the
+        # pkg
+        #
+        # To raise a flag here, we would be raising a flag for any package that ships files at the root and the
+        # copyright has a paragraph for subfolders with wildcards (paragraphs always have different licenses).
+        #
+        # That gets trickier if only a part of the file path is present
+        #
+        # No package that has this issue has been found.
+        #
+        p = self.find_files_paragraphs(copyright_info, f_src)
+
+        # Check for ambiguities if there ar multiple licenses and f_src has no path associated
+        if len(p) > 1 and '/' not in f_src:
+            found_pkg = False
+            found_sub_wc = False
+
+            # Check if the file is specified by full name
+            for par in p:
+                if f_src in par.files:
+                    found_pkg = True
+
+            # Check if we have paragraphs for subfolders with wildcards
+            for par in p:
+                for f in par.files:
+                    if "/*" in f:
+                        found_sub_wc = True
+                        break
+                if found_sub_wc:
+                    break
+
+            # Show matching paragraphs
+            if not found_pkg and found_sub_wc:
+                for par in p:
+                    print(f"{f_src} --> {par.license} ({par.files})", file=sys.stderr)
+                return AMBIGUOUS_LICENSE_INFO_FOUND
+
+        # fake previous behaviour
+        p = p[-1] if p else None
+
+        if p:
            return self.get_license_str(p)

        return NO_LICENSE_INFO_FOUND
 @@ -193,30 +253,11 @@ class BomGenerator():
    # Get the copyright for file by searching the exact file path
    def get_copyright(self, copyright_info, f_src):
        p = copyright_info.find_files_paragraph(f_src)
-        if p != None:
+        if p:
            return self.get_copyright_str(p)

        return NO_COPYRIGHT_INFO_FOUND

-    # Get the license for file by searching a match with the last part
-    # of the report generated by FOSSolgy as we can't rely on the folder
-    # used to upload the source code
-    def get_license_relaxed(self, copyright_info, f_src):
-        for p in copyright_info.all_files_paragraphs():
-            for f in p.files:
-                if f.endswith(f_src):
-                    return self.get_license_str(p)
-
-        return NO_LICENSE_INFO_FOUND
-
-    def get_copyright_relaxed(self, copyright_info, f_src):
-        for p in copyright_info.all_files_paragraphs():
-            for f in p.files:
-                if f.endswith(f_src):
-                    return self.get_copyright_str(p)
-
-        return NO_COPYRIGHT_INFO_FOUND
-
    def get_copyright_file(self, filenames):
        if COPYRIGHT_REPORT in filenames:
            return COPYRIGHT_REPORT
 @@ -260,7 +301,7 @@ class BomGenerator():
                for p in path_depth.get_paths(pd_key):
                    path_prefix_len = unit_file_path.find('/' + p + '/')
                    if path_prefix_len != -1:
-                        return unit_file_path[:path_prefix_len+1]
+                        return unit_file_path[:path_prefix_len + 1]
        return ''

    def scan_rust_unit(self, f_dir, f_src, copyright_info, out_licenses, out_copyright):
 @@ -296,18 +337,12 @@ class BomGenerator():
            out_copyright.add(NO_COPYRIGHT_INFO_FOUND)

    def scan_regular_unit(self, unit_file_path, copyright_info, out_licenses, out_copyright):
-            if not self.relaxed:
-                license = self.get_license(copyright_info, unit_file_path)
-            else:
-                license = self.get_license_relaxed(copyright_info, unit_file_path)
-            out_licenses.add(license)
+        license = self.get_license(copyright_info, unit_file_path)
+        out_licenses.add(license)

-            if self.copyright:
-                if not self.relaxed:
-                    copyright = self.get_copyright(copyright_info, unit_file_path)
-                else:
-                    copyright = self.get_copyright_relaxed(copyright_info, unit_file_path)
-                out_copyright.add(copyright)
+        if self.copyright:
+            copyright = self.get_copyright(copyright_info, unit_file_path)
+            out_copyright.add(copyright)

    def scan_units(self, binary_name, package_copyright_files, units):
        binary_licenses = set()
 @@ -316,7 +351,8 @@ class BomGenerator():

        if not package_copyright_files.primary_copyright:
            if self.copyright:
-                return {'binary_name': binary_name, 'binary_licenses': [NO_LICENSE_REPORT_FOUND], 'binary_copyright': [NO_COPYRIGHT_REPORT_FOUND], 'sources': sources}
+                return {'binary_name': binary_name, 'binary_licenses': [NO_LICENSE_REPORT_FOUND],
+                        'binary_copyright': [NO_COPYRIGHT_REPORT_FOUND], 'sources': sources}
            else:
                return {'binary_name': binary_name, 'binary_licenses': [NO_LICENSE_REPORT_FOUND], 'sources': sources}

 @@ -343,7+379,7 @@
            if unit_file_path.startswith('/usr/share/cargo/registry'):
                self.scan_rust_unit(f_dir, f_src, file_copyright_info, file_licenses, file_copyright)
            else:
                unit_file_path = unit_file_path.removeprefix(path_prefix)
                self.scan_regular_unit(unit_file_path, file_copyright_info, file_licenses, file_copyright)

            if self.verbose > VERBOSE_BINARY:
 @@ -355,7+391,7 @@

        if self.copyright:
            list_binary_copyright = parse_copyright(binary_copyright)
-            return {'binary_name': binary_name, 'binary_licenses': list(binary_licenses), 'binary_copyright': list_binary_copyright, 'sources': sources}
+            return {'binary_name': binary_name, 'binary_licenses': list(binary_licenses),
+                    'binary_copyright': list_binary_copyright, 'sources': sources}
        else:
            return {'binary_name': binary_name, 'binary_licenses': list(binary_licenses), 'sources': sources}

 @@ -374,7 +411,8 @@ class BomGenerator():

        if self.copyright:
            list_package_copyright = parse_copyright(package_copyright)
-            return {'package_name': package_name, 'package_licenses': list(package_licenses), 'package_copyright': list_package_copyright, 'binaries': binaries}
+            return {'package_name': package_name, 'package_licenses': list(package_licenses),
+                    'package_copyright': list_package_copyright, 'binaries': binaries}
        else:
            return {'package_name': package_name, 'package_licenses': list(package_licenses), 'binaries': binaries}

 @@ -447,25 +485,26 @@ class BomGenerator():
            kwargs["indent"] = 4
        print(json.dumps(image, **kwargs))

-def main(argv):
-    global RELAXED_SEARCH

+def main(argv):
    parser = argparse.ArgumentParser()
-    parser.add_argument ("-c","--comments", action='store_true', help="include license comments")
-    parser.add_argument ("-C","--copyright", action='store_true', help="include copyright information")
-    parser.add_argument ("-l","--copyright_limit", default=COPYRIGHT_LENGTH, help="limit maximum number of characters in copyright information")
-    parser.add_argument ("-d","--dir", default=DEFAULT_BOM_DIR, help="directory to search for information")
-    parser.add_argument ("-p","--pretty", action='store_true', help="indent the JSON output")
-    parser.add_argument ("-r","--relaxed", action='store_true', help="use relaxed search")
-    parser.add_argument ("-s","--dpkg-status", default=DEFAULT_DPKG_STATUS, help="dpkg status file")
-    parser.add_argument ("-v","--verbose", type=int, default=VERBOSE_IMAGE,
-        help="verbose use in output 0: image, 1: package, 2: binary, 3: source")
+    parser.add_argument("-c", "--comments", action='store_true', help="include license comments")
+    parser.add_argument("-C", "--copyright", action='store_true', help="include copyright information")
+    parser.add_argument("-l", "--copyright_limit", default=COPYRIGHT_LENGTH,
+                        help="limit maximum number of characters in copyright information")
+    parser.add_argument("-d", "--dir", default=DEFAULT_BOM_DIR, help="directory to search for information")
+    parser.add_argument("-p", "--pretty", action='store_true', help="indent the JSON output")
+    parser.add_argument("-s", "--dpkg-status", default=DEFAULT_DPKG_STATUS, help="dpkg status file")
+    parser.add_argument("-v", "--verbose", type=int, default=VERBOSE_IMAGE,
+                        help="verbose use in output 0: image, 1: package, 2: binary, 3: source")

-    args = parser.parse_args ()
+    args = parser.parse_args()

-    bom_generator = BomGenerator(args.dir, args.relaxed, args.dpkg_status, args.verbose, args.pretty, args.comments, args.copyright, args.copyright_limit)
+    bom_generator = BomGenerator(args.dir, args.dpkg_status, args.verbose, args.pretty, args.comments,
+                                 args.copyright, args.copyright_limit)

    bom_generator.check_packages_copyright()

+
 if __name__ == '__main__':
    main(sys.argv[1:])