Skip to content
Snippets Groups Projects

Draft: scripts/generate_bom.py: Report ambiguous copyright

Closed Detlev Casanova requested to merge wip/detlev/at8032-improv-bom-generator into apertis/v2023dev3
10 unresolved threads
1 file
+ 94
55
Compare changes
  • Side-by-side
  • Inline
+ 94
55
@@ -26,6 +26,7 @@ VERBOSE_SOURCE = 3
NO_LICENSE_REPORT_FOUND = 'NoLicenseReportFound'
NO_COPYRIGHT_REPORT_FOUND = 'NoCopyrightReportFound'
NO_LICENSE_INFO_FOUND = 'NoLicenseInfoFound'
AMBIGUOUS_LICENSE_INFO_FOUND = 'AmbiguousLicenseInfoFound'
Please register or sign in to reply
NO_COPYRIGHT_INFO_FOUND = 'NoCopyrightInfoFound'
NO_SOURCE_INFO_FOUND = 'NoSourceInfoFound'
@@ -41,15 +42,18 @@ re_bin2sources = re.compile(
'(?P<package>.+)_bin2sources(?:_(?P<arch>[a-zA-Z0-9]+))?\\.json(\\.gz)?$'
)
def get_base_package_name(dirpath):
return basename(dirpath)
def open_potentially_gzipped(path, *args, **kw):
if str(path).endswith('.gz'):
return gzip.open(path, *args, **kw)
else:
return open(path, *args, **kw)
def parse_copyright(copyright):
"""Parse copyrights to ensure each item contains only one copyright and
not a string of several copyrights
@@ -153,11 +157,11 @@ class PathDepth():
def get_paths(self, pd):
return self.path_depth[pd]
class BomGenerator():
def __init__(self, bom_dir, relaxed, dpkg_status, verbose, pretty, comments, copyright, copyright_limit):
def __init__(self, bom_dir, dpkg_status, verbose, pretty, comments, copyright, copyright_limit):
self.bom_dir = bom_dir
self.relaxed = relaxed
self.dpkg_status = dpkg_status
self.dpkg_status = dpkg_status
self.verbose = verbose
self.pretty = pretty
self.comments = comments
@@ -165,14 +169,14 @@ class BomGenerator():
self.copyright_limit = int(copyright_limit)
def get_license_str(self, paragraph):
license = paragraph.license[0]
license = paragraph.license[0]
if self.comments:
license += ' ' + str(paragraph.comment)
return license
def get_copyright_str(self, paragraph):
copyright = paragraph.copyright
copyright = paragraph.copyright
if len(copyright) > self.copyright_limit:
return copyright[:self.copyright_limit]
@@ -182,10 +186,66 @@ class BomGenerator():
return copyright
def find_files_paragraphs(self, copyright_info, f_src):
result = []
for p in copyright_info.all_files_paragraphs():
if p.matches(f_src):
result.append(p)
return result
# Get the license for file by searching the exact file path
def get_license(self, copyright_info, f_src):
p = copyright_info.find_files_paragraph(f_src)
if p != None:
# p = copyright_info.find_files_paragraphs(f_src)
# This only returns the last matching paragraph. That is because entries in the copyright files are exceptions
# to the previous one:
# - "*": GPL-2
# - "src/*": GPL-2+
# - "src/feature/*": GPL-3
# a file named 'src/feature/lists.c' will be GPL-3 (last entry)
# a file named 'lists.c' will be GPL-2 (first entry)
# So, if dwarf2sources cannot retrieve the correct path to the file
# (gives 'lists.c' instead of 'src/feature/lists.c'),
# it is impossible to know if that is an issue or not because there might be a lists.c file at the root of the
# pkg
#
# To raise a flag here, we would be raising a flag for any package that ships files at the root and the
# copyright has a paragraph for subfolders with wildcards (paragraphs always have different licenses).
Please register or sign in to reply
#
# That gets trickier if only a part of the file path is present
#
# No package that has this issue has been found.
#
p = self.find_files_paragraphs(copyright_info, f_src)
# Check for ambiguities if there ar multiple licenses and f_src has no path associated
if len(p) > 1 and '/' not in f_src:
    • What about a scenario like

      or

      and dwarf2sources reporting feature/list.c after trying to remove the prefix?

      • Author Maintainer

        That's the most trickiest, because if parts of the path can be missing in f_src from dwarf2sources, then any paragraph with a wildcard can match all f_src.

        In other words, if you can't trust and make any assumptions about the path to the file, then any wildcard (even with a path like src/lib2/*) can be matched.

        In your second example, feature/list.c can match the 3 wildcards. Even if we had a lib1/list.c, we can't trust the path: there could be a src/lib2/lib1/list.c or src/lib1/lib1/list.c or src/lib1/list.c.

        We could add some heuristics or try to match parts of the path but that's unreliable and becomes quite heavy when we could just drop wildcards everywhere and be way safer

      • Let me clarify a bit, since I just mentioned some potential issues without adding suggestions.

        First, I think that you are in the right track just need to cover some additional cases and have a clear approach based on some facts:

        1- Does copyright report contains wildcards? 2- Were we able to calculate a valid prefix that always match? This points to the fact that the debug info is "nice"

        If 1 is true and 2 is true -> we are safe If 1 is true and 2 is false -> we are in a kind of ambiguity, we should switch to full report to have more info If 1 is false and 2 is true -> we are safe If 1 is false and 2 is false -> we need to investigate

        Please take into account as mentioned in !479 (comment 56398) that maybe there are just a few cases where things are really tricky, so as I suggested, there first thing is to analyze those ones.

        As mentioned, the idea with this changes is to detect potential ambiguities and report them to try to switch to full report to reduce them.

      • Please register or sign in to reply
Please register or sign in to reply
found_pkg = False
found_sub_wc = False
# Check if the file is specified by full name
for par in p:
Please register or sign in to reply
if f_src in par.files:
found_pkg = True
# Check if we have paragraphs for subfolders with wildcards
Please register or sign in to reply
for par in p:
for f in par.files:
if "/*" in f:
found_sub_wc = True
break
if found_sub_wc:
break
Please register or sign in to reply
# Show matching paragraphs
if not found_pkg and found_sub_wc:
for par in p:
print(f"{f_src} --> {par.license} ({par.files})", file=sys.stderr)
return AMBIGUOUS_LICENSE_INFO_FOUND
# fake previous behaviour
p = p[-1] if p else None
if p:
return self.get_license_str(p)
return NO_LICENSE_INFO_FOUND
@@ -193,30 +253,11 @@ class BomGenerator():
# Get the copyright for file by searching the exact file path
def get_copyright(self, copyright_info, f_src):
p = copyright_info.find_files_paragraph(f_src)
if p != None:
if p:
return self.get_copyright_str(p)
return NO_COPYRIGHT_INFO_FOUND
# Get the license for file by searching a match with the last part
# of the report generated by FOSSolgy as we can't rely on the folder
# used to upload the source code
def get_license_relaxed(self, copyright_info, f_src):
for p in copyright_info.all_files_paragraphs():
for f in p.files:
if f.endswith(f_src):
return self.get_license_str(p)
return NO_LICENSE_INFO_FOUND
def get_copyright_relaxed(self, copyright_info, f_src):
for p in copyright_info.all_files_paragraphs():
for f in p.files:
if f.endswith(f_src):
return self.get_copyright_str(p)
return NO_COPYRIGHT_INFO_FOUND
def get_copyright_file(self, filenames):
if COPYRIGHT_REPORT in filenames:
return COPYRIGHT_REPORT
@@ -260,7 +301,7 @@ class BomGenerator():
for p in path_depth.get_paths(pd_key):
path_prefix_len = unit_file_path.find('/' + p + '/')
if path_prefix_len != -1:
return unit_file_path[:path_prefix_len+1]
return unit_file_path[:path_prefix_len + 1]
return ''
def scan_rust_unit(self, f_dir, f_src, copyright_info, out_licenses, out_copyright):
@@ -296,18 +337,12 @@ class BomGenerator():
out_copyright.add(NO_COPYRIGHT_INFO_FOUND)
def scan_regular_unit(self, unit_file_path, copyright_info, out_licenses, out_copyright):
if not self.relaxed:
license = self.get_license(copyright_info, unit_file_path)
else:
license = self.get_license_relaxed(copyright_info, unit_file_path)
out_licenses.add(license)
license = self.get_license(copyright_info, unit_file_path)
out_licenses.add(license)
if self.copyright:
if not self.relaxed:
copyright = self.get_copyright(copyright_info, unit_file_path)
else:
copyright = self.get_copyright_relaxed(copyright_info, unit_file_path)
out_copyright.add(copyright)
if self.copyright:
copyright = self.get_copyright(copyright_info, unit_file_path)
out_copyright.add(copyright)
def scan_units(self, binary_name, package_copyright_files, units):
binary_licenses = set()
@@ -316,7 +351,8 @@ class BomGenerator():
if not package_copyright_files.primary_copyright:
if self.copyright:
return {'binary_name': binary_name, 'binary_licenses': [NO_LICENSE_REPORT_FOUND], 'binary_copyright': [NO_COPYRIGHT_REPORT_FOUND], 'sources': sources}
return {'binary_name': binary_name, 'binary_licenses': [NO_LICENSE_REPORT_FOUND],
'binary_copyright': [NO_COPYRIGHT_REPORT_FOUND], 'sources': sources}
else:
return {'binary_name': binary_name, 'binary_licenses': [NO_LICENSE_REPORT_FOUND], 'sources': sources}
@@ -343,7+379,7 @@
if unit_file_path.startswith('/usr/share/cargo/registry'):
self.scan_rust_unit(f_dir, f_src, file_copyright_info, file_licenses, file_copyright)
else:
unit_file_path = unit_file_path.removeprefix(path_prefix)
Please register or sign in to reply
self.scan_regular_unit(unit_file_path, file_copyright_info, file_licenses, file_copyright)
if self.verbose > VERBOSE_BINARY:
@@ -355,7+391,7 @@
if self.copyright:
list_binary_copyright = parse_copyright(binary_copyright)
return {'binary_name': binary_name, 'binary_licenses': list(binary_licenses), 'binary_copyright': list_binary_copyright, 'sources': sources}
return {'binary_name': binary_name, 'binary_licenses': list(binary_licenses),
'binary_copyright': list_binary_copyright, 'sources': sources}
else:
return {'binary_name': binary_name, 'binary_licenses': list(binary_licenses), 'sources': sources}
@@ -374,7 +411,8 @@ class BomGenerator():
if self.copyright:
list_package_copyright = parse_copyright(package_copyright)
return {'package_name': package_name, 'package_licenses': list(package_licenses), 'package_copyright': list_package_copyright, 'binaries': binaries}
return {'package_name': package_name, 'package_licenses': list(package_licenses),
'package_copyright': list_package_copyright, 'binaries': binaries}
else:
return {'package_name': package_name, 'package_licenses': list(package_licenses), 'binaries': binaries}
@@ -447,25 +485,26 @@ class BomGenerator():
kwargs["indent"] = 4
print(json.dumps(image, **kwargs))
def main(argv):
global RELAXED_SEARCH
def main(argv):
parser = argparse.ArgumentParser()
parser.add_argument ("-c","--comments", action='store_true', help="include license comments")
parser.add_argument ("-C","--copyright", action='store_true', help="include copyright information")
parser.add_argument ("-l","--copyright_limit", default=COPYRIGHT_LENGTH, help="limit maximum number of characters in copyright information")
parser.add_argument ("-d","--dir", default=DEFAULT_BOM_DIR, help="directory to search for information")
parser.add_argument ("-p","--pretty", action='store_true', help="indent the JSON output")
parser.add_argument ("-r","--relaxed", action='store_true', help="use relaxed search")
parser.add_argument ("-s","--dpkg-status", default=DEFAULT_DPKG_STATUS, help="dpkg status file")
parser.add_argument ("-v","--verbose", type=int, default=VERBOSE_IMAGE,
help="verbose use in output 0: image, 1: package, 2: binary, 3: source")
parser.add_argument("-c", "--comments", action='store_true', help="include license comments")
parser.add_argument("-C", "--copyright", action='store_true', help="include copyright information")
parser.add_argument("-l", "--copyright_limit", default=COPYRIGHT_LENGTH,
help="limit maximum number of characters in copyright information")
parser.add_argument("-d", "--dir", default=DEFAULT_BOM_DIR, help="directory to search for information")
parser.add_argument("-p", "--pretty", action='store_true', help="indent the JSON output")
parser.add_argument("-s", "--dpkg-status", default=DEFAULT_DPKG_STATUS, help="dpkg status file")
parser.add_argument("-v", "--verbose", type=int, default=VERBOSE_IMAGE,
help="verbose use in output 0: image, 1: package, 2: binary, 3: source")
args = parser.parse_args ()
args = parser.parse_args()
bom_generator = BomGenerator(args.dir, args.relaxed, args.dpkg_status, args.verbose, args.pretty, args.comments, args.copyright, args.copyright_limit)
bom_generator = BomGenerator(args.dir, args.dpkg_status, args.verbose, args.pretty, args.comments,
args.copyright, args.copyright_limit)
bom_generator.check_packages_copyright()
if __name__ == '__main__':
main(sys.argv[1:])
Loading