-
Frederic Danis authored
License scan for `pkg/dash` and `tests\dash` should not re-use the scan results from a previous scan of the other project. Separate license scan using different group id, which is done using different users.
Frederic Danis authoredLicense scan for `pkg/dash` and `tests\dash` should not re-use the scan results from a previous scan of the other project. Separate license scan using different group id, which is done using different users.
ci-license-scan 16.64 KiB
#!/usr/bin/env python3
# SPDX-License-Identifier: MPL-2.0
#
# Copyright © 2019 Collabora Ltd
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import argparse
from pathlib import Path
from io import BufferedReader
from itertools import chain
import sh
import sys
import yaml
import textwrap
import unicodedata
import fossology
# this is necessary to eliminate references in the generated YAML
# Perl tools use YAML::Tiny which doesn’t support references.
yaml.SafeDumper.ignore_aliases = lambda *args : True
from debian.copyright import Copyright, FilesParagraph, NotMachineReadableError
scan_copyrights = sh.Command('scan-copyrights')
builtin_ignores = """
# Ignore Apertis files
debian/apertis/*
# Ignore possible upstream signing keys
debian/**/*.pgp
debian/**/*.gpg
debian/**/*.asc
# Ignore translations
**/po/*
# Ignore general backup files
*~
# Ignore emacs recovery files
.#*
# Ignore vi swap files
.*.swp
# File or directory names that should be ignored
DEADJOE
.cvsignore
.arch-inventory
.bzrignore
.gitignore
CVS
RCS
.pc
.deps
{arch}
.arch-ids
.svn
.hg
.hgtags
_darcs
.git
.shelf
_MTN
.bzr
.bzr.backup
.bzrtags
# Autotools build script not affecting the licensing of artifacts
config.guess
config.sub
# Scan everything else
!**
"""
# gitignore format to regex conversion
# internally, paths alls start with ./, so when we need to anchor patterns,
# we need to prepend ^\./, not just ^
# pathspec implements the full spec, but in case it’s not there we can get
# very close except a few corner cases using fnmatch
try:
from pathspec.patterns.gitwildmatch import GitWildMatchPattern
def gitpattern2re(pattern):
if pattern == '**':
return '.*', True
if pattern == '!**':
return '.*', False
regex, positive = GitWildMatchPattern.pattern_to_regex(pattern)
regex = regex.replace('^', '^\./', 1)
return regex, positive
except:
import warnings
warnings.warn("pathspec not available, copyright ignore patterns will not match exactly")
import fnmatch
import re
NOTDOUBLESTAR = re.compile(r'\.\*(?!\.\*)')
def gitpattern2re(pattern):
if pattern == '**':
return '.*', True
if pattern == '!**':
return '.*', False
if pattern.startswith('!'):
positive = False
pattern = pattern[1:]
else:
positive = True
if '/' in pattern:
if pattern.startswith('/'):
pattern = pattern[1:]
if pattern.endswith('/'):
pattern += '**'
regex = fnmatch.translate(pattern)
regex = re.sub(NOTDOUBLESTAR, '[^/]*', regex).replace('.*[^/]*/', '.*').replace('.*[^/]*', '.*')
else:
regex = fnmatch.translate(pattern)
return f"^\./{regex}", positive
# use colordiff when available for better readability
try:
diff = sh.Command('colordiff')
except:
diff = sh.Command('diff')
copyrightdiff = diff.bake('-au', '--label=approved-copyright', '--label=unapproved-copyright', _fg=True)
def merge_scan_patterns(*dicts):
"""
>>> dict1 = {'ignore': {'suffixes': ['jpg', 'png']}, 'check': {'pattern': ['foo.*']}}
>>> dict2 = {'ignore': {'suffixes': ['pdf', 'cpp']}, 'check': {'pattern': ['[A-Z]*']}}
>>> dict3 = {'ignore': {'pattern': []}}
>>> merge_scan_patterns(dict1, dict2)
{'ignore': {'suffixes': ['cpp', 'jpg', 'pdf', 'png']}, 'check': {'pattern': ['[A-Z]*', 'foo.*']}}
>>> merge_scan_patterns(dict1, dict3)
{'ignore': {'suffixes': ['jpg', 'png']}, 'check': {'pattern': ['foo.*']}}
>>> merge_scan_patterns(dict2, dict3)
{'ignore': {'suffixes': ['pdf', 'cpp']}, 'check': {'pattern': ['[A-Z]*']}}
>>> merge_scan_patterns(dict3, dict2, dict1)
{'ignore': {'pattern': [], 'suffixes': ['cpp', 'jpg', 'pdf', 'png']}, 'check': {'pattern': ['[A-Z]*', 'foo.*']}}
"""
target = {}
for source in dicts:
for action in source.keys():
if action not in target:
target[action] = source[action].copy()
continue
for bit in source[action].keys():
new_bits = sorted(set(target[action].get(bit, []) + source[action][bit]))
if len(new_bits):
target[action][bit] = new_bits
return target
def gitignore2list(s):
"""
>>> gitignore2list('''# test file
... *.txt
...
... **/foo/*jpg
... **/*jpg
... *.cache/
... !*.js
... *.cache/**/test''') #doctest: +ELLIPSIS
[('^...', True), ('^...', True), ('^...', True), ('^...', True), ('^...', False), ('^...', True)]
"""
non_comments = [i.strip() for i in s.splitlines() if not i.startswith('#')]
patterns = [gitpattern2re(glob) for glob in non_comments if len(glob)]
return patterns
def gitignore2pat(s):
"""
Convert gitignore patterns to the Dpkg::Copyright::Scanner-compatible format
>>> gitignore2pat('''# test file
... *.txt
...
... **/foo/*jpg
... **/*jpg
... *.cache/
... !*.js
... *.cache/**/test''') #doctest: +ELLIPSIS
{'ignore': {'pattern': [..., ..., ...]}, 'check': {'pattern': [...]}}
"""
ignores = []
checks = []
for regex, positive in gitignore2list(s):
if positive:
ignores += [regex]
else:
checks += [regex]
return {
'ignore': {
'pattern': ignores
},
'check': {
'pattern': checks
}
}
def parse_whitelist(filename: str):
"""
Attempt to open and parse a whilelist, do nothing if it’s not there
"""
with Path(filename) as f:
if f.is_file():
return gitignore2pat(f.read_text())
else:
return {}
def detect_license_for(path, c=None):
if c is None:
with open('debian/copyright', 'rb') as f:
c = Copyright(f, strict=False)
# patterns ending with * do not always match correctly (TODO: why?)
# remove the * to make sure they do
if path.endswith('/*'):
path = path.rstrip('*')
p = c.find_files_paragraph(path)
if p:
return p.copyright, p.license
return None, None
def is_gibberish(s: str) -> bool:
"""
Try to tell whether the input is some random gibberish, not a copyright statement
>>> is_gibberish('ÿêÒªÿêÓ¬ÿëÓ®ÿëÔ®ÿëÕ°ÿëÖ°ÿìÕ±ÿìÖ±ÿì×´ÿìصÿíضÿíÙ')
True
>>> is_gibberish('2011 Jürgen Möller')
False
>>> is_gibberish("2000 李健秋")
False
>>> is_gibberish('Æf-w¬6f*äIy,2bÓñ.\x982§VS')
True
>>> is_gibberish('ÿy\x8d\x8aÿt}kÿoiEÿiP')
True
>>> is_gibberish('f%^2<')
True
"""
# FIXME: Fix scan-copyright to say 'no-info-found' instead of spitting random stuff
# See https://phabricator.apertis.org/T6677 for details
if len(s) < 6:
return True
if any(unicodedata.category(c)[0] == 'C' for c in s):
return True
gibberishness = sum(1 if (c >= '\x7b') and (c <= '\xff') else 0 for c in s)
if len(s) < 16 and s.count(' ') == 0 and gibberishness >= 6:
return True
if gibberishness >= 0.6 * len(s):
return True
return False
def reindent_multiline(s: str) -> str:
lines = s.split('\n')
return '\n '.join([l.lstrip() for l in lines])
def configure_scanner(whitelists):
"""
Merge our metadata with that of the Debian package
"""
with Path('debian') / 'copyright-scan-patterns.yml' as scan_patterns:
debian_patterns = {}
if scan_patterns.is_file():
try:
debian_patterns = yaml.safe_load(scan_patterns.read_text())
except:
pass
extra_patterns = [gitignore2pat(builtin_ignores)]
for whitelist in whitelists:
extra_patterns += [parse_whitelist(whitelist)]
new_yaml = yaml.safe_dump(merge_scan_patterns(debian_patterns, *extra_patterns), default_style='|')
scan_patterns.write_text(new_yaml)
copyright_overrides = {}
(Path('debian') / 'apertis').mkdir(parents=True, exist_ok=True)
with Path('debian') / 'fill.copyright.blanks.yml' as debian_copyright_overrides:
if debian_copyright_overrides.is_file():
try:
copyright_overrides = yaml.safe_load(debian_copyright_overrides.read_text())
except:
pass
try:
# when Debian ships machine-readable debian/copyright, extract license for debian/*
debian_copyright, debian_license = detect_license_for('debian/*')
if debian_license and 'debian/' not in copyright_overrides:
copyright_overrides['debian/'] = {
'license': debian_license.synopsis,
'copyright': debian_copyright
}
except:
pass
with Path('debian') / 'apertis' / 'copyright.yml' as apertis_copyright_overrides:
if apertis_copyright_overrides.is_file():
try:
copyright_overrides.update(yaml.safe_load(apertis_copyright_overrides.read_text()))
except:
pass
if copyright_overrides:
debian_copyright_overrides.write_text(yaml.safe_dump(copyright_overrides, default_style='|'))
class NullFilter(BufferedReader):
"""
Filter out control characters, most importantly NULs (but not CR, LF or HT)
which make the copyright parser choke.
The reason for the characters being in the input stream is that the scanning tool
attempts to parse binary files as text and spits bits of it into the output.
The reason the parser chokes is that *apparently* it uses the logic similar to
str.splitlines() internally, breaking lines on wrong characters. This needs to be
fixed in the Python package implementing it.
"""
# FIXME: Fix scan-copyrights to better detect non-textual files
# FIXME: Fix debian.copyright.Copyright to better handle non-well-formed input
# See https://phabricator.apertis.org/T6677 for details
forbidden = bytes(range(9)) + bytes(range(11, 13)) + bytes(range(14, 32))
transtable = bytes.maketrans(forbidden, b'.' * len(forbidden))
def read(self, size=-1):
b = super().read(size)
return b.translate(NullFilter.transtable)
def readline(self, size=-1):
b = super().readline(size)
return b.translate(NullFilter.transtable)
class MutableCopyright(Copyright):
@property
def paragraphs(self):
return self._Copyright__paragraphs
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--fail-on-change', dest='fail_on_change', action='store_true', default=False, help='fail on any changes')
parser.add_argument('--blacklist-license', dest='blacklist', metavar='LICENSE', action='append', default=[], help='license to blacklist')
parser.add_argument('--blacklist-licenses', dest='blacklist', metavar='LICENSES', type=str, help='space-separated licenses to blacklist')
parser.add_argument('--extra-whitelist', dest='whitelists', metavar='WHITELIST', action='append', default=['debian/apertis/copyright.whitelist'], help='extra file whitelist')
parser.add_argument('--fossology-host', dest='fossology_host', help='FOSSology host URL to use')
parser.add_argument('--fossology-username', dest='fossology_username', help='FOSSology username')
parser.add_argument('--fossology-password', dest='fossology_password', help='FOSSology password')
parser.add_argument('--source-url', dest='source_url', help='git source URL to scan')
parser.add_argument('--source-branch', dest='source_branch', help='git source branch to scan')
args = parser.parse_args()
print("%s fail on change" % ("Will" if args.fail_on_change else "Will not"))
disallowlist = args.blacklist
if isinstance(disallowlist, str):
disallowlist = disallowlist.split()
if disallowlist:
print(f"Disallowed licenses: {disallowlist}")
disallowlist = [d.lower() for d in disallowlist]
print(f"Using whitelists: {args.whitelists}")
configure_scanner(args.whitelists)
Path('debian/apertis/copyright').touch(exist_ok=True)
sys.stdout.flush()
with open('debian/apertis/copyright.new', 'wt') as f:
print('Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n', file=f)
scan_copyrights(_out=f)
if args.fossology_host and args.fossology_username and args.fossology_password and args.source_url and args.source_branch:
with open('debian/apertis/copyright.fossology', 'wt') as f:
args.source_url = args.source_url.rstrip('/')
name = args.source_url.split('/')[-1]
suffix = '.git'
if name.endswith(suffix):
name = name[:-len(suffix)]
foss_project = fossology.ApertisFossology(args.fossology_host, args.fossology_username, args.fossology_password)
foss_project.upload(args.source_url, args.source_branch, name)
group_id = foss_project.get_group_id()
reuse_id = foss_project.get_previous_upload_analysis_id(name, group_id)
foss_project.analyse(reuse_id, group_id)
report = foss_project.get_report('dep5')
print(report, file=f)
# open for parsing as binary since copyrights may (incorrectly) contain binary data
bad_licenses = set()
unknown_licensed = False
fixups = dict()
fixups_applied = set()
with open('debian/copyright', 'rb') as f:
try:
debian_copyrights = Copyright(f, strict=False)
except (ValueError, NotMachineReadableError):
print("WARN: No machine-readable debian/copyright found, can’t use licensing information from Debian.", file=sys.stderr)
debian_copyrights = Copyright()
with open('debian/apertis/copyright.new', 'rb') as f:
try:
n = NullFilter(f)
c = MutableCopyright(n, strict=False)
for p in c.all_files_paragraphs():
if p.license:
if p.license.synopsis == 'UNKNOWN':
for f in p.files:
copyright, license = detect_license_for(f, debian_copyrights)
if copyright is not None and license is not None:
if license.synopsis.rstrip('+').lower() in disallowlist:
bad_licenses.add(license.synopsis)
fixups.setdefault(copyright, {}).setdefault(license, []).append(p)
else:
unknown_licensed = True
elif p.license.synopsis.rstrip('+').lower() in disallowlist:
bad_licenses.add(p.license.synopsis)
if p.copyright and is_gibberish(p.copyright):
p.copyright = 'no-info-found'
if fixups:
for fc, fls in fixups.items():
for fl, fps in fls.items():
files = list(chain.from_iterable([fp.files for fp in fps]))
for fp in fps:
if fp in c.paragraphs:
c.paragraphs.remove(fp)
new_p = FilesParagraph.create(
files=files,
copyright=reindent_multiline(fc),
license=fl
)
c.add_files_paragraph(new_p)
fixups_applied.update(files)
print(f"Reused licensing information from debian/copyright for:\n",
textwrap.indent("\n".join(sorted(fixups_applied)), " "*2),
file=sys.stderr)
with open('debian/apertis/copyright.new', 'wt') as f:
c.dump(f)
except ValueError:
print("ERROR: Can't parse debian/copyright, verify copyright statement for binary gibberish.", file=sys.stderr)
sys.exit(2)
try:
copyrightdiff('debian/apertis/copyright', 'debian/apertis/copyright.new')
except sh.ErrorReturnCode as e:
if e.exit_code == 1:
if args.fail_on_change:
print('\nERROR: Differences found, reconcillation needed.', file=sys.stderr)
sys.exit(1)
else:
pass
else:
raise
if unknown_licensed or bad_licenses:
print("\nERROR:", file=sys.stderr)
if unknown_licensed:
print(" Files with UNKNOWN license found.", file=sys.stderr)
for l in bad_licenses:
print(f" Blacklisted license {l} found.", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
# some doctest magic
if len(sys.argv) > 1 and sys.argv[1] == '--test':
import doctest
sys.argv[1] = '-v'
doctest.testmod()
else:
main()