ci-license-scan

#!/usr/bin/env python3
# SPDX-License-Identifier: MPL-2.0
#
# Copyright © 2019 Collabora Ltd
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import argparse
from pathlib import Path
from io import BufferedReader
from itertools import chain
import sh
import sys
import yaml
import textwrap
import unicodedata
import fossology

# this is necessary to eliminate references in the generated YAML
# Perl tools use YAML::Tiny which doesn’t support references.
yaml.SafeDumper.ignore_aliases = lambda *args : True

from debian.copyright import Copyright, FilesParagraph, NotMachineReadableError

scan_copyrights = sh.Command('scan-copyrights')

builtin_ignores = """
# Ignore Apertis files
debian/apertis/*
# Ignore possible upstream signing keys
debian/**/*.pgp
debian/**/*.gpg
debian/**/*.asc
# Ignore translations
**/po/*
# Ignore general backup files
*~
# Ignore emacs recovery files
.#*
# Ignore vi swap files
.*.swp
# File or directory names that should be ignored
DEADJOE
.cvsignore
.arch-inventory
.bzrignore
.gitignore
CVS
RCS
.pc
.deps
{arch}
.arch-ids
.svn
.hg
.hgtags
_darcs
.git
.shelf
_MTN
.bzr
.bzr.backup
.bzrtags
# Autotools build script not affecting the licensing of artifacts
config.guess
config.sub
# Scan everything else
!**
"""

# gitignore format to regex conversion
# internally, paths alls start with ./, so when we need to anchor patterns,
# we need to prepend ^\./, not just ^
# pathspec implements the full spec, but in case it’s not there we can get
# very close except a few corner cases using fnmatch

try:
    from pathspec.patterns.gitwildmatch import GitWildMatchPattern
    def gitpattern2re(pattern):
        if pattern == '**':
            return '.*', True
        if pattern == '!**':
            return '.*', False
        regex, positive = GitWildMatchPattern.pattern_to_regex(pattern)
        regex = regex.replace('^', '^\./', 1)
        return regex, positive
except:
    import warnings
    warnings.warn("pathspec not available, copyright ignore patterns will not match exactly")
    import fnmatch
    import re

    NOTDOUBLESTAR = re.compile(r'\.\*(?!\.\*)')

    def gitpattern2re(pattern):
        if pattern == '**':
            return '.*', True
        if pattern == '!**':
            return '.*', False
        if pattern.startswith('!'):
            positive = False
            pattern = pattern[1:]
        else:
            positive = True
        if '/' in pattern:
            if pattern.startswith('/'):
                pattern = pattern[1:]
            if pattern.endswith('/'):
                pattern += '**'
            regex = fnmatch.translate(pattern)
            regex = re.sub(NOTDOUBLESTAR, '[^/]*', regex).replace('.*[^/]*/', '.*').replace('.*[^/]*', '.*')
        else:
            regex = fnmatch.translate(pattern)
        return f"^\./{regex}", positive

# use colordiff when available for better readability
try:
    diff = sh.Command('colordiff')
except:
    diff = sh.Command('diff')

copyrightdiff = diff.bake('-au', '--label=approved-copyright', '--label=unapproved-copyright', _fg=True)

def merge_scan_patterns(*dicts):
    """
    >>> dict1 = {'ignore': {'suffixes': ['jpg', 'png']}, 'check': {'pattern': ['foo.*']}}
    >>> dict2 = {'ignore': {'suffixes': ['pdf', 'cpp']}, 'check': {'pattern': ['[A-Z]*']}}
    >>> dict3 = {'ignore': {'pattern': []}}
    >>> merge_scan_patterns(dict1, dict2)
    {'ignore': {'suffixes': ['cpp', 'jpg', 'pdf', 'png']}, 'check': {'pattern': ['[A-Z]*', 'foo.*']}}
    >>> merge_scan_patterns(dict1, dict3)
    {'ignore': {'suffixes': ['jpg', 'png']}, 'check': {'pattern': ['foo.*']}}
    >>> merge_scan_patterns(dict2, dict3)
    {'ignore': {'suffixes': ['pdf', 'cpp']}, 'check': {'pattern': ['[A-Z]*']}}
    >>> merge_scan_patterns(dict3, dict2, dict1)
    {'ignore': {'pattern': [], 'suffixes': ['cpp', 'jpg', 'pdf', 'png']}, 'check': {'pattern': ['[A-Z]*', 'foo.*']}}
    """
    target = {}
    for source in dicts:
        for action in source.keys():
            if action not in target:
                target[action] = source[action].copy()
                continue
            for bit in source[action].keys():
                new_bits = sorted(set(target[action].get(bit, []) + source[action][bit]))
                if len(new_bits):
                    target[action][bit] = new_bits
    return target

def gitignore2list(s):
    """
    >>> gitignore2list('''# test file
    ... *.txt
    ...
    ... **/foo/*jpg
    ... **/*jpg
    ... *.cache/
    ... !*.js
    ... *.cache/**/test''') #doctest: +ELLIPSIS
    [('^...', True), ('^...', True), ('^...', True), ('^...', True), ('^...', False), ('^...', True)]
    """
    non_comments = [i.strip() for i in s.splitlines() if not i.startswith('#')]
    patterns = [gitpattern2re(glob) for glob in non_comments if len(glob)]
    return patterns

def gitignore2pat(s):
    """
    Convert gitignore patterns to the Dpkg::Copyright::Scanner-compatible format

    >>> gitignore2pat('''# test file
    ... *.txt
    ...
    ... **/foo/*jpg
    ... **/*jpg
    ... *.cache/
    ... !*.js
    ... *.cache/**/test''') #doctest: +ELLIPSIS
    {'ignore': {'pattern': [..., ..., ...]}, 'check': {'pattern': [...]}}
    """
    ignores = []
    checks = []
    for regex, positive in gitignore2list(s):
        if positive:
            ignores += [regex]
        else:
            checks += [regex]
    return {
        'ignore': {
            'pattern': ignores
        },
        'check': {
            'pattern': checks
        }
    }

def parse_whitelist(filename: str):
    """
    Attempt to open and parse a whilelist, do nothing if it’s not there
    """
    with Path(filename) as f:
        if f.is_file():
            return gitignore2pat(f.read_text())
        else:
            return {}

def detect_license_for(path, c=None):
    if c is None:
        with open('debian/copyright', 'rb') as f:
            c = Copyright(f, strict=False)
    # patterns ending with * do not always match correctly (TODO: why?)
    # remove the * to make sure they do
    if path.endswith('/*'):
        path = path.rstrip('*')
    p = c.find_files_paragraph(path)
    if p:
        return p.copyright, p.license
    return None, None

def is_gibberish(s: str) -> bool:
    """
    Try to tell whether the input is some random gibberish, not a copyright statement

    >>> is_gibberish('ÿêÒªÿêÓ¬ÿëÓ®ÿëÔ®ÿëÕ°ÿëÖ°ÿìÕ±ÿìÖ±ÿì×´ÿìØµÿíØ¶ÿíÙ')
    True
    >>> is_gibberish('2011 Jürgen Möller')
    False
    >>> is_gibberish("2000 李健秋")
    False
    >>> is_gibberish('Æf-w¬6f*äIy,2bÓñ.\x982§VS')
    True
    >>> is_gibberish('ÿy\x8d\x8aÿt}kÿoiEÿiP')
    True
    >>> is_gibberish('f%^2<')
    True
    """
    # FIXME: Fix scan-copyright to say 'no-info-found' instead of spitting random stuff
    # See https://phabricator.apertis.org/T6677 for details
    if len(s) < 6:
        return True
    if any(unicodedata.category(c)[0] == 'C' for c in s):
        return True
    gibberishness = sum(1 if (c >= '\x7b') and (c <= '\xff') else 0 for c in s)
    if len(s) < 16 and s.count(' ') == 0 and gibberishness >= 6:
        return True
    if gibberishness >= 0.6 * len(s):
        return True
    return False

def reindent_multiline(s: str) -> str:
    lines = s.split('\n')
    return '\n '.join([l.lstrip() for l in lines])

def configure_scanner(whitelists):
    """
    Merge our metadata with that of the Debian package
    """
    with Path('debian') / 'copyright-scan-patterns.yml' as scan_patterns:
        debian_patterns = {}
        if scan_patterns.is_file():
            try:
                debian_patterns = yaml.safe_load(scan_patterns.read_text())
            except:
                pass

        extra_patterns = [gitignore2pat(builtin_ignores)]
        for whitelist in whitelists:
            extra_patterns += [parse_whitelist(whitelist)]
        new_yaml = yaml.safe_dump(merge_scan_patterns(debian_patterns, *extra_patterns), default_style='|')
        scan_patterns.write_text(new_yaml)
    copyright_overrides = {}
    (Path('debian') / 'apertis').mkdir(parents=True, exist_ok=True)
    with Path('debian') / 'fill.copyright.blanks.yml' as debian_copyright_overrides:
        if debian_copyright_overrides.is_file():
            try:
                copyright_overrides = yaml.safe_load(debian_copyright_overrides.read_text())
            except:
                pass
        try:
            # when Debian ships machine-readable debian/copyright, extract license for debian/*
            debian_copyright, debian_license = detect_license_for('debian/*')
            if debian_license and 'debian/' not in copyright_overrides:
                copyright_overrides['debian/'] = {
                    'license': debian_license.synopsis,
                    'copyright': debian_copyright
                }
        except:
            pass
        with Path('debian') / 'apertis' / 'copyright.yml' as apertis_copyright_overrides:
            if apertis_copyright_overrides.is_file():
                try:
                    copyright_overrides.update(yaml.safe_load(apertis_copyright_overrides.read_text()))
                except:
                    pass
        if copyright_overrides:
            debian_copyright_overrides.write_text(yaml.safe_dump(copyright_overrides, default_style='|'))

class NullFilter(BufferedReader):
    """
    Filter out control characters, most importantly NULs (but not CR, LF or HT)
    which make the copyright parser choke.

    The reason for the characters being in the input stream is that the scanning tool
    attempts to parse binary files as text and spits bits of it into the output.

    The reason the parser chokes is that *apparently* it uses the logic similar to
    str.splitlines() internally, breaking lines on wrong characters. This needs to be
    fixed in the Python package implementing it.
    """
    # FIXME: Fix scan-copyrights to better detect non-textual files
    # FIXME: Fix debian.copyright.Copyright to better handle non-well-formed input
    # See https://phabricator.apertis.org/T6677 for details
    forbidden = bytes(range(9)) + bytes(range(11, 13)) + bytes(range(14, 32))
    transtable = bytes.maketrans(forbidden, b'.' * len(forbidden))

    def read(self, size=-1):
        b = super().read(size)
        return b.translate(NullFilter.transtable)

    def readline(self, size=-1):
        b = super().readline(size)
        return b.translate(NullFilter.transtable)

class MutableCopyright(Copyright):
    @property
    def paragraphs(self):
        return self._Copyright__paragraphs

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--fail-on-change', dest='fail_on_change', action='store_true', default=False, help='fail on any changes')
    parser.add_argument('--blacklist-license', dest='blacklist', metavar='LICENSE', action='append', default=[], help='license to blacklist')
    parser.add_argument('--blacklist-licenses', dest='blacklist', metavar='LICENSES', type=str, help='space-separated licenses to blacklist')
    parser.add_argument('--extra-whitelist', dest='whitelists', metavar='WHITELIST', action='append', default=['debian/apertis/copyright.whitelist'], help='extra file whitelist')
    parser.add_argument('--fossology-host', dest='fossology_host', help='FOSSology host URL to use')
    parser.add_argument('--fossology-username', dest='fossology_username', help='FOSSology username')
    parser.add_argument('--fossology-password', dest='fossology_password', help='FOSSology password')
    parser.add_argument('--source-url', dest='source_url', help='git source URL to scan')
    parser.add_argument('--source-branch', dest='source_branch', help='git source branch to scan')
    args = parser.parse_args()
    print("%s fail on change" % ("Will" if args.fail_on_change else "Will not"))
    disallowlist = args.blacklist
    if isinstance(disallowlist, str):
        disallowlist = disallowlist.split()
    if disallowlist:
        print(f"Disallowed licenses: {disallowlist}")
    disallowlist = [d.lower() for d in disallowlist]
    print(f"Using whitelists: {args.whitelists}")
    configure_scanner(args.whitelists)
    Path('debian/apertis/copyright').touch(exist_ok=True)
    sys.stdout.flush()

    with open('debian/apertis/copyright.new', 'wt') as f:
        print('Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n', file=f)
        scan_copyrights(_out=f)
    if args.fossology_host and args.fossology_username and args.fossology_password and args.source_url and args.source_branch:
        with open('debian/apertis/copyright.fossology', 'wt') as f:
            args.source_url = args.source_url.rstrip('/')
            name = args.source_url.split('/')[-1]
            suffix = '.git'
            if name.endswith(suffix):
                name = name[:-len(suffix)]
            foss_project = fossology.ApertisFossology(args.fossology_host, args.fossology_username, args.fossology_password)
            foss_project.upload(args.source_url, args.source_branch, name)
            group_id = foss_project.get_group_id()
            reuse_id = foss_project.get_previous_upload_analysis_id(name, group_id)
            foss_project.analyse(reuse_id, group_id)
            report = foss_project.get_report('dep5')
            print(report, file=f)
    # open for parsing as binary since copyrights may (incorrectly) contain binary data
    bad_licenses = set()
    unknown_licensed = False
    fixups = dict()
    fixups_applied = set()
    with open('debian/copyright', 'rb') as f:
        try:
            debian_copyrights = Copyright(f, strict=False)
        except (ValueError, NotMachineReadableError):
            print("WARN: No machine-readable debian/copyright found, can’t use licensing information from Debian.", file=sys.stderr)
            debian_copyrights = Copyright()
    with open('debian/apertis/copyright.new', 'rb') as f:
      try:
        n = NullFilter(f)
        c = MutableCopyright(n, strict=False)
        for p in c.all_files_paragraphs():
            if p.license:
                if p.license.synopsis == 'UNKNOWN':
                    for f in p.files:
                        copyright, license = detect_license_for(f, debian_copyrights)
                        if copyright is not None and license is not None:
                            if license.synopsis.rstrip('+').lower() in disallowlist:
                                bad_licenses.add(license.synopsis)
                            fixups.setdefault(copyright, {}).setdefault(license, []).append(p)
                        else:
                            unknown_licensed = True
                elif p.license.synopsis.rstrip('+').lower() in disallowlist:
                    bad_licenses.add(p.license.synopsis)
            if p.copyright and is_gibberish(p.copyright):
                p.copyright = 'no-info-found'
        if fixups:
            for fc, fls in fixups.items():
                for fl, fps in fls.items():
                    files = list(chain.from_iterable([fp.files for fp in fps]))
                    for fp in fps:
                        if fp in c.paragraphs:
                            c.paragraphs.remove(fp)
                    new_p = FilesParagraph.create(
                        files=files,
                        copyright=reindent_multiline(fc),
                        license=fl
                    )
                    c.add_files_paragraph(new_p)
                    fixups_applied.update(files)
            print(f"Reused licensing information from debian/copyright for:\n",
                textwrap.indent("\n".join(sorted(fixups_applied)), " "*2),
                file=sys.stderr)
        with open('debian/apertis/copyright.new', 'wt') as f:
            c.dump(f)
      except ValueError:
        print("ERROR: Can't parse debian/copyright, verify copyright statement for binary gibberish.", file=sys.stderr)
        sys.exit(2)
    try:
        copyrightdiff('debian/apertis/copyright', 'debian/apertis/copyright.new')
    except sh.ErrorReturnCode as e:
        if e.exit_code == 1:
            if args.fail_on_change:
                print('\nERROR: Differences found, reconcillation needed.', file=sys.stderr)
                sys.exit(1)
            else:
                pass
        else:
            raise
    if unknown_licensed or bad_licenses:
        print("\nERROR:", file=sys.stderr)
        if unknown_licensed:
            print(" Files with UNKNOWN license found.", file=sys.stderr)
        for l in bad_licenses:
            print(f" Blacklisted license {l} found.", file=sys.stderr)
        sys.exit(1)

if __name__ == '__main__':
    # some doctest magic
    if len(sys.argv) > 1 and sys.argv[1] == '--test':
        import doctest
        sys.argv[1] = '-v'
        doctest.testmod()
    else:
        main()