Skip to content
Snippets Groups Projects
Commit 2f7e27e6 authored by Ryan Gonzalez's avatar Ryan Gonzalez
Browse files

dh_setup_copyright: Improve performance of matching files on large trees


The previous version of this code was `O(m*n)` where `m` = the number of
files in the source tree and `n` = the number of files referenced by
the binaries. In most cases, those numbers are quite small, but on large
packages they can grow incredibly large. For instance, for rustc's main
binary, `m > 300k` and `n > 33k`, resulting in each outer loop iteration
taking an average of ~1.9s. That would result in a runtime of over 17
hours, which is a rather absurd bump to the build time.

Instead, we can reorganize the code so that the source tree contents are
stored in a hash, indexed by basename. That turns the entire inner
matching loop into a single hash lookup, bringing the outer loop runtime
to a worst-case single-digit number of milliseconds.

Fixes: infrastructure/apertis-issues#595

Signed-off-by: default avatarRyan Gonzalez <ryan.gonzalez@collabora.com>
parent 74473f7b
No related branches found
No related tags found
3 merge requests!37Merge changes from apertis/v2024-updates into apertis/v2024,!36Backport v2024 <- v2025dev2: Reproducibility and performance fixes,!35dh_setup_copyright: Reproducibility and performance fixes
Pipeline #713529 passed
Pipeline: debhelper

#713531

    ......@@ -317,6 +317,20 @@ sub scan_binary_shlibs {
    }
    }
    sub collect_sources_by_basename {
    my @sources = find_local('.', { 'exclude' => ['debian'] });
    my %sources_by_basename = ();
    foreach my $source (@sources) {
    # Clean up stuff like `./`.
    $source = File::Spec->canonpath($source);
    my $with_basename = $sources_by_basename{basename $source} ||= [];
    push @$with_basename, $source;
    }
    \%sources_by_basename
    }
    # Returns the number of path components shared between the tails of the two
    # arguments.
    #
    ......@@ -335,30 +349,21 @@ sub common_path_suffix_length {
    }
    sub build_copied_sources_map {
    my ($tmpdir, $others, $metadata) = @_;
    my ($tmpdir, $others, $sources_by_basename, $metadata) = @_;
    my $copied_sources = $metadata->{'copied_sources'};
    my @sources = find_local('.', { 'exclude' => ['debian'] });
    foreach my $file (@$others) {
    my $base = basename $file;
    my $unscored_matches = $sources_by_basename->{basename $file};
    next if !defined $unscored_matches;
    # Assign a "score" to each matching file based on the number of path
    # components in common, so that we'll be more likely to match e.g.
    # "/usr/include/test" as "src/include/test" than "examples/test".
    my @matches = ();
    foreach my $source (@sources) {
    # Clean up stuff like `./`.
    $source = File::Spec->canonpath($source);
    if ($base =~ /(^|\/)\Q@{[basename $source]}\E$/) {
    my $score = common_path_suffix_length $file, $source;
    push @matches, [$score, $source];
    }
    }
    if (!@matches) {
    next;
    }
    my @matches = map {
    my $score = common_path_suffix_length $file, $_;
    [$score, $_]
    } @$unscored_matches;
    my $file_sha256 = Digest::SHA->new('256')->addfile("$tmpdir/$file", 'b')->hexdigest;
    # Skip if...
    ......@@ -442,7 +447,9 @@ for my $package (@{$dh{DOPACKAGES}}) {
    extract_filenames($fname_list, $tmp, @binaries);
    process_external_sources("$tmp/$fname_list", "$tmp/$fname_external_dir", \%metadata);
    scan_binary_shlibs($tmp, \@binaries, \%metadata);
    build_copied_sources_map($tmp, \@others, \%metadata);
    my $sources_by_basename = collect_sources_by_basename;
    build_copied_sources_map($tmp, \@others, $sources_by_basename, \%metadata);
    if (%metadata) {
    open my $meta_handle, '>', "$tmp/$fname_metadata" or die "Failed to open $fname_metadata: $!";
    ......
    0% Loading or .
    You are about to add 0 people to the discussion. Proceed with caution.
    Finish editing this message first!
    Please register or to comment