ncdu-zig/ncdubinexp.pl

#!/usr/bin/perl
# SPDX-FileCopyrightText: Yorhel <projects@yorhel.nl>
# SPDX-License-Identifier: MIT


# Usage: ncdubinexp.pl [options] <export.ncdu
# Or: ncdu -O- | ncdubinexp.pl [options]
#
# Reads and validates a binary ncdu export file and optionally prints out
# various diagnostic data and statistics.
#
# Options:
#   blocks    - print a listing of all blocks as they are read
#   items     - print a listing of all items as they are read
#   dirs      - print out dir listing stats
#   stats     - print some overview stats
#
# This script is highly inefficient in both RAM and CPU, not suitable for large
# exports.
# This script does not permit unknown blocks or item keys, although that is
# technically valid.


use v5.36;
use autodie;
use bytes;
no warnings 'portable';
use List::Util 'min', 'max';
use CBOR::XS;  # Does not officially support recent perl versions, but it's the only CPAN module that supports streaming.
use Compress::Zstd;

my $printblocks = grep $_ eq 'blocks', @ARGV;
my $printitems = grep $_ eq 'items', @ARGV;
my $printdirs = grep $_ eq 'dirs', @ARGV;
my $printstats = grep $_ eq 'stats', @ARGV;

my %datablocks;
my %items;
my $root_itemref;
my $datablock_len = 0;
my $rawdata_len = 0;
my $minitemsperblock = 1e10;
my $maxitemsperblock = 0;

{
    die "Input too short\n" if 8 != read STDIN, my $sig, 8;
    die "Invalid file signature\n" if $sig ne "\xbfncduEX1";
}

my @itemkeys = qw/
    type
    name
    prev
    asize
    dsize
    dev
    rderr
    cumasize
    cumdsize
    shrasize
    shrdsize
    items
    sub
    ino
    nlink
    uid
    gid
    mode
    mtime
/;


sub datablock($prefix, $off, $blklen, $content) {
    die "$prefix: Data block too small\n" if length $content < 8;
    die "$prefix: Data block too large\n" if length $content >= (1<<24);

    my($num, $rawlen) = unpack 'NN', $content;
    die "$prefix: Uncompressed data block size too large\n" if $rawlen >= (1<<24);
    die sprintf "%s: Duplicate block id %d (first at %010x)", $prefix, $num, $datablocks{$num}>>24 if $datablocks{$num};
    $datablocks{$num} = ($off << 24) | $blklen;

    my $compressed = substr $content, 8;
    $printblocks && printf "%s: data block %d  rawlen %d (%.2f)\n", $prefix, $num, $rawlen, $rawlen/(length($compressed))*100;

    $datablock_len += length($compressed);
    $rawdata_len += $rawlen;

    my $rawdata = decompress($compressed);
    die "$prefix: Block id $num failed decompression\n" if !defined $rawdata;
    die sprintf "%s: Block id %d decompressed to %d bytes but expected %d\n",
        $prefix, $num, length($rawdata), $rawlen if $rawlen != length $rawdata;
    cbordata($num, $rawdata);
}


sub fmtitem($val) {
    join '  ', map "$_:$val->{$_}", grep exists $val->{$_}, @itemkeys;
}


sub cbordata($blknum, $data) {
    my $cbor = CBOR::XS->new_safe;
    my $off = 0;
    my $nitems = 0;
    while ($off < length $data) { # This substr madness is prolly quite slow
        my($val, $len) = $cbor->decode_prefix(substr $data, $off);
        my $itemref = ($blknum << 24) | $off;
        $off += $len;
        $nitems++;

        # Basic validation of the CBOR data. Doesn't validate that every value
        # has the correct CBOR type or that integers are within range.
        $val = { _itemref => $itemref, map {
            die sprintf "#%010x: Invalid CBOR key '%s'\n", $itemref, $_ if !/^[0-9]+$/ || !$itemkeys[$_];
            my($k, $v) = ($itemkeys[$_], $val->{$_});
            die sprintf "#%010x: Invalid value for key '%s': '%s'\n", $itemref, $k, $v
                if ref $v eq 'ARRAY' || ref $v eq 'HASH' || !defined $v || !(
                    $k eq 'type' ? ($v =~ /^(-[1-4]|[0-3])$/) :
                    $k eq 'prev' || $k eq 'sub' || $k eq 'prevlnk' ? 1 : # itemrefs are validated separately
                    $k eq 'name' ? length $v :
                    $k eq 'rderr' ? Types::Serialiser::is_bool($v) :
                    /^[0-9]+$/
                );
            ($k,$v)
        } keys %$val };

        $printitems && printf "#%010x: %s\n", $itemref, fmtitem $val;
        $items{$itemref} = $val;
    }
    $minitemsperblock = $nitems if $minitemsperblock > $nitems;
    $maxitemsperblock = $nitems if $maxitemsperblock < $nitems;
}


sub indexblock($prefix, $content) {
    $printblocks && print "$prefix: index block\n";

    my $maxnum = max keys %datablocks;
    die "$prefix: index block size incorrect for $maxnum+1 data blocks\n" if length($content) != 8*($maxnum+1) + 8;

    my @ints = unpack 'Q>*', $content;
    $root_itemref = pop @ints;

    for my $i (0..$#ints-1) {
        if (!$datablocks{$i}) {
            die "$prefix: index entry for missing block (#$i) must be 0\n" if $ints[$i] != 0;
        } else {
            die sprintf "%s: invalid index entry for block #%d (got %016x expected %016x)\n",
                $prefix, $i, $ints[$i], $datablocks{$i}
                if $ints[$i] != $datablocks{$i};
        }
    }
}


while (1) {
    my $off = tell STDIN;
    my $prefix = sprintf '%010x', $off;
    die "$prefix Input too short, expected block header\n" if 4 != read STDIN, my $blkhead, 4;
    $blkhead = unpack 'N', $blkhead;
    my $blkid = $blkhead >> 28;
    my $blklen = $blkhead & 0x0fffffff;

    $prefix .= "[$blklen]";
    die "$prefix: Short read on block content\n" if $blklen - 8 != read STDIN, my $content, $blklen - 8;
    die "$prefix: Input too short, expected block footer\n" if 4 != read STDIN, my $blkfoot, 4;
    die "$prefix: Block footer does not match header\n" if $blkhead != unpack 'N', $blkfoot;

    if ($blkid == 0) {
        datablock($prefix, $off, $blklen, $content);
    } elsif ($blkid == 1) {
        indexblock($prefix, $content);
        last;
    } else {
        die "$prefix Unknown block id $blkid\n";
    }
}

{
    die sprintf "0x%08x: Data after index block\n", tell(STDIN) if 0 != read STDIN, my $x, 1;
}


# Each item must be referenced exactly once from either a 'prev' or 'sub' key,
# $nodup verifies the "at most once" part.
sub resolve($cur, $key, $nodup) {
    my $ref = exists $cur->{$key} ? $cur->{$key} : return;
    my $item = $ref < 0
        ? ($items{ $cur->{_itemref} + $ref } || die sprintf "#%010x: Invalid relative itemref %s: %d\n", $cur->{_itemref}, $key, $ref)
        : ($items{$ref} || die sprintf "#%010x: Invalid reference %s to #%010x\n", $cur->{_itemref}, $key, $ref);
    die sprintf "Item #%010x referenced more than once, from #%010x and #%010x\n", $item->{_itemref}, $item->{_lastseen}, $cur->{_itemref}
        if $nodup && defined $item->{_lastseen};
    $item->{_lastseen} = $cur->{_itemref} if $nodup;
    return $item;
}

my @dirblocks; # [ path, nitems, nblocks ]
my %dirblocks; # nblocks => ndirs

sub traverse($parent, $path) {
    my $sub = resolve($parent, 'sub', 1);
    my %blocks;
    my $items = 0;
    while ($sub) {
        $items++;
        $blocks{ $sub->{_itemref} >> 24 }++;
        traverse($sub, "$path/$sub->{name}") if $sub->{type} == 0;
        $sub = resolve($sub, 'prev', 1);
    }
    push @dirblocks, [ $path, $items, scalar keys %blocks ] if scalar keys %blocks > 1;
    $dirblocks{ keys %blocks }++ if $items > 0;
    $items && $printdirs && printf "#%010x: %d items in %d blocks (%d .. %d)  %s\n",
        $parent->{_itemref}, $items, scalar keys %blocks,
        min(values %blocks), max(values %blocks), $path;
}


{
    my $root = $items{$root_itemref} || die sprintf "Invalid root itemref: %010x\n", $root_itemref;
    $root->{_lastseen} = 0xffffffffff;
    traverse($root, $root->{name});

    my($noref) = grep !$_->{_lastseen}, values %items;
    die sprintf "No reference found to #%010x\n", $noref->{_itemref} if $noref;
}

if ($printstats) {
    my $nblocks = keys %datablocks;
    my $nitems = keys %items;
    printf "     Total items: %d\n", $nitems;
    printf "    Total blocks: %d\n", $nblocks;
    printf " Items per block: %.1f (%d .. %d)\n", $nitems / $nblocks, $minitemsperblock, $maxitemsperblock;
    printf "  Avg block size: %d compressed, %d raw (%.1f)\n", $datablock_len/$nblocks, $rawdata_len/$nblocks, $rawdata_len/$datablock_len*100;
    printf "   Avg item size: %.1f compressed, %.1f raw\n", $datablock_len/$nitems, $rawdata_len/$nitems;

    @dirblocks = sort { $b->[2] <=> $a->[2] } @dirblocks;
    print "\nBlocks per directory listing histogram\n";
    printf "  %5d %6d\n", $_, $dirblocks{$_} for sort { $a <=> $b } keys %dirblocks;
    print "\nMost blocks per directory listing\n";
    print "     items blks  path\n";
    printf "%10d %4d  %s\n", @{$dirblocks[$_]}[1,2,0] for (0..min 9, $#dirblocks);
}
Experimental new export format The goals of this format being: - Streaming parallel export with minimal mandatory buffering. - Exported data includes cumulative directory stats, so reader doesn't have to go through the entire tree to calculate these. - Fast-ish directory listings without reading the entire file. - Built-in compression. Current implementation is missing compression, hardlink counting and actually reading the file. Also need to tune and measure stuff. 2024-07-27 08:40:48 -08:00			`#!/usr/bin/perl`
			`# SPDX-FileCopyrightText: Yorhel <projects@yorhel.nl>`
			`# SPDX-License-Identifier: MIT`


			`# Usage: ncdubinexp.pl [options] <export.ncdu`
			`# Or: ncdu -O- \| ncdubinexp.pl [options]`
			`#`
			`# Reads and validates a binary ncdu export file and optionally prints out`
			`# various diagnostic data and statistics.`
			`#`
			`# Options:`
			`# blocks - print a listing of all blocks as they are read`
			`# items - print a listing of all items as they are read`
			`# dirs - print out dir listing stats`
			`# stats - print some overview stats`
			`#`
			`# This script is highly inefficient in both RAM and CPU, not suitable for large`
			`# exports.`
			`# This script does not permit unknown blocks or item keys, although that is`
			`# technically valid.`


			`use v5.36;`
			`use autodie;`
			`use bytes;`
			`no warnings 'portable';`
			`use List::Util 'min', 'max';`
			`use CBOR::XS; # Does not officially support recent perl versions, but it's the only CPAN module that supports streaming.`
Stick with zstd-4 + 64k block, add --compress-level, fix 32bit build And do dynamic buffer allocation for bin_export, removing 128k of .rodata that I accidentally introduced earlier and reducing memory use for parallel scans. Static binaries now also include the minimal version of zstd, current sizes for x86_64 are: 582k ncdu-2.5 601k ncdu-new-nocompress 765k ncdu-new-zstd That's not great, but also not awful. Even zlib or LZ4 would've resulted in a 700k binary. 2024-08-03 03:16:44 -08:00			`use Compress::Zstd;`
Experimental new export format The goals of this format being: - Streaming parallel export with minimal mandatory buffering. - Exported data includes cumulative directory stats, so reader doesn't have to go through the entire tree to calculate these. - Fast-ish directory listings without reading the entire file. - Built-in compression. Current implementation is missing compression, hardlink counting and actually reading the file. Also need to tune and measure stuff. 2024-07-27 08:40:48 -08:00
			`my $printblocks = grep $_ eq 'blocks', @ARGV;`
			`my $printitems = grep $_ eq 'items', @ARGV;`
			`my $printdirs = grep $_ eq 'dirs', @ARGV;`
			`my $printstats = grep $_ eq 'stats', @ARGV;`

			`my %datablocks;`
			`my %items;`
			`my $root_itemref;`
			`my $datablock_len = 0;`
			`my $rawdata_len = 0;`
			`my $minitemsperblock = 1e10;`
			`my $maxitemsperblock = 0;`

			`{`
			`die "Input too short\n" if 8 != read STDIN, my $sig, 8;`
			`die "Invalid file signature\n" if $sig ne "\xbfncduEX1";`
			`}`

			`my @itemkeys = qw/`
			`type`
			`name`
			`prev`
			`asize`
			`dsize`
			`dev`
			`rderr`
			`cumasize`
			`cumdsize`
			`shrasize`
			`shrdsize`
			`items`
			`sub`
			`ino`
			`nlink`
			`uid`
			`gid`
			`mode`
			`mtime`
			`/;`


			`sub datablock($prefix, $off, $blklen, $content) {`
			`die "$prefix: Data block too small\n" if length $content < 8;`
binfmt: Support larger (non-data) block sizes I realized that the 16 MiB limitation implied that the index block could only hold ((2^24)-16)/8 =~ 2 mil data block pointers. At the default 64k data block size that means an export can only reference up to ~128 GiB of uncompressed data. That's pretty limiting. This change increases the maximum size of the index block to 256 MiB, supporting ~33 mil data block pointers and ~2 TiB of uncompressed data with the default data block size. 2024-08-08 23:29:26 -08:00			`die "$prefix: Data block too large\n" if length $content >= (1<<24);`
Experimental new export format The goals of this format being: - Streaming parallel export with minimal mandatory buffering. - Exported data includes cumulative directory stats, so reader doesn't have to go through the entire tree to calculate these. - Fast-ish directory listings without reading the entire file. - Built-in compression. Current implementation is missing compression, hardlink counting and actually reading the file. Also need to tune and measure stuff. 2024-07-27 08:40:48 -08:00
			`my($num, $rawlen) = unpack 'NN', $content;`
binfmt: Support larger (non-data) block sizes I realized that the 16 MiB limitation implied that the index block could only hold ((2^24)-16)/8 =~ 2 mil data block pointers. At the default 64k data block size that means an export can only reference up to ~128 GiB of uncompressed data. That's pretty limiting. This change increases the maximum size of the index block to 256 MiB, supporting ~33 mil data block pointers and ~2 TiB of uncompressed data with the default data block size. 2024-08-08 23:29:26 -08:00			`die "$prefix: Uncompressed data block size too large\n" if $rawlen >= (1<<24);`
Experimental new export format The goals of this format being: - Streaming parallel export with minimal mandatory buffering. - Exported data includes cumulative directory stats, so reader doesn't have to go through the entire tree to calculate these. - Fast-ish directory listings without reading the entire file. - Built-in compression. Current implementation is missing compression, hardlink counting and actually reading the file. Also need to tune and measure stuff. 2024-07-27 08:40:48 -08:00			`die sprintf "%s: Duplicate block id %d (first at %010x)", $prefix, $num, $datablocks{$num}>>24 if $datablocks{$num};`
			`$datablocks{$num} = ($off << 24) \| $blklen;`

Stick with zstd-4 + 64k block, add --compress-level, fix 32bit build And do dynamic buffer allocation for bin_export, removing 128k of .rodata that I accidentally introduced earlier and reducing memory use for parallel scans. Static binaries now also include the minimal version of zstd, current sizes for x86_64 are: 582k ncdu-2.5 601k ncdu-new-nocompress 765k ncdu-new-zstd That's not great, but also not awful. Even zlib or LZ4 would've resulted in a 700k binary. 2024-08-03 03:16:44 -08:00			`my $compressed = substr $content, 8;`
			`$printblocks && printf "%s: data block %d rawlen %d (%.2f)\n", $prefix, $num, $rawlen, $rawlen/(length($compressed))*100;`
Experimental new export format The goals of this format being: - Streaming parallel export with minimal mandatory buffering. - Exported data includes cumulative directory stats, so reader doesn't have to go through the entire tree to calculate these. - Fast-ish directory listings without reading the entire file. - Built-in compression. Current implementation is missing compression, hardlink counting and actually reading the file. Also need to tune and measure stuff. 2024-07-27 08:40:48 -08:00
Stick with zstd-4 + 64k block, add --compress-level, fix 32bit build And do dynamic buffer allocation for bin_export, removing 128k of .rodata that I accidentally introduced earlier and reducing memory use for parallel scans. Static binaries now also include the minimal version of zstd, current sizes for x86_64 are: 582k ncdu-2.5 601k ncdu-new-nocompress 765k ncdu-new-zstd That's not great, but also not awful. Even zlib or LZ4 would've resulted in a 700k binary. 2024-08-03 03:16:44 -08:00			`$datablock_len += length($compressed);`
Experimental new export format The goals of this format being: - Streaming parallel export with minimal mandatory buffering. - Exported data includes cumulative directory stats, so reader doesn't have to go through the entire tree to calculate these. - Fast-ish directory listings without reading the entire file. - Built-in compression. Current implementation is missing compression, hardlink counting and actually reading the file. Also need to tune and measure stuff. 2024-07-27 08:40:48 -08:00			`$rawdata_len += $rawlen;`

Stick with zstd-4 + 64k block, add --compress-level, fix 32bit build And do dynamic buffer allocation for bin_export, removing 128k of .rodata that I accidentally introduced earlier and reducing memory use for parallel scans. Static binaries now also include the minimal version of zstd, current sizes for x86_64 are: 582k ncdu-2.5 601k ncdu-new-nocompress 765k ncdu-new-zstd That's not great, but also not awful. Even zlib or LZ4 would've resulted in a 700k binary. 2024-08-03 03:16:44 -08:00			`my $rawdata = decompress($compressed);`
			`die "$prefix: Block id $num failed decompression\n" if !defined $rawdata;`
			`die sprintf "%s: Block id %d decompressed to %d bytes but expected %d\n",`
			`$prefix, $num, length($rawdata), $rawlen if $rawlen != length $rawdata;`
			`cbordata($num, $rawdata);`
Experimental new export format The goals of this format being: - Streaming parallel export with minimal mandatory buffering. - Exported data includes cumulative directory stats, so reader doesn't have to go through the entire tree to calculate these. - Fast-ish directory listings without reading the entire file. - Built-in compression. Current implementation is missing compression, hardlink counting and actually reading the file. Also need to tune and measure stuff. 2024-07-27 08:40:48 -08:00			`}`


			`sub fmtitem($val) {`
			`join ' ', map "$_:$val->{$_}", grep exists $val->{$_}, @itemkeys;`
			`}`


			`sub cbordata($blknum, $data) {`
			`my $cbor = CBOR::XS->new_safe;`
			`my $off = 0;`
			`my $nitems = 0;`
			`while ($off < length $data) { # This substr madness is prolly quite slow`
			`my($val, $len) = $cbor->decode_prefix(substr $data, $off);`
			`my $itemref = ($blknum << 24) \| $off;`
			`$off += $len;`
			`$nitems++;`

			`# Basic validation of the CBOR data. Doesn't validate that every value`
			`# has the correct CBOR type or that integers are within range.`
			`$val = { _itemref => $itemref, map {`
			`die sprintf "#%010x: Invalid CBOR key '%s'\n", $itemref, $_ if !/^[0-9]+$/ \|\| !$itemkeys[$_];`
			`my($k, $v) = ($itemkeys[$_], $val->{$_});`
			`die sprintf "#%010x: Invalid value for key '%s': '%s'\n", $itemref, $k, $v`
			`if ref $v eq 'ARRAY' \|\| ref $v eq 'HASH' \|\| !defined $v \|\| !(`
			`$k eq 'type' ? ($v =~ /^(-[1-4]\|[0-3])$/) :`
			`$k eq 'prev' \|\| $k eq 'sub' \|\| $k eq 'prevlnk' ? 1 : # itemrefs are validated separately`
			`$k eq 'name' ? length $v :`
			`$k eq 'rderr' ? Types::Serialiser::is_bool($v) :`
			`/^[0-9]+$/`
			`);`
			`($k,$v)`
			`} keys %$val };`

			`$printitems && printf "#%010x: %s\n", $itemref, fmtitem $val;`
			`$items{$itemref} = $val;`
			`}`
			`$minitemsperblock = $nitems if $minitemsperblock > $nitems;`
			`$maxitemsperblock = $nitems if $maxitemsperblock < $nitems;`
			`}`


			`sub indexblock($prefix, $content) {`
			`$printblocks && print "$prefix: index block\n";`

			`my $maxnum = max keys %datablocks;`
			`die "$prefix: index block size incorrect for $maxnum+1 data blocks\n" if length($content) != 8*($maxnum+1) + 8;`

			`my @ints = unpack 'Q>*', $content;`
			`$root_itemref = pop @ints;`

			`for my $i (0..$#ints-1) {`
			`if (!$datablocks{$i}) {`
			`die "$prefix: index entry for missing block (#$i) must be 0\n" if $ints[$i] != 0;`
			`} else {`
			`die sprintf "%s: invalid index entry for block #%d (got %016x expected %016x)\n",`
			`$prefix, $i, $ints[$i], $datablocks{$i}`
			`if $ints[$i] != $datablocks{$i};`
			`}`
			`}`
			`}`


			`while (1) {`
			`my $off = tell STDIN;`
			`my $prefix = sprintf '%010x', $off;`
			`die "$prefix Input too short, expected block header\n" if 4 != read STDIN, my $blkhead, 4;`
			`$blkhead = unpack 'N', $blkhead;`
binfmt: Support larger (non-data) block sizes I realized that the 16 MiB limitation implied that the index block could only hold ((2^24)-16)/8 =~ 2 mil data block pointers. At the default 64k data block size that means an export can only reference up to ~128 GiB of uncompressed data. That's pretty limiting. This change increases the maximum size of the index block to 256 MiB, supporting ~33 mil data block pointers and ~2 TiB of uncompressed data with the default data block size. 2024-08-08 23:29:26 -08:00			`my $blkid = $blkhead >> 28;`
			`my $blklen = $blkhead & 0x0fffffff;`
Experimental new export format The goals of this format being: - Streaming parallel export with minimal mandatory buffering. - Exported data includes cumulative directory stats, so reader doesn't have to go through the entire tree to calculate these. - Fast-ish directory listings without reading the entire file. - Built-in compression. Current implementation is missing compression, hardlink counting and actually reading the file. Also need to tune and measure stuff. 2024-07-27 08:40:48 -08:00
			`$prefix .= "[$blklen]";`
			`die "$prefix: Short read on block content\n" if $blklen - 8 != read STDIN, my $content, $blklen - 8;`
			`die "$prefix: Input too short, expected block footer\n" if 4 != read STDIN, my $blkfoot, 4;`
			`die "$prefix: Block footer does not match header\n" if $blkhead != unpack 'N', $blkfoot;`

binfmt: Support larger (non-data) block sizes I realized that the 16 MiB limitation implied that the index block could only hold ((2^24)-16)/8 =~ 2 mil data block pointers. At the default 64k data block size that means an export can only reference up to ~128 GiB of uncompressed data. That's pretty limiting. This change increases the maximum size of the index block to 256 MiB, supporting ~33 mil data block pointers and ~2 TiB of uncompressed data with the default data block size. 2024-08-08 23:29:26 -08:00			`if ($blkid == 0) {`
Experimental new export format The goals of this format being: - Streaming parallel export with minimal mandatory buffering. - Exported data includes cumulative directory stats, so reader doesn't have to go through the entire tree to calculate these. - Fast-ish directory listings without reading the entire file. - Built-in compression. Current implementation is missing compression, hardlink counting and actually reading the file. Also need to tune and measure stuff. 2024-07-27 08:40:48 -08:00			`datablock($prefix, $off, $blklen, $content);`
binfmt: Support larger (non-data) block sizes I realized that the 16 MiB limitation implied that the index block could only hold ((2^24)-16)/8 =~ 2 mil data block pointers. At the default 64k data block size that means an export can only reference up to ~128 GiB of uncompressed data. That's pretty limiting. This change increases the maximum size of the index block to 256 MiB, supporting ~33 mil data block pointers and ~2 TiB of uncompressed data with the default data block size. 2024-08-08 23:29:26 -08:00			`} elsif ($blkid == 1) {`
Experimental new export format The goals of this format being: - Streaming parallel export with minimal mandatory buffering. - Exported data includes cumulative directory stats, so reader doesn't have to go through the entire tree to calculate these. - Fast-ish directory listings without reading the entire file. - Built-in compression. Current implementation is missing compression, hardlink counting and actually reading the file. Also need to tune and measure stuff. 2024-07-27 08:40:48 -08:00			`indexblock($prefix, $content);`
			`last;`
			`} else {`
			`die "$prefix Unknown block id $blkid\n";`
			`}`
			`}`

			`{`
			`die sprintf "0x%08x: Data after index block\n", tell(STDIN) if 0 != read STDIN, my $x, 1;`
			`}`



			`# Each item must be referenced exactly once from either a 'prev' or 'sub' key,`
			`# $nodup verifies the "at most once" part.`
			`sub resolve($cur, $key, $nodup) {`
			`my $ref = exists $cur->{$key} ? $cur->{$key} : return;`
			`my $item = $ref < 0`
			`? ($items{ $cur->{_itemref} + $ref } \|\| die sprintf "#%010x: Invalid relative itemref %s: %d\n", $cur->{_itemref}, $key, $ref)`
			`: ($items{$ref} \|\| die sprintf "#%010x: Invalid reference %s to #%010x\n", $cur->{_itemref}, $key, $ref);`
			`die sprintf "Item #%010x referenced more than once, from #%010x and #%010x\n", $item->{_itemref}, $item->{_lastseen}, $cur->{_itemref}`
			`if $nodup && defined $item->{_lastseen};`
			`$item->{_lastseen} = $cur->{_itemref} if $nodup;`
			`return $item;`
			`}`

			`my @dirblocks; # [ path, nitems, nblocks ]`
			`my %dirblocks; # nblocks => ndirs`

			`sub traverse($parent, $path) {`
			`my $sub = resolve($parent, 'sub', 1);`
			`my %blocks;`
			`my $items = 0;`
			`while ($sub) {`
			`$items++;`
			`$blocks{ $sub->{_itemref} >> 24 }++;`
			`traverse($sub, "$path/$sub->{name}") if $sub->{type} == 0;`
			`$sub = resolve($sub, 'prev', 1);`
			`}`
			`push @dirblocks, [ $path, $items, scalar keys %blocks ] if scalar keys %blocks > 1;`
			`$dirblocks{ keys %blocks }++ if $items > 0;`
			`$items && $printdirs && printf "#%010x: %d items in %d blocks (%d .. %d) %s\n",`
			`$parent->{_itemref}, $items, scalar keys %blocks,`
			`min(values %blocks), max(values %blocks), $path;`
			`}`


			`{`
			`my $root = $items{$root_itemref} \|\| die sprintf "Invalid root itemref: %010x\n", $root_itemref;`
			`$root->{_lastseen} = 0xffffffffff;`
			`traverse($root, $root->{name});`

			`my($noref) = grep !$_->{_lastseen}, values %items;`
			`die sprintf "No reference found to #%010x\n", $noref->{_itemref} if $noref;`
			`}`

			`if ($printstats) {`
			`my $nblocks = keys %datablocks;`
			`my $nitems = keys %items;`
			`printf " Total items: %d\n", $nitems;`
			`printf " Total blocks: %d\n", $nblocks;`
			`printf " Items per block: %.1f (%d .. %d)\n", $nitems / $nblocks, $minitemsperblock, $maxitemsperblock;`
			`printf " Avg block size: %d compressed, %d raw (%.1f)\n", $datablock_len/$nblocks, $rawdata_len/$nblocks, $rawdata_len/$datablock_len*100;`
			`printf " Avg item size: %.1f compressed, %.1f raw\n", $datablock_len/$nitems, $rawdata_len/$nitems;`

			`@dirblocks = sort { $b->[2] <=> $a->[2] } @dirblocks;`
			`print "\nBlocks per directory listing histogram\n";`
			`printf " %5d %6d\n", $_, $dirblocks{$_} for sort { $a <=> $b } keys %dirblocks;`
			`print "\nMost blocks per directory listing\n";`
			`print " items blks path\n";`
			`printf "%10d %4d %s\n", @{$dirblocks[$_]}[1,2,0] for (0..min 9, $#dirblocks);`
			`}`