From f25bc5cbf4c402868c2d3037503c157975e57023 Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sat, 27 Jul 2024 18:40:48 +0200 Subject: [PATCH] Experimental new export format The goals of this format being: - Streaming parallel export with minimal mandatory buffering. - Exported data includes cumulative directory stats, so reader doesn't have to go through the entire tree to calculate these. - Fast-ish directory listings without reading the entire file. - Built-in compression. Current implementation is missing compression, hardlink counting and actually reading the file. Also need to tune and measure stuff. --- ncdubinexp.pl | 240 +++++++++++++++++++++++++++++++ src/bin_export.zig | 340 ++++++++++++++++++++++++++++++++++++++++++++ src/json_import.zig | 2 +- src/main.zig | 28 ++-- src/mem_src.zig | 4 +- src/scan.zig | 8 +- src/sink.zig | 24 +++- 7 files changed, 627 insertions(+), 19 deletions(-) create mode 100755 ncdubinexp.pl create mode 100644 src/bin_export.zig diff --git a/ncdubinexp.pl b/ncdubinexp.pl new file mode 100755 index 0000000..6651062 --- /dev/null +++ b/ncdubinexp.pl @@ -0,0 +1,240 @@ +#!/usr/bin/perl +# SPDX-FileCopyrightText: Yorhel +# SPDX-License-Identifier: MIT + + +# Usage: ncdubinexp.pl [options] >24 if $datablocks{$num}; + $datablocks{$num} = ($off << 24) | $blklen; + + $printblocks && printf "%s: data block %d rawlen %d (%.2f)\n", $prefix, $num, $rawlen, $rawlen/(length($content)-8)*100; + + $datablock_len += length($content)-8; + $rawdata_len += $rawlen; + + # TODO: Decompress + cbordata($num, substr $content, 8); +} + + +sub fmtitem($val) { + join ' ', map "$_:$val->{$_}", grep exists $val->{$_}, @itemkeys; +} + + +sub cbordata($blknum, $data) { + my $cbor = CBOR::XS->new_safe; + my $off = 0; + my $nitems = 0; + while ($off < length $data) { # This substr madness is prolly quite slow + my($val, $len) = $cbor->decode_prefix(substr $data, $off); + my $itemref = ($blknum << 24) | $off; + $off += $len; + next if !defined $val; + $nitems++; + + # Basic validation of the CBOR data. Doesn't validate that every value + # has the correct CBOR type or that integers are within range. + $val = { _itemref => $itemref, map { + die sprintf "#%010x: Invalid CBOR key '%s'\n", $itemref, $_ if !/^[0-9]+$/ || !$itemkeys[$_]; + my($k, $v) = ($itemkeys[$_], $val->{$_}); + die sprintf "#%010x: Invalid value for key '%s': '%s'\n", $itemref, $k, $v + if ref $v eq 'ARRAY' || ref $v eq 'HASH' || !defined $v || !( + $k eq 'type' ? ($v =~ /^(-[1-4]|[0-3])$/) : + $k eq 'prev' || $k eq 'sub' || $k eq 'prevlnk' ? 1 : # itemrefs are validated separately + $k eq 'name' ? length $v : + $k eq 'rderr' ? Types::Serialiser::is_bool($v) : + /^[0-9]+$/ + ); + ($k,$v) + } keys %$val }; + + $printitems && printf "#%010x: %s\n", $itemref, fmtitem $val; + $items{$itemref} = $val; + } + $minitemsperblock = $nitems if $minitemsperblock > $nitems; + $maxitemsperblock = $nitems if $maxitemsperblock < $nitems; +} + + +sub indexblock($prefix, $content) { + $printblocks && print "$prefix: index block\n"; + + my $maxnum = max keys %datablocks; + die "$prefix: index block size incorrect for $maxnum+1 data blocks\n" if length($content) != 8*($maxnum+1) + 8; + + my @ints = unpack 'Q>*', $content; + $root_itemref = pop @ints; + + for my $i (0..$#ints-1) { + if (!$datablocks{$i}) { + die "$prefix: index entry for missing block (#$i) must be 0\n" if $ints[$i] != 0; + } else { + die sprintf "%s: invalid index entry for block #%d (got %016x expected %016x)\n", + $prefix, $i, $ints[$i], $datablocks{$i} + if $ints[$i] != $datablocks{$i}; + } + } +} + + +while (1) { + my $off = tell STDIN; + my $prefix = sprintf '%010x', $off; + die "$prefix Input too short, expected block header\n" if 4 != read STDIN, my $blkhead, 4; + $blkhead = unpack 'N', $blkhead; + my $blkid = $blkhead >> 24; + my $blklen = $blkhead & 0xffffff; + + $prefix .= "[$blklen]"; + die "$prefix: Short read on block content\n" if $blklen - 8 != read STDIN, my $content, $blklen - 8; + die "$prefix: Input too short, expected block footer\n" if 4 != read STDIN, my $blkfoot, 4; + die "$prefix: Block footer does not match header\n" if $blkhead != unpack 'N', $blkfoot; + + if ($blkid == 1) { + datablock($prefix, $off, $blklen, $content); + } elsif ($blkid == 2) { + indexblock($prefix, $content); + last; + } else { + die "$prefix Unknown block id $blkid\n"; + } +} + +{ + die sprintf "0x%08x: Data after index block\n", tell(STDIN) if 0 != read STDIN, my $x, 1; +} + + + +# Each item must be referenced exactly once from either a 'prev' or 'sub' key, +# $nodup verifies the "at most once" part. +sub resolve($cur, $key, $nodup) { + my $ref = exists $cur->{$key} ? $cur->{$key} : return; + my $item = $ref < 0 + ? ($items{ $cur->{_itemref} + $ref } || die sprintf "#%010x: Invalid relative itemref %s: %d\n", $cur->{_itemref}, $key, $ref) + : ($items{$ref} || die sprintf "#%010x: Invalid reference %s to #%010x\n", $cur->{_itemref}, $key, $ref); + die sprintf "Item #%010x referenced more than once, from #%010x and #%010x\n", $item->{_itemref}, $item->{_lastseen}, $cur->{_itemref} + if $nodup && defined $item->{_lastseen}; + $item->{_lastseen} = $cur->{_itemref} if $nodup; + return $item; +} + +my @dirblocks; # [ path, nitems, nblocks ] +my %dirblocks; # nblocks => ndirs + +sub traverse($parent, $path) { + my $sub = resolve($parent, 'sub', 1); + my %blocks; + my $items = 0; + while ($sub) { + $items++; + $blocks{ $sub->{_itemref} >> 24 }++; + traverse($sub, "$path/$sub->{name}") if $sub->{type} == 0; + $sub = resolve($sub, 'prev', 1); + } + push @dirblocks, [ $path, $items, scalar keys %blocks ] if scalar keys %blocks > 1; + $dirblocks{ keys %blocks }++ if $items > 0; + $items && $printdirs && printf "#%010x: %d items in %d blocks (%d .. %d) %s\n", + $parent->{_itemref}, $items, scalar keys %blocks, + min(values %blocks), max(values %blocks), $path; +} + + +{ + my $root = $items{$root_itemref} || die sprintf "Invalid root itemref: %010x\n", $root_itemref; + $root->{_lastseen} = 0xffffffffff; + traverse($root, $root->{name}); + + my($noref) = grep !$_->{_lastseen}, values %items; + die sprintf "No reference found to #%010x\n", $noref->{_itemref} if $noref; + + # TODO: Verify 'link' references +} + +if ($printstats) { + my $nblocks = keys %datablocks; + my $nitems = keys %items; + printf " Total items: %d\n", $nitems; + printf " Total blocks: %d\n", $nblocks; + printf " Items per block: %.1f (%d .. %d)\n", $nitems / $nblocks, $minitemsperblock, $maxitemsperblock; + printf " Avg block size: %d compressed, %d raw (%.1f)\n", $datablock_len/$nblocks, $rawdata_len/$nblocks, $rawdata_len/$datablock_len*100; + printf " Avg item size: %.1f compressed, %.1f raw\n", $datablock_len/$nitems, $rawdata_len/$nitems; + + @dirblocks = sort { $b->[2] <=> $a->[2] } @dirblocks; + print "\nBlocks per directory listing histogram\n"; + printf " %5d %6d\n", $_, $dirblocks{$_} for sort { $a <=> $b } keys %dirblocks; + print "\nMost blocks per directory listing\n"; + print " items blks path\n"; + printf "%10d %4d %s\n", @{$dirblocks[$_]}[1,2,0] for (0..min 9, $#dirblocks); +} diff --git a/src/bin_export.zig b/src/bin_export.zig new file mode 100644 index 0000000..54904f3 --- /dev/null +++ b/src/bin_export.zig @@ -0,0 +1,340 @@ +// SPDX-FileCopyrightText: Yorhel +// SPDX-License-Identifier: MIT + +const std = @import("std"); +const main = @import("main.zig"); +const sink = @import("sink.zig"); +const util = @import("util.zig"); +const ui = @import("ui.zig"); + +pub const global = struct { + var fd: std.fs.File = undefined; + var index = std.ArrayList(u8).init(main.allocator); + var file_off: u64 = 0; + var lock: std.Thread.Mutex = .{}; + var root_itemref: u64 = 0; + // TODO: + // var links: Map dev -> ino -> (last_offset, size, blocks, nlink) +}; + +const BLOCK_SIZE: usize = 64*1024; + +const ItemType = enum(i3) { + dir = 0, + reg = 1, + nonreg = 2, + link = 3, + err = -1, + pattern = -2, + otherfs = -3, + kernfs = -4 +}; + +const ItemKey = enum(u5) { + // all items + type = 0, // ItemType + name = 1, // bytes + prev = 2, // itemref + // Only for non-specials + asize = 3, // u64 + dsize = 4, // u64 + // Only for .dir + dev = 5, // u64 only if different from parent dir + rderr = 6, // bool true = error reading directory list, false = error in sub-item, absent = no error + cumasize = 7, // u64 + cumdsize = 8, // u64 + shrasize = 9, // u64 + shrdsize = 10, // u64 + items = 11, // u32 + sub = 12, // itemref only if dir is not empty + // Only for .link + ino = 13, // u64 + nlink = 14, // u32 + prevlnk = 15, // itemref + // Extended mode + uid = 16, // u32 + gid = 17, // u32 + mode = 18, // u16 + mtime = 19, // u64 +}; + +// Pessimistic upper bound on the encoded size of an item, excluding the name field. +// 2 bytes for map start/end, 10 per field (2 for the key, 9 for a full u64). +const MAX_ITEM_LEN = 2 + 11 * @typeInfo(ItemKey).Enum.fields.len; + +const CborMajor = enum(u3) { pos, neg, bytes, text, array, map, tag, simple }; + +inline fn bigu16(v: u16) [2]u8 { return @bitCast(std.mem.nativeToBig(u16, v)); } +inline fn bigu32(v: u32) [4]u8 { return @bitCast(std.mem.nativeToBig(u32, v)); } +inline fn bigu64(v: u64) [8]u8 { return @bitCast(std.mem.nativeToBig(u64, v)); } + +inline fn blockHeader(id: u8, len: u32) [4]u8 { return bigu32((@as(u32, id) << 24) | len); } + +inline fn cborByte(major: CborMajor, arg: u5) u8 { return (@as(u8, @intFromEnum(major)) << 5) | arg; } + + +pub const Thread = struct { + buf: [BLOCK_SIZE]u8 = undefined, + off: usize = BLOCK_SIZE, + block_num: u32 = std.math.maxInt(u32), + itemref: u64 = 0, // ref of item currently being written + + // Temporary buffer for headers and compression + tmp: [BLOCK_SIZE+128]u8 = undefined, + + fn createBlock(t: *Thread) []const u8 { + if (t.block_num == std.math.maxInt(u32) or t.off <= 1) return ""; + + // TODO: Compression + const blocklen: u32 = @intCast(t.off + 16); + t.tmp[0..4].* = blockHeader(1, blocklen); + t.tmp[4..8].* = bigu32(t.block_num); + t.tmp[8..12].* = bigu32(@intCast(t.off)); + @memcpy(t.tmp[12..][0..t.off], t.buf[0..t.off]); + t.tmp[12+t.off..][0..4].* = blockHeader(1, blocklen); + return t.tmp[0..blocklen]; + } + + fn flush(t: *Thread, expected_len: usize) void { + @setCold(true); + const block = createBlock(t); + + global.lock.lock(); + defer global.lock.unlock(); + // This can only really happen when the root path exceeds BLOCK_SIZE, + // in which case we would probably have error'ed out earlier anyway. + if (expected_len > t.buf.len) ui.die("Error writing data: path too long.\n", .{}); + + if (block.len > 0) { + global.index.items[4..][t.block_num*8..][0..8].* = bigu64((global.file_off << 24) + block.len); + global.file_off += block.len; + global.fd.writeAll(block) catch |e| + ui.die("Error writing to file: {s}.\n", .{ ui.errorString(e) }); + } + + t.off = 0; + t.block_num = @intCast((global.index.items.len - 4) / 8); + global.index.appendSlice(&[1]u8{0}**8) catch unreachable; + // Start the first block with a CBOR 'null', so that itemrefs can never be 0. + if (t.block_num == 0) t.cborHead(.simple, 22); + } + + fn cborHead(t: *Thread, major: CborMajor, arg: u64) void { + if (arg <= 23) { + t.buf[t.off] = cborByte(major, @intCast(arg)); + t.off += 1; + } else if (arg <= std.math.maxInt(u8)) { + t.buf[t.off] = cborByte(major, 24); + t.buf[t.off+1] = @truncate(arg); + t.off += 2; + } else if (arg <= std.math.maxInt(u16)) { + t.buf[t.off] = cborByte(major, 25); + t.buf[t.off+1..][0..2].* = bigu16(@intCast(arg)); + t.off += 3; + } else if (arg <= std.math.maxInt(u32)) { + t.buf[t.off] = cborByte(major, 26); + t.buf[t.off+1..][0..4].* = bigu32(@intCast(arg)); + t.off += 5; + } else { + t.buf[t.off] = cborByte(major, 27); + t.buf[t.off+1..][0..8].* = bigu64(arg); + t.off += 9; + } + } + + fn cborIndef(t: *Thread, major: CborMajor) void { + t.buf[t.off] = cborByte(major, 31); + t.off += 1; + } + + fn itemKey(t: *Thread, key: ItemKey) void { + t.cborHead(.pos, @intFromEnum(key)); + } + + fn itemRef(t: *Thread, key: ItemKey, ref: u64) void { + if (ref == 0) return; + t.itemKey(key); + // Full references compress like shit and most of the references point + // into the same block, so optimize that case by using a negative + // offset instead. + if ((ref >> 24) == t.block_num) t.cborHead(.neg, t.itemref - ref - 1) + else t.cborHead(.pos, ref); + } + + // Reserve space for a new item, write out the type, prev and name fields and return the itemref. + fn itemStart(t: *Thread, itype: ItemType, prev_item: u64, name: []const u8) u64 { + const min_len = name.len + MAX_ITEM_LEN; + if (t.off + min_len > t.buf.len) t.flush(min_len); + + t.itemref = (@as(u64, t.block_num) << 24) | t.off; + t.cborIndef(.map); + t.itemKey(.type); + if (@intFromEnum(itype) >= 0) t.cborHead(.pos, @intCast(@intFromEnum(itype))) + else t.cborHead(.neg, @intCast(-1 - @intFromEnum(itype))); + t.itemKey(.name); + t.cborHead(.bytes, name.len); + @memcpy(t.buf[t.off..][0..name.len], name); + t.off += name.len; + t.itemRef(.prev, prev_item); + return t.itemref; + } + + fn itemExt(t: *Thread, stat: *const sink.Stat) void { + if (!main.config.extended) return; + t.itemKey(.uid); + t.cborHead(.pos, stat.ext.uid); + t.itemKey(.gid); + t.cborHead(.pos, stat.ext.gid); + t.itemKey(.mode); + t.cborHead(.pos, stat.ext.mode); + t.itemKey(.mtime); + t.cborHead(.pos, stat.ext.mtime); + } + + fn itemEnd(t: *Thread) void { + t.cborIndef(.simple); + } +}; + + +pub const Dir = struct { + // TODO: When items are written out into blocks depth-first, parent dirs + // will end up getting their items distributed over many blocks, which will + // significantly slow down reading that dir's listing. It may be worth + // buffering some items at the Dir level before flushing them out to the + // Thread buffer. + + // The lock protects all of the below, and is necessary because final() + // accesses the parent dir and may be called from other threads. + // I'm not expecting much lock contention, but it's possible to turn + // last_item into an atomic integer and other fields could be split up for + // subdir use. + lock: std.Thread.Mutex = .{}, + last_sub: u64 = 0, + stat: sink.Stat, + items: u32 = 0, + size: u64 = 0, + blocks: u64 = 0, + err: bool = false, + suberr: bool = false, + // TODO: set of links + //shared_size: u64, + //shared_blocks: u64, + + pub fn addSpecial(d: *Dir, t: *Thread, name: []const u8, sp: sink.Special) void { + d.lock.lock(); + defer d.lock.unlock(); + d.items += 1; + if (sp == .err) d.suberr = true; + const it: ItemType = switch (sp) { + .err => .err, + .other_fs => .otherfs, + .kernfs => .kernfs, + .excluded => .pattern, + }; + d.last_sub = t.itemStart(it, d.last_sub, name); + t.itemEnd(); + } + + pub fn addStat(d: *Dir, t: *Thread, name: []const u8, stat: *const sink.Stat) void { + d.lock.lock(); + defer d.lock.unlock(); + d.items += 1; + if (!stat.hlinkc) { + d.size +|= stat.size; + d.blocks +|= stat.blocks; + } + const it: ItemType = if (stat.hlinkc) .link else if (stat.reg) .reg else .nonreg; + d.last_sub = t.itemStart(it, d.last_sub, name); + t.itemKey(.asize); + t.cborHead(.pos, stat.size); + t.itemKey(.dsize); + t.cborHead(.pos, util.blocksToSize(stat.blocks)); + // TODO: hardlink stuff + t.itemExt(stat); + t.itemEnd(); + } + + pub fn addDir(d: *Dir, stat: *const sink.Stat) Dir { + d.lock.lock(); + defer d.lock.unlock(); + d.items += 1; + d.size +|= stat.size; + d.blocks +|= stat.blocks; + return .{ .stat = stat.* }; + } + + pub fn setReadError(d: *Dir) void { + d.lock.lock(); + defer d.lock.unlock(); + d.err = true; + } + + pub fn final(d: *Dir, t: *Thread, name: []const u8, parent: ?*Dir) void { + if (parent) |p| p.lock.lock(); + defer if (parent) |p| p.lock.unlock(); + + // TODO: hardlink stuff + if (parent) |p| { + p.items += d.items; + p.size +|= d.size; + p.blocks +|= d.blocks; + if (d.suberr or d.err) p.suberr = true; + + p.last_sub = t.itemStart(.dir, p.last_sub, name); + } else + global.root_itemref = t.itemStart(.dir, 0, name); + + t.itemKey(.asize); + t.cborHead(.pos, d.stat.size); + t.itemKey(.dsize); + t.cborHead(.pos, util.blocksToSize(d.stat.blocks)); + if (parent == null or parent.?.stat.dev != d.stat.dev) { + t.itemKey(.dev); + t.cborHead(.pos, d.stat.dev); + } + if (d.err or d.suberr) { + t.itemKey(.rderr); + t.cborHead(.simple, if (d.err) 21 else 20); + } + t.itemKey(.cumasize); + t.cborHead(.pos, d.size); + t.itemKey(.cumdsize); + t.cborHead(.pos, util.blocksToSize(d.blocks)); + t.itemKey(.items); + t.cborHead(.pos, d.items); + t.itemRef(.sub, d.last_sub); + t.itemExt(&d.stat); + t.itemEnd(); + } +}; + + +pub fn createRoot(stat: *const sink.Stat) Dir { + return .{ .stat = stat.* }; +} + +pub fn done(threads: []sink.Thread) void { + for (threads) |*t| t.sink.bin.flush(0); + + while (std.mem.endsWith(u8, global.index.items, &[1]u8{0}**8)) + global.index.shrinkRetainingCapacity(global.index.items.len - 8); + global.index.appendSlice(&bigu64(global.root_itemref)) catch unreachable; + global.index.appendSlice(&blockHeader(2, @intCast(global.index.items.len + 4))) catch unreachable; + global.index.items[0..4].* = blockHeader(2, @intCast(global.index.items.len)); + global.fd.writeAll(global.index.items) catch |e| + ui.die("Error writing to file: {s}.\n", .{ ui.errorString(e) }); + global.index.clearAndFree(); + + global.fd.close(); +} + +pub fn setupOutput(fd: std.fs.File) void { + global.fd = fd; + fd.writeAll("\xbfncduEX1") catch |e| + ui.die("Error writing to file: {s}.\n", .{ ui.errorString(e) }); + global.file_off = 8; + + // Placeholder for the index block header. + global.index.appendSlice("aaaa") catch unreachable; +} diff --git a/src/json_import.zig b/src/json_import.zig index 20864c1..eff37f9 100644 --- a/src/json_import.zig +++ b/src/json_import.zig @@ -448,7 +448,7 @@ fn item(ctx: *Ctx, parent: ?*sink.Dir, dev: u64) void { if (ctx.special == .err) dir.setReadError(ctx.sink); while (ctx.p.elem(false)) item(ctx, dir, ndev); ctx.sink.setDir(parent); - dir.unref(); + dir.unref(ctx.sink); } else if (ctx.special) |s| { parent.?.addSpecial(ctx.sink, name, s); if (ctx.stat.dir and ctx.p.elem(false)) ctx.p.die("unexpected contents in an excluded directory"); diff --git a/src/main.zig b/src/main.zig index 7477c70..73fdce0 100644 --- a/src/main.zig +++ b/src/main.zig @@ -8,6 +8,7 @@ const model = @import("model.zig"); const scan = @import("scan.zig"); const json_import = @import("json_import.zig"); const json_export = @import("json_export.zig"); +const bin_export = @import("bin_export.zig"); const sink = @import("sink.zig"); const mem_src = @import("mem_src.zig"); const mem_sink = @import("mem_sink.zig"); @@ -23,6 +24,7 @@ test "imports" { _ = scan; _ = json_import; _ = json_export; + _ = bin_export; _ = sink; _ = mem_src; _ = mem_sink; @@ -475,7 +477,8 @@ pub fn main() void { var scan_dir: ?[:0]const u8 = null; var import_file: ?[:0]const u8 = null; - var export_file: ?[:0]const u8 = null; + var export_json: ?[:0]const u8 = null; + var export_bin: ?[:0]const u8 = null; var quit_after_scan = false; { const arglist = std.process.argsAlloc(allocator) catch unreachable; @@ -491,8 +494,10 @@ pub fn main() void { } if (opt.is("-h") or opt.is("-?") or opt.is("--help")) help() else if (opt.is("-v") or opt.is("-V") or opt.is("--version")) version() - else if (opt.is("-o") and export_file != null) ui.die("The -o flag can only be given once.\n", .{}) - else if (opt.is("-o")) export_file = allocator.dupeZ(u8, args.arg()) catch unreachable + else if (opt.is("-o") and (export_json != null or export_bin != null)) ui.die("The -o flag can only be given once.\n", .{}) + else if (opt.is("-o")) export_json = allocator.dupeZ(u8, args.arg()) catch unreachable + else if (opt.is("-O") and (export_json != null or export_bin != null)) ui.die("The -O flag can only be given once.\n", .{}) + else if (opt.is("-O")) export_bin = allocator.dupeZ(u8, args.arg()) catch unreachable else if (opt.is("-f") and import_file != null) ui.die("The -f flag can only be given once.\n", .{}) else if (opt.is("-f")) import_file = allocator.dupeZ(u8, args.arg()) catch unreachable else if (opt.is("--ignore-config")) {} @@ -512,25 +517,32 @@ pub fn main() void { const out_tty = stdout.isTty(); const in_tty = stdin.isTty(); if (config.scan_ui == null) { - if (export_file) |f| { + if (export_json orelse export_bin) |f| { if (!out_tty or std.mem.eql(u8, f, "-")) config.scan_ui = .none else config.scan_ui = .line; } else config.scan_ui = .full; } - if (!in_tty and import_file == null and export_file == null and !quit_after_scan) + if (!in_tty and import_file == null and export_json == null and export_bin == null and !quit_after_scan) ui.die("Standard input is not a TTY. Did you mean to import a file using '-f -'?\n", .{}); - config.nc_tty = !in_tty or (if (export_file) |f| std.mem.eql(u8, f, "-") else false); + config.nc_tty = !in_tty or (if (export_json orelse export_bin) |f| std.mem.eql(u8, f, "-") else false); event_delay_timer = std.time.Timer.start() catch unreachable; defer ui.deinit(); - if (export_file) |f| { + if (export_json) |f| { const file = if (std.mem.eql(u8, f, "-")) stdout else std.fs.cwd().createFileZ(f, .{}) catch |e| ui.die("Error opening export file: {s}.\n", .{ui.errorString(e)}); json_export.setupOutput(file); sink.global.sink = .json; + } else if (export_bin) |f| { + const file = + if (std.mem.eql(u8, f, "-")) stdout + else std.fs.cwd().createFileZ(f, .{}) + catch |e| ui.die("Error opening export file: {s}.\n", .{ui.errorString(e)}); + bin_export.setupOutput(file); + sink.global.sink = .bin; } if (import_file) |f| { @@ -543,7 +555,7 @@ pub fn main() void { else |_| (scan_dir orelse "."); scan.scan(path) catch |e| ui.die("Error opening directory: {s}.\n", .{ui.errorString(e)}); } - if (quit_after_scan or export_file != null) return; + if (quit_after_scan or export_json != null or export_bin != null) return; config.can_shell = config.can_shell orelse !config.imported; config.can_delete = config.can_delete orelse !config.imported; diff --git a/src/mem_src.zig b/src/mem_src.zig index 988904f..cd11660 100644 --- a/src/mem_src.zig +++ b/src/mem_src.zig @@ -46,7 +46,7 @@ fn rec(ctx: *Ctx, dir: *sink.Dir, entry: *model.Entry) void { var it = d.sub; while (it) |e| : (it = e.next) rec(ctx, ndir, e); ctx.sink.setDir(dir); - ndir.unref(); + ndir.unref(ctx.sink); return; } if (entry.file()) |f| { @@ -74,6 +74,6 @@ pub fn run(d: *model.Dir) void { var it = d.sub; while (it) |e| : (it = e.next) rec(&ctx, root, e); - root.unref(); + root.unref(ctx.sink); sink.done(); } diff --git a/src/scan.zig b/src/scan.zig index cb59b0b..9e4a81c 100644 --- a/src/scan.zig +++ b/src/scan.zig @@ -162,10 +162,10 @@ const Dir = struct { return d; } - fn destroy(d: *Dir) void { + fn destroy(d: *Dir, t: *Thread) void { d.pat.deinit(); d.fd.close(); - d.sink.unref(); + d.sink.unref(t.sink); main.allocator.destroy(d); } }; @@ -231,7 +231,7 @@ const Thread = struct { var edir = dir.fd.openDirZ(name, .{ .no_follow = true, .iterate = true }) catch { const s = dir.sink.addDir(t.sink, name, &stat); s.setReadError(t.sink); - s.unref(); + s.unref(t.sink); return; }; @@ -275,7 +275,7 @@ const Thread = struct { if (entry) |e| t.scanOne(d, e.name) else { t.sink.setDir(null); - t.stack.pop().destroy(); + t.stack.pop().destroy(t); } } } diff --git a/src/sink.zig b/src/sink.zig index 436cdf2..bd067a1 100644 --- a/src/sink.zig +++ b/src/sink.zig @@ -7,6 +7,7 @@ const model = @import("model.zig"); const mem_src = @import("mem_src.zig"); const mem_sink = @import("mem_sink.zig"); const json_export = @import("json_export.zig"); +const bin_export = @import("bin_export.zig"); const ui = @import("ui.zig"); const util = @import("util.zig"); @@ -78,6 +79,7 @@ pub const Dir = struct { const Out = union(enum) { mem: mem_sink.Dir, json: json_export.Dir, + bin: bin_export.Dir, }; pub fn addSpecial(d: *Dir, t: *Thread, name: []const u8, sp: Special) void { @@ -85,6 +87,7 @@ pub const Dir = struct { switch (d.out) { .mem => |*m| m.addSpecial(&t.sink.mem, name, sp), .json => |*j| j.addSpecial(name, sp), + .bin => |*b| b.addSpecial(&t.sink.bin, name, sp), } if (sp == .err) { global.last_error_lock.lock(); @@ -103,6 +106,7 @@ pub const Dir = struct { switch (d.out) { .mem => |*m| _ = m.addStat(&t.sink.mem, name, stat), .json => |*j| j.addStat(name, stat), + .bin => |*b| b.addStat(&t.sink.bin, name, stat), } } @@ -119,6 +123,7 @@ pub const Dir = struct { .out = switch (d.out) { .mem => |*m| .{ .mem = m.addDir(&t.sink.mem, name, stat) }, .json => |*j| .{ .json = j.addDir(name, stat) }, + .bin => |*b| .{ .bin = b.addDir(stat) }, }, }; d.ref(); @@ -130,6 +135,7 @@ pub const Dir = struct { switch (d.out) { .mem => |*m| m.setReadError(), .json => |*j| j.setReadError(), + .bin => |*b| b.setReadError(), } global.last_error_lock.lock(); defer global.last_error_lock.unlock(); @@ -158,16 +164,17 @@ pub const Dir = struct { _ = d.refcnt.fetchAdd(1, .monotonic); } - pub fn unref(d: *Dir) void { + pub fn unref(d: *Dir, t: *Thread) void { if (d.refcnt.fetchSub(1, .release) != 1) return; d.refcnt.fence(.acquire); switch (d.out) { .mem => |*m| m.final(if (d.parent) |p| &p.out.mem else null), .json => |*j| j.final(), + .bin => |*b| b.final(&t.sink.bin, d.name, if (d.parent) |p| &p.out.bin else null), } - if (d.parent) |p| p.unref(); + if (d.parent) |p| p.unref(t); if (d.name.len > 0) main.allocator.free(d.name); main.allocator.destroy(d); } @@ -184,6 +191,7 @@ pub const Thread = struct { sink: union { mem: mem_sink.Thread, json: void, + bin: bin_export.Thread, } = .{.mem = .{}}, fn addBytes(t: *Thread, bytes: u64) void { @@ -215,7 +223,7 @@ pub const Thread = struct { pub const global = struct { pub var state: enum { done, err, zeroing, hlcnt, running } = .running; pub var threads: []Thread = undefined; - pub var sink: enum { json, mem } = .mem; + pub var sink: enum { json, mem, bin } = .mem; pub var last_error: ?[:0]u8 = null; var last_error_lock = std.Thread.Mutex{}; @@ -235,7 +243,13 @@ pub fn createThreads(num: usize) []Thread { if (global.last_error) |p| main.allocator.free(p); global.last_error = null; global.threads = main.allocator.alloc(Thread, num) catch unreachable; - for (global.threads) |*t| t.* = .{}; + for (global.threads) |*t| t.* = .{ + .sink = switch (global.sink) { + .mem => .{ .mem = .{} }, + .json => .{ .json = {} }, + .bin => .{ .bin = .{} }, + }, + }; return global.threads; } @@ -245,6 +259,7 @@ pub fn done() void { switch (global.sink) { .mem => mem_sink.done(), .json => json_export.done(), + .bin => bin_export.done(global.threads), } global.state = .done; main.allocator.free(global.threads); @@ -268,6 +283,7 @@ pub fn createRoot(path: []const u8, stat: *const Stat) *Dir { .out = switch (global.sink) { .mem => .{ .mem = mem_sink.createRoot(path, stat) }, .json => .{ .json = json_export.createRoot(path, stat) }, + .bin => .{ .bin = bin_export.createRoot(stat) }, }, }; return d;