Add hardlink counting support for the new export format

This ended up a little different than I had originally planned. The bad part is that my idea for the 'prevlnk' references wasn't going to work out. For one because the reader has no efficient way to determine the head reference of this list and implementing a lookup table would be pretty costly and complex, and second because even with those references working, they'd be pretty useless because there's no way to go from an itemref to a full path. I don't see an easy way to solve these problems, so I'm afraid the efficient hardlink list feature will have to be disabled when reading from this new format. :( The good news is that removing these references simplifies the hardlink counting implementation and removes the requirement for a global inode map and associated mutex. \o/ Performance is looking really good so far, too.
2026-03-13 15:08:39 -08:00 · 2024-07-31 19:32:49 +02:00 · 2024-07-31 19:32:49 +02:00 · 5a0c8c6175
commit 5a0c8c6175
parent ebaa9b6a89
2 changed files with 103 additions and 18 deletions
--- a/ncdubinexp.pl
+++ b/ncdubinexp.pl
@ -62,7 +62,6 @@ my @itemkeys = qw/
    sub
    ino
    nlink
-    prevlnk
    uid
    gid
    mode
@ -218,8 +217,6 @@ sub traverse($parent, $path) {

    my($noref) = grep !$_->{_lastseen}, values %items;
    die sprintf "No reference found to #%010x\n", $noref->{_itemref} if $noref;
-
-    # TODO: Verify 'link' references
 }

 if ($printstats) {
--- a/src/bin_export.zig
+++ b/src/bin_export.zig
@ -18,8 +18,6 @@ pub const global = struct {
    var file_off: u64 = 0;
    var lock: std.Thread.Mutex = .{};
    var root_itemref: u64 = 0;
-    // TODO:
-    // var links: Map dev -> ino -> (last_offset, size, blocks, nlink)
 };

 const BLOCK_SIZE: usize = 512*1024; // XXX: Current maximum for benchmarking, should just stick with a fixed block size.
@ -55,12 +53,11 @@ const ItemKey = enum(u5) {
    // Only for .link
    ino     = 13, // u64
    nlink   = 14, // u32
-    prevlnk = 15, // itemref
    // Extended mode
-    uid   = 16, // u32
-    gid   = 17, // u32
-    mode  = 18, // u16
-    mtime = 19, // u64
+    uid   = 15, // u32
+    gid   = 16, // u32
+    mode  = 17, // u16
+    mtime = 18, // u64
 };

 // Pessimistic upper bound on the encoded size of an item, excluding the name field.
@ -251,9 +248,18 @@ pub const Dir = struct {
    blocks: u64 = 0,
    err: bool = false,
    suberr: bool = false,
-    // TODO: set of links
-    //shared_size: u64,
-    //shared_blocks: u64,
+    shared_size: u64 = 0,
+    shared_blocks: u64 = 0,
+    inodes: Inodes = Inodes.init(main.allocator),
+
+    const Inodes = std.AutoHashMap(u64, Inode);
+    const Inode = struct {
+        size: u64,
+        blocks: u64,
+        nlink: u32,
+        nfound: u32,
+    };
+

    pub fn addSpecial(d: *Dir, t: *Thread, name: []const u8, sp: sink.Special) void {
        d.lock.lock();
@ -284,7 +290,21 @@ pub const Dir = struct {
        t.cborHead(.pos, stat.size);
        t.itemKey(.dsize);
        t.cborHead(.pos, util.blocksToSize(stat.blocks));
-        // TODO: hardlink stuff
+
+        if (stat.hlinkc) {
+            const lnk = d.inodes.getOrPut(stat.ino) catch unreachable;
+            if (!lnk.found_existing) lnk.value_ptr.* = .{
+                .size = stat.size,
+                .blocks = stat.blocks,
+                .nlink = stat.nlink,
+                .nfound = 1,
+            } else lnk.value_ptr.nfound += 1;
+            t.itemKey(.ino);
+            t.cborHead(.pos, stat.ino);
+            t.itemKey(.nlink);
+            t.cborHead(.pos, stat.nlink);
+        }
+
        t.itemExt(stat);
        t.itemEnd();
    }
@ -304,20 +324,80 @@ pub const Dir = struct {
        d.err = true;
    }

+    // XXX: older JSON exports did not include the nlink count and have
+    // this field set to '0'.  We can deal with that when importing to
+    // mem_sink, but the hardlink counting algorithm used here really does need
+    // that information. Current code makes sure to count such links only once
+    // per dir, but does not count them towards the shared_* fields. That
+    // behavior is similar to ncdu 1.x, but the difference between memory
+    // import and this file export might be surprising.
+    fn countLinks(d: *Dir, parent: ?*Dir) void {
+        var parent_new: u32 = 0;
+        var it = d.inodes.iterator();
+        while (it.next()) |kv| {
+            const v = kv.value_ptr;
+            d.size +|= v.size;
+            d.blocks +|= v.blocks;
+            if (v.nlink > 1 and v.nfound <= v.nlink) {
+                d.shared_size +|= v.size;
+                d.shared_blocks +|= v.blocks;
+            }
+
+            const p = parent orelse continue;
+            // All contained in this dir, no need to keep this entry around
+            if (v.nlink > 0 and v.nfound >= v.nlink) {
+                p.size +|= v.size;
+                p.blocks +|= v.blocks;
+                _ = d.inodes.remove(kv.key_ptr.*);
+            } else if (!p.inodes.contains(kv.key_ptr.*))
+                parent_new += 1;
+        }
+
+        // Merge remaining inodes into parent
+        const p = parent orelse return;
+        if (d.inodes.count() == 0) return;
+
+        // If parent is empty, just transfer
+        if (p.inodes.count() == 0) {
+            p.inodes.deinit();
+            p.inodes = d.inodes;
+            d.inodes = Inodes.init(main.allocator); // So we can deinit() without affecting parent
+        // Otherwise, merge
+        } else {
+            p.inodes.ensureUnusedCapacity(parent_new) catch unreachable;
+            it = d.inodes.iterator();
+            while (it.next()) |kv| {
+                const v = kv.value_ptr;
+                const plnk = p.inodes.getOrPutAssumeCapacity(kv.key_ptr.*);
+                if (!plnk.found_existing) plnk.value_ptr.* = v.*
+                else plnk.value_ptr.*.nfound += v.nfound;
+            }
+        }
+    }
+
    pub fn final(d: *Dir, t: *Thread, name: []const u8, parent: ?*Dir) void {
        if (parent) |p| p.lock.lock();
        defer if (parent) |p| p.lock.unlock();

-        // TODO: hardlink stuff
        if (parent) |p| {
+            // Different dev? Don't merge the 'inodes' sets, just count the
+            // links here first so the sizes get added to the parent.
+            if (p.stat.dev != d.stat.dev) d.countLinks(null);
+
            p.items += d.items;
            p.size +|= d.size;
            p.blocks +|= d.blocks;
            if (d.suberr or d.err) p.suberr = true;

+            // Same dir, merge inodes
+            if (p.stat.dev == d.stat.dev) d.countLinks(p);
+
            p.last_sub = t.itemStart(.dir, p.last_sub, name);
-        } else
+        } else {
+            d.countLinks(null);
            global.root_itemref = t.itemStart(.dir, 0, name);
+        }
+        d.inodes.deinit();

        t.itemKey(.asize);
        t.cborHead(.pos, d.stat.size);
@ -332,9 +412,17 @@ pub const Dir = struct {
            t.cborHead(.simple, if (d.err) 21 else 20);
        }
        t.itemKey(.cumasize);
-        t.cborHead(.pos, d.size);
+        t.cborHead(.pos, d.size +| d.stat.size);
        t.itemKey(.cumdsize);
-        t.cborHead(.pos, util.blocksToSize(d.blocks));
+        t.cborHead(.pos, util.blocksToSize(d.blocks +| d.stat.blocks));
+        if (d.shared_size > 0) {
+            t.itemKey(.shrasize);
+            t.cborHead(.pos, d.shared_size);
+        }
+        if (d.shared_blocks > 0) {
+            t.itemKey(.shrdsize);
+            t.cborHead(.pos, util.blocksToSize(d.shared_blocks));
+        }
        t.itemKey(.items);
        t.cborHead(.pos, d.items);
        t.itemRef(.sub, d.last_sub);