ncdu-zig/src/model.zig

const std = @import("std");
const main = @import("main.zig");

// While an arena allocator is optimimal for almost all scenarios in which ncdu
// is used, it doesn't allow for re-using deleted nodes after doing a delete or
// refresh operation, so a long-running ncdu session with regular refreshes
// will leak memory, but I'd say that's worth the efficiency gains.
// (TODO: Measure, though. Might as well use a general purpose allocator if the
// memory overhead turns out to be insignificant.)
var allocator = std.heap.ArenaAllocator.init(std.heap.page_allocator);

fn saturateAdd(a: anytype, b: @TypeOf(a)) @TypeOf(a) {
    std.debug.assert(@typeInfo(@TypeOf(a)).Int.signedness == .unsigned);
    return std.math.add(@TypeOf(a), a, b) catch std.math.maxInt(@TypeOf(a));
}

fn saturateSub(a: anytype, b: @TypeOf(a)) @TypeOf(a) {
    std.debug.assert(@typeInfo(@TypeOf(a)).Int.signedness == .unsigned);
    return std.math.sub(@TypeOf(a), a, b) catch std.math.minInt(@TypeOf(a));
}

pub const EType = packed enum(u2) { dir, link, file };

// Memory layout:
//      Dir + name (+ alignment + Ext)
//  or: Link + name (+ alignment + Ext)
//  or: File + name (+ alignment + Ext)
//
// Entry is always the first part of Dir, Link and File, so a pointer cast to
// *Entry is always safe and an *Entry can be casted to the full type.
// (TODO: What are the aliassing rules for Zig? There is a 'noalias' keyword,
// but does that mean all unmarked pointers are allowed to alias?)
// (TODO: The 'alignment' in the layout above is a lie, none of these structs
// or fields have any sort of alignment. This is great for saving memory but
// perhaps not very great for code size or performance. Might want to
// experiment with setting some alignment and measure the impact)
// (TODO: Putting Ext before the Entry pointer may be a little faster; removes
// the need to iterate over the name)
pub const Entry = packed struct {
    etype: EType,
    isext: bool,
    blocks: u61, // 512-byte blocks
    size: u64,
    next: ?*Entry,

    const Self = @This();

    pub fn dir(self: *Self) ?*Dir {
        return if (self.etype == .dir) @ptrCast(*Dir, self) else null;
    }

    pub fn link(self: *Self) ?*Link {
        return if (self.etype == .link) @ptrCast(*Link, self) else null;
    }

    pub fn file(self: *Self) ?*File {
        return if (self.etype == .file) @ptrCast(*File, self) else null;
    }

    fn name_offset(etype: EType) usize {
        return switch (etype) {
            .dir => @byteOffsetOf(Dir, "name"),
            .link => @byteOffsetOf(Link, "name"),
            .file => @byteOffsetOf(File, "name"),
        };
    }

    pub fn name(self: *const Self) [:0]const u8 {
        const ptr = @intToPtr([*:0]u8, @ptrToInt(self) + name_offset(self.etype));
        return ptr[0..std.mem.lenZ(ptr) :0];
    }

    pub fn ext(self: *Self) ?*Ext {
        if (!self.isext) return null;
        const n = self.name();
        return @intToPtr(*Ext, std.mem.alignForward(@ptrToInt(self) + name_offset(self.etype) + n.len + 1, @alignOf(Ext)));
    }

    pub fn create(etype: EType, isext: bool, ename: []const u8) !*Entry {
        const base_size = name_offset(etype) + ename.len + 1;
        const size = (if (isext) std.mem.alignForward(base_size, @alignOf(Ext))+@sizeOf(Ext) else base_size);
        var ptr = try allocator.allocator.allocWithOptions(u8, size, @alignOf(Entry), null);
        std.mem.set(u8, ptr, 0); // kind of ugly, but does the trick
        var e = @ptrCast(*Entry, ptr);
        e.etype = etype;
        e.isext = isext;
        var name_ptr = @intToPtr([*]u8, @ptrToInt(e) + name_offset(etype));
        std.mem.copy(u8, name_ptr[0..ename.len], ename);
        //std.debug.warn("{any}\n", .{ @ptrCast([*]u8, e)[0..size] });
        return e;
    }

    // Set the 'err' flag on Dirs and Files, propagating 'suberr' to parents.
    pub fn set_err(self: *Self, parents: *const Parents) void {
        if (self.dir()) |d| d.err = true
        else if (self.file()) |f| f.err = true
        else unreachable;
        var it = parents.iter();
        if (&parents.top().entry == self) _ = it.next();
        while (it.next()) |p| {
            if (p.suberr) break;
            p.suberr = true;
        }
    }

    // Insert this entry into the tree at the given directory, updating parent sizes and item counts.
    // (TODO: This function creates an unrecoverable mess on OOM, need to do something better)
    pub fn insert(self: *Entry, parents: *const Parents) !void {
        self.next = parents.top().sub;
        parents.top().sub = self;
        if (self.dir()) |d| std.debug.assert(d.sub == null);

        const dev = parents.top().dev;
        // Set if this is the first time we've found this hardlink in the bottom-most directory of the given dev.
        // Means we should count it for other-dev parent dirs, too.
        var new_hl = false;

        // TODO: Saturating add/substract
        var it = parents.iter();
        while(it.next()) |p| {
            var add_total = false;

            // Hardlink in a subdirectory with a different device, only count it the first time.
            if (self.link() != null and dev != p.dev) {
                add_total = new_hl;

            } else if (self.link()) |l| {
                const n = HardlinkNode{ .ino = l.ino, .dir = p, .num_files = 1 };
                var d = try devices.items[dev].hardlinks.getOrPut(n);
                new_hl = !d.found_existing;
                if (d.found_existing) d.entry.key.num_files += 1;
                // First time we encounter this file in this dir, count it.
                if (d.entry.key.num_files == 1) {
                    add_total = true;
                    p.shared_size = saturateAdd(p.shared_size, self.size);
                    p.shared_blocks = saturateAdd(p.shared_blocks, self.blocks);
                    p.shared_items = saturateAdd(p.shared_items, 1);
                // Encountered this file in this dir the same number of times as its link count, meaning it's not shared with other dirs.
                } else if(d.entry.key.num_files == l.nlink) {
                    p.shared_size = saturateSub(p.shared_size, self.size);
                    p.shared_blocks = saturateSub(p.shared_blocks, self.blocks);
                    p.shared_items = saturateSub(p.shared_items, 1);
                }
            } else {
                add_total = true;
            }
            if(add_total) {
                p.total_size = saturateAdd(p.total_size, self.size);
                p.total_blocks = saturateAdd(p.total_blocks, self.blocks);
                p.total_items = saturateAdd(p.total_items, 1);
            }
        }
    }
};

const DevId = u30; // Can be reduced to make room for more flags in Dir.

pub const Dir = packed struct {
    entry: Entry,

    sub: ?*Entry,

    // total_*: Total size of all unique files + dirs. Non-shared hardlinks are counted only once.
    //   (i.e. the space you'll need if you created a filesystem with only this dir)
    // shared_*: Unique hardlinks that still have references outside of this directory.
    //   (i.e. the space you won't reclaim by deleting this dir)
    // (space reclaimed by deleting a dir =~ total_ - shared_)
    total_blocks: u64,
    shared_blocks: u64,
    total_size: u64,
    shared_size: u64,
    total_items: u32,
    shared_items: u32,
    // TODO: ncdu1 only keeps track of a total item count including duplicate hardlinks.
    // That number seems useful, too. Include it somehow?

    // Indexes into the global 'devices' array
    dev: DevId,

    err: bool,
    suberr: bool,

    // Only used to find the @byteOffsetOff, the name is written at this point as a 0-terminated string.
    // (Old C habits die hard)
    name: u8,
};

// File that's been hardlinked (i.e. nlink > 1)
pub const Link = packed struct {
    entry: Entry,
    ino: u64,
    // dev is inherited from the parent Dir
    nlink: u32,
    name: u8,
};

// Anything that's not an (indexed) directory or hardlink. Excluded directories are also "Files".
pub const File = packed struct {
    entry: Entry,

    err: bool,
    excluded: bool,
    other_fs: bool,
    kernfs: bool,
    notreg: bool,
    _pad: u3,

    name: u8,
};

pub const Ext = packed struct {
    mtime: u64,
    uid: u32,
    gid: u32,
    mode: u16,
};


// Hardlink handling:
//
//   Global lookup table of dev -> (ino,*Dir) -> num_files
//
//   num_files is how many times the file has been found in the particular dir.
//   num_links is the file's st_nlink count.
//
//   Adding a hardlink: O(parents)
//
//     for dir in file.parents:
//       add to dir.total_* if it's not yet in the lookup table
//       add to num_files in the lookup table
//       add to dir.shared_* where num_files == 1
//
//   Removing a hardlink: O(parents)
//
//     for dir in file.parents:
//       subtract from num_files in the lookup table
//       subtract from dir.total_* if num_files == 0
//       subtract from dir.shared_* if num_files == num_links-1
//       remove from lookup table if num_files == 0
//
//   Re-calculating full hardlink stats (only possible when also storing sizes):
//
//     reset total_* and shared_* for all dirs
//     for (file,dir) in lookup_table:
//       dir.total_* += file
//       if file.num_links != dir.num_files:
//         dir.shared_* += file
//
// Problem: num_links is not available in ncdu JSON dumps, will have to assume
//   that there are no shared hardlinks outside of the given dump.
//
// Problem: This data structure does not provide a way to easily list all paths
//   with the same dev,ino. ncdu provides this list in the info window. Doesn't
//   seem too commonly used, can still be provided by a slow full scan of the
//   tree.


// 20 bytes per hardlink/Dir entry, everything in a single allocation.
// (Should really be aligned to 8 bytes and hence take up 24 bytes, but let's see how this works out)
//
// getEntry() allows modification of the key without re-insertion (this is unsafe in the general case, but works fine for modifying num_files)
//
// Potential problem: HashMap uses a 32bit item counter, which may be exceeded in extreme scenarios.
// (ncdu itself doesn't support more than 31bit-counted files, but this table is hardlink_count*parent_dirs and may grow a bit)

const HardlinkNode = packed struct {
    ino: u64,
    dir: *Dir,
    num_files: u32,

    const Self = @This();

    // hash() assumes a struct layout, hence the 'packed struct'
    fn hash(self: Self) u64 { return std.hash.Wyhash.hash(0, @ptrCast([*]const u8, &self)[0..@byteOffsetOf(Self, "dir")+@sizeOf(*Dir)]); }
    fn eql(a: Self, b: Self) bool { return a.ino == b.ino and a.dir == b.dir; }
};

// Device entry, this is used for two reasons:
// 1. st_dev ids are 64-bit, but in a typical filesystem there's only a few
//    unique ids, hence we can save RAM by only storing smaller DevId's in Dir
//    entries and using that as an index to a lookup table.
// 2. Keeping track of hardlink counts for each dir and inode, as described above.
//
// (Device entries are never deallocated)
const Device = struct {
    dev: u64,
    hardlinks: Hardlinks = Hardlinks.init(main.allocator),

    const Hardlinks = std.HashMap(HardlinkNode, void, HardlinkNode.hash, HardlinkNode.eql, 80);
};

var devices: std.ArrayList(Device) = std.ArrayList(Device).init(main.allocator);
var dev_lookup: std.AutoHashMap(u64, DevId) = std.AutoHashMap(u64, DevId).init(main.allocator);

pub fn getDevId(dev: u64) !DevId {
    var d = try dev_lookup.getOrPut(dev);
    if (!d.found_existing) {
        errdefer dev_lookup.removeAssertDiscard(dev);
        d.entry.value = @intCast(DevId, devices.items.len);
        try devices.append(.{ .dev = dev });
    }
    return d.entry.value;
}

pub fn getDev(id: DevId) u64 {
    return devices.items[id].dev;
}

pub var root: *Dir = undefined;

// Stack of parent directories, convenient helper when constructing and traversing the tree.
// The 'root' node is always implicitely at the bottom of the stack.
pub const Parents = struct {
    stack: std.ArrayList(*Dir) = std.ArrayList(*Dir).init(main.allocator),

    const Self = @This();

    pub fn push(self: *Self, dir: *Dir) !void {
        return self.stack.append(dir);
    }

    // Attempting to remove the root node is considered a bug.
    pub fn pop(self: *Self) void {
        _ = self.stack.pop();
    }

    pub fn top(self: *const Self) *Dir {
        return if (self.stack.items.len == 0) root else self.stack.items[self.stack.items.len-1];
    }

    pub const Iterator = struct {
        lst: *const Self,
        index: usize = 0, // 0 = top of the stack, counts upwards to go down

        pub fn next(it: *Iterator) ?*Dir {
            const len = it.lst.stack.items.len;
            if (it.index > len) return null;
            it.index += 1;
            return if (it.index > len) root else it.lst.stack.items[len-it.index];
        }
    };

    // Iterate from top to bottom of the stack.
    pub fn iter(self: *const Self) Iterator {
        return .{ .lst = self };
    }

    pub fn path(self: *const Self, wr: anytype) !void {
        const r = root.entry.name();
        try wr.writeAll(r);
        var i: usize = 0;
        while (i < self.stack.items.len) {
            if (i != 0 or r[r.len-1] != '/') try wr.writeByte('/');
            try wr.writeAll(self.stack.items[i].entry.name());
            i += 1;
        }
    }
};

test "name offsets" {
    std.testing.expectEqual(@bitOffsetOf(Dir, "name") % 8, 0);
    std.testing.expectEqual(@bitOffsetOf(Link, "name") % 8, 0);
    std.testing.expectEqual(@bitOffsetOf(File, "name") % 8, 0);
}

test "entry" {
    var e = Entry.create(.file, false, "hello") catch unreachable;
    std.debug.assert(e.etype == .file);
    std.debug.assert(!e.isext);
    std.testing.expectEqualStrings(e.name(), "hello");
}
WIP: Experimenting with a rewrite to Zig & a new data model The new data model is supposed to solve a few problems with ncdu 1.x's 'struct dir': - Reduce memory overhead, - Fix extremely slow counting of hard links in some scenarios (issue #121) - Add support for counting 'shared' data with other directories (issue #36) Quick memory usage comparison of my root directory with ~3.5 million files (normal / extended mode): ncdu 1.15.1: 379M / 451M new (unaligned): 145M / 178M new (aligned): 155M / 200M There's still a /lot/ of to-do's left before this is usable, however, and there's a bunch of issues I haven't really decided on yet, such as which TUI library to use. Backporting this data model to the C version of ncdu is also possible, but somewhat painful. Let's first see how far I get with Zig. 2021-04-29 02:48:45 -08:00			`const std = @import("std");`
			`const main = @import("main.zig");`

			`// While an arena allocator is optimimal for almost all scenarios in which ncdu`
			`// is used, it doesn't allow for re-using deleted nodes after doing a delete or`
			`// refresh operation, so a long-running ncdu session with regular refreshes`
			`// will leak memory, but I'd say that's worth the efficiency gains.`
			`// (TODO: Measure, though. Might as well use a general purpose allocator if the`
			`// memory overhead turns out to be insignificant.)`
			`var allocator = std.heap.ArenaAllocator.init(std.heap.page_allocator);`

Correct int truncating/saturating + avoid one toPosixPath() 2021-05-01 00:39:57 -08:00			`fn saturateAdd(a: anytype, b: @TypeOf(a)) @TypeOf(a) {`
			`std.debug.assert(@typeInfo(@TypeOf(a)).Int.signedness == .unsigned);`
			`return std.math.add(@TypeOf(a), a, b) catch std.math.maxInt(@TypeOf(a));`
			`}`

			`fn saturateSub(a: anytype, b: @TypeOf(a)) @TypeOf(a) {`
			`std.debug.assert(@typeInfo(@TypeOf(a)).Int.signedness == .unsigned);`
			`return std.math.sub(@TypeOf(a), a, b) catch std.math.minInt(@TypeOf(a));`
			`}`

WIP: Experimenting with a rewrite to Zig & a new data model The new data model is supposed to solve a few problems with ncdu 1.x's 'struct dir': - Reduce memory overhead, - Fix extremely slow counting of hard links in some scenarios (issue #121) - Add support for counting 'shared' data with other directories (issue #36) Quick memory usage comparison of my root directory with ~3.5 million files (normal / extended mode): ncdu 1.15.1: 379M / 451M new (unaligned): 145M / 178M new (aligned): 155M / 200M There's still a /lot/ of to-do's left before this is usable, however, and there's a bunch of issues I haven't really decided on yet, such as which TUI library to use. Backporting this data model to the C version of ncdu is also possible, but somewhat painful. Let's first see how far I get with Zig. 2021-04-29 02:48:45 -08:00			`pub const EType = packed enum(u2) { dir, link, file };`

			`// Memory layout:`
			`// Dir + name (+ alignment + Ext)`
			`// or: Link + name (+ alignment + Ext)`
			`// or: File + name (+ alignment + Ext)`
			`//`
			`// Entry is always the first part of Dir, Link and File, so a pointer cast to`
			`// Entry is always safe and an Entry can be casted to the full type.`
			`// (TODO: What are the aliassing rules for Zig? There is a 'noalias' keyword,`
			`// but does that mean all unmarked pointers are allowed to alias?)`
			`// (TODO: The 'alignment' in the layout above is a lie, none of these structs`
			`// or fields have any sort of alignment. This is great for saving memory but`
			`// perhaps not very great for code size or performance. Might want to`
			`// experiment with setting some alignment and measure the impact)`
			`// (TODO: Putting Ext before the Entry pointer may be a little faster; removes`
			`// the need to iterate over the name)`
			`pub const Entry = packed struct {`
			`etype: EType,`
			`isext: bool,`
			`blocks: u61, // 512-byte blocks`
			`size: u64,`
			`next: ?*Entry,`

			`const Self = @This();`

			`pub fn dir(self: Self) ?Dir {`
			`return if (self.etype == .dir) @ptrCast(*Dir, self) else null;`
			`}`

			`pub fn link(self: Self) ?Link {`
			`return if (self.etype == .link) @ptrCast(*Link, self) else null;`
			`}`

			`pub fn file(self: Self) ?File {`
			`return if (self.etype == .file) @ptrCast(*File, self) else null;`
			`}`

			`fn name_offset(etype: EType) usize {`
			`return switch (etype) {`
			`.dir => @byteOffsetOf(Dir, "name"),`
			`.link => @byteOffsetOf(Link, "name"),`
			`.file => @byteOffsetOf(File, "name"),`
			`};`
			`}`

			`pub fn name(self: *const Self) [:0]const u8 {`
			`const ptr = @intToPtr([*:0]u8, @ptrToInt(self) + name_offset(self.etype));`
			`return ptr[0..std.mem.lenZ(ptr) :0];`
			`}`

			`pub fn ext(self: Self) ?Ext {`
			`if (!self.isext) return null;`
			`const n = self.name();`
			`return @intToPtr(*Ext, std.mem.alignForward(@ptrToInt(self) + name_offset(self.etype) + n.len + 1, @alignOf(Ext)));`
			`}`

			`pub fn create(etype: EType, isext: bool, ename: []const u8) !*Entry {`
			`const base_size = name_offset(etype) + ename.len + 1;`
			`const size = (if (isext) std.mem.alignForward(base_size, @alignOf(Ext))+@sizeOf(Ext) else base_size);`
			`var ptr = try allocator.allocator.allocWithOptions(u8, size, @alignOf(Entry), null);`
			`std.mem.set(u8, ptr, 0); // kind of ugly, but does the trick`
			`var e = @ptrCast(*Entry, ptr);`
			`e.etype = etype;`
			`e.isext = isext;`
			`var name_ptr = @intToPtr([*]u8, @ptrToInt(e) + name_offset(etype));`
			`std.mem.copy(u8, name_ptr[0..ename.len], ename);`
			`//std.debug.warn("{any}\n", .{ @ptrCast([*]u8, e)[0..size] });`
			`return e;`
			`}`

			`// Set the 'err' flag on Dirs and Files, propagating 'suberr' to parents.`
			`pub fn set_err(self: Self, parents: const Parents) void {`
			`if (self.dir()) \|d\| d.err = true`
			`else if (self.file()) \|f\| f.err = true`
			`else unreachable;`
			`var it = parents.iter();`
			`if (&parents.top().entry == self) _ = it.next();`
			`while (it.next()) \|p\| {`
			`if (p.suberr) break;`
			`p.suberr = true;`
			`}`
			`}`

			`// Insert this entry into the tree at the given directory, updating parent sizes and item counts.`
			`// (TODO: This function creates an unrecoverable mess on OOM, need to do something better)`
			`pub fn insert(self: Entry, parents: const Parents) !void {`
			`self.next = parents.top().sub;`
			`parents.top().sub = self;`
			`if (self.dir()) \|d\| std.debug.assert(d.sub == null);`

			`const dev = parents.top().dev;`
			`// Set if this is the first time we've found this hardlink in the bottom-most directory of the given dev.`
			`// Means we should count it for other-dev parent dirs, too.`
			`var new_hl = false;`

			`// TODO: Saturating add/substract`
			`var it = parents.iter();`
			`while(it.next()) \|p\| {`
			`var add_total = false;`

			`// Hardlink in a subdirectory with a different device, only count it the first time.`
			`if (self.link() != null and dev != p.dev) {`
			`add_total = new_hl;`

			`} else if (self.link()) \|l\| {`
			`const n = HardlinkNode{ .ino = l.ino, .dir = p, .num_files = 1 };`
			`var d = try devices.items[dev].hardlinks.getOrPut(n);`
			`new_hl = !d.found_existing;`
			`if (d.found_existing) d.entry.key.num_files += 1;`
			`// First time we encounter this file in this dir, count it.`
			`if (d.entry.key.num_files == 1) {`
			`add_total = true;`
Correct int truncating/saturating + avoid one toPosixPath() 2021-05-01 00:39:57 -08:00			`p.shared_size = saturateAdd(p.shared_size, self.size);`
			`p.shared_blocks = saturateAdd(p.shared_blocks, self.blocks);`
			`p.shared_items = saturateAdd(p.shared_items, 1);`
WIP: Experimenting with a rewrite to Zig & a new data model The new data model is supposed to solve a few problems with ncdu 1.x's 'struct dir': - Reduce memory overhead, - Fix extremely slow counting of hard links in some scenarios (issue #121) - Add support for counting 'shared' data with other directories (issue #36) Quick memory usage comparison of my root directory with ~3.5 million files (normal / extended mode): ncdu 1.15.1: 379M / 451M new (unaligned): 145M / 178M new (aligned): 155M / 200M There's still a /lot/ of to-do's left before this is usable, however, and there's a bunch of issues I haven't really decided on yet, such as which TUI library to use. Backporting this data model to the C version of ncdu is also possible, but somewhat painful. Let's first see how far I get with Zig. 2021-04-29 02:48:45 -08:00			`// Encountered this file in this dir the same number of times as its link count, meaning it's not shared with other dirs.`
			`} else if(d.entry.key.num_files == l.nlink) {`
Correct int truncating/saturating + avoid one toPosixPath() 2021-05-01 00:39:57 -08:00			`p.shared_size = saturateSub(p.shared_size, self.size);`
			`p.shared_blocks = saturateSub(p.shared_blocks, self.blocks);`
			`p.shared_items = saturateSub(p.shared_items, 1);`
WIP: Experimenting with a rewrite to Zig & a new data model The new data model is supposed to solve a few problems with ncdu 1.x's 'struct dir': - Reduce memory overhead, - Fix extremely slow counting of hard links in some scenarios (issue #121) - Add support for counting 'shared' data with other directories (issue #36) Quick memory usage comparison of my root directory with ~3.5 million files (normal / extended mode): ncdu 1.15.1: 379M / 451M new (unaligned): 145M / 178M new (aligned): 155M / 200M There's still a /lot/ of to-do's left before this is usable, however, and there's a bunch of issues I haven't really decided on yet, such as which TUI library to use. Backporting this data model to the C version of ncdu is also possible, but somewhat painful. Let's first see how far I get with Zig. 2021-04-29 02:48:45 -08:00			`}`
			`} else {`
			`add_total = true;`
			`}`
			`if(add_total) {`
Correct int truncating/saturating + avoid one toPosixPath() 2021-05-01 00:39:57 -08:00			`p.total_size = saturateAdd(p.total_size, self.size);`
			`p.total_blocks = saturateAdd(p.total_blocks, self.blocks);`
			`p.total_items = saturateAdd(p.total_items, 1);`
WIP: Experimenting with a rewrite to Zig & a new data model The new data model is supposed to solve a few problems with ncdu 1.x's 'struct dir': - Reduce memory overhead, - Fix extremely slow counting of hard links in some scenarios (issue #121) - Add support for counting 'shared' data with other directories (issue #36) Quick memory usage comparison of my root directory with ~3.5 million files (normal / extended mode): ncdu 1.15.1: 379M / 451M new (unaligned): 145M / 178M new (aligned): 155M / 200M There's still a /lot/ of to-do's left before this is usable, however, and there's a bunch of issues I haven't really decided on yet, such as which TUI library to use. Backporting this data model to the C version of ncdu is also possible, but somewhat painful. Let's first see how far I get with Zig. 2021-04-29 02:48:45 -08:00			`}`
			`}`
			`}`
			`};`

			`const DevId = u30; // Can be reduced to make room for more flags in Dir.`

			`pub const Dir = packed struct {`
			`entry: Entry,`

			`sub: ?*Entry,`

			`// total_*: Total size of all unique files + dirs. Non-shared hardlinks are counted only once.`
			`// (i.e. the space you'll need if you created a filesystem with only this dir)`
			`// shared_*: Unique hardlinks that still have references outside of this directory.`
			`// (i.e. the space you won't reclaim by deleting this dir)`
			`// (space reclaimed by deleting a dir =~ total_ - shared_)`
			`total_blocks: u64,`
			`shared_blocks: u64,`
			`total_size: u64,`
			`shared_size: u64,`
			`total_items: u32,`
			`shared_items: u32,`
			`// TODO: ncdu1 only keeps track of a total item count including duplicate hardlinks.`
			`// That number seems useful, too. Include it somehow?`

			`// Indexes into the global 'devices' array`
			`dev: DevId,`

			`err: bool,`
			`suberr: bool,`

			`// Only used to find the @byteOffsetOff, the name is written at this point as a 0-terminated string.`
			`// (Old C habits die hard)`
			`name: u8,`
			`};`

			`// File that's been hardlinked (i.e. nlink > 1)`
			`pub const Link = packed struct {`
			`entry: Entry,`
			`ino: u64,`
			`// dev is inherited from the parent Dir`
			`nlink: u32,`
			`name: u8,`
			`};`

			`// Anything that's not an (indexed) directory or hardlink. Excluded directories are also "Files".`
			`pub const File = packed struct {`
			`entry: Entry,`

			`err: bool,`
			`excluded: bool,`
			`other_fs: bool,`
			`kernfs: bool,`
			`notreg: bool,`
			`_pad: u3,`

			`name: u8,`
			`};`

			`pub const Ext = packed struct {`
			`mtime: u64,`
Correct int truncating/saturating + avoid one toPosixPath() 2021-05-01 00:39:57 -08:00			`uid: u32,`
			`gid: u32,`
WIP: Experimenting with a rewrite to Zig & a new data model The new data model is supposed to solve a few problems with ncdu 1.x's 'struct dir': - Reduce memory overhead, - Fix extremely slow counting of hard links in some scenarios (issue #121) - Add support for counting 'shared' data with other directories (issue #36) Quick memory usage comparison of my root directory with ~3.5 million files (normal / extended mode): ncdu 1.15.1: 379M / 451M new (unaligned): 145M / 178M new (aligned): 155M / 200M There's still a /lot/ of to-do's left before this is usable, however, and there's a bunch of issues I haven't really decided on yet, such as which TUI library to use. Backporting this data model to the C version of ncdu is also possible, but somewhat painful. Let's first see how far I get with Zig. 2021-04-29 02:48:45 -08:00			`mode: u16,`
			`};`


			`// Hardlink handling:`
			`//`
			`// Global lookup table of dev -> (ino,*Dir) -> num_files`
			`//`
			`// num_files is how many times the file has been found in the particular dir.`
			`// num_links is the file's st_nlink count.`
			`//`
			`// Adding a hardlink: O(parents)`
			`//`
			`// for dir in file.parents:`
			`// add to dir.total_* if it's not yet in the lookup table`
			`// add to num_files in the lookup table`
			`// add to dir.shared_* where num_files == 1`
			`//`
			`// Removing a hardlink: O(parents)`
			`//`
			`// for dir in file.parents:`
			`// subtract from num_files in the lookup table`
			`// subtract from dir.total_* if num_files == 0`
			`// subtract from dir.shared_* if num_files == num_links-1`
			`// remove from lookup table if num_files == 0`
			`//`
			`// Re-calculating full hardlink stats (only possible when also storing sizes):`
			`//`
			`// reset total_* and shared_* for all dirs`
			`// for (file,dir) in lookup_table:`
			`// dir.total_* += file`
			`// if file.num_links != dir.num_files:`
			`// dir.shared_* += file`
			`//`
			`// Problem: num_links is not available in ncdu JSON dumps, will have to assume`
			`// that there are no shared hardlinks outside of the given dump.`
			`//`
			`// Problem: This data structure does not provide a way to easily list all paths`
			`// with the same dev,ino. ncdu provides this list in the info window. Doesn't`
			`// seem too commonly used, can still be provided by a slow full scan of the`
			`// tree.`


			`// 20 bytes per hardlink/Dir entry, everything in a single allocation.`
			`// (Should really be aligned to 8 bytes and hence take up 24 bytes, but let's see how this works out)`
			`//`
			`// getEntry() allows modification of the key without re-insertion (this is unsafe in the general case, but works fine for modifying num_files)`
			`//`
			`// Potential problem: HashMap uses a 32bit item counter, which may be exceeded in extreme scenarios.`
			`// (ncdu itself doesn't support more than 31bit-counted files, but this table is hardlink_count*parent_dirs and may grow a bit)`

			`const HardlinkNode = packed struct {`
			`ino: u64,`
			`dir: *Dir,`
			`num_files: u32,`

			`const Self = @This();`

			`// hash() assumes a struct layout, hence the 'packed struct'`
			`fn hash(self: Self) u64 { return std.hash.Wyhash.hash(0, @ptrCast([]const u8, &self)[0..@byteOffsetOf(Self, "dir")+@sizeOf(Dir)]); }`
			`fn eql(a: Self, b: Self) bool { return a.ino == b.ino and a.dir == b.dir; }`
			`};`

			`// Device entry, this is used for two reasons:`
			`// 1. st_dev ids are 64-bit, but in a typical filesystem there's only a few`
			`// unique ids, hence we can save RAM by only storing smaller DevId's in Dir`
			`// entries and using that as an index to a lookup table.`
			`// 2. Keeping track of hardlink counts for each dir and inode, as described above.`
			`//`
			`// (Device entries are never deallocated)`
			`const Device = struct {`
			`dev: u64,`
			`hardlinks: Hardlinks = Hardlinks.init(main.allocator),`

			`const Hardlinks = std.HashMap(HardlinkNode, void, HardlinkNode.hash, HardlinkNode.eql, 80);`
			`};`

			`var devices: std.ArrayList(Device) = std.ArrayList(Device).init(main.allocator);`
			`var dev_lookup: std.AutoHashMap(u64, DevId) = std.AutoHashMap(u64, DevId).init(main.allocator);`

			`pub fn getDevId(dev: u64) !DevId {`
			`var d = try dev_lookup.getOrPut(dev);`
			`if (!d.found_existing) {`
			`errdefer dev_lookup.removeAssertDiscard(dev);`
			`d.entry.value = @intCast(DevId, devices.items.len);`
			`try devices.append(.{ .dev = dev });`
			`}`
			`return d.entry.value;`
			`}`

			`pub fn getDev(id: DevId) u64 {`
			`return devices.items[id].dev;`
			`}`

			`pub var root: *Dir = undefined;`

			`// Stack of parent directories, convenient helper when constructing and traversing the tree.`
			`// The 'root' node is always implicitely at the bottom of the stack.`
			`pub const Parents = struct {`
			`stack: std.ArrayList(Dir) = std.ArrayList(Dir).init(main.allocator),`

			`const Self = @This();`

			`pub fn push(self: Self, dir: Dir) !void {`
			`return self.stack.append(dir);`
			`}`

			`// Attempting to remove the root node is considered a bug.`
			`pub fn pop(self: *Self) void {`
			`_ = self.stack.pop();`
			`}`

			`pub fn top(self: const Self) Dir {`
			`return if (self.stack.items.len == 0) root else self.stack.items[self.stack.items.len-1];`
			`}`

			`pub const Iterator = struct {`
			`lst: *const Self,`
			`index: usize = 0, // 0 = top of the stack, counts upwards to go down`

			`pub fn next(it: Iterator) ?Dir {`
			`const len = it.lst.stack.items.len;`
			`if (it.index > len) return null;`
			`it.index += 1;`
			`return if (it.index > len) root else it.lst.stack.items[len-it.index];`
			`}`
			`};`

			`// Iterate from top to bottom of the stack.`
			`pub fn iter(self: *const Self) Iterator {`
			`return .{ .lst = self };`
			`}`
Fix some scanning bugs + support --exclude-caches and --follow-symlinks Supporting kernfs checking is going to be a bit more annoying. And so is exclude patterns. Ugh. 2021-04-30 09:15:29 -08:00
			`pub fn path(self: *const Self, wr: anytype) !void {`
			`const r = root.entry.name();`
			`try wr.writeAll(r);`
			`var i: usize = 0;`
			`while (i < self.stack.items.len) {`
			`if (i != 0 or r[r.len-1] != '/') try wr.writeByte('/');`
			`try wr.writeAll(self.stack.items[i].entry.name());`
			`i += 1;`
			`}`
			`}`
WIP: Experimenting with a rewrite to Zig & a new data model The new data model is supposed to solve a few problems with ncdu 1.x's 'struct dir': - Reduce memory overhead, - Fix extremely slow counting of hard links in some scenarios (issue #121) - Add support for counting 'shared' data with other directories (issue #36) Quick memory usage comparison of my root directory with ~3.5 million files (normal / extended mode): ncdu 1.15.1: 379M / 451M new (unaligned): 145M / 178M new (aligned): 155M / 200M There's still a /lot/ of to-do's left before this is usable, however, and there's a bunch of issues I haven't really decided on yet, such as which TUI library to use. Backporting this data model to the C version of ncdu is also possible, but somewhat painful. Let's first see how far I get with Zig. 2021-04-29 02:48:45 -08:00			`};`

			`test "name offsets" {`
			`std.testing.expectEqual(@bitOffsetOf(Dir, "name") % 8, 0);`
			`std.testing.expectEqual(@bitOffsetOf(Link, "name") % 8, 0);`
			`std.testing.expectEqual(@bitOffsetOf(File, "name") % 8, 0);`
			`}`

			`test "entry" {`
			`var e = Entry.create(.file, false, "hello") catch unreachable;`
			`std.debug.assert(e.etype == .file);`
			`std.debug.assert(!e.isext);`
			`std.testing.expectEqualStrings(e.name(), "hello");`
			`}`