ncdu-zig/src/model.zig

372 lines
14 KiB
Zig
Raw Normal View History

const std = @import("std");
const main = @import("main.zig");
// While an arena allocator is optimimal for almost all scenarios in which ncdu
// is used, it doesn't allow for re-using deleted nodes after doing a delete or
// refresh operation, so a long-running ncdu session with regular refreshes
// will leak memory, but I'd say that's worth the efficiency gains.
// (TODO: Measure, though. Might as well use a general purpose allocator if the
// memory overhead turns out to be insignificant.)
var allocator = std.heap.ArenaAllocator.init(std.heap.page_allocator);
fn saturateAdd(a: anytype, b: @TypeOf(a)) @TypeOf(a) {
std.debug.assert(@typeInfo(@TypeOf(a)).Int.signedness == .unsigned);
return std.math.add(@TypeOf(a), a, b) catch std.math.maxInt(@TypeOf(a));
}
fn saturateSub(a: anytype, b: @TypeOf(a)) @TypeOf(a) {
std.debug.assert(@typeInfo(@TypeOf(a)).Int.signedness == .unsigned);
return std.math.sub(@TypeOf(a), a, b) catch std.math.minInt(@TypeOf(a));
}
pub const EType = packed enum(u2) { dir, link, file };
// Memory layout:
// Dir + name (+ alignment + Ext)
// or: Link + name (+ alignment + Ext)
// or: File + name (+ alignment + Ext)
//
// Entry is always the first part of Dir, Link and File, so a pointer cast to
// *Entry is always safe and an *Entry can be casted to the full type.
// (TODO: What are the aliassing rules for Zig? There is a 'noalias' keyword,
// but does that mean all unmarked pointers are allowed to alias?)
// (TODO: The 'alignment' in the layout above is a lie, none of these structs
// or fields have any sort of alignment. This is great for saving memory but
// perhaps not very great for code size or performance. Might want to
// experiment with setting some alignment and measure the impact)
// (TODO: Putting Ext before the Entry pointer may be a little faster; removes
// the need to iterate over the name)
pub const Entry = packed struct {
etype: EType,
isext: bool,
blocks: u61, // 512-byte blocks
size: u64,
next: ?*Entry,
const Self = @This();
pub fn dir(self: *Self) ?*Dir {
return if (self.etype == .dir) @ptrCast(*Dir, self) else null;
}
pub fn link(self: *Self) ?*Link {
return if (self.etype == .link) @ptrCast(*Link, self) else null;
}
pub fn file(self: *Self) ?*File {
return if (self.etype == .file) @ptrCast(*File, self) else null;
}
fn name_offset(etype: EType) usize {
return switch (etype) {
.dir => @byteOffsetOf(Dir, "name"),
.link => @byteOffsetOf(Link, "name"),
.file => @byteOffsetOf(File, "name"),
};
}
pub fn name(self: *const Self) [:0]const u8 {
const ptr = @intToPtr([*:0]u8, @ptrToInt(self) + name_offset(self.etype));
return ptr[0..std.mem.lenZ(ptr) :0];
}
pub fn ext(self: *Self) ?*Ext {
if (!self.isext) return null;
const n = self.name();
return @intToPtr(*Ext, std.mem.alignForward(@ptrToInt(self) + name_offset(self.etype) + n.len + 1, @alignOf(Ext)));
}
pub fn create(etype: EType, isext: bool, ename: []const u8) !*Entry {
const base_size = name_offset(etype) + ename.len + 1;
const size = (if (isext) std.mem.alignForward(base_size, @alignOf(Ext))+@sizeOf(Ext) else base_size);
var ptr = try allocator.allocator.allocWithOptions(u8, size, @alignOf(Entry), null);
std.mem.set(u8, ptr, 0); // kind of ugly, but does the trick
var e = @ptrCast(*Entry, ptr);
e.etype = etype;
e.isext = isext;
var name_ptr = @intToPtr([*]u8, @ptrToInt(e) + name_offset(etype));
std.mem.copy(u8, name_ptr[0..ename.len], ename);
//std.debug.warn("{any}\n", .{ @ptrCast([*]u8, e)[0..size] });
return e;
}
// Set the 'err' flag on Dirs and Files, propagating 'suberr' to parents.
pub fn set_err(self: *Self, parents: *const Parents) void {
if (self.dir()) |d| d.err = true
else if (self.file()) |f| f.err = true
else unreachable;
var it = parents.iter();
if (&parents.top().entry == self) _ = it.next();
while (it.next()) |p| {
if (p.suberr) break;
p.suberr = true;
}
}
// Insert this entry into the tree at the given directory, updating parent sizes and item counts.
// (TODO: This function creates an unrecoverable mess on OOM, need to do something better)
pub fn insert(self: *Entry, parents: *const Parents) !void {
self.next = parents.top().sub;
parents.top().sub = self;
if (self.dir()) |d| std.debug.assert(d.sub == null);
const dev = parents.top().dev;
// Set if this is the first time we've found this hardlink in the bottom-most directory of the given dev.
// Means we should count it for other-dev parent dirs, too.
var new_hl = false;
// TODO: Saturating add/substract
var it = parents.iter();
while(it.next()) |p| {
var add_total = false;
// Hardlink in a subdirectory with a different device, only count it the first time.
if (self.link() != null and dev != p.dev) {
add_total = new_hl;
} else if (self.link()) |l| {
const n = HardlinkNode{ .ino = l.ino, .dir = p, .num_files = 1 };
var d = try devices.items[dev].hardlinks.getOrPut(n);
new_hl = !d.found_existing;
if (d.found_existing) d.entry.key.num_files += 1;
// First time we encounter this file in this dir, count it.
if (d.entry.key.num_files == 1) {
add_total = true;
p.shared_size = saturateAdd(p.shared_size, self.size);
p.shared_blocks = saturateAdd(p.shared_blocks, self.blocks);
p.shared_items = saturateAdd(p.shared_items, 1);
// Encountered this file in this dir the same number of times as its link count, meaning it's not shared with other dirs.
} else if(d.entry.key.num_files == l.nlink) {
p.shared_size = saturateSub(p.shared_size, self.size);
p.shared_blocks = saturateSub(p.shared_blocks, self.blocks);
p.shared_items = saturateSub(p.shared_items, 1);
}
} else {
add_total = true;
}
if(add_total) {
p.total_size = saturateAdd(p.total_size, self.size);
p.total_blocks = saturateAdd(p.total_blocks, self.blocks);
p.total_items = saturateAdd(p.total_items, 1);
}
}
}
};
const DevId = u30; // Can be reduced to make room for more flags in Dir.
pub const Dir = packed struct {
entry: Entry,
sub: ?*Entry,
// total_*: Total size of all unique files + dirs. Non-shared hardlinks are counted only once.
// (i.e. the space you'll need if you created a filesystem with only this dir)
// shared_*: Unique hardlinks that still have references outside of this directory.
// (i.e. the space you won't reclaim by deleting this dir)
// (space reclaimed by deleting a dir =~ total_ - shared_)
total_blocks: u64,
shared_blocks: u64,
total_size: u64,
shared_size: u64,
total_items: u32,
shared_items: u32,
// TODO: ncdu1 only keeps track of a total item count including duplicate hardlinks.
// That number seems useful, too. Include it somehow?
// Indexes into the global 'devices' array
dev: DevId,
err: bool,
suberr: bool,
// Only used to find the @byteOffsetOff, the name is written at this point as a 0-terminated string.
// (Old C habits die hard)
name: u8,
};
// File that's been hardlinked (i.e. nlink > 1)
pub const Link = packed struct {
entry: Entry,
ino: u64,
// dev is inherited from the parent Dir
nlink: u32,
name: u8,
};
// Anything that's not an (indexed) directory or hardlink. Excluded directories are also "Files".
pub const File = packed struct {
entry: Entry,
err: bool,
excluded: bool,
other_fs: bool,
kernfs: bool,
notreg: bool,
_pad: u3,
name: u8,
};
pub const Ext = packed struct {
mtime: u64,
uid: u32,
gid: u32,
mode: u16,
};
// Hardlink handling:
//
// Global lookup table of dev -> (ino,*Dir) -> num_files
//
// num_files is how many times the file has been found in the particular dir.
// num_links is the file's st_nlink count.
//
// Adding a hardlink: O(parents)
//
// for dir in file.parents:
// add to dir.total_* if it's not yet in the lookup table
// add to num_files in the lookup table
// add to dir.shared_* where num_files == 1
//
// Removing a hardlink: O(parents)
//
// for dir in file.parents:
// subtract from num_files in the lookup table
// subtract from dir.total_* if num_files == 0
// subtract from dir.shared_* if num_files == num_links-1
// remove from lookup table if num_files == 0
//
// Re-calculating full hardlink stats (only possible when also storing sizes):
//
// reset total_* and shared_* for all dirs
// for (file,dir) in lookup_table:
// dir.total_* += file
// if file.num_links != dir.num_files:
// dir.shared_* += file
//
// Problem: num_links is not available in ncdu JSON dumps, will have to assume
// that there are no shared hardlinks outside of the given dump.
//
// Problem: This data structure does not provide a way to easily list all paths
// with the same dev,ino. ncdu provides this list in the info window. Doesn't
// seem too commonly used, can still be provided by a slow full scan of the
// tree.
// 20 bytes per hardlink/Dir entry, everything in a single allocation.
// (Should really be aligned to 8 bytes and hence take up 24 bytes, but let's see how this works out)
//
// getEntry() allows modification of the key without re-insertion (this is unsafe in the general case, but works fine for modifying num_files)
//
// Potential problem: HashMap uses a 32bit item counter, which may be exceeded in extreme scenarios.
// (ncdu itself doesn't support more than 31bit-counted files, but this table is hardlink_count*parent_dirs and may grow a bit)
const HardlinkNode = packed struct {
ino: u64,
dir: *Dir,
num_files: u32,
const Self = @This();
// hash() assumes a struct layout, hence the 'packed struct'
fn hash(self: Self) u64 { return std.hash.Wyhash.hash(0, @ptrCast([*]const u8, &self)[0..@byteOffsetOf(Self, "dir")+@sizeOf(*Dir)]); }
fn eql(a: Self, b: Self) bool { return a.ino == b.ino and a.dir == b.dir; }
};
// Device entry, this is used for two reasons:
// 1. st_dev ids are 64-bit, but in a typical filesystem there's only a few
// unique ids, hence we can save RAM by only storing smaller DevId's in Dir
// entries and using that as an index to a lookup table.
// 2. Keeping track of hardlink counts for each dir and inode, as described above.
//
// (Device entries are never deallocated)
const Device = struct {
dev: u64,
hardlinks: Hardlinks = Hardlinks.init(main.allocator),
const Hardlinks = std.HashMap(HardlinkNode, void, HardlinkNode.hash, HardlinkNode.eql, 80);
};
var devices: std.ArrayList(Device) = std.ArrayList(Device).init(main.allocator);
var dev_lookup: std.AutoHashMap(u64, DevId) = std.AutoHashMap(u64, DevId).init(main.allocator);
pub fn getDevId(dev: u64) !DevId {
var d = try dev_lookup.getOrPut(dev);
if (!d.found_existing) {
errdefer dev_lookup.removeAssertDiscard(dev);
d.entry.value = @intCast(DevId, devices.items.len);
try devices.append(.{ .dev = dev });
}
return d.entry.value;
}
pub fn getDev(id: DevId) u64 {
return devices.items[id].dev;
}
pub var root: *Dir = undefined;
// Stack of parent directories, convenient helper when constructing and traversing the tree.
// The 'root' node is always implicitely at the bottom of the stack.
pub const Parents = struct {
stack: std.ArrayList(*Dir) = std.ArrayList(*Dir).init(main.allocator),
const Self = @This();
pub fn push(self: *Self, dir: *Dir) !void {
return self.stack.append(dir);
}
// Attempting to remove the root node is considered a bug.
pub fn pop(self: *Self) void {
_ = self.stack.pop();
}
pub fn top(self: *const Self) *Dir {
return if (self.stack.items.len == 0) root else self.stack.items[self.stack.items.len-1];
}
pub const Iterator = struct {
lst: *const Self,
index: usize = 0, // 0 = top of the stack, counts upwards to go down
pub fn next(it: *Iterator) ?*Dir {
const len = it.lst.stack.items.len;
if (it.index > len) return null;
it.index += 1;
return if (it.index > len) root else it.lst.stack.items[len-it.index];
}
};
// Iterate from top to bottom of the stack.
pub fn iter(self: *const Self) Iterator {
return .{ .lst = self };
}
pub fn path(self: *const Self, wr: anytype) !void {
const r = root.entry.name();
try wr.writeAll(r);
var i: usize = 0;
while (i < self.stack.items.len) {
if (i != 0 or r[r.len-1] != '/') try wr.writeByte('/');
try wr.writeAll(self.stack.items[i].entry.name());
i += 1;
}
}
};
test "name offsets" {
std.testing.expectEqual(@bitOffsetOf(Dir, "name") % 8, 0);
std.testing.expectEqual(@bitOffsetOf(Link, "name") % 8, 0);
std.testing.expectEqual(@bitOffsetOf(File, "name") % 8, 0);
}
test "entry" {
var e = Entry.create(.file, false, "hello") catch unreachable;
std.debug.assert(e.etype == .file);
std.debug.assert(!e.isext);
std.testing.expectEqualStrings(e.name(), "hello");
}