ncdu-zig/src/bin_reader.zig

// SPDX-FileCopyrightText: Yorhel <projects@yorhel.nl>
// SPDX-License-Identifier: MIT

const std = @import("std");
const main = @import("main.zig");
const model = @import("model.zig");
const util = @import("util.zig");
const sink = @import("sink.zig");
const ui = @import("ui.zig");
const bin_export = @import("bin_export.zig");
const c = @import("c.zig").c;


const CborMajor = bin_export.CborMajor;
const ItemKey = bin_export.ItemKey;

// Two ways to read a bin export:
//
// 1. Streaming import
//   - Read blocks sequentially, assemble items into model.Entry's and stitch
//     them together on the go.
//   - Does not use the sink.zig API, since sub-level items are read before their parent dirs.
//   - Useful when:
//     - User attempts to do a refresh or delete while browsing a file through (2)
//     - Reading from a stream
//
// 2. Random access browsing
//   - Read final block first to get the root item, then have browser.zig fetch
//     dir listings from this file.
//   - The default reader mode, requires much less memory than (1) and provides
//     a snappier first-browsing experience.
//
// The approach from (2) can also be used to walk through the entire directory
// tree and stream it to sink.zig (either for importing or converting to JSON).
// That would allow for better code reuse and low-memory conversion, but
// performance will not be as good as a direct streaming read. Needs
// benchmarks.
//
// This file only implements (2) at the moment.

pub const global = struct {
    var fd: std.fs.File = undefined;
    var index: []u8 = undefined;
    var blocks: [8]Block = [1]Block{.{}}**8;
    var counter: u64 = 0;

    // Last itemref being read/parsed. This is a hack to provide *some* context on error.
    // Providing more context mainly just bloats the binary and decreases
    // performance for fairly little benefit. Nobody's going to debug a corrupted export.
    var lastitem: ?u64 = null;
};


const Block = struct {
    num: u32 = std.math.maxInt(u32),
    last: u64 = 0,
    data: []u8 = undefined,
};


inline fn bigu16(v: [2]u8) u16 { return std.mem.bigToNative(u16, @bitCast(v)); }
inline fn bigu32(v: [4]u8) u32 { return std.mem.bigToNative(u32, @bitCast(v)); }
inline fn bigu64(v: [8]u8) u64 { return std.mem.bigToNative(u64, @bitCast(v)); }

fn die() noreturn {
    @branchHint(.cold);
    if (global.lastitem) |e| ui.die("Error reading item {x} from file\n", .{e})
    else ui.die("Error reading from file\n", .{});
}


fn readBlock(num: u32) []const u8 {
    // Simple linear search, only suitable if we keep the number of in-memory blocks small.
    var block: *Block = &global.blocks[0];
    for (&global.blocks) |*b| {
        if (b.num == num) {
            if (b.last != global.counter) {
                global.counter += 1;
                b.last = global.counter;
            }
            return b.data;
        }
        if (block.last > b.last) block = b;
    }
    if (block.num != std.math.maxInt(u32))
        main.allocator.free(block.data);
    block.num = num;
    global.counter += 1;
    block.last = global.counter;

    if (num > global.index.len/8 - 1) die();
    const offlen = bigu64(global.index[num*8..][0..8].*);
    const off = offlen >> 24;
    const len = offlen & 0xffffff;
    if (len <= 12) die();

    // Only read the compressed data part, assume block header, number and footer are correct.
    const buf = main.allocator.alloc(u8, @intCast(len - 12)) catch unreachable;
    defer main.allocator.free(buf);
    const rdlen = global.fd.preadAll(buf, off + 8)
        catch |e| ui.die("Error reading from file: {s}\n", .{ui.errorString(e)});
    if (rdlen != buf.len) die();

    const rawlen = c.ZSTD_getFrameContentSize(buf.ptr, buf.len);
    if (rawlen <= 0 or rawlen >= (1<<24)) die();
    block.data = main.allocator.alloc(u8, @intCast(rawlen)) catch unreachable;

    const res = c.ZSTD_decompress(block.data.ptr, block.data.len, buf.ptr, buf.len);
    if (res != block.data.len) ui.die("Error decompressing block {} (expected {} got {})\n", .{ num, block.data.len, res });

    return block.data;
}


const CborReader = struct {
    buf: []const u8,

    fn head(r: *CborReader) CborVal {
        if (r.buf.len < 1) die();
        var v = CborVal{
            .rd = r,
            .major = @enumFromInt(r.buf[0] >> 5),
            .indef = false,
            .arg = 0,
        };
        switch (r.buf[0] & 0x1f) {
            0x00...0x17 => |n| {
                v.arg = n;
                r.buf = r.buf[1..];
            },
            0x18 => {
                if (r.buf.len < 2) die();
                v.arg = r.buf[1];
                r.buf = r.buf[2..];
            },
            0x19 => {
                if (r.buf.len < 3) die();
                v.arg = bigu16(r.buf[1..3].*);
                r.buf = r.buf[3..];
            },
            0x1a => {
                if (r.buf.len < 5) die();
                v.arg = bigu32(r.buf[1..5].*);
                r.buf = r.buf[5..];
            },
            0x1b => {
                if (r.buf.len < 9) die();
                v.arg = bigu64(r.buf[1..9].*);
                r.buf = r.buf[9..];
            },
            0x1f => switch (v.major) {
                .bytes, .text, .array, .map, .simple => {
                    v.indef = true;
                    r.buf = r.buf[1..];
                },
                else => die(),
            },
            else => die(),
        }
        return v;
    }

    // Read the next CBOR value, skipping any tags
    fn next(r: *CborReader) CborVal {
        while (true) {
            const v = r.head();
            if (v.major != .tag) return v;
        }
    }
};

const CborVal = struct {
    rd: *CborReader,
    major: CborMajor,
    indef: bool,
    arg: u64,

    fn end(v: *const CborVal) bool {
        return v.major == .simple and v.indef;
    }

    fn int(v: *const CborVal, T: type) T {
        switch (v.major) {
            .pos => return std.math.cast(T, v.arg) orelse die(),
            .neg => {
                if (std.math.minInt(T) == 0) die();
                if (v.arg > std.math.maxInt(T)) die();
                return -@as(T, @intCast(v.arg)) + (-1);
            },
            else => die(),
        }
    }

    fn isTrue(v: *const CborVal) bool {
        return v.major == .simple and v.arg == 21;
    }

    // Read either a byte or text string.
    // Doesn't validate UTF-8 strings, doesn't support indefinite-length strings.
    fn bytes(v: *const CborVal) []const u8 {
        if (v.indef or (v.major != .bytes and v.major != .text)) die();
        if (v.rd.buf.len < v.arg) die();
        defer v.rd.buf = v.rd.buf[@intCast(v.arg)..];
        return v.rd.buf[0..@intCast(v.arg)];
    }

    // Skip current value.
    fn skip(v: *const CborVal) void {
        // indefinite-length bytes, text, array or map; skip till break marker.
        if (v.major != .simple and v.indef) {
            while (true) {
                const n = v.rd.next();
                if (n.end()) return;
                n.skip();
            }
        }
        switch (v.major) {
            .bytes, .text => {
                if (v.rd.buf.len < v.arg) die();
                v.rd.buf = v.rd.buf[@intCast(v.arg)..];
            },
            .array => {
                if (v.arg > (1<<24)) die();
                for (0..@intCast(v.arg)) |_| v.rd.next().skip();
            },
            .map => {
                if (v.arg > (1<<24)) die();
                for (0..@intCast(v.arg*|2)) |_| v.rd.next().skip();
            },
            else => {},
        }
    }

    fn etype(v: *const CborVal) model.EType {
        const n = v.int(i32);
        return std.meta.intToEnum(model.EType, n)
            catch if (n < 0) .pattern else .nonreg;
    }

    fn itemref(v: *const CborVal, cur: u64) u64 {
        if (v.major == .pos) return v.arg;
        if (v.major == .neg) {
            if (v.arg >= (cur & 0xffffff)) die();
            return cur - v.arg - 1;
        }
        return die();
    }
};


test "CBOR int parsing" {
    inline for (.{
        .{ .in = "\x00", .t = u1, .exp = 0 },
        .{ .in = "\x01", .t = u1, .exp = 1 },
        .{ .in = "\x18\x18", .t = u8, .exp = 0x18 },
        .{ .in = "\x18\xff", .t = u8, .exp = 0xff },
        .{ .in = "\x19\x07\xff", .t = u64, .exp = 0x7ff },
        .{ .in = "\x19\xff\xff", .t = u64, .exp = 0xffff },
        .{ .in = "\x1a\x00\x01\x00\x00", .t = u64, .exp = 0x10000 },
        .{ .in = "\x1b\x7f\xff\xff\xff\xff\xff\xff\xff", .t = i64, .exp = std.math.maxInt(i64) },
        .{ .in = "\x1b\xff\xff\xff\xff\xff\xff\xff\xff", .t = u64, .exp = std.math.maxInt(u64) },
        .{ .in = "\x1b\xff\xff\xff\xff\xff\xff\xff\xff", .t = i65, .exp = std.math.maxInt(u64) },
        .{ .in = "\x20", .t = i1, .exp = -1 },
        .{ .in = "\x38\x18", .t = i8, .exp = -0x19 },
        .{ .in = "\x39\x01\xf3", .t = i16, .exp = -500 },
        .{ .in = "\x3a\xfe\xdc\xba\x97", .t = i33, .exp = -0xfedc_ba98 },
        .{ .in = "\x3b\x7f\xff\xff\xff\xff\xff\xff\xff", .t = i64, .exp = std.math.minInt(i64) },
        .{ .in = "\x3b\xff\xff\xff\xff\xff\xff\xff\xff", .t = i65, .exp = std.math.minInt(i65) },
    }) |t| {
        var r = CborReader{.buf = t.in};
        try std.testing.expectEqual(@as(t.t, t.exp), r.next().int(t.t));
        try std.testing.expectEqual(0, r.buf.len);
    }
}

test "CBOR string parsing" {
    var r = CborReader{.buf="\x40"};
    try std.testing.expectEqualStrings("", r.next().bytes());
    r.buf = "\x45\x00\x01\x02\x03\x04x";
    try std.testing.expectEqualStrings("\x00\x01\x02\x03\x04", r.next().bytes());
    try std.testing.expectEqualStrings("x", r.buf);
    r.buf = "\x78\x241234567890abcdefghijklmnopqrstuvwxyz-end";
    try std.testing.expectEqualStrings("1234567890abcdefghijklmnopqrstuvwxyz", r.next().bytes());
    try std.testing.expectEqualStrings("-end", r.buf);
}

test "CBOR skip parsing" {
    inline for (.{
        "\x00",
        "\x40",
        "\x41a",
        "\x5f\xff",
        "\x5f\x41a\xff",
        "\x80",
        "\x81\x00",
        "\x9f\xff",
        "\x9f\x9f\xff\xff",
        "\x9f\x9f\x81\x00\xff\xff",
        "\xa0",
        "\xa1\x00\x01",
        "\xbf\xff",
        "\xbf\xc0\x00\x9f\xff\xff",
    }) |s| {
        var r = CborReader{.buf = s ++ "garbage"};
        r.next().skip();
        try std.testing.expectEqualStrings(r.buf, "garbage");
    }
}

const ItemParser = struct {
    r: CborReader,
    len: ?u64 = null,

    const Field = struct {
        key: ItemKey,
        val: CborVal,
    };

    fn init(buf: []const u8) ItemParser {
        var r = ItemParser{.r = .{.buf = buf}};
        const head = r.r.next();
        if (head.major != .map) die();
        if (!head.indef) r.len = head.arg;
        return r;
    }

    fn key(r: *ItemParser) ?CborVal {
        if (r.len) |*l| {
            if (l.* == 0) return null;
            l.* -= 1;
            return r.r.next();
        } else {
            const v = r.r.next();
            return if (v.end()) null else v;
        }
    }

    // Skips over any fields that don't fit into an ItemKey.
    fn next(r: *ItemParser) ?Field {
        while (r.key()) |k| {
            if (k.major == .pos and k.arg <= std.math.maxInt(@typeInfo(ItemKey).@"enum".tag_type)) return .{
                .key = @enumFromInt(k.arg),
                .val = r.r.next(),
            } else {
                k.skip();
                r.r.next().skip();
            }
        }
        return null;
    }
};

// Returned buffer is valid until the next readItem().
fn readItem(ref: u64) ItemParser {
    global.lastitem = ref;
    if (ref >= (1 << (24 + 32))) die();
    const block = readBlock(@intCast(ref >> 24));
    if ((ref & 0xffffff) >= block.len) die();
    return ItemParser.init(block[@intCast(ref & 0xffffff)..]);
}

const Import = struct {
    sink: *sink.Thread,
    stat: sink.Stat = .{},
    fields: Fields = .{},
    p: ItemParser = undefined,

    const Fields = struct {
        name: []const u8 = "",
        rderr: bool = false,
        prev: ?u64 = null,
        sub: ?u64 = null,
    };

    fn readFields(ctx: *Import, ref: u64) void {
        ctx.p = readItem(ref);
        var hastype = false;

        while (ctx.p.next()) |kv| switch (kv.key) {
            .type => {
                ctx.stat.etype = kv.val.etype();
                hastype = true;
            },
            .name => ctx.fields.name = kv.val.bytes(),
            .prev => ctx.fields.prev = kv.val.itemref(ref),
            .asize => ctx.stat.size = kv.val.int(u64),
            .dsize => ctx.stat.blocks = @intCast(kv.val.int(u64)/512),
            .dev => ctx.stat.dev = kv.val.int(u64),
            .rderr => ctx.fields.rderr = kv.val.isTrue(),
            .sub => ctx.fields.sub = kv.val.itemref(ref),
            .ino => ctx.stat.ino = kv.val.int(u64),
            .nlink => ctx.stat.nlink = kv.val.int(u31),
            .uid => { ctx.stat.ext.uid = kv.val.int(u32); ctx.stat.ext.pack.hasuid = true; },
            .gid => { ctx.stat.ext.gid = kv.val.int(u32); ctx.stat.ext.pack.hasgid = true; },
            .mode => { ctx.stat.ext.mode = kv.val.int(u16); ctx.stat.ext.pack.hasmode = true; },
            .mtime => { ctx.stat.ext.mtime = kv.val.int(u64); ctx.stat.ext.pack.hasmtime = true; },
            else => kv.val.skip(),
        };

        if (!hastype) die();
        if (ctx.fields.name.len == 0) die();
    }

    fn import(ctx: *Import, ref: u64, parent: ?*sink.Dir, dev: u64) void {
        ctx.stat = .{ .dev = dev };
        ctx.fields = .{};
        ctx.readFields(ref);

        if (ctx.stat.etype == .dir) {
            const prev = ctx.fields.prev;
            const dir =
                if (parent) |d| d.addDir(ctx.sink, ctx.fields.name, &ctx.stat)
                else sink.createRoot(ctx.fields.name, &ctx.stat);
            ctx.sink.setDir(dir);
            if (ctx.fields.rderr) dir.setReadError(ctx.sink);

            ctx.fields.prev = ctx.fields.sub;
            while (ctx.fields.prev) |n| ctx.import(n, dir, ctx.stat.dev);

            ctx.sink.setDir(parent);
            dir.unref(ctx.sink);
            ctx.fields.prev = prev;

        } else {
            const p = parent orelse die();
            if (@intFromEnum(ctx.stat.etype) < 0)
                p.addSpecial(ctx.sink, ctx.fields.name, ctx.stat.etype)
            else
                p.addStat(ctx.sink, ctx.fields.name, &ctx.stat);
        }

        if ((ctx.sink.files_seen.load(.monotonic) & 65) == 0)
            main.handleEvent(false, false);
    }
};

// Resolve an itemref and return a newly allocated entry.
// Dir.parent and Link.next/prev are left uninitialized.
pub fn get(ref: u64, alloc: std.mem.Allocator) *model.Entry {
    const parser = readItem(ref);

    var etype: ?model.EType = null;
    var name: []const u8 = "";
    var p = parser;
    var ext = model.Ext{};
    while (p.next()) |kv| {
        switch (kv.key) {
            .type => etype = kv.val.etype(),
            .name => name = kv.val.bytes(),
            .uid   => { ext.uid = kv.val.int(u32); ext.pack.hasuid = true; },
            .gid   => { ext.gid = kv.val.int(u32); ext.pack.hasgid = true; },
            .mode  => { ext.mode = kv.val.int(u16); ext.pack.hasmode = true; },
            .mtime => { ext.mtime = kv.val.int(u64); ext.pack.hasmtime = true; },
            else => kv.val.skip(),
        }
    }
    if (etype == null or name.len == 0) die();

    var entry = model.Entry.create(alloc, etype.?, main.config.extended and !ext.isEmpty(), name);
    entry.next = .{ .ref = std.math.maxInt(u64) };
    if (entry.ext()) |e| e.* = ext;
    if (entry.dir()) |d| d.sub = .{ .ref = std.math.maxInt(u64) };
    p = parser;
    while (p.next()) |kv| switch (kv.key) {
        .prev  => entry.next = .{ .ref = kv.val.itemref(ref) },
        .asize => { if (entry.pack.etype != .dir) entry.size = kv.val.int(u64); },
        .dsize => { if (entry.pack.etype != .dir) entry.pack.blocks = @intCast(kv.val.int(u64)/512); },

        .rderr => { if (entry.dir()) |d| {
            if (kv.val.isTrue()) d.pack.err = true
            else d.pack.suberr = true;
        } },
        .dev      => { if (entry.dir()) |d| d.pack.dev = model.devices.getId(kv.val.int(u64)); },
        .cumasize => entry.size = kv.val.int(u64),
        .cumdsize => entry.pack.blocks = @intCast(kv.val.int(u64)/512),
        .shrasize => { if (entry.dir()) |d| d.shared_size = kv.val.int(u64); },
        .shrdsize => { if (entry.dir()) |d| d.shared_blocks = kv.val.int(u64)/512; },
        .items    => { if (entry.dir()) |d| d.items = util.castClamp(u32, kv.val.int(u64)); },
        .sub      => { if (entry.dir()) |d| d.sub = .{ .ref = kv.val.itemref(ref) }; },

        .ino   => { if (entry.link()) |l| l.ino = kv.val.int(u64); },
        .nlink => { if (entry.link()) |l| l.pack.nlink = kv.val.int(u31); },
        else => kv.val.skip(),
    };
    return entry;
}

pub fn getRoot() u64 {
    return bigu64(global.index[global.index.len-8..][0..8].*);
}

// Walk through the directory tree in depth-first order and pass results to sink.zig.
// Depth-first is required for JSON export, but more efficient strategies are
// possible for other sinks. Parallel import is also an option, but that's more
// complex and likely less efficient than a streaming import.
pub fn import() void {
    const sink_threads = sink.createThreads(1);
    var ctx = Import{.sink = &sink_threads[0]};
    ctx.import(getRoot(), null, 0);
    sink.done();
}

// Assumes that the file signature has already been read and validated.
pub fn open(fd: std.fs.File) !void {
    global.fd = fd;

    // Do not use fd.getEndPos() because that requires newer kernels supporting statx() #261.
    try fd.seekFromEnd(0);
    const size = try fd.getPos();
    if (size < 16) return error.EndOfStream;

    // Read index block
    var buf: [4]u8 = undefined;
    if (try fd.preadAll(&buf, size - 4) != 4) return error.EndOfStream;
    const index_header = bigu32(buf);
    if ((index_header >> 28) != 1 or (index_header & 7) != 0) die();
    const len = (index_header & 0x0fffffff) - 8; // excluding block header & footer
    if (len >= size) die();
    global.index = main.allocator.alloc(u8, len) catch unreachable;
    if (try fd.preadAll(global.index, size - len - 4) != global.index.len) return error.EndOfStream;
}