From 025e5ee99e7438afb02a020b32ba1538e2292476 Mon Sep 17 00:00:00 2001 From: Yorhel Date: Fri, 2 Aug 2024 13:17:42 +0200 Subject: [PATCH] Add import function for the new binary format This isn't the low-memory browsing experience I was hoping to implement, yet, but it serves as a good way to test the new format and such a sink-based import is useful to have anyway. Performance is much better than I had expected, and I haven't even profiled anything yet. --- src/bin_export.zig | 9 +- src/bin_reader.zig | 453 ++++++++++++++++++++++++++++++++++++++++++++ src/json_import.zig | 10 +- src/main.zig | 20 +- src/model.zig | 4 +- src/ui.zig | 1 + 6 files changed, 484 insertions(+), 13 deletions(-) create mode 100644 src/bin_reader.zig diff --git a/src/bin_export.zig b/src/bin_export.zig index b0f70dc..4aef619 100644 --- a/src/bin_export.zig +++ b/src/bin_export.zig @@ -23,7 +23,9 @@ pub const global = struct { const BLOCK_SIZE: usize = 512*1024; // XXX: Current maximum for benchmarking, should just stick with a fixed block size. -const ItemKey = enum(u5) { +pub const SIGNATURE = "\xbfncduEX1"; + +pub const ItemKey = enum(u5) { // all items type = 0, // EType name = 1, // bytes @@ -48,13 +50,14 @@ const ItemKey = enum(u5) { gid = 16, // u32 mode = 17, // u16 mtime = 18, // u64 + _, }; // Pessimistic upper bound on the encoded size of an item, excluding the name field. // 2 bytes for map start/end, 11 per field (2 for the key, 9 for a full u64). const MAX_ITEM_LEN = 2 + 11 * @typeInfo(ItemKey).Enum.fields.len; -const CborMajor = enum(u3) { pos, neg, bytes, text, array, map, tag, simple }; +pub const CborMajor = enum(u3) { pos, neg, bytes, text, array, map, tag, simple }; inline fn bigu16(v: u16) [2]u8 { return @bitCast(std.mem.nativeToBig(u16, v)); } inline fn bigu32(v: u32) [4]u8 { return @bitCast(std.mem.nativeToBig(u32, v)); } @@ -436,7 +439,7 @@ pub fn done(threads: []sink.Thread) void { pub fn setupOutput(fd: std.fs.File) void { global.fd = fd; - fd.writeAll("\xbfncduEX1") catch |e| + fd.writeAll(SIGNATURE) catch |e| ui.die("Error writing to file: {s}.\n", .{ ui.errorString(e) }); global.file_off = 8; diff --git a/src/bin_reader.zig b/src/bin_reader.zig new file mode 100644 index 0000000..fc22000 --- /dev/null +++ b/src/bin_reader.zig @@ -0,0 +1,453 @@ +// SPDX-FileCopyrightText: Yorhel +// SPDX-License-Identifier: MIT + +const std = @import("std"); +const main = @import("main.zig"); +const model = @import("model.zig"); +const util = @import("util.zig"); +const sink = @import("sink.zig"); +const ui = @import("ui.zig"); +const bin_export = @import("bin_export.zig"); + +const CborMajor = bin_export.CborMajor; +const ItemKey = bin_export.ItemKey; + +// Two ways to read a bin export: +// +// 1. Streaming import +// - Read blocks sequentially, assemble items into model.Entry's and stitch +// them together on the go. +// - Does not use the sink.zig API, since sub-level items are read before their parent dirs. +// - Useful when: +// - User attempts to do a refresh or delete while browsing a file through (2) +// - Reading from a stream +// +// 2. Random access browsing +// - Read final block first to get the root item, then have browser.zig fetch +// dir listings from this file. +// - The default reader mode, requires much less memory than (1) and provides +// a snappier first-browsing experience. +// +// The approach from (2) can also be used to walk through the entire directory +// tree and stream it to sink.zig (either for importing or converting to JSON). +// That would allow for better code reuse and low-memory conversion, but +// performance will not be as good as a direct streaming read. Needs +// benchmarks. +// +// This file only implements (2) at the moment. + +pub const global = struct { + var fd: std.fs.File = undefined; + var index: []u8 = undefined; + var blocks: [8]Block = [1]Block{.{}}**8; + var counter: u64 = 0; +}; + + +const Block = struct { + num: u32 = std.math.maxInt(u32), + last: u64 = 0, + data: []u8 = undefined, +}; + + +inline fn bigu16(v: [2]u8) u16 { return std.mem.bigToNative(u16, @bitCast(v)); } +inline fn bigu32(v: [4]u8) u32 { return std.mem.bigToNative(u32, @bitCast(v)); } +inline fn bigu64(v: [8]u8) u64 { return std.mem.bigToNative(u64, @bitCast(v)); } + +fn die(comptime s: []const u8, arg: anytype) noreturn { + ui.die("Error reading from file: "++s++"\n", arg); +} + + +fn readBlock(num: u32) []const u8 { + // Simple linear search, only suitable if we keep the number of in-memory blocks small. + var block: *Block = &global.blocks[0]; + for (&global.blocks) |*b| { + if (b.num == num) { + if (b.last != global.counter) { + global.counter += 1; + b.last = global.counter; + } + return b.data; + } + if (block.last > b.last) block = b; + } + if (block.num != std.math.maxInt(u32)) + main.allocator.free(block.data); + block.num = num; + global.counter += 1; + block.last = global.counter; + + if (num > global.index.len/8 - 1) die("invalid reference to block {}", .{num}); + const offlen = bigu64(global.index[num*8..][0..8].*); + if ((offlen & 0xffffff) < 16) die("block {} is missing or too small", .{num}); + + const buf = main.allocator.alloc(u8, (offlen & 0xffffff) - 12) catch unreachable; + defer main.allocator.free(buf); + const rdlen = global.fd.preadAll(buf, (offlen >> 24) + 8) + catch |e| die("{s}", .{ui.errorString(e)}); + if (rdlen != buf.len) die("short read for block {}", .{num}); + + const rawlen = bigu32(buf[0..4].*); + if (rawlen >= (1<<24)) die("too large uncompressed size for {}", .{num}); + block.data = main.allocator.alloc(u8, rawlen) catch unreachable; + + // TODO: decompress + @memcpy(block.data, buf[4..][0..rawlen]); + return block.data; +} + + +const CborReader = struct { + buf: []const u8, + + fn head(r: *CborReader) !CborVal { + if (r.buf.len < 0) return error.EndOfStream; + var v = CborVal{ + .rd = r, + .major = @enumFromInt(r.buf[0] >> 5), + .indef = false, + .arg = 0, + }; + switch (r.buf[0] & 0x1f) { + 0x00...0x17 => |n| { + v.arg = n; + r.buf = r.buf[1..]; + }, + 0x18 => { + if (r.buf.len < 2) return error.EndOfStream; + v.arg = r.buf[1]; + r.buf = r.buf[2..]; + }, + 0x19 => { + if (r.buf.len < 3) return error.EndOfStream; + v.arg = bigu16(r.buf[1..3].*); + r.buf = r.buf[3..]; + }, + 0x1a => { + if (r.buf.len < 5) return error.EndOfStream; + v.arg = bigu32(r.buf[1..5].*); + r.buf = r.buf[5..]; + }, + 0x1b => { + if (r.buf.len < 9) return error.EndOfStream; + v.arg = bigu64(r.buf[1..9].*); + r.buf = r.buf[9..]; + }, + 0x1f => switch (v.major) { + .bytes, .text, .array, .map, .simple => { + v.indef = true; + r.buf = r.buf[1..]; + }, + else => return error.Invalid, + }, + else => return error.Invalid, + } + return v; + } + + // Read the next CBOR value, skipping any tags + fn next(r: *CborReader) !CborVal { + while (true) { + const v = try r.head(); + if (v.major != .tag) return v; + } + } +}; + +const CborVal = struct { + rd: *CborReader, + major: CborMajor, + indef: bool, + arg: u64, + + fn end(v: *const CborVal) bool { + return v.major == .simple and v.indef; + } + + fn int(v: *const CborVal, T: type) !T { + switch (v.major) { + .pos => return std.math.cast(T, v.arg) orelse return error.IntegerOutOfRange, + .neg => { + if (std.math.minInt(T) == 0) return error.IntegerOutOfRange; + if (v.arg > std.math.maxInt(T)) return error.IntegerOutOfRange; + return -@as(T, @intCast(v.arg)) + (-1); + }, + else => return error.InvalidValue, + } + } + + // Read either a byte or text string. + // Doesn't validate UTF-8 strings, doesn't support indefinite-length strings. + fn bytes(v: *const CborVal) ![]const u8 { + if (v.indef or (v.major != .bytes and v.major != .text)) return error.InvalidValue; + if (v.rd.buf.len < v.arg) return error.EndOfStream; + defer v.rd.buf = v.rd.buf[v.arg..]; + return v.rd.buf[0..v.arg]; + } + + // Skip current value. + fn skip(v: *const CborVal) !void { + // indefinite-length bytes, text, array or map; skip till break marker. + if (v.major != .simple and v.indef) { + while (true) { + const n = try v.rd.next(); + if (n.end()) return; + try n.skip(); + } + } + switch (v.major) { + .bytes, .text => { + if (v.rd.buf.len < v.arg) return error.EndOfStream; + v.rd.buf = v.rd.buf[v.arg..]; + }, + .array => { + for (0..v.arg) |_| try (try v.rd.next()).skip(); + }, + .map => { + for (0..v.arg*|2) |_| try (try v.rd.next()).skip(); + }, + else => {}, + } + } + + fn etype(v: *const CborVal) !model.EType { + const n = try v.int(i32); + return std.meta.intToEnum(model.EType, n) + catch if (n < 0) .pattern else .nonreg; + } + + fn itemref(v: *const CborVal, cur: u64) !u64 { + if (v.major == .pos) return v.arg; + if (v.major == .neg) { + if (v.arg > (1<<24)) return error.InvalidBackReference; + return cur - v.arg - 1; + } + return error.InvalidReference; + } +}; + + +test "CBOR int parsing" { + inline for (.{ + .{ .in = "\x00", .t = u1, .exp = 0 }, + .{ .in = "\x01", .t = u1, .exp = 1 }, + .{ .in = "\x18\x18", .t = u8, .exp = 0x18 }, + .{ .in = "\x18\xff", .t = u8, .exp = 0xff }, + .{ .in = "\x19\x07\xff", .t = u64, .exp = 0x7ff }, + .{ .in = "\x19\xff\xff", .t = u64, .exp = 0xffff }, + .{ .in = "\x1a\x00\x01\x00\x00", .t = u64, .exp = 0x10000 }, + .{ .in = "\x1b\x7f\xff\xff\xff\xff\xff\xff\xff", .t = i64, .exp = std.math.maxInt(i64) }, + .{ .in = "\x1b\xff\xff\xff\xff\xff\xff\xff\xff", .t = u64, .exp = std.math.maxInt(u64) }, + .{ .in = "\x1b\xff\xff\xff\xff\xff\xff\xff\xff", .t = i65, .exp = std.math.maxInt(u64) }, + .{ .in = "\x20", .t = i1, .exp = -1 }, + .{ .in = "\x38\x18", .t = i8, .exp = -0x19 }, + .{ .in = "\x39\x01\xf3", .t = i16, .exp = -500 }, + .{ .in = "\x3a\xfe\xdc\xba\x97", .t = i33, .exp = -0xfedc_ba98 }, + .{ .in = "\x3b\x7f\xff\xff\xff\xff\xff\xff\xff", .t = i64, .exp = std.math.minInt(i64) }, + .{ .in = "\x3b\xff\xff\xff\xff\xff\xff\xff\xff", .t = i65, .exp = std.math.minInt(i65) }, + }) |t| { + var r = CborReader{.buf = t.in}; + try std.testing.expectEqual(@as(t.t, t.exp), (try r.next()).int(t.t)); + try std.testing.expectEqual(0, r.buf.len); + } + var r = CborReader{.buf="\x02"}; + try std.testing.expectError(error.IntegerOutOfRange, (try r.next()).int(u1)); + r.buf = "\x18"; + try std.testing.expectError(error.EndOfStream, r.next()); + r.buf = "\x1e"; + try std.testing.expectError(error.Invalid, r.next()); + r.buf = "\x1f"; + try std.testing.expectError(error.Invalid, r.next()); +} + +test "CBOR string parsing" { + var r = CborReader{.buf="\x40"}; + try std.testing.expectEqualStrings("", try (try r.next()).bytes()); + r.buf = "\x45\x00\x01\x02\x03\x04x"; + try std.testing.expectEqualStrings("\x00\x01\x02\x03\x04", try (try r.next()).bytes()); + try std.testing.expectEqualStrings("x", r.buf); + r.buf = "\x78\x241234567890abcdefghijklmnopqrstuvwxyz-end"; + try std.testing.expectEqualStrings("1234567890abcdefghijklmnopqrstuvwxyz", try (try r.next()).bytes()); + try std.testing.expectEqualStrings("-end", r.buf); +} + +test "CBOR skip parsing" { + inline for (.{ + "\x00", + "\x40", + "\x41a", + "\x5f\xff", + "\x5f\x41a\xff", + "\x80", + "\x81\x00", + "\x9f\xff", + "\x9f\x9f\xff\xff", + "\x9f\x9f\x81\x00\xff\xff", + "\xa0", + "\xa1\x00\x01", + "\xbf\xff", + "\xbf\xc0\x00\x9f\xff\xff", + }) |s| { + var r = CborReader{.buf = s ++ "garbage"}; + try (try r.next()).skip(); + try std.testing.expectEqualStrings(r.buf, "garbage"); + } +} + +const ItemParser = struct { + r: CborReader, + len: ?usize = null, + + const Field = struct { + key: ItemKey, + val: CborVal, + }; + + fn init(buf: []const u8) !ItemParser { + var r = ItemParser{.r = .{.buf = buf}}; + const head = try r.r.next(); + if (head.major != .map) return error.Invalid; + if (!head.indef) r.len = head.arg; + return r; + } + + fn key(r: *ItemParser) !?CborVal { + if (r.len) |*l| { + if (l.* == 0) return null; + l.* -= 1; + return try r.r.next(); + } else { + const v = try r.r.next(); + return if (v.end()) null else v; + } + } + + // Skips over any fields that don't fit into an ItemKey. + fn next(r: *ItemParser) !?Field { + while (try r.key()) |k| { + if (k.int(@typeInfo(ItemKey).Enum.tag_type)) |n| return .{ + .key = @enumFromInt(n), + .val = try r.r.next(), + } else |_| { + try k.skip(); + try (try r.r.next()).skip(); + } + } + return null; + } +}; + +// Returned buffer is valid until the next readItem(). +fn readItem(ref: u64) ItemParser { + if (ref >= (1 << (24 + 32))) die("invalid itemref {x}", .{ref}); + const block = readBlock(@intCast(ref >> 24)); + if ((ref & 0xffffff) > block.len) die("itemref {x} exceeds block size", .{ref}); + return ItemParser.init(block[(ref & 0xffffff)..]) catch die("invalid item at ref {x}", .{ref}); +} + +const Import = struct { + sink: *sink.Thread, + stat: sink.Stat = .{}, + fields: Fields = .{}, + p: ItemParser = undefined, + + const Fields = struct { + name: []const u8 = "", + rderr: bool = false, + prev: ?u64 = null, + sub: ?u64 = null, + }; + + fn readFields(ctx: *Import, ref: u64) !void { + ctx.p = readItem(ref); + var hastype = false; + + while (try ctx.p.next()) |kv| switch (kv.key) { + .type => { + ctx.stat.etype = try kv.val.etype(); + hastype = true; + }, + .name => ctx.fields.name = try kv.val.bytes(), + .prev => ctx.fields.prev = try kv.val.itemref(ref), + .asize => ctx.stat.size = try kv.val.int(u64), + .dsize => ctx.stat.blocks = @intCast(try kv.val.int(u64)/512), + .dev => ctx.stat.dev = try kv.val.int(u64), + .rderr => ctx.fields.rderr = kv.val.major == .simple and kv.val.arg == 21, + .sub => ctx.fields.sub = try kv.val.itemref(ref), + .ino => ctx.stat.ino = try kv.val.int(u64), + .nlink => ctx.stat.nlink = try kv.val.int(u31), + .uid => ctx.stat.ext.uid = try kv.val.int(u32), + .gid => ctx.stat.ext.gid = try kv.val.int(u32), + .mode => ctx.stat.ext.mode = try kv.val.int(u16), + .mtime => ctx.stat.ext.mtime = try kv.val.int(u64), + else => try kv.val.skip(), + }; + + if (!hastype) return error.MissingType; + if (ctx.fields.name.len == 0) return error.MissingName; + } + + fn import(ctx: *Import, ref: u64, parent: ?*sink.Dir, dev: u64) void { + ctx.stat = .{ .dev = dev }; + ctx.fields = .{}; + ctx.readFields(ref) catch |e| die("item {x}: {}", .{ref, e}); + + if (ctx.stat.etype == .dir) { + const prev = ctx.fields.prev; + const dir = + if (parent) |d| d.addDir(ctx.sink, ctx.fields.name, &ctx.stat) + else sink.createRoot(ctx.fields.name, &ctx.stat); + ctx.sink.setDir(dir); + if (ctx.fields.rderr) dir.setReadError(ctx.sink); + + ctx.fields.prev = ctx.fields.sub; + while (ctx.fields.prev) |n| ctx.import(n, dir, ctx.stat.dev); + + ctx.sink.setDir(parent); + dir.unref(ctx.sink); + ctx.fields.prev = prev; + + } else { + const p = parent orelse die("root item must be a directory", .{}); + if (@intFromEnum(ctx.stat.etype) < 0) + p.addSpecial(ctx.sink, ctx.fields.name, ctx.stat.etype) + else + p.addStat(ctx.sink, ctx.fields.name, &ctx.stat); + } + + if ((ctx.sink.files_seen.load(.monotonic) & 65) == 0) + main.handleEvent(false, false); + } +}; + +// Walk through the directory tree in depth-first order and pass results to sink.zig. +// Depth-first is required for JSON export, but more efficient strategies are +// possible for other sinks. Parallel import is also an option, but that's more +// complex and likely less efficient than a streaming import. +pub fn import() void { + const sink_threads = sink.createThreads(1); + var ctx = Import{.sink = &sink_threads[0]}; + ctx.import(bigu64(global.index[global.index.len-8..][0..8].*), null, 0); + sink.done(); +} + +// Assumes that the file signature has already been read and validated. +pub fn open(fd: std.fs.File) !void { + global.fd = fd; + + const size = try fd.getEndPos(); + if (size < 16) return error.EndOfStream; + + // Read index block + var buf: [4]u8 = undefined; + if (try fd.preadAll(&buf, size - 4) != 4) return error.EndOfStream; + const index_header = bigu32(buf); + if ((index_header >> 24) != 2 or (index_header & 7) != 0) + die("invalid index block ({x})", .{index_header}); + const len = (index_header & 0x00ffffff) - 8; // excluding block header & footer + if (len >= size) die("invalid index block size", .{}); + global.index = main.allocator.alloc(u8, len) catch unreachable; + if (try fd.preadAll(global.index, size - len - 4) != global.index.len) return error.EndOfStream; +} diff --git a/src/json_import.zig b/src/json_import.zig index 50fc893..aeee46b 100644 --- a/src/json_import.zig +++ b/src/json_import.zig @@ -467,16 +467,12 @@ fn item(ctx: *Ctx, parent: ?*sink.Dir, dev: u64) void { } -pub fn import(path: [:0]const u8) void { +pub fn import(fd: std.fs.File, head: []const u8) void { const sink_threads = sink.createThreads(1); defer sink.done(); - const fd = if (std.mem.eql(u8, "-", path)) std.io.getStdIn() - else std.fs.cwd().openFileZ(path, .{}) - catch |e| ui.die("Error reading file: {s}.\n", .{ui.errorString(e)}); - defer fd.close(); - - var p = Parser{.rd = fd}; + var p = Parser{.rd = fd, .rdsize = head.len}; + @memcpy(p.buf[0..head.len], head); p.array(); if (p.uint(u16) != 1) p.die("incompatible major format version"); if (!p.elem(false)) p.die("expected array element"); diff --git a/src/main.zig b/src/main.zig index 0fc9f7c..bfca8ae 100644 --- a/src/main.zig +++ b/src/main.zig @@ -9,6 +9,7 @@ const scan = @import("scan.zig"); const json_import = @import("json_import.zig"); const json_export = @import("json_export.zig"); const bin_export = @import("bin_export.zig"); +const bin_reader = @import("bin_reader.zig"); const sink = @import("sink.zig"); const mem_src = @import("mem_src.zig"); const mem_sink = @import("mem_sink.zig"); @@ -25,6 +26,7 @@ test "imports" { _ = json_import; _ = json_export; _ = bin_export; + _ = bin_reader; _ = sink; _ = mem_src; _ = mem_sink; @@ -443,6 +445,22 @@ fn readExcludeFile(path: [:0]const u8) !void { } } +fn readImport(path: [:0]const u8) !void { + const fd = + if (std.mem.eql(u8, "-", path)) std.io.getStdIn() + else try std.fs.cwd().openFileZ(path, .{}); + defer fd.close(); + + // TODO: While we're at it, recognize and handle compressed JSON + var buf: [8]u8 = undefined; + try fd.reader().readNoEof(&buf); + if (std.mem.eql(u8, &buf, bin_export.SIGNATURE)) { + try bin_reader.open(fd); + bin_reader.import(); + } else + json_import.import(fd, &buf); +} + pub fn main() void { ui.main_thread = std.Thread.getCurrentId(); @@ -559,7 +577,7 @@ pub fn main() void { } if (import_file) |f| { - json_import.import(f); + readImport(f) catch |e| ui.die("Error reading file '{s}': {s}.\n", .{f, ui.errorString(e)}); config.imported = true; } else { var buf = [_]u8{0} ** (std.fs.MAX_PATH_BYTES+1); diff --git a/src/model.zig b/src/model.zig index 67817a9..29a11b1 100644 --- a/src/model.zig +++ b/src/model.zig @@ -462,8 +462,8 @@ pub var root: *Dir = undefined; test "entry" { var arena = std.heap.ArenaAllocator.init(std.testing.allocator); defer arena.deinit(); - var e = Entry.create(arena.allocator(), .file, false, "hello"); - try std.testing.expectEqual(e.pack.etype, .file); + var e = Entry.create(arena.allocator(), .reg, false, "hello"); + try std.testing.expectEqual(e.pack.etype, .reg); try std.testing.expect(!e.pack.isext); try std.testing.expectEqualStrings(e.name(), "hello"); } diff --git a/src/ui.zig b/src/ui.zig index a1bf12f..10d39ce 100644 --- a/src/ui.zig +++ b/src/ui.zig @@ -84,6 +84,7 @@ pub fn errorString(e: anyerror) [:0]const u8 { error.ReadOnlyFilesystem => "Read-only filesystem", error.SymlinkLoop => "Symlink loop", error.SystemFdQuotaExceeded => "System file descriptor limit exceeded", + error.EndOfStream => "Unexpected end of file", else => @errorName(e), }; }