From d2e8dd8a901edcb3a6576332b942cbc9cd192df0 Mon Sep 17 00:00:00 2001 From: Yorhel Date: Tue, 16 Jul 2024 13:09:54 +0200 Subject: [PATCH] Reimplement JSON import + minor fixes Previous import code did not correctly handle a non-empty directory with the "read_error" flag set. I have no clue if that can ever happen in practice, but at least ncdu 1.x can theoretically emit such JSON so we handle it now. Also fixes mtime display of "special" files. i.e. don't display the mtime of the parent directory - that's confusing. Split a generic-ish JSON parser out of the import code for easier reasoning and implemented a few more performance improvements as well. New code is ~30% faster in both ReleaseSafe and ReleaseFast. --- src/browser.zig | 2 +- src/json_import.zig | 476 ++++++++++++++++++++++++++++++++++++++++++++ src/main.zig | 6 +- src/sink.zig | 6 +- 4 files changed, 486 insertions(+), 4 deletions(-) create mode 100644 src/json_import.zig diff --git a/src/browser.zig b/src/browser.zig index caef8f4..3c87a54 100644 --- a/src/browser.zig +++ b/src/browser.zig @@ -281,7 +281,7 @@ const Row = struct { if (!main.config.show_mtime or self.col + 37 > ui.cols) return; defer self.col += 27; ui.move(self.row, self.col+1); - const ext = (if (self.item) |e| e.ext() else @as(?*model.Ext, null)) orelse dir_parent.entry.ext(); + const ext = if (self.item) |e| e.ext() else dir_parent.entry.ext(); if (ext) |e| ui.addts(self.bg, e.mtime) else ui.addstr(" no mtime"); } diff --git a/src/json_import.zig b/src/json_import.zig new file mode 100644 index 0000000..bd29fe6 --- /dev/null +++ b/src/json_import.zig @@ -0,0 +1,476 @@ +// SPDX-FileCopyrightText: Yorhel +// SPDX-License-Identifier: MIT + +const std = @import("std"); +const main = @import("main.zig"); +const util = @import("util.zig"); +const model = @import("model.zig"); +const sink = @import("sink.zig"); +const ui = @import("ui.zig"); + + +// Using a custom JSON parser here because, while std.json is great, it does +// perform strict UTF-8 validation. Which is correct, of course, but ncdu dumps +// are not always correct JSON as they may contain non-UTF-8 paths encoded as +// strings. + +const Parser = struct { + rd: std.fs.File, + rdoff: usize = 0, + rdsize: usize = 0, + byte: u64 = 1, + line: u64 = 1, + buf: [16*1024]u8 = undefined, + + fn die(p: *Parser, str: []const u8) noreturn { + ui.die("Error importing file on line {}:{}: {s}.\n", .{ p.line, p.byte, str }); + } + + // Feed back a byte that has just been returned by nextByte() + fn undoNextByte(p: *Parser, b: u8) void { + p.byte -= 1; + p.rdoff -= 1; + p.buf[p.rdoff] = b; + } + + fn fill(p: *Parser) void { + @setCold(true); + p.rdoff = 0; + p.rdsize = p.rd.read(&p.buf) catch |e| switch (e) { + error.IsDir => p.die("not a file"), // should be detected at open() time, but no flag for that... + error.SystemResources => p.die("out of memory"), + else => p.die("I/O error"), + }; + } + + // Returns 0 on EOF. + // (or if the file contains a 0 byte, but that's invalid anyway) + // (Returning a '?u8' here is nicer but kills performance by about +30%) + fn nextByte(p: *Parser) u8 { + if (p.rdoff == p.rdsize) { + p.fill(); + if (p.rdsize == 0) return 0; + } + p.byte += 1; + defer p.rdoff += 1; + return (&p.buf)[p.rdoff]; + } + + // next non-whitespace byte + fn nextChr(p: *Parser) u8 { + while (true) switch (p.nextByte()) { + '\n' => { + p.line += 1; + p.byte = 1; + }, + ' ', '\t', '\r' => {}, + else => |b| return b, + }; + } + + fn expectLit(p: *Parser, lit: []const u8) void { + for (lit) |b| if (b != p.nextByte()) p.die("invalid JSON"); + } + + fn hexdig(p: *Parser) u16 { + const b = p.nextByte(); + return switch (b) { + '0'...'9' => b - '0', + 'a'...'f' => b - 'a' + 10, + 'A'...'F' => b - 'A' + 10, + else => p.die("invalid hex digit"), + }; + } + + // Read a string (after the ") into buf. + // Any characters beyond the size of the buffer are consumed but otherwise discarded. + fn stringContent(p: *Parser, buf: []u8) []u8 { + var n: usize = 0; + while (true) switch (p.nextByte()) { + '"' => break, + '\\' => switch (p.nextByte()) { + '"' => if (n < buf.len) { buf[n] = '"'; n += 1; }, + '\\'=> if (n < buf.len) { buf[n] = '\\';n += 1; }, + '/' => if (n < buf.len) { buf[n] = '/'; n += 1; }, + 'b' => if (n < buf.len) { buf[n] = 0x8; n += 1; }, + 'f' => if (n < buf.len) { buf[n] = 0xc; n += 1; }, + 'n' => if (n < buf.len) { buf[n] = 0xa; n += 1; }, + 'r' => if (n < buf.len) { buf[n] = 0xd; n += 1; }, + 't' => if (n < buf.len) { buf[n] = 0x9; n += 1; }, + 'u' => { + const char = (p.hexdig()<<12) + (p.hexdig()<<8) + (p.hexdig()<<4) + p.hexdig(); + if (n + 6 < buf.len) + n += std.unicode.utf8Encode(char, buf[n..n+5]) catch unreachable; + }, + else => p.die("invalid escape sequence"), + }, + 0x20, 0x21, 0x23...0x5b, 0x5d...0xff => |b| if (n < buf.len) { buf[n] = b; n += 1; }, + else => p.die("invalid character in string"), + }; + return buf[0..n]; + } + + fn string(p: *Parser, buf: []u8) []u8 { + if (p.nextChr() != '"') p.die("expected string"); + return p.stringContent(buf); + } + + fn uintTail(p: *Parser, head: u8, T: anytype) T { + if (head == '0') return 0; + var v: T = head - '0'; // Assumption: T >= u8 + // Assumption: we don't parse JSON "documents" that are a bare uint. + while (true) switch (p.nextByte()) { + '0'...'9' => |b| { + const newv = v *% 10 +% (b - '0'); + if (newv < v) p.die("integer out of range"); + v = newv; + }, + else => |b| break p.undoNextByte(b), + }; + if (v == 0) p.die("expected number"); + return v; + } + + fn uint(p: *Parser, T: anytype) T { + switch (p.nextChr()) { + '0'...'9' => |b| return p.uintTail(b, T), + else => p.die("expected number"), + } + } + + fn boolean(p: *Parser) bool { + switch (p.nextChr()) { + 't' => { p.expectLit("rue"); return true; }, + 'f' => { p.expectLit("alse"); return false; }, + else => p.die("expected boolean"), + } + } + + fn obj(p: *Parser) void { + if (p.nextChr() != '{') p.die("expected object"); + } + + fn key(p: *Parser, first: bool, buf: []u8) ?[]u8 { + const k = switch (p.nextChr()) { + ',' => blk: { + if (first) p.die("invalid JSON"); + break :blk p.string(buf); + }, + '"' => blk: { + if (!first) p.die("invalid JSON"); + break :blk p.stringContent(buf); + }, + '}' => return null, + else => p.die("invalid JSON"), + }; + if (p.nextChr() != ':') p.die("invalid JSON"); + return k; + } + + fn array(p: *Parser) void { + if (p.nextChr() != '[') p.die("expected array"); + } + + fn elem(p: *Parser, first: bool) bool { + switch (p.nextChr()) { + ',' => if (first) p.die("invalid JSON") else return true, + ']' => return false, + else => |b| { + if (!first) p.die("invalid JSON"); + p.undoNextByte(b); + return true; + }, + } + } + + fn skipContent(p: *Parser, head: u8) void { + switch (head) { + 't' => p.expectLit("rue"), + 'f' => p.expectLit("alse"), + 'n' => p.expectLit("ull"), + '-', '0'...'9' => + // Numbers are kind of annoying, this "parsing" is invalid and ultra-lazy. + while (true) switch (p.nextByte()) { + '-', '+', 'e', 'E', '.', '0'...'9' => {}, + else => |b| return p.undoNextByte(b), + }, + '"' => _ = p.stringContent(&[0]u8{}), + '[' => { + var first = true; + while (p.elem(first)) { + first = false; + p.skip(); + } + }, + '{' => { + var first = true; + while (p.key(first, &[0]u8{})) |_| { + first = false; + p.skip(); + } + }, + else => p.die("invalid JSON"), + } + } + + fn skip(p: *Parser) void { + p.skipContent(p.nextChr()); + } + + fn eof(p: *Parser) void { + if (p.nextChr() != 0) p.die("trailing garbage"); + } +}; + + +// Should really add some invalid JSON test cases as well, but I'd first like +// to benchmark the performance impact of using error returns instead of +// calling ui.die(). +test "JSON parser" { + const json = + \\{ + \\ "null": null, + \\ "true": true, + \\ "false": false, + \\ "zero":0 ,"uint": 123, + \\ "emptyObj": {}, + \\ "emptyArray": [], + \\ "emptyString": "", + \\ "encString": "\"\\\/\b\f\n\uBe3F", + \\ "numbers": [0,1,20,-300, 3.4 ,0e-10 , -100.023e+13 ] + \\} + ; + var p = Parser{ .rd = undefined, .rdsize = json.len }; + @memcpy(p.buf[0..json.len], json); + p.skip(); + + p = Parser{ .rd = undefined, .rdsize = json.len }; + @memcpy(p.buf[0..json.len], json); + var buf: [128]u8 = undefined; + p.obj(); + + try std.testing.expectEqualStrings(p.key(true, &buf).?, "null"); + p.skip(); + + try std.testing.expectEqualStrings(p.key(false, &buf).?, "true"); + try std.testing.expect(p.boolean()); + + try std.testing.expectEqualStrings(p.key(false, &buf).?, "false"); + try std.testing.expect(!p.boolean()); + + try std.testing.expectEqualStrings(p.key(false, &buf).?, "zero"); + try std.testing.expectEqual(0, p.uint(u8)); + + try std.testing.expectEqualStrings(p.key(false, &buf).?, "uint"); + try std.testing.expectEqual(123, p.uint(u8)); + + try std.testing.expectEqualStrings(p.key(false, &buf).?, "emptyObj"); + p.obj(); + try std.testing.expect(p.key(true, &buf) == null); + + try std.testing.expectEqualStrings(p.key(false, &buf).?, "emptyArray"); + p.array(); + try std.testing.expect(!p.elem(true)); + + try std.testing.expectEqualStrings(p.key(false, &buf).?, "emptyString"); + try std.testing.expectEqualStrings(p.string(&buf), ""); + + try std.testing.expectEqualStrings(p.key(false, &buf).?, "encString"); + try std.testing.expectEqualStrings(p.string(&buf), "\"\\/\x08\x0c\n\u{be3f}"); + + try std.testing.expectEqualStrings(p.key(false, &buf).?, "numbers"); + p.skip(); + + try std.testing.expect(p.key(true, &buf) == null); +} + + +const Ctx = struct { + p: *Parser, + sink: *sink.Thread, + items_seen: u64 = 0, + stat: sink.Stat = .{}, + special: ?sink.Special = null, + namelen: usize = 0, + namebuf: [32*1024]u8 = undefined, +}; + + +fn itemkey(ctx: *Ctx, key: []const u8) void { + const eq = std.mem.eql; + switch (if (key.len > 0) key[0] else @as(u8,0)) { + 'a' => { + if (eq(u8, key, "asize")) { + ctx.stat.size = ctx.p.uint(u64); + return; + } + }, + 'd' => { + if (eq(u8, key, "dsize")) { + ctx.stat.blocks = @intCast(ctx.p.uint(u64)>>9); + return; + } + if (eq(u8, key, "dev")) { + ctx.stat.dev = ctx.p.uint(u64); + return; + } + }, + 'e' => { + if (eq(u8, key, "excluded")) { + var buf: [32]u8 = undefined; + const typ = ctx.p.string(&buf); + // "frmlnk" is also possible, but currently considered equivalent to "pattern". + if (eq(u8, typ, "otherfs")) ctx.special = .other_fs + else if (eq(u8, typ, "kernfs")) ctx.special = .kernfs + else ctx.special = .excluded; + return; + } + }, + 'g' => { + if (eq(u8, key, "gid")) { + ctx.stat.ext.gid = ctx.p.uint(u32); + return; + } + }, + 'h' => { + if (eq(u8, key, "hlnkc")) { + ctx.stat.hlinkc = ctx.p.boolean(); + return; + } + }, + 'i' => { + if (eq(u8, key, "ino")) { + ctx.stat.ino = ctx.p.uint(u64); + return; + } + }, + 'm' => { + if (eq(u8, key, "mode")) { + ctx.stat.ext.mode = ctx.p.uint(u16); + return; + } + if (eq(u8, key, "mtime")) { + ctx.stat.ext.mtime = ctx.p.uint(u64); + // Accept decimal numbers, but discard the fractional part because our data model doesn't support it. + switch (ctx.p.nextByte()) { + '.' => + while (true) switch (ctx.p.nextByte()) { + '0'...'9' => {}, + else => |b| return ctx.p.undoNextByte(b), + }, + else => |b| return ctx.p.undoNextByte(b), + } + } + }, + 'n' => { + if (eq(u8, key, "name")) { + if (ctx.namelen != 0) ctx.p.die("duplicate key"); + ctx.namelen = ctx.p.string(&ctx.namebuf).len; + if (ctx.namelen > ctx.namebuf.len-5) ctx.p.die("too long file name"); + return; + } + if (eq(u8, key, "nlink")) { + ctx.stat.nlink = ctx.p.uint(u31); + if (!ctx.stat.dir and ctx.stat.nlink > 1) + ctx.stat.hlinkc = true; + return; + } + if (eq(u8, key, "notreg")) { + ctx.stat.reg = !ctx.p.boolean(); + return; + } + }, + 'r' => { + if (eq(u8, key, "read_error")) { + if (ctx.p.boolean()) + ctx.special = .err; + return; + } + }, + 'u' => { + if (eq(u8, key, "uid")) { + ctx.stat.ext.uid = ctx.p.uint(u32); + return; + } + }, + else => {}, + } + ctx.p.skip(); +} + + +fn item(ctx: *Ctx, parent: ?*sink.Dir, dev: u64) void { + ctx.stat = .{ .dev = dev }; + ctx.namelen = 0; + ctx.special = null; + ctx.stat.dir = switch (ctx.p.nextChr()) { + '[' => blk: { + ctx.p.obj(); + break :blk true; + }, + '{' => false, + else => ctx.p.die("expected object or array"), + }; + if (parent == null and !ctx.stat.dir) ctx.p.die("parent item must be a directory"); + + var keybuf: [32]u8 = undefined; + var first = true; + while (ctx.p.key(first, &keybuf)) |k| { + first = false; + itemkey(ctx, k); + } + if (ctx.namelen == 0) ctx.p.die("missing \"name\" field"); + const name = (&ctx.namebuf)[0..ctx.namelen]; + + if (ctx.stat.dir and (ctx.special == null or ctx.special == .err)) { + const ndev = ctx.stat.dev; + const dir = + if (parent) |d| d.addDir(ctx.sink, name, &ctx.stat) + else sink.createRoot(name, &ctx.stat); + ctx.sink.setDir(dir); + if (ctx.special == .err) dir.setReadError(ctx.sink); + while (ctx.p.elem(false)) item(ctx, dir, ndev); + ctx.sink.setDir(parent); + dir.unref(); + } else if (ctx.special) |s| { + parent.?.addSpecial(ctx.sink, name, s); + if (ctx.stat.dir and ctx.p.elem(false)) ctx.p.die("unexpected contents in an excluded directory"); + } else { + parent.?.addStat(ctx.sink, name, &ctx.stat); + } + + ctx.items_seen += 1; + if ((ctx.items_seen & 1023) == 0) + main.handleEvent(false, false); +} + + +pub fn import(path: [:0]const u8) void { + const sink_threads = sink.createThreads(1); + defer sink.done(); + + const fd = if (std.mem.eql(u8, "-", path)) std.io.getStdIn() + else std.fs.cwd().openFileZ(path, .{}) + catch |e| ui.die("Error reading file: {s}.\n", .{ui.errorString(e)}); + defer fd.close(); + + var p = Parser{.rd = fd}; + p.array(); + if (p.uint(u16) != 1) p.die("incompatible major format version"); + if (!p.elem(false)) p.die("expected array element"); + _ = p.uint(u16); // minor version, ignored for now + if (!p.elem(false)) p.die("expected array element"); + + // metadata object + p.obj(); + p.skipContent('{'); + + // Items + if (!p.elem(false)) p.die("expected array element"); + var ctx = Ctx{.p = &p, .sink = &sink_threads[0]}; + item(&ctx, null, 0); + + // accept more trailing elements + while (p.elem(false)) p.skip(); + p.eof(); +} diff --git a/src/main.zig b/src/main.zig index 41fce0a..ecaac4f 100644 --- a/src/main.zig +++ b/src/main.zig @@ -6,6 +6,7 @@ pub const program_version = "2.4"; const std = @import("std"); const model = @import("model.zig"); const scan = @import("scan.zig"); +const json_import = @import("json_import.zig"); const sink = @import("sink.zig"); const ui = @import("ui.zig"); const browser = @import("browser.zig"); @@ -17,6 +18,7 @@ const c = @cImport(@cInclude("locale.h")); test "imports" { _ = model; _ = scan; + _ = json_import; _ = sink; _ = ui; _ = browser; @@ -517,8 +519,8 @@ pub fn main() void { catch |e| ui.die("Error opening export file: {s}.\n", .{ui.errorString(e)}) ) else null; - if (import_file) |_| { - //scan.importRoot(f, out_file); + if (import_file) |f| { + json_import.import(f); config.imported = true; } else { var buf = [_]u8{0} ** (std.fs.MAX_PATH_BYTES+1); diff --git a/src/sink.zig b/src/sink.zig index 55df10a..e711b0c 100644 --- a/src/sink.zig +++ b/src/sink.zig @@ -271,9 +271,13 @@ pub const Dir = struct { switch (d.out) { .mem => |*m| m.setReadError(), } + state.last_error_lock.lock(); + defer state.last_error_lock.unlock(); + if (state.last_error) |p| main.allocator.free(p); + state.last_error = d.path(); } - fn path(d: *Dir) [:0]const u8 { + fn path(d: *Dir) [:0]u8 { var components = std.ArrayList([]const u8).init(main.allocator); defer components.deinit(); var it: ?*Dir = d;