mirror of
https://code.blicky.net/yorhel/ncdu.git
synced 2026-01-18 03:28:40 -09:00
I had a feeling my last workaround wasn't correct, turns out my basic assumption about ZSTD_decompressStream() was wrong: rather than guaranteeing some output when there's enough input, it always guarantees to consume input when there's space in the output. Fixed the code and adjusted the buffers again.
562 lines
18 KiB
Zig
562 lines
18 KiB
Zig
// SPDX-FileCopyrightText: Yorhel <projects@yorhel.nl>
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
const std = @import("std");
|
|
const main = @import("main.zig");
|
|
const util = @import("util.zig");
|
|
const model = @import("model.zig");
|
|
const sink = @import("sink.zig");
|
|
const ui = @import("ui.zig");
|
|
const c = @import("c.zig").c;
|
|
|
|
|
|
const ZstdReader = struct {
|
|
ctx: ?*c.ZSTD_DStream,
|
|
in: c.ZSTD_inBuffer,
|
|
lastret: usize = 0,
|
|
inbuf: [c.ZSTD_BLOCKSIZE_MAX + 16]u8, // This ZSTD_DStreamInSize() + a little bit extra
|
|
|
|
fn create(head: []const u8) *ZstdReader {
|
|
const r = main.allocator.create(ZstdReader) catch unreachable;
|
|
@memcpy(r.inbuf[0..head.len], head);
|
|
r.in = .{
|
|
.src = &r.inbuf,
|
|
.size = head.len,
|
|
.pos = 0,
|
|
};
|
|
while (true) {
|
|
r.ctx = c.ZSTD_createDStream();
|
|
if (r.ctx != null) break;
|
|
ui.oom();
|
|
}
|
|
return r;
|
|
}
|
|
|
|
fn destroy(r: *ZstdReader) void {
|
|
_ = c.ZSTD_freeDStream(r.ctx);
|
|
main.allocator.destroy(r);
|
|
}
|
|
|
|
fn read(r: *ZstdReader, f: std.fs.File, out: []u8) !usize {
|
|
while (true) {
|
|
if (r.in.size == r.in.pos) {
|
|
r.in.pos = 0;
|
|
r.in.size = try f.read(&r.inbuf);
|
|
if (r.in.size == 0) {
|
|
if (r.lastret == 0) return 0;
|
|
return error.ZstdDecompressError; // Early EOF
|
|
}
|
|
}
|
|
|
|
var arg = c.ZSTD_outBuffer{ .dst = out.ptr, .size = out.len, .pos = 0 };
|
|
r.lastret = c.ZSTD_decompressStream(r.ctx, &arg, &r.in);
|
|
if (c.ZSTD_isError(r.lastret) != 0) return error.ZstdDecompressError;
|
|
if (arg.pos > 0) return arg.pos;
|
|
}
|
|
}
|
|
};
|
|
|
|
|
|
// Using a custom JSON parser here because, while std.json is great, it does
|
|
// perform strict UTF-8 validation. Which is correct, of course, but ncdu dumps
|
|
// are not always correct JSON as they may contain non-UTF-8 paths encoded as
|
|
// strings.
|
|
|
|
const Parser = struct {
|
|
rd: std.fs.File,
|
|
zstd: ?*ZstdReader = null,
|
|
rdoff: usize = 0,
|
|
rdsize: usize = 0,
|
|
byte: u64 = 1,
|
|
line: u64 = 1,
|
|
buf: [129*1024]u8 = undefined,
|
|
|
|
fn die(p: *Parser, str: []const u8) noreturn {
|
|
ui.die("Error importing file on line {}:{}: {s}.\n", .{ p.line, p.byte, str });
|
|
}
|
|
|
|
// Feed back a byte that has just been returned by nextByte()
|
|
fn undoNextByte(p: *Parser, b: u8) void {
|
|
p.byte -= 1;
|
|
p.rdoff -= 1;
|
|
p.buf[p.rdoff] = b;
|
|
}
|
|
|
|
fn fill(p: *Parser) void {
|
|
@setCold(true);
|
|
p.rdoff = 0;
|
|
p.rdsize = (if (p.zstd) |z| z.read(p.rd, &p.buf) else p.rd.read(&p.buf)) catch |e| switch (e) {
|
|
error.IsDir => p.die("not a file"), // should be detected at open() time, but no flag for that...
|
|
error.SystemResources => p.die("out of memory"),
|
|
error.ZstdDecompressError => p.die("decompression error"),
|
|
else => p.die("I/O error"),
|
|
};
|
|
}
|
|
|
|
// Returns 0 on EOF.
|
|
// (or if the file contains a 0 byte, but that's invalid anyway)
|
|
// (Returning a '?u8' here is nicer but kills performance by about +30%)
|
|
fn nextByte(p: *Parser) u8 {
|
|
if (p.rdoff == p.rdsize) {
|
|
p.fill();
|
|
if (p.rdsize == 0) return 0;
|
|
}
|
|
p.byte += 1;
|
|
defer p.rdoff += 1;
|
|
return (&p.buf)[p.rdoff];
|
|
}
|
|
|
|
// next non-whitespace byte
|
|
fn nextChr(p: *Parser) u8 {
|
|
while (true) switch (p.nextByte()) {
|
|
'\n' => {
|
|
p.line += 1;
|
|
p.byte = 1;
|
|
},
|
|
' ', '\t', '\r' => {},
|
|
else => |b| return b,
|
|
};
|
|
}
|
|
|
|
fn expectLit(p: *Parser, lit: []const u8) void {
|
|
for (lit) |b| if (b != p.nextByte()) p.die("invalid JSON");
|
|
}
|
|
|
|
fn hexdig(p: *Parser) u16 {
|
|
const b = p.nextByte();
|
|
return switch (b) {
|
|
'0'...'9' => b - '0',
|
|
'a'...'f' => b - 'a' + 10,
|
|
'A'...'F' => b - 'A' + 10,
|
|
else => p.die("invalid hex digit"),
|
|
};
|
|
}
|
|
|
|
fn stringContentSlow(p: *Parser, buf: []u8, head: u8, off: usize) []u8 {
|
|
@setCold(true);
|
|
var b = head;
|
|
var n = off;
|
|
while (true) {
|
|
switch (b) {
|
|
'"' => break,
|
|
'\\' => switch (p.nextByte()) {
|
|
'"' => if (n < buf.len) { buf[n] = '"'; n += 1; },
|
|
'\\'=> if (n < buf.len) { buf[n] = '\\';n += 1; },
|
|
'/' => if (n < buf.len) { buf[n] = '/'; n += 1; },
|
|
'b' => if (n < buf.len) { buf[n] = 0x8; n += 1; },
|
|
'f' => if (n < buf.len) { buf[n] = 0xc; n += 1; },
|
|
'n' => if (n < buf.len) { buf[n] = 0xa; n += 1; },
|
|
'r' => if (n < buf.len) { buf[n] = 0xd; n += 1; },
|
|
't' => if (n < buf.len) { buf[n] = 0x9; n += 1; },
|
|
'u' => {
|
|
const first = (p.hexdig()<<12) + (p.hexdig()<<8) + (p.hexdig()<<4) + p.hexdig();
|
|
var unit = @as(u21, first);
|
|
if (std.unicode.utf16IsLowSurrogate(first)) p.die("Unexpected low surrogate");
|
|
if (std.unicode.utf16IsHighSurrogate(first)) {
|
|
p.expectLit("\\u");
|
|
const second = (p.hexdig()<<12) + (p.hexdig()<<8) + (p.hexdig()<<4) + p.hexdig();
|
|
unit = std.unicode.utf16DecodeSurrogatePair(&.{first, second}) catch p.die("Invalid low surrogate");
|
|
}
|
|
if (n + 6 < buf.len)
|
|
n += std.unicode.utf8Encode(unit, buf[n..n+5]) catch unreachable;
|
|
},
|
|
else => p.die("invalid escape sequence"),
|
|
},
|
|
0x20, 0x21, 0x23...0x5b, 0x5d...0xff => if (n < buf.len) { buf[n] = b; n += 1; },
|
|
else => p.die("invalid character in string"),
|
|
}
|
|
b = p.nextByte();
|
|
}
|
|
return buf[0..n];
|
|
}
|
|
|
|
// Read a string (after the ") into buf.
|
|
// Any characters beyond the size of the buffer are consumed but otherwise discarded.
|
|
fn stringContent(p: *Parser, buf: []u8) []u8 {
|
|
// The common case (for ncdu dumps): string fits in the given buffer and does not contain any escapes.
|
|
var n: usize = 0;
|
|
var b = p.nextByte();
|
|
while (n < buf.len and b >= 0x20 and b != '"' and b != '\\') {
|
|
buf[n] = b;
|
|
n += 1;
|
|
b = p.nextByte();
|
|
}
|
|
if (b == '"') return buf[0..n];
|
|
return p.stringContentSlow(buf, b, n);
|
|
}
|
|
|
|
fn string(p: *Parser, buf: []u8) []u8 {
|
|
if (p.nextChr() != '"') p.die("expected string");
|
|
return p.stringContent(buf);
|
|
}
|
|
|
|
fn uintTail(p: *Parser, head: u8, T: anytype) T {
|
|
if (head == '0') return 0;
|
|
var v: T = head - '0'; // Assumption: T >= u8
|
|
// Assumption: we don't parse JSON "documents" that are a bare uint.
|
|
while (true) switch (p.nextByte()) {
|
|
'0'...'9' => |b| {
|
|
const newv = v *% 10 +% (b - '0');
|
|
if (newv < v) p.die("integer out of range");
|
|
v = newv;
|
|
},
|
|
else => |b| break p.undoNextByte(b),
|
|
};
|
|
if (v == 0) p.die("expected number");
|
|
return v;
|
|
}
|
|
|
|
fn uint(p: *Parser, T: anytype) T {
|
|
switch (p.nextChr()) {
|
|
'0'...'9' => |b| return p.uintTail(b, T),
|
|
else => p.die("expected number"),
|
|
}
|
|
}
|
|
|
|
fn boolean(p: *Parser) bool {
|
|
switch (p.nextChr()) {
|
|
't' => { p.expectLit("rue"); return true; },
|
|
'f' => { p.expectLit("alse"); return false; },
|
|
else => p.die("expected boolean"),
|
|
}
|
|
}
|
|
|
|
fn obj(p: *Parser) void {
|
|
if (p.nextChr() != '{') p.die("expected object");
|
|
}
|
|
|
|
fn key(p: *Parser, first: bool, buf: []u8) ?[]u8 {
|
|
const k = switch (p.nextChr()) {
|
|
',' => blk: {
|
|
if (first) p.die("invalid JSON");
|
|
break :blk p.string(buf);
|
|
},
|
|
'"' => blk: {
|
|
if (!first) p.die("invalid JSON");
|
|
break :blk p.stringContent(buf);
|
|
},
|
|
'}' => return null,
|
|
else => p.die("invalid JSON"),
|
|
};
|
|
if (p.nextChr() != ':') p.die("invalid JSON");
|
|
return k;
|
|
}
|
|
|
|
fn array(p: *Parser) void {
|
|
if (p.nextChr() != '[') p.die("expected array");
|
|
}
|
|
|
|
fn elem(p: *Parser, first: bool) bool {
|
|
switch (p.nextChr()) {
|
|
',' => if (first) p.die("invalid JSON") else return true,
|
|
']' => return false,
|
|
else => |b| {
|
|
if (!first) p.die("invalid JSON");
|
|
p.undoNextByte(b);
|
|
return true;
|
|
},
|
|
}
|
|
}
|
|
|
|
fn skipContent(p: *Parser, head: u8) void {
|
|
switch (head) {
|
|
't' => p.expectLit("rue"),
|
|
'f' => p.expectLit("alse"),
|
|
'n' => p.expectLit("ull"),
|
|
'-', '0'...'9' =>
|
|
// Numbers are kind of annoying, this "parsing" is invalid and ultra-lazy.
|
|
while (true) switch (p.nextByte()) {
|
|
'-', '+', 'e', 'E', '.', '0'...'9' => {},
|
|
else => |b| return p.undoNextByte(b),
|
|
},
|
|
'"' => _ = p.stringContent(&[0]u8{}),
|
|
'[' => {
|
|
var first = true;
|
|
while (p.elem(first)) {
|
|
first = false;
|
|
p.skip();
|
|
}
|
|
},
|
|
'{' => {
|
|
var first = true;
|
|
while (p.key(first, &[0]u8{})) |_| {
|
|
first = false;
|
|
p.skip();
|
|
}
|
|
},
|
|
else => p.die("invalid JSON"),
|
|
}
|
|
}
|
|
|
|
fn skip(p: *Parser) void {
|
|
p.skipContent(p.nextChr());
|
|
}
|
|
|
|
fn eof(p: *Parser) void {
|
|
if (p.nextChr() != 0) p.die("trailing garbage");
|
|
}
|
|
};
|
|
|
|
|
|
// Should really add some invalid JSON test cases as well, but I'd first like
|
|
// to benchmark the performance impact of using error returns instead of
|
|
// calling ui.die().
|
|
test "JSON parser" {
|
|
const json =
|
|
\\{
|
|
\\ "null": null,
|
|
\\ "true": true,
|
|
\\ "false": false,
|
|
\\ "zero":0 ,"uint": 123,
|
|
\\ "emptyObj": {},
|
|
\\ "emptyArray": [],
|
|
\\ "emptyString": "",
|
|
\\ "encString": "\"\\\/\b\f\n\uBe3F",
|
|
\\ "numbers": [0,1,20,-300, 3.4 ,0e-10 , -100.023e+13 ]
|
|
\\}
|
|
;
|
|
var p = Parser{ .rd = undefined, .rdsize = json.len };
|
|
@memcpy(p.buf[0..json.len], json);
|
|
p.skip();
|
|
|
|
p = Parser{ .rd = undefined, .rdsize = json.len };
|
|
@memcpy(p.buf[0..json.len], json);
|
|
var buf: [128]u8 = undefined;
|
|
p.obj();
|
|
|
|
try std.testing.expectEqualStrings(p.key(true, &buf).?, "null");
|
|
p.skip();
|
|
|
|
try std.testing.expectEqualStrings(p.key(false, &buf).?, "true");
|
|
try std.testing.expect(p.boolean());
|
|
|
|
try std.testing.expectEqualStrings(p.key(false, &buf).?, "false");
|
|
try std.testing.expect(!p.boolean());
|
|
|
|
try std.testing.expectEqualStrings(p.key(false, &buf).?, "zero");
|
|
try std.testing.expectEqual(0, p.uint(u8));
|
|
|
|
try std.testing.expectEqualStrings(p.key(false, &buf).?, "uint");
|
|
try std.testing.expectEqual(123, p.uint(u8));
|
|
|
|
try std.testing.expectEqualStrings(p.key(false, &buf).?, "emptyObj");
|
|
p.obj();
|
|
try std.testing.expect(p.key(true, &buf) == null);
|
|
|
|
try std.testing.expectEqualStrings(p.key(false, &buf).?, "emptyArray");
|
|
p.array();
|
|
try std.testing.expect(!p.elem(true));
|
|
|
|
try std.testing.expectEqualStrings(p.key(false, &buf).?, "emptyString");
|
|
try std.testing.expectEqualStrings(p.string(&buf), "");
|
|
|
|
try std.testing.expectEqualStrings(p.key(false, &buf).?, "encString");
|
|
try std.testing.expectEqualStrings(p.string(&buf), "\"\\/\x08\x0c\n\u{be3f}");
|
|
|
|
try std.testing.expectEqualStrings(p.key(false, &buf).?, "numbers");
|
|
p.skip();
|
|
|
|
try std.testing.expect(p.key(true, &buf) == null);
|
|
}
|
|
|
|
|
|
const Ctx = struct {
|
|
p: *Parser,
|
|
sink: *sink.Thread,
|
|
stat: sink.Stat = .{},
|
|
rderr: bool = false,
|
|
namelen: usize = 0,
|
|
namebuf: [32*1024]u8 = undefined,
|
|
};
|
|
|
|
|
|
fn itemkey(ctx: *Ctx, key: []const u8) void {
|
|
const eq = std.mem.eql;
|
|
switch (if (key.len > 0) key[0] else @as(u8,0)) {
|
|
'a' => {
|
|
if (eq(u8, key, "asize")) {
|
|
ctx.stat.size = ctx.p.uint(u64);
|
|
return;
|
|
}
|
|
},
|
|
'd' => {
|
|
if (eq(u8, key, "dsize")) {
|
|
ctx.stat.blocks = @intCast(ctx.p.uint(u64)>>9);
|
|
return;
|
|
}
|
|
if (eq(u8, key, "dev")) {
|
|
ctx.stat.dev = ctx.p.uint(u64);
|
|
return;
|
|
}
|
|
},
|
|
'e' => {
|
|
if (eq(u8, key, "excluded")) {
|
|
var buf: [32]u8 = undefined;
|
|
const typ = ctx.p.string(&buf);
|
|
// "frmlnk" is also possible, but currently considered equivalent to "pattern".
|
|
ctx.stat.etype =
|
|
if (eq(u8, typ, "otherfs") or eq(u8, typ, "othfs")) .otherfs
|
|
else if (eq(u8, typ, "kernfs")) .kernfs
|
|
else .pattern;
|
|
return;
|
|
}
|
|
},
|
|
'g' => {
|
|
if (eq(u8, key, "gid")) {
|
|
ctx.stat.ext.gid = ctx.p.uint(u32);
|
|
ctx.stat.ext.pack.hasgid = true;
|
|
return;
|
|
}
|
|
},
|
|
'h' => {
|
|
if (eq(u8, key, "hlnkc")) {
|
|
if (ctx.p.boolean()) ctx.stat.etype = .link;
|
|
return;
|
|
}
|
|
},
|
|
'i' => {
|
|
if (eq(u8, key, "ino")) {
|
|
ctx.stat.ino = ctx.p.uint(u64);
|
|
return;
|
|
}
|
|
},
|
|
'm' => {
|
|
if (eq(u8, key, "mode")) {
|
|
ctx.stat.ext.mode = ctx.p.uint(u16);
|
|
ctx.stat.ext.pack.hasmode = true;
|
|
return;
|
|
}
|
|
if (eq(u8, key, "mtime")) {
|
|
ctx.stat.ext.mtime = ctx.p.uint(u64);
|
|
ctx.stat.ext.pack.hasmtime = true;
|
|
// Accept decimal numbers, but discard the fractional part because our data model doesn't support it.
|
|
switch (ctx.p.nextByte()) {
|
|
'.' =>
|
|
while (true) switch (ctx.p.nextByte()) {
|
|
'0'...'9' => {},
|
|
else => |b| return ctx.p.undoNextByte(b),
|
|
},
|
|
else => |b| return ctx.p.undoNextByte(b),
|
|
}
|
|
}
|
|
},
|
|
'n' => {
|
|
if (eq(u8, key, "name")) {
|
|
if (ctx.namelen != 0) ctx.p.die("duplicate key");
|
|
ctx.namelen = ctx.p.string(&ctx.namebuf).len;
|
|
if (ctx.namelen > ctx.namebuf.len-5) ctx.p.die("too long file name");
|
|
return;
|
|
}
|
|
if (eq(u8, key, "nlink")) {
|
|
ctx.stat.nlink = ctx.p.uint(u31);
|
|
if (ctx.stat.etype != .dir and ctx.stat.nlink > 1)
|
|
ctx.stat.etype = .link;
|
|
return;
|
|
}
|
|
if (eq(u8, key, "notreg")) {
|
|
if (ctx.p.boolean()) ctx.stat.etype = .nonreg;
|
|
return;
|
|
}
|
|
},
|
|
'r' => {
|
|
if (eq(u8, key, "read_error")) {
|
|
if (ctx.p.boolean()) {
|
|
if (ctx.stat.etype == .dir) ctx.rderr = true
|
|
else ctx.stat.etype = .err;
|
|
}
|
|
return;
|
|
}
|
|
},
|
|
'u' => {
|
|
if (eq(u8, key, "uid")) {
|
|
ctx.stat.ext.uid = ctx.p.uint(u32);
|
|
ctx.stat.ext.pack.hasuid = true;
|
|
return;
|
|
}
|
|
},
|
|
else => {},
|
|
}
|
|
ctx.p.skip();
|
|
}
|
|
|
|
|
|
fn item(ctx: *Ctx, parent: ?*sink.Dir, dev: u64) void {
|
|
ctx.stat = .{ .dev = dev };
|
|
ctx.namelen = 0;
|
|
ctx.rderr = false;
|
|
const isdir = switch (ctx.p.nextChr()) {
|
|
'[' => blk: {
|
|
ctx.p.obj();
|
|
break :blk true;
|
|
},
|
|
'{' => false,
|
|
else => ctx.p.die("expected object or array"),
|
|
};
|
|
if (parent == null and !isdir) ctx.p.die("parent item must be a directory");
|
|
ctx.stat.etype = if (isdir) .dir else .reg;
|
|
|
|
var keybuf: [32]u8 = undefined;
|
|
var first = true;
|
|
while (ctx.p.key(first, &keybuf)) |k| {
|
|
first = false;
|
|
itemkey(ctx, k);
|
|
}
|
|
if (ctx.namelen == 0) ctx.p.die("missing \"name\" field");
|
|
const name = (&ctx.namebuf)[0..ctx.namelen];
|
|
|
|
if (ctx.stat.etype == .dir) {
|
|
const ndev = ctx.stat.dev;
|
|
const dir =
|
|
if (parent) |d| d.addDir(ctx.sink, name, &ctx.stat)
|
|
else sink.createRoot(name, &ctx.stat);
|
|
ctx.sink.setDir(dir);
|
|
if (ctx.rderr) dir.setReadError(ctx.sink);
|
|
while (ctx.p.elem(false)) item(ctx, dir, ndev);
|
|
ctx.sink.setDir(parent);
|
|
dir.unref(ctx.sink);
|
|
|
|
} else {
|
|
if (@intFromEnum(ctx.stat.etype) < 0)
|
|
parent.?.addSpecial(ctx.sink, name, ctx.stat.etype)
|
|
else
|
|
parent.?.addStat(ctx.sink, name, &ctx.stat);
|
|
if (isdir and ctx.p.elem(false)) ctx.p.die("unexpected contents in an excluded directory");
|
|
}
|
|
|
|
if ((ctx.sink.files_seen.load(.monotonic) & 65) == 0)
|
|
main.handleEvent(false, false);
|
|
}
|
|
|
|
|
|
pub fn import(fd: std.fs.File, head: []const u8) void {
|
|
const sink_threads = sink.createThreads(1);
|
|
defer sink.done();
|
|
|
|
var p = Parser{.rd = fd};
|
|
defer if (p.zstd) |z| z.destroy();
|
|
|
|
if (head.len >= 4 and std.mem.eql(u8, head[0..4], "\x28\xb5\x2f\xfd")) {
|
|
p.zstd = ZstdReader.create(head);
|
|
} else {
|
|
p.rdsize = head.len;
|
|
@memcpy(p.buf[0..head.len], head);
|
|
}
|
|
p.array();
|
|
if (p.uint(u16) != 1) p.die("incompatible major format version");
|
|
if (!p.elem(false)) p.die("expected array element");
|
|
_ = p.uint(u16); // minor version, ignored for now
|
|
if (!p.elem(false)) p.die("expected array element");
|
|
|
|
// metadata object
|
|
p.obj();
|
|
p.skipContent('{');
|
|
|
|
// Items
|
|
if (!p.elem(false)) p.die("expected array element");
|
|
var ctx = Ctx{.p = &p, .sink = &sink_threads[0]};
|
|
item(&ctx, null, 0);
|
|
|
|
// accept more trailing elements
|
|
while (p.elem(false)) p.skip();
|
|
p.eof();
|
|
}
|