From df5845baadf94aeeada131673c54f3df7d6b6cf5 Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sat, 26 Oct 2024 19:30:09 +0200 Subject: [PATCH] Support writing zstd-compressed json, add --compress option --- ncdu.1 | 27 +++++++++++++---------- src/json_export.zig | 53 ++++++++++++++++++++++++++++++++++++++++++++- src/main.zig | 3 +++ 3 files changed, 71 insertions(+), 12 deletions(-) diff --git a/ncdu.1 b/ncdu.1 index e26e1e4..f1e52a7 100644 --- a/ncdu.1 +++ b/ncdu.1 @@ -1,6 +1,6 @@ .\" SPDX-FileCopyrightText: Yorhel .\" SPDX-License-Identifier: MIT -.Dd September 27, 2024 +.Dd October 26, 2024 .Dt NCDU 1 .Os .Sh NAME @@ -21,6 +21,7 @@ .Op Fl L , \-follow\-symlinks , \-no\-follow\-symlinks .Op Fl \-include\-kernfs , \-exclude\-kernfs .Op Fl t , \-threads Ar num +.Op Fl c , \-compress , \-no\-compress .Op Fl \-compress\-level Ar num .Op Fl 0 , 1 , 2 .Op Fl q , \-slow\-ui\-updates , \-fast\-ui\-updates @@ -97,6 +98,11 @@ uncompressed, or a little over 100 KiB when compressed with gzip. This scales linearly, so be prepared to handle a few tens of megabytes when dealing with millions of files. .Pp +Consider enabling +.Fl c +to output Zstandard-compressed JSON, which can significantly reduce size of the +exported data. +.Pp When running a multi-threaded scan or when scanning a directory tree that may not fit in memory, consider using .Fl O @@ -187,10 +193,14 @@ The binary format (see .Fl O ) does not have this problem and supports efficient exporting with any number of threads. +.It Fl c , \-compress , \-no\-compress +Enable or disable Zstandard compression when exporting to JSON (see +.Fl o ) .It Fl \-compress\-level Ar num Set the Zstandard compression level when using .Fl O -to create a binary export. +or +.Fl c . Valid values are 1 (fastest) to 19 (slowest). Defaults to 4. .El @@ -495,9 +505,9 @@ you'll want to use Since scanning a large directory may take a while, you can scan a directory and export the results for later viewing: .Bd -literal -offset indent -ncdu \-1xo\- / | gzip >export.gz +ncdu \-1cxo export.json.zst / # ...some time later: -zcat export.gz | ncdu \-f\- +ncdu \-f export.json.zst .Ed To export from a cron job, make sure to replace .Fl 1 @@ -506,15 +516,10 @@ with to suppress any unnecessary output. .Pp You can also export a directory and browse it once scanning is done: -.Dl ncdu \-o\- | tee export.file | ./ncdu \-f\- -The same is possible with gzip compression, but is a bit kludgey: -.Dl ncdu \-o\- | gzip | tee export.gz | gunzip | ./ncdu \-f\- +.Dl ncdu \-co\- | tee export.json.zst | ./ncdu \-f\- .Pp To scan a system remotely, but browse through the files locally: -.Dl ssh \-C user@system ncdu \-o\- / | ./ncdu \-f\- -The -.Fl C -option to ssh enables compression, which will be very useful over slow links. +.Dl ssh user@system ncdu \-co\- / | ./ncdu \-cf\- Remote scanning and local viewing has two major advantages when compared to running .Nm diff --git a/src/json_export.zig b/src/json_export.zig index fe5f255..2885d0a 100644 --- a/src/json_export.zig +++ b/src/json_export.zig @@ -7,6 +7,7 @@ const model = @import("model.zig"); const sink = @import("sink.zig"); const util = @import("util.zig"); const ui = @import("ui.zig"); +const c = @import("c.zig").c; // JSON output is necessarily single-threaded and items MUST be added depth-first. @@ -14,8 +15,55 @@ pub const global = struct { var writer: *Writer = undefined; }; + +const ZstdWriter = struct { + ctx: ?*c.ZSTD_CStream, + out: c.ZSTD_outBuffer, + outbuf: [c.ZSTD_BLOCKSIZE_MAX + 64]u8, + + fn create() *ZstdWriter { + const w = main.allocator.create(ZstdWriter) catch unreachable; + w.out = .{ + .dst = &w.outbuf, + .size = w.outbuf.len, + .pos = 0, + }; + while (true) { + w.ctx = c.ZSTD_createCStream(); + if (w.ctx != null) break; + ui.oom(); + } + _ = c.ZSTD_CCtx_setParameter(w.ctx, c.ZSTD_c_compressionLevel, main.config.complevel); + return w; + } + + fn destroy(w: *ZstdWriter) void { + _ = c.ZSTD_freeCStream(w.ctx); + main.allocator.destroy(w); + } + + fn write(w: *ZstdWriter, f: std.fs.File, in: []const u8, flush: bool) !void { + var arg = c.ZSTD_inBuffer{ + .src = in.ptr, + .size = in.len, + .pos = 0, + }; + while (true) { + const v = c.ZSTD_compressStream2(w.ctx, &w.out, &arg, if (flush) c.ZSTD_e_end else c.ZSTD_e_continue); + if (c.ZSTD_isError(v) != 0) return error.ZstdCompressError; + if (flush or w.out.pos > w.outbuf.len / 2) { + try f.writeAll(w.outbuf[0..w.out.pos]); + w.out.pos = 0; + } + if (!flush and arg.pos == arg.size) break; + if (flush and v == 0) break; + } + } +}; + pub const Writer = struct { fd: std.fs.File, + zstd: ?*ZstdWriter = null, // Must be large enough to hold PATH_MAX*6 plus some overhead. // (The 6 is because, in the worst case, every byte expands to a "\u####" // escape, and we do pessimistic estimates here in order to avoid checking @@ -29,7 +77,8 @@ pub const Writer = struct { // This can only really happen when the root path exceeds PATH_MAX, // in which case we would probably have error'ed out earlier anyway. if (bytes > ctx.buf.len) ui.die("Error writing JSON export: path too long.\n", .{}); - ctx.fd.writeAll(ctx.buf[0..ctx.off]) catch |e| + const buf = ctx.buf[0..ctx.off]; + (if (ctx.zstd) |z| z.write(ctx.fd, buf, bytes == 0) else ctx.fd.writeAll(buf)) catch |e| ui.die("Error writing to file: {s}.\n", .{ ui.errorString(e) }); ctx.off = 0; } @@ -92,6 +141,7 @@ pub const Writer = struct { fn init(out: std.fs.File) *Writer { var ctx = main.allocator.create(Writer) catch unreachable; ctx.* = .{ .fd = out }; + if (main.config.compress) ctx.zstd = ZstdWriter.create(); ctx.write("[1,2,{\"progname\":\"ncdu\",\"progver\":\"" ++ main.program_version ++ "\",\"timestamp\":"); ctx.writeUint(@intCast(@max(0, std.time.timestamp()))); ctx.writeByte('}'); @@ -210,6 +260,7 @@ pub fn createRoot(path: []const u8, stat: *const sink.Stat) Dir { pub fn done() void { global.writer.write("]\n"); global.writer.flush(0); + if (global.writer.zstd) |z| z.destroy(); global.writer.fd.close(); main.allocator.destroy(global.writer); } diff --git a/src/main.zig b/src/main.zig index e5cb866..4db4ec8 100644 --- a/src/main.zig +++ b/src/main.zig @@ -81,6 +81,7 @@ pub const config = struct { pub var exclude_patterns: std.ArrayList([:0]const u8) = std.ArrayList([:0]const u8).init(allocator); pub var threads: usize = 1; pub var complevel: u8 = 4; + pub var compress: bool = false; pub var update_delay: u64 = 100*std.time.ns_per_ms; pub var scan_ui: ?enum { none, line, full } = null; @@ -276,6 +277,8 @@ fn argConfig(args: *Args, opt: Args.Option) bool { else if (opt.is("--include-caches")) config.exclude_caches = false else if (opt.is("--exclude-kernfs")) config.exclude_kernfs = true else if (opt.is("--include-kernfs")) config.exclude_kernfs = false + else if (opt.is("-c") or opt.is("--compress")) config.compress = true + else if (opt.is("--no-compress")) config.compress = false else if (opt.is("--compress-level")) { const val = args.arg(); config.complevel = std.fmt.parseInt(u8, val, 10) catch ui.die("Invalid number for --compress-level: {s}.\n", .{val});