From df5845baadf94aeeada131673c54f3df7d6b6cf5 Mon Sep 17 00:00:00 2001
From: Yorhel <git@yorhel.nl>
Date: Sat, 26 Oct 2024 19:30:09 +0200
Subject: [PATCH] Support writing zstd-compressed json, add --compress option

---
 ncdu.1              | 27 +++++++++++++----------
 src/json_export.zig | 53 ++++++++++++++++++++++++++++++++++++++++++++-
 src/main.zig        |  3 +++
 3 files changed, 71 insertions(+), 12 deletions(-)

diff --git a/ncdu.1 b/ncdu.1
index e26e1e4..f1e52a7 100644
--- a/ncdu.1
+++ b/ncdu.1
@@ -1,6 +1,6 @@
 .\" SPDX-FileCopyrightText: Yorhel <projects@yorhel.nl>
 .\" SPDX-License-Identifier: MIT
-.Dd September 27, 2024
+.Dd October 26, 2024
 .Dt NCDU 1
 .Os
 .Sh NAME
@@ -21,6 +21,7 @@
 .Op Fl L , \-follow\-symlinks , \-no\-follow\-symlinks
 .Op Fl \-include\-kernfs , \-exclude\-kernfs
 .Op Fl t , \-threads Ar num
+.Op Fl c , \-compress , \-no\-compress
 .Op Fl \-compress\-level Ar num
 .Op Fl 0 , 1 , 2
 .Op Fl q , \-slow\-ui\-updates , \-fast\-ui\-updates
@@ -97,6 +98,11 @@ uncompressed, or a little over 100 KiB when compressed with gzip.
 This scales linearly, so be prepared to handle a few tens of megabytes when
 dealing with millions of files.
 .Pp
+Consider enabling
+.Fl c
+to output Zstandard-compressed JSON, which can significantly reduce size of the
+exported data.
+.Pp
 When running a multi-threaded scan or when scanning a directory tree that may
 not fit in memory, consider using
 .Fl O
@@ -187,10 +193,14 @@ The binary format (see
 .Fl O )
 does not have this problem and supports efficient exporting with any number of
 threads.
+.It Fl c , \-compress , \-no\-compress
+Enable or disable Zstandard compression when exporting to JSON (see
+.Fl o )
 .It Fl \-compress\-level Ar num
 Set the Zstandard compression level when using
 .Fl O
-to create a binary export.
+or
+.Fl c .
 Valid values are 1 (fastest) to 19 (slowest).
 Defaults to 4.
 .El
@@ -495,9 +505,9 @@ you'll want to use
 Since scanning a large directory may take a while, you can scan a directory and
 export the results for later viewing:
 .Bd -literal -offset indent
-ncdu \-1xo\- / | gzip >export.gz
+ncdu \-1cxo export.json.zst /
 # ...some time later:
-zcat export.gz | ncdu \-f\-
+ncdu \-f export.json.zst
 .Ed
 To export from a cron job, make sure to replace
 .Fl 1
@@ -506,15 +516,10 @@ with
 to suppress any unnecessary output.
 .Pp
 You can also export a directory and browse it once scanning is done:
-.Dl ncdu \-o\- | tee export.file | ./ncdu \-f\-
-The same is possible with gzip compression, but is a bit kludgey:
-.Dl ncdu \-o\- | gzip | tee export.gz | gunzip | ./ncdu \-f\-
+.Dl ncdu \-co\- | tee export.json.zst | ./ncdu \-f\-
 .Pp
 To scan a system remotely, but browse through the files locally:
-.Dl ssh \-C user@system ncdu \-o\- / | ./ncdu \-f\-
-The
-.Fl C
-option to ssh enables compression, which will be very useful over slow links.
+.Dl ssh user@system ncdu \-co\- / | ./ncdu \-cf\-
 Remote scanning and local viewing has two major advantages when
 compared to running
 .Nm
diff --git a/src/json_export.zig b/src/json_export.zig
index fe5f255..2885d0a 100644
--- a/src/json_export.zig
+++ b/src/json_export.zig
@@ -7,6 +7,7 @@ const model = @import("model.zig");
 const sink = @import("sink.zig");
 const util = @import("util.zig");
 const ui = @import("ui.zig");
+const c = @import("c.zig").c;
 
 // JSON output is necessarily single-threaded and items MUST be added depth-first.
 
@@ -14,8 +15,55 @@ pub const global = struct {
     var writer: *Writer = undefined;
 };
 
+
+const ZstdWriter = struct {
+    ctx: ?*c.ZSTD_CStream,
+    out: c.ZSTD_outBuffer,
+    outbuf: [c.ZSTD_BLOCKSIZE_MAX + 64]u8,
+
+    fn create() *ZstdWriter {
+        const w = main.allocator.create(ZstdWriter) catch unreachable;
+        w.out = .{
+            .dst = &w.outbuf,
+            .size = w.outbuf.len,
+            .pos = 0,
+        };
+        while (true) {
+            w.ctx = c.ZSTD_createCStream();
+            if (w.ctx != null) break;
+            ui.oom();
+        }
+        _ = c.ZSTD_CCtx_setParameter(w.ctx, c.ZSTD_c_compressionLevel, main.config.complevel);
+        return w;
+    }
+
+    fn destroy(w: *ZstdWriter) void {
+        _ = c.ZSTD_freeCStream(w.ctx);
+        main.allocator.destroy(w);
+    }
+
+    fn write(w: *ZstdWriter, f: std.fs.File, in: []const u8, flush: bool) !void {
+        var arg = c.ZSTD_inBuffer{
+            .src = in.ptr,
+            .size = in.len,
+            .pos = 0,
+        };
+        while (true) {
+            const v = c.ZSTD_compressStream2(w.ctx, &w.out, &arg, if (flush) c.ZSTD_e_end else c.ZSTD_e_continue);
+            if (c.ZSTD_isError(v) != 0) return error.ZstdCompressError;
+            if (flush or w.out.pos > w.outbuf.len / 2) {
+                try f.writeAll(w.outbuf[0..w.out.pos]);
+                w.out.pos = 0;
+            }
+            if (!flush and arg.pos == arg.size) break;
+            if (flush and v == 0) break;
+        }
+    }
+};
+
 pub const Writer = struct {
     fd: std.fs.File,
+    zstd: ?*ZstdWriter = null,
     // Must be large enough to hold PATH_MAX*6 plus some overhead.
     // (The 6 is because, in the worst case, every byte expands to a "\u####"
     // escape, and we do pessimistic estimates here in order to avoid checking
@@ -29,7 +77,8 @@ pub const Writer = struct {
         // This can only really happen when the root path exceeds PATH_MAX,
         // in which case we would probably have error'ed out earlier anyway.
         if (bytes > ctx.buf.len) ui.die("Error writing JSON export: path too long.\n", .{});
-        ctx.fd.writeAll(ctx.buf[0..ctx.off]) catch |e|
+        const buf = ctx.buf[0..ctx.off];
+        (if (ctx.zstd) |z| z.write(ctx.fd, buf, bytes == 0) else ctx.fd.writeAll(buf)) catch |e|
             ui.die("Error writing to file: {s}.\n", .{ ui.errorString(e) });
         ctx.off = 0;
     }
@@ -92,6 +141,7 @@ pub const Writer = struct {
     fn init(out: std.fs.File) *Writer {
         var ctx = main.allocator.create(Writer) catch unreachable;
         ctx.* = .{ .fd = out };
+        if (main.config.compress) ctx.zstd = ZstdWriter.create();
         ctx.write("[1,2,{\"progname\":\"ncdu\",\"progver\":\"" ++ main.program_version ++ "\",\"timestamp\":");
         ctx.writeUint(@intCast(@max(0, std.time.timestamp())));
         ctx.writeByte('}');
@@ -210,6 +260,7 @@ pub fn createRoot(path: []const u8, stat: *const sink.Stat) Dir {
 pub fn done() void {
     global.writer.write("]\n");
     global.writer.flush(0);
+    if (global.writer.zstd) |z| z.destroy();
     global.writer.fd.close();
     main.allocator.destroy(global.writer);
 }
diff --git a/src/main.zig b/src/main.zig
index e5cb866..4db4ec8 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -81,6 +81,7 @@ pub const config = struct {
     pub var exclude_patterns: std.ArrayList([:0]const u8) = std.ArrayList([:0]const u8).init(allocator);
     pub var threads: usize = 1;
     pub var complevel: u8 = 4;
+    pub var compress: bool = false;
 
     pub var update_delay: u64 = 100*std.time.ns_per_ms;
     pub var scan_ui: ?enum { none, line, full } = null;
@@ -276,6 +277,8 @@ fn argConfig(args: *Args, opt: Args.Option) bool {
     else if (opt.is("--include-caches")) config.exclude_caches = false
     else if (opt.is("--exclude-kernfs")) config.exclude_kernfs = true
     else if (opt.is("--include-kernfs")) config.exclude_kernfs = false
+    else if (opt.is("-c") or opt.is("--compress")) config.compress = true
+    else if (opt.is("--no-compress")) config.compress = false
     else if (opt.is("--compress-level")) {
         const val = args.arg();
         config.complevel = std.fmt.parseInt(u8, val, 10) catch ui.die("Invalid number for --compress-level: {s}.\n", .{val});