// SPDX-FileCopyrightText: Yorhel <projects@yorhel.nl>
// SPDX-License-Identifier: MIT

const std = @import("std");
const main = @import("main.zig");
const sink = @import("sink.zig");
const util = @import("util.zig");
const ui = @import("ui.zig");
const c = @cImport({
    @cInclude("zlib.h");
    @cInclude("zstd.h");
    @cInclude("lz4.h");
});

pub const global = struct {
    var fd: std.fs.File = undefined;
    var index = std.ArrayList(u8).init(main.allocator);
    var file_off: u64 = 0;
    var lock: std.Thread.Mutex = .{};
    var root_itemref: u64 = 0;
};

const BLOCK_SIZE: usize = 512*1024; // XXX: Current maximum for benchmarking, should just stick with a fixed block size.

const ItemType = enum(i3) {
    dir = 0,
    reg = 1,
    nonreg = 2,
    link = 3,
    err = -1,
    pattern = -2,
    otherfs = -3,
    kernfs = -4
};

const ItemKey = enum(u5) {
    // all items
    type = 0, // ItemType
    name = 1, // bytes
    prev = 2, // itemref
    // Only for non-specials
    asize = 3, // u64
    dsize = 4, // u64
    // Only for .dir
    dev      =  5, // u64        only if different from parent dir
    rderr    =  6, // bool       true = error reading directory list, false = error in sub-item, absent = no error
    cumasize =  7, // u64
    cumdsize =  8, // u64
    shrasize =  9, // u64
    shrdsize = 10, // u64
    items    = 11, // u32
    sub      = 12, // itemref    only if dir is not empty
    // Only for .link
    ino     = 13, // u64
    nlink   = 14, // u32
    // Extended mode
    uid   = 15, // u32
    gid   = 16, // u32
    mode  = 17, // u16
    mtime = 18, // u64
};

// Pessimistic upper bound on the encoded size of an item, excluding the name field.
// 2 bytes for map start/end, 11 per field (2 for the key, 9 for a full u64).
const MAX_ITEM_LEN = 2 + 11 * @typeInfo(ItemKey).Enum.fields.len;

const CborMajor = enum(u3) { pos, neg, bytes, text, array, map, tag, simple };

inline fn bigu16(v: u16) [2]u8 { return @bitCast(std.mem.nativeToBig(u16, v)); }
inline fn bigu32(v: u32) [4]u8 { return @bitCast(std.mem.nativeToBig(u32, v)); }
inline fn bigu64(v: u64) [8]u8 { return @bitCast(std.mem.nativeToBig(u64, v)); }

inline fn blockHeader(id: u8, len: u32) [4]u8 { return bigu32((@as(u32, id) << 24) | len); }

inline fn cborByte(major: CborMajor, arg: u5) u8 { return (@as(u8, @intFromEnum(major)) << 5) | arg; }


pub const Thread = struct {
    buf: [BLOCK_SIZE]u8 = undefined,
    off: usize = BLOCK_SIZE,
    block_num: u32 = std.math.maxInt(u32),
    itemref: u64 = 0, // ref of item currently being written

    // Temporary buffer for headers and compression.
    // TODO: check with compressBound()/ZSTD_compressBound()
    tmp: [BLOCK_SIZE+128]u8 = undefined,

    fn compressNone(in: []const u8, out: []u8) usize {
        @memcpy(out[0..in.len], in);
        return in.len;
    }

    fn compressZlib(in: []const u8, out: []u8) usize {
        var outlen: c.uLongf = out.len;
        const r = c.compress2(out.ptr, &outlen, in.ptr, in.len, main.config.complevel);
        std.debug.assert(r == c.Z_OK);
        return outlen;
    }

    fn compressZstd(in: []const u8, out: []u8) usize {
        const r = c.ZSTD_compress(out.ptr, out.len, in.ptr, in.len, main.config.complevel);
        std.debug.assert(c.ZSTD_isError(r) == 0);
        return r;
    }

    fn compressLZ4(in: []const u8, out: []u8) usize {
        const r = c.LZ4_compress_default(in.ptr, out.ptr, @intCast(in.len), @intCast(out.len));
        std.debug.assert(r > 0);
        return @intCast(r);
    }

    fn createBlock(t: *Thread) []const u8 {
        if (t.block_num == std.math.maxInt(u32) or t.off <= 1) return "";

        const bodylen = switch (main.config.compression) {
            .none => compressNone(t.buf[0..t.off], t.tmp[12..]),
            .zlib => compressZlib(t.buf[0..t.off], t.tmp[12..]),
            .zstd => compressZstd(t.buf[0..t.off], t.tmp[12..]),
            .lz4 => compressLZ4(t.buf[0..t.off], t.tmp[12..]),
        };
        const blocklen: u32 = @intCast(bodylen + 16);
        t.tmp[0..4].* = blockHeader(1, blocklen);
        t.tmp[4..8].* = bigu32(t.block_num);
        t.tmp[8..12].* = bigu32(@intCast(t.off));
        t.tmp[12+bodylen..][0..4].* = blockHeader(1, blocklen);
        return t.tmp[0..blocklen];
    }

    fn flush(t: *Thread, expected_len: usize) void {
        @setCold(true);
        const block = createBlock(t);

        global.lock.lock();
        defer global.lock.unlock();
        // This can only really happen when the root path exceeds BLOCK_SIZE,
        // in which case we would probably have error'ed out earlier anyway.
        if (expected_len > t.buf.len) ui.die("Error writing data: path too long.\n", .{});

        if (block.len > 0) {
            global.index.items[4..][t.block_num*8..][0..8].* = bigu64((global.file_off << 24) + block.len);
            global.file_off += block.len;
            global.fd.writeAll(block) catch |e|
                ui.die("Error writing to file: {s}.\n", .{ ui.errorString(e) });
        }

        t.off = 0;
        t.block_num = @intCast((global.index.items.len - 4) / 8);
        global.index.appendSlice(&[1]u8{0}**8) catch unreachable;
        // Start the first block with a CBOR 'null', so that itemrefs can never be 0.
        if (t.block_num == 0) t.cborHead(.simple, 22);
    }

    fn cborHead(t: *Thread, major: CborMajor, arg: u64) void {
        if (arg <= 23) {
            t.buf[t.off] = cborByte(major, @intCast(arg));
            t.off += 1;
        } else if (arg <= std.math.maxInt(u8)) {
            t.buf[t.off] = cborByte(major, 24);
            t.buf[t.off+1] = @truncate(arg);
            t.off += 2;
        } else if (arg <= std.math.maxInt(u16)) {
            t.buf[t.off] = cborByte(major, 25);
            t.buf[t.off+1..][0..2].* = bigu16(@intCast(arg));
            t.off += 3;
        } else if (arg <= std.math.maxInt(u32)) {
            t.buf[t.off] = cborByte(major, 26);
            t.buf[t.off+1..][0..4].* = bigu32(@intCast(arg));
            t.off += 5;
        } else {
            t.buf[t.off] = cborByte(major, 27);
            t.buf[t.off+1..][0..8].* = bigu64(arg);
            t.off += 9;
        }
    }

    fn cborIndef(t: *Thread, major: CborMajor) void {
        t.buf[t.off] = cborByte(major, 31);
        t.off += 1;
    }

    fn itemKey(t: *Thread, key: ItemKey) void {
        t.cborHead(.pos, @intFromEnum(key));
    }

    fn itemRef(t: *Thread, key: ItemKey, ref: u64) void {
        if (ref == 0) return;
        t.itemKey(key);
        // Full references compress like shit and most of the references point
        // into the same block, so optimize that case by using a negative
        // offset instead.
        if ((ref >> 24) == t.block_num) t.cborHead(.neg, t.itemref - ref - 1)
        else t.cborHead(.pos, ref);
    }

    // Reserve space for a new item, write out the type, prev and name fields and return the itemref.
    fn itemStart(t: *Thread, itype: ItemType, prev_item: u64, name: []const u8) u64 {
        const min_len = name.len + MAX_ITEM_LEN;
        if (t.off + min_len > main.config.blocksize) t.flush(min_len);

        t.itemref = (@as(u64, t.block_num) << 24) | t.off;
        t.cborIndef(.map);
        t.itemKey(.type);
        if (@intFromEnum(itype) >= 0) t.cborHead(.pos, @intCast(@intFromEnum(itype)))
        else t.cborHead(.neg, @intCast(-1 - @intFromEnum(itype)));
        t.itemKey(.name);
        t.cborHead(.bytes, name.len);
        @memcpy(t.buf[t.off..][0..name.len], name);
        t.off += name.len;
        t.itemRef(.prev, prev_item);
        return t.itemref;
    }

    fn itemExt(t: *Thread, stat: *const sink.Stat) void {
        if (!main.config.extended) return;
        t.itemKey(.uid);
        t.cborHead(.pos, stat.ext.uid);
        t.itemKey(.gid);
        t.cborHead(.pos, stat.ext.gid);
        t.itemKey(.mode);
        t.cborHead(.pos, stat.ext.mode);
        t.itemKey(.mtime);
        t.cborHead(.pos, stat.ext.mtime);
    }

    fn itemEnd(t: *Thread) void {
        t.cborIndef(.simple);
    }
};


pub const Dir = struct {
    // TODO: When items are written out into blocks depth-first, parent dirs
    // will end up getting their items distributed over many blocks, which will
    // significantly slow down reading that dir's listing. It may be worth
    // buffering some items at the Dir level before flushing them out to the
    // Thread buffer.

    // The lock protects all of the below, and is necessary because final()
    // accesses the parent dir and may be called from other threads.
    // I'm not expecting much lock contention, but it's possible to turn
    // last_item into an atomic integer and other fields could be split up for
    // subdir use.
    lock: std.Thread.Mutex = .{},
    last_sub: u64 = 0,
    stat: sink.Stat,
    items: u32 = 0,
    size: u64 = 0,
    blocks: u64 = 0,
    err: bool = false,
    suberr: bool = false,
    shared_size: u64 = 0,
    shared_blocks: u64 = 0,
    inodes: Inodes = Inodes.init(main.allocator),

    const Inodes = std.AutoHashMap(u64, Inode);
    const Inode = struct {
        size: u64,
        blocks: u64,
        nlink: u32,
        nfound: u32,
    };


    pub fn addSpecial(d: *Dir, t: *Thread, name: []const u8, sp: sink.Special) void {
        d.lock.lock();
        defer d.lock.unlock();
        d.items += 1;
        if (sp == .err) d.suberr = true;
        const it: ItemType = switch (sp) {
            .err => .err,
            .other_fs => .otherfs,
            .kernfs => .kernfs,
            .excluded => .pattern,
        };
        d.last_sub = t.itemStart(it, d.last_sub, name);
        t.itemEnd();
    }

    pub fn addStat(d: *Dir, t: *Thread, name: []const u8, stat: *const sink.Stat) void {
        d.lock.lock();
        defer d.lock.unlock();
        d.items += 1;
        if (!stat.hlinkc) {
            d.size +|= stat.size;
            d.blocks +|= stat.blocks;
        }
        const it: ItemType = if (stat.hlinkc) .link else if (stat.reg) .reg else .nonreg;
        d.last_sub = t.itemStart(it, d.last_sub, name);
        t.itemKey(.asize);
        t.cborHead(.pos, stat.size);
        t.itemKey(.dsize);
        t.cborHead(.pos, util.blocksToSize(stat.blocks));

        if (stat.hlinkc) {
            const lnk = d.inodes.getOrPut(stat.ino) catch unreachable;
            if (!lnk.found_existing) lnk.value_ptr.* = .{
                .size = stat.size,
                .blocks = stat.blocks,
                .nlink = stat.nlink,
                .nfound = 1,
            } else lnk.value_ptr.nfound += 1;
            t.itemKey(.ino);
            t.cborHead(.pos, stat.ino);
            t.itemKey(.nlink);
            t.cborHead(.pos, stat.nlink);
        }

        t.itemExt(stat);
        t.itemEnd();
    }

    pub fn addDir(d: *Dir, stat: *const sink.Stat) Dir {
        d.lock.lock();
        defer d.lock.unlock();
        d.items += 1;
        d.size +|= stat.size;
        d.blocks +|= stat.blocks;
        return .{ .stat = stat.* };
    }

    pub fn setReadError(d: *Dir) void {
        d.lock.lock();
        defer d.lock.unlock();
        d.err = true;
    }

    // XXX: older JSON exports did not include the nlink count and have
    // this field set to '0'.  We can deal with that when importing to
    // mem_sink, but the hardlink counting algorithm used here really does need
    // that information. Current code makes sure to count such links only once
    // per dir, but does not count them towards the shared_* fields. That
    // behavior is similar to ncdu 1.x, but the difference between memory
    // import and this file export might be surprising.
    fn countLinks(d: *Dir, parent: ?*Dir) void {
        var parent_new: u32 = 0;
        var it = d.inodes.iterator();
        while (it.next()) |kv| {
            const v = kv.value_ptr;
            d.size +|= v.size;
            d.blocks +|= v.blocks;
            if (v.nlink > 1 and v.nfound <= v.nlink) {
                d.shared_size +|= v.size;
                d.shared_blocks +|= v.blocks;
            }

            const p = parent orelse continue;
            // All contained in this dir, no need to keep this entry around
            if (v.nlink > 0 and v.nfound >= v.nlink) {
                p.size +|= v.size;
                p.blocks +|= v.blocks;
                _ = d.inodes.remove(kv.key_ptr.*);
            } else if (!p.inodes.contains(kv.key_ptr.*))
                parent_new += 1;
        }

        // Merge remaining inodes into parent
        const p = parent orelse return;
        if (d.inodes.count() == 0) return;

        // If parent is empty, just transfer
        if (p.inodes.count() == 0) {
            p.inodes.deinit();
            p.inodes = d.inodes;
            d.inodes = Inodes.init(main.allocator); // So we can deinit() without affecting parent
        // Otherwise, merge
        } else {
            p.inodes.ensureUnusedCapacity(parent_new) catch unreachable;
            it = d.inodes.iterator();
            while (it.next()) |kv| {
                const v = kv.value_ptr;
                const plnk = p.inodes.getOrPutAssumeCapacity(kv.key_ptr.*);
                if (!plnk.found_existing) plnk.value_ptr.* = v.*
                else plnk.value_ptr.*.nfound += v.nfound;
            }
        }
    }

    pub fn final(d: *Dir, t: *Thread, name: []const u8, parent: ?*Dir) void {
        if (parent) |p| p.lock.lock();
        defer if (parent) |p| p.lock.unlock();

        if (parent) |p| {
            // Different dev? Don't merge the 'inodes' sets, just count the
            // links here first so the sizes get added to the parent.
            if (p.stat.dev != d.stat.dev) d.countLinks(null);

            p.items += d.items;
            p.size +|= d.size;
            p.blocks +|= d.blocks;
            if (d.suberr or d.err) p.suberr = true;

            // Same dir, merge inodes
            if (p.stat.dev == d.stat.dev) d.countLinks(p);

            p.last_sub = t.itemStart(.dir, p.last_sub, name);
        } else {
            d.countLinks(null);
            global.root_itemref = t.itemStart(.dir, 0, name);
        }
        d.inodes.deinit();

        t.itemKey(.asize);
        t.cborHead(.pos, d.stat.size);
        t.itemKey(.dsize);
        t.cborHead(.pos, util.blocksToSize(d.stat.blocks));
        if (parent == null or parent.?.stat.dev != d.stat.dev) {
            t.itemKey(.dev);
            t.cborHead(.pos, d.stat.dev);
        }
        if (d.err or d.suberr) {
            t.itemKey(.rderr);
            t.cborHead(.simple, if (d.err) 21 else 20);
        }
        t.itemKey(.cumasize);
        t.cborHead(.pos, d.size +| d.stat.size);
        t.itemKey(.cumdsize);
        t.cborHead(.pos, util.blocksToSize(d.blocks +| d.stat.blocks));
        if (d.shared_size > 0) {
            t.itemKey(.shrasize);
            t.cborHead(.pos, d.shared_size);
        }
        if (d.shared_blocks > 0) {
            t.itemKey(.shrdsize);
            t.cborHead(.pos, util.blocksToSize(d.shared_blocks));
        }
        t.itemKey(.items);
        t.cborHead(.pos, d.items);
        t.itemRef(.sub, d.last_sub);
        t.itemExt(&d.stat);
        t.itemEnd();
    }
};


pub fn createRoot(stat: *const sink.Stat) Dir {
    return .{ .stat = stat.* };
}

pub fn done(threads: []sink.Thread) void {
    for (threads) |*t| t.sink.bin.flush(0);

    while (std.mem.endsWith(u8, global.index.items, &[1]u8{0}**8))
        global.index.shrinkRetainingCapacity(global.index.items.len - 8);
    global.index.appendSlice(&bigu64(global.root_itemref)) catch unreachable;
    global.index.appendSlice(&blockHeader(2, @intCast(global.index.items.len + 4))) catch unreachable;
    global.index.items[0..4].* = blockHeader(2, @intCast(global.index.items.len));
    global.fd.writeAll(global.index.items) catch |e|
        ui.die("Error writing to file: {s}.\n", .{ ui.errorString(e) });
    global.index.clearAndFree();

    global.fd.close();
}

pub fn setupOutput(fd: std.fs.File) void {
    global.fd = fd;
    fd.writeAll("\xbfncduEX1") catch |e|
        ui.die("Error writing to file: {s}.\n", .{ ui.errorString(e) });
    global.file_off = 8;

    // Placeholder for the index block header.
    global.index.appendSlice("aaaa") catch unreachable;
}