diff --git a/src/exclude.zig b/src/exclude.zig new file mode 100644 index 0000000..8e2b793 --- /dev/null +++ b/src/exclude.zig @@ -0,0 +1,322 @@ +// SPDX-FileCopyrightText: 2021-2022 Yoran Heling +// SPDX-License-Identifier: MIT + +const std = @import("std"); +const main = @import("main.zig"); +const c = @cImport(@cInclude("fnmatch.h")); + +// Reference: +// https://manned.org/glob.7 +// https://manned.org/man.b4c7391e/rsync#head17 +// https://manned.org/man.401d6ade/arch/gitignore#head4 +// Patterns: +// Single component (none of these patterns match a '/'): +// * -> match any character sequence +// ? -> match single character +// [abc] -> match a single character in the given list +// [a-c] -> match a single character in the given range +// [!a-c] -> match a single character not in the given range +// # (these are currently still handled by calling libc fnmatch()) +// Anchored patterns: +// /pattern +// /dir/pattern +// /dir/subdir/pattern +// # In both rsync and gitignore, anchored patterns are relative to the +// # directory under consideration. In ncdu they are instead anchored to +// # the filesystem root (i.e. matched against the absolute path). +// Non-anchored patterns: +// somefile +// subdir/foo +// sub*/bar +// # In .gitignore, non-anchored patterns with a slash are implicitely anchored, +// # in rsync they can match anywhere in a path. We follow rsync here. +// Dir patterns (trailing '/' matches only dirs): +// /pattern/ +// somedir/ +// subdir/pattern/ +// +// BREAKING CHANGE: +// ncdu < 2.2 single-component matches may cross directory boundary, e.g. +// 'a*b' matches 'a/b'. This is an old bug, the fix breaks compatibility with +// old exlude patterns. + +const Pattern = struct { + isdir: bool = undefined, + isliteral: bool = undefined, + pattern: [:0]const u8, + sub: ?*const Pattern = undefined, + + fn isLiteral(str: []const u8) bool { + for (str) |chr| switch (chr) { + '[', '*', '?', '\\' => return false, + else => {}, + }; + return true; + } + + fn parse(pat_: []const u8) *const Pattern { + var pat = std.mem.trimLeft(u8, pat_, "/"); + var top = main.allocator.create(Pattern) catch unreachable; + var tail = top; + tail.sub = null; + while (std.mem.indexOfScalar(u8, pat, '/')) |idx| { + tail.pattern = main.allocator.dupeZ(u8, pat[0..idx]) catch unreachable; + tail.isdir = true; + tail.isliteral = isLiteral(tail.pattern); + pat = pat[idx+1..]; + if (std.mem.allEqual(u8, pat, '/')) return top; + + const next = main.allocator.create(Pattern) catch unreachable; + tail.sub = next; + tail = next; + tail.sub = null; + } + tail.pattern = main.allocator.dupeZ(u8, pat) catch unreachable; + tail.isdir = false; + tail.isliteral = isLiteral(tail.pattern); + return top; + } +}; + +test "parse" { + const t1 = Pattern.parse(""); + try std.testing.expectEqualStrings(t1.pattern, ""); + try std.testing.expectEqual(t1.isdir, false); + try std.testing.expectEqual(t1.isliteral, true); + try std.testing.expectEqual(t1.sub, null); + + const t2 = Pattern.parse("//a//"); + try std.testing.expectEqualStrings(t2.pattern, "a"); + try std.testing.expectEqual(t2.isdir, true); + try std.testing.expectEqual(t2.isliteral, true); + try std.testing.expectEqual(t2.sub, null); + + const t3 = Pattern.parse("foo*/bar.zig"); + try std.testing.expectEqualStrings(t3.pattern, "foo*"); + try std.testing.expectEqual(t3.isdir, true); + try std.testing.expectEqual(t3.isliteral, false); + try std.testing.expectEqualStrings(t3.sub.?.pattern, "bar.zig"); + try std.testing.expectEqual(t3.sub.?.isdir, false); + try std.testing.expectEqual(t3.sub.?.isliteral, true); + try std.testing.expectEqual(t3.sub.?.sub, null); + + const t4 = Pattern.parse("/?/sub/dir/"); + try std.testing.expectEqualStrings(t4.pattern, "?"); + try std.testing.expectEqual(t4.isdir, true); + try std.testing.expectEqual(t4.isliteral, false); + try std.testing.expectEqualStrings(t4.sub.?.pattern, "sub"); + try std.testing.expectEqual(t4.sub.?.isdir, true); + try std.testing.expectEqual(t4.sub.?.isliteral, true); + try std.testing.expectEqualStrings(t4.sub.?.sub.?.pattern, "dir"); + try std.testing.expectEqual(t4.sub.?.sub.?.isdir, true); + try std.testing.expectEqual(t4.sub.?.sub.?.isliteral, true); + try std.testing.expectEqual(t4.sub.?.sub.?.sub, null); +} + + +// List of patterns to be matched at one particular level. +// There are 2 different types of lists: those where all patterns have a +// sub-pointer (where the pattern only matches directories at this level, and +// the match result is only used to construct the PatternList of the +// subdirectory) and patterns without a sub-pointer (where the match result +// determines whether the file/dir at this level should be included or not). +fn PatternList(withsub: bool) type { + return struct { + literals: std.HashMapUnmanaged(*const Pattern, Val, Ctx, 80) = .{}, + wild: std.ArrayListUnmanaged(*const Pattern) = .{}, + + // Not a fan of the map-of-arrays approach in the 'withsub' case, it + // has a lot of extra allocations. Linking the Patterns together in a + // list would be nicer, but that involves mutable Patterns, which in + // turn prevents multithreaded scanning. An alternative would be a + // sorted array + binary search, but that slows down lookups. Perhaps a + // custom hashmap with support for duplicate keys? + const Val = if (withsub) std.ArrayListUnmanaged(*const Pattern) else void; + + const Ctx = struct { + pub fn hash(_: Ctx, p: *const Pattern) u64 { + return std.hash.Wyhash.hash(0, p.pattern); + } + pub fn eql(_: Ctx, a: *const Pattern, b: *const Pattern) bool { + return std.mem.eql(u8, a.pattern, b.pattern); + } + }; + + const Self = @This(); + + fn append(self: *Self, pat: *const Pattern) void { + std.debug.assert((pat.sub != null) == withsub); + if (pat.isliteral) { + var e = self.literals.getOrPut(main.allocator, pat) catch unreachable; + if (!e.found_existing) { + e.key_ptr.* = pat; + e.value_ptr.* = .{}; + } + if (!withsub and !pat.isdir and e.key_ptr.*.isdir) e.key_ptr.* = pat; + if (withsub) { + if (pat.sub) |s| e.value_ptr.*.append(main.allocator, s) catch unreachable; + } + + } else self.wild.append(main.allocator, pat) catch unreachable; + } + + fn match(self: *const Self, name: [:0]const u8) ?bool { + var ret: ?bool = null; + if (self.literals.getKey(&.{ .pattern = name })) |p| ret = p.isdir; + for (self.wild.items) |p| { + if (ret == false) return ret; + if (c.fnmatch(p.pattern, name, 0) == 0) ret = p.isdir; + } + return ret; + } + + fn enter(self: *const Self, out: *Patterns, name: [:0]const u8) void { + if (self.literals.get(&.{ .pattern = name })) |lst| for (lst.items) |sub| out.append(sub); + for (self.wild.items) |p| if (c.fnmatch(p.pattern, name, 0) == 0) out.append(p.sub.?); + } + + fn deinit(self: *Self) void { + if (withsub) { + var it = self.literals.valueIterator(); + while (it.next()) |e| e.deinit(main.allocator); + } + self.literals.deinit(main.allocator); + self.wild.deinit(main.allocator); + self.* = undefined; + } + }; +} + +// List of all patterns that should be matched at one level. +pub const Patterns = struct { + nonsub: PatternList(false) = .{}, + sub: PatternList(true) = .{}, + isroot: bool = false, + + fn append(self: *Patterns, pat: *const Pattern) void { + if (pat.sub == null) self.nonsub.append(pat) + else self.sub.append(pat); + } + + // Matches patterns in this level plus unanchored patterns. + // Returns null if nothing matches, otherwise whether the given item should + // only be exluced if it's a directory. + // (Should not be called on root_unanchored) + pub fn match(self: *const Patterns, name: [:0]const u8) ?bool { + const a = self.nonsub.match(name); + if (a == false) return false; + const b = root_unanchored.nonsub.match(name); + if (b == false) return false; + return a orelse b; + } + + // Construct the list of patterns for a subdirectory. + pub fn enter(self: *const Patterns, name: [:0]const u8) Patterns { + var ret = Patterns{}; + self.sub.enter(&ret, name); + root_unanchored.sub.enter(&ret, name); + return ret; + } + + pub fn deinit(self: *Patterns) void { + // getPatterns() result should be deinit()ed, except when it returns the root, + // let's simplify that and simply don't deinit root. + if (self.isroot) return; + self.nonsub.deinit(); + self.sub.deinit(); + self.* = undefined; + } +}; + +// Unanchored patterns that should be checked at every level +var root_unanchored: Patterns = .{}; + +// Patterns anchored at the root +var root: Patterns = .{ .isroot = true }; + +pub fn addPattern(pattern: []const u8) void { + if (pattern.len == 0) return; + const p = Pattern.parse(pattern); + if (pattern[0] == '/') root.append(p) + else root_unanchored.append(p); +} + +// Get the patterns for the given (absolute) path, assuming the given path +// itself hasn't been excluded. This function is slow, directory walking code +// should use Patterns.enter() instead. +pub fn getPatterns(path_: []const u8) Patterns { + var path = std.mem.trim(u8, path_, "/"); + if (path.len == 0) return root; + var pat = root; + defer pat.deinit(); + while (std.mem.indexOfScalar(u8, path, '/')) |idx| { + var name = main.allocator.dupeZ(u8, path[0..idx]) catch unreachable; + defer main.allocator.free(name); + path = path[idx+1..]; + + var sub = pat.enter(name); + pat.deinit(); + pat = sub; + } + + var name = main.allocator.dupeZ(u8, path) catch unreachable; + defer main.allocator.free(name); + return pat.enter(name); +} + + +fn testfoo(p: *const Patterns) !void { + try std.testing.expectEqual(p.match("root"), null); + try std.testing.expectEqual(p.match("bar"), false); + try std.testing.expectEqual(p.match("qoo"), false); + try std.testing.expectEqual(p.match("xyz"), false); + try std.testing.expectEqual(p.match("okay"), null); + try std.testing.expectEqual(p.match("somefile"), false); + var s = p.enter("okay"); + try std.testing.expectEqual(s.match("bar"), null); + try std.testing.expectEqual(s.match("xyz"), null); + try std.testing.expectEqual(s.match("notokay"), false); + s.deinit(); +} + +test "Matching" { + addPattern("/foo/bar"); + addPattern("/foo/qoo/"); + addPattern("/foo/qoo"); + addPattern("/foo/qoo/"); + addPattern("/f??/xyz"); + addPattern("/f??/xyz/"); + addPattern("/*o/somefile"); + addPattern("/a??/okay"); + addPattern("/roo?"); + addPattern("/root/"); + addPattern("excluded"); + addPattern("somefile/"); + addPattern("o*y/not[o]kay"); + + var a0 = getPatterns("/"); + try std.testing.expectEqual(a0.match("a"), null); + try std.testing.expectEqual(a0.match("excluded"), false); + try std.testing.expectEqual(a0.match("somefile"), true); + try std.testing.expectEqual(a0.match("root"), false); + var a1 = a0.enter("foo"); + a0.deinit(); + try testfoo(&a1); + a1.deinit(); + + var b0 = getPatterns("/somedir/somewhere"); + try std.testing.expectEqual(b0.match("a"), null); + try std.testing.expectEqual(b0.match("excluded"), false); + try std.testing.expectEqual(b0.match("root"), null); + try std.testing.expectEqual(b0.match("okay"), null); + var b1 = b0.enter("okay"); + b0.deinit(); + try std.testing.expectEqual(b1.match("excluded"), false); + try std.testing.expectEqual(b1.match("okay"), null); + try std.testing.expectEqual(b1.match("notokay"), false); + b1.deinit(); + + var c0 = getPatterns("/foo/"); + try testfoo(&c0); + c0.deinit(); +} diff --git a/src/main.zig b/src/main.zig index 70e61a5..593cb04 100644 --- a/src/main.zig +++ b/src/main.zig @@ -10,6 +10,7 @@ const ui = @import("ui.zig"); const browser = @import("browser.zig"); const delete = @import("delete.zig"); const util = @import("util.zig"); +const exclude = @import("exclude.zig"); const c = @cImport(@cInclude("locale.h")); // "Custom" allocator that wraps the libc allocator and calls ui.oom() on error. @@ -229,7 +230,7 @@ fn argConfig(args: *Args, opt: Args.Option) bool { else if (opt.is("--no-si")) config.si = false else if (opt.is("-L") or opt.is("--follow-symlinks")) config.follow_symlinks = true else if (opt.is("--no-follow-symlinks")) config.follow_symlinks = false - else if (opt.is("--exclude")) config.exclude_patterns.append(allocator.dupeZ(u8, args.arg()) catch unreachable) catch unreachable + else if (opt.is("--exclude")) exclude.addPattern(args.arg()) else if (opt.is("-X") or opt.is("--exclude-from")) { const arg = args.arg(); readExcludeFile(arg) catch |e| ui.die("Error reading excludes from {s}: {s}.\n", .{ arg, ui.errorString(e) }); @@ -381,11 +382,12 @@ fn readExcludeFile(path: [:0]const u8) !void { defer f.close(); var rd = std.io.bufferedReader(f.reader()).reader(); var buf = std.ArrayList(u8).init(allocator); + defer buf.deinit(); while (true) { rd.readUntilDelimiterArrayList(&buf, '\n', 4096) catch |e| if (e != error.EndOfStream) return e else if (buf.items.len == 0) break; if (buf.items.len > 0) - config.exclude_patterns.append(buf.toOwnedSliceSentinel(0) catch unreachable) catch unreachable; + exclude.addPattern(util.arrayListBufZ(&buf)); } } @@ -552,6 +554,7 @@ test "imports" { _ = @import("model.zig"); _ = @import("ui.zig"); _ = @import("util.zig"); + _ = @import("exclude.zig"); } test "argument parser" { diff --git a/src/scan.zig b/src/scan.zig index 86f7061..68b74c8 100644 --- a/src/scan.zig +++ b/src/scan.zig @@ -6,8 +6,8 @@ const main = @import("main.zig"); const model = @import("model.zig"); const ui = @import("ui.zig"); const util = @import("util.zig"); +const exclude = @import("exclude.zig"); const c_statfs = @cImport(@cInclude("sys/vfs.h")); -const c_fnmatch = @cImport(@cInclude("fnmatch.h")); // Concise stat struct for fields we're interested in, with the types used by the model. @@ -450,7 +450,7 @@ const Context = struct { var active_context: *Context = undefined; // Read and index entries of the given dir. -fn scanDir(ctx: *Context, dir: std.fs.Dir, dir_dev: u64) void { +fn scanDir(ctx: *Context, pat: *const exclude.Patterns, dir: std.fs.Dir, dir_dev: u64) void { var it = main.allocator.create(std.fs.Dir.Iterator) catch unreachable; defer main.allocator.destroy(it); it.* = dir.iterate(); @@ -465,19 +465,8 @@ fn scanDir(ctx: *Context, dir: std.fs.Dir, dir_dev: u64) void { defer ctx.popPath(); main.handleEvent(false, false); - // XXX: This algorithm is extremely slow, can be optimized with some clever pattern parsing. - const excluded = blk: { - for (main.config.exclude_patterns.items) |pat| { - var path = ctx.pathZ(); - while (path.len > 0) { - if (c_fnmatch.fnmatch(pat, path, 0) == 0) break :blk true; - if (std.mem.indexOfScalar(u8, path, '/')) |idx| path = path[idx+1..:0] - else break; - } - } - break :blk false; - }; - if (excluded) { + const excluded = pat.match(ctx.name); + if (excluded == false) { // matched either a file or directory, so we can exclude this before stat()ing. ctx.addSpecial(.excluded); continue; } @@ -486,7 +475,6 @@ fn scanDir(ctx: *Context, dir: std.fs.Dir, dir_dev: u64) void { ctx.addSpecial(.err); continue; }; - if (main.config.same_fs and ctx.stat.dev != dir_dev) { ctx.addSpecial(.other_fs); continue; @@ -503,6 +491,10 @@ fn scanDir(ctx: *Context, dir: std.fs.Dir, dir_dev: u64) void { } } else |_| {} } + if (excluded) |e| if (e and ctx.stat.dir) { + ctx.addSpecial(.excluded); + continue; + }; var edir = if (ctx.stat.dir) dir.openDirZ(ctx.name, .{ .access_sub_paths = true, .iterate = true, .no_follow = true }) catch { @@ -530,7 +522,11 @@ fn scanDir(ctx: *Context, dir: std.fs.Dir, dir_dev: u64) void { } ctx.addStat(dir_dev); - if (ctx.stat.dir) scanDir(ctx, edir.?, ctx.stat.dev); + if (ctx.stat.dir) { + var subpat = pat.enter(ctx.name); + defer subpat.deinit(); + scanDir(ctx, &subpat, edir.?, ctx.stat.dev); + } } } @@ -568,7 +564,9 @@ pub fn scan() void { return; }; defer dir.close(); - scanDir(active_context, dir, active_context.stat.dev); + var pat = exclude.getPatterns(active_context.pathZ()); + defer pat.deinit(); + scanDir(active_context, &pat, dir, active_context.stat.dev); active_context.popPath(); active_context.final(); }