ncdu-zig/src/exclude.zig
2023-01-19 08:00:27 +01:00

322 lines
12 KiB
Zig

// SPDX-FileCopyrightText: 2021-2023 Yoran Heling <projects@yorhel.nl>
// SPDX-License-Identifier: MIT
const std = @import("std");
const main = @import("main.zig");
const c = @cImport(@cInclude("fnmatch.h"));
// Reference:
// https://manned.org/glob.7
// https://manned.org/man.b4c7391e/rsync#head17
// https://manned.org/man.401d6ade/arch/gitignore#head4
// Patterns:
// Single component (none of these patterns match a '/'):
// * -> match any character sequence
// ? -> match single character
// [abc] -> match a single character in the given list
// [a-c] -> match a single character in the given range
// [!a-c] -> match a single character not in the given range
// # (these are currently still handled by calling libc fnmatch())
// Anchored patterns:
// /pattern
// /dir/pattern
// /dir/subdir/pattern
// # In both rsync and gitignore, anchored patterns are relative to the
// # directory under consideration. In ncdu they are instead anchored to
// # the filesystem root (i.e. matched against the absolute path).
// Non-anchored patterns:
// somefile
// subdir/foo
// sub*/bar
// # In .gitignore, non-anchored patterns with a slash are implicitely anchored,
// # in rsync they can match anywhere in a path. We follow rsync here.
// Dir patterns (trailing '/' matches only dirs):
// /pattern/
// somedir/
// subdir/pattern/
//
// BREAKING CHANGE:
// ncdu < 2.2 single-component matches may cross directory boundary, e.g.
// 'a*b' matches 'a/b'. This is an old bug, the fix breaks compatibility with
// old exlude patterns.
const Pattern = struct {
isdir: bool = undefined,
isliteral: bool = undefined,
pattern: [:0]const u8,
sub: ?*const Pattern = undefined,
fn isLiteral(str: []const u8) bool {
for (str) |chr| switch (chr) {
'[', '*', '?', '\\' => return false,
else => {},
};
return true;
}
fn parse(pat_: []const u8) *const Pattern {
var pat = std.mem.trimLeft(u8, pat_, "/");
var top = main.allocator.create(Pattern) catch unreachable;
var tail = top;
tail.sub = null;
while (std.mem.indexOfScalar(u8, pat, '/')) |idx| {
tail.pattern = main.allocator.dupeZ(u8, pat[0..idx]) catch unreachable;
tail.isdir = true;
tail.isliteral = isLiteral(tail.pattern);
pat = pat[idx+1..];
if (std.mem.allEqual(u8, pat, '/')) return top;
const next = main.allocator.create(Pattern) catch unreachable;
tail.sub = next;
tail = next;
tail.sub = null;
}
tail.pattern = main.allocator.dupeZ(u8, pat) catch unreachable;
tail.isdir = false;
tail.isliteral = isLiteral(tail.pattern);
return top;
}
};
test "parse" {
const t1 = Pattern.parse("");
try std.testing.expectEqualStrings(t1.pattern, "");
try std.testing.expectEqual(t1.isdir, false);
try std.testing.expectEqual(t1.isliteral, true);
try std.testing.expectEqual(t1.sub, null);
const t2 = Pattern.parse("//a//");
try std.testing.expectEqualStrings(t2.pattern, "a");
try std.testing.expectEqual(t2.isdir, true);
try std.testing.expectEqual(t2.isliteral, true);
try std.testing.expectEqual(t2.sub, null);
const t3 = Pattern.parse("foo*/bar.zig");
try std.testing.expectEqualStrings(t3.pattern, "foo*");
try std.testing.expectEqual(t3.isdir, true);
try std.testing.expectEqual(t3.isliteral, false);
try std.testing.expectEqualStrings(t3.sub.?.pattern, "bar.zig");
try std.testing.expectEqual(t3.sub.?.isdir, false);
try std.testing.expectEqual(t3.sub.?.isliteral, true);
try std.testing.expectEqual(t3.sub.?.sub, null);
const t4 = Pattern.parse("/?/sub/dir/");
try std.testing.expectEqualStrings(t4.pattern, "?");
try std.testing.expectEqual(t4.isdir, true);
try std.testing.expectEqual(t4.isliteral, false);
try std.testing.expectEqualStrings(t4.sub.?.pattern, "sub");
try std.testing.expectEqual(t4.sub.?.isdir, true);
try std.testing.expectEqual(t4.sub.?.isliteral, true);
try std.testing.expectEqualStrings(t4.sub.?.sub.?.pattern, "dir");
try std.testing.expectEqual(t4.sub.?.sub.?.isdir, true);
try std.testing.expectEqual(t4.sub.?.sub.?.isliteral, true);
try std.testing.expectEqual(t4.sub.?.sub.?.sub, null);
}
// List of patterns to be matched at one particular level.
// There are 2 different types of lists: those where all patterns have a
// sub-pointer (where the pattern only matches directories at this level, and
// the match result is only used to construct the PatternList of the
// subdirectory) and patterns without a sub-pointer (where the match result
// determines whether the file/dir at this level should be included or not).
fn PatternList(comptime withsub: bool) type {
return struct {
literals: std.HashMapUnmanaged(*const Pattern, Val, Ctx, 80) = .{},
wild: std.ArrayListUnmanaged(*const Pattern) = .{},
// Not a fan of the map-of-arrays approach in the 'withsub' case, it
// has a lot of extra allocations. Linking the Patterns together in a
// list would be nicer, but that involves mutable Patterns, which in
// turn prevents multithreaded scanning. An alternative would be a
// sorted array + binary search, but that slows down lookups. Perhaps a
// custom hashmap with support for duplicate keys?
const Val = if (withsub) std.ArrayListUnmanaged(*const Pattern) else void;
const Ctx = struct {
pub fn hash(_: Ctx, p: *const Pattern) u64 {
return std.hash.Wyhash.hash(0, p.pattern);
}
pub fn eql(_: Ctx, a: *const Pattern, b: *const Pattern) bool {
return std.mem.eql(u8, a.pattern, b.pattern);
}
};
const Self = @This();
fn append(self: *Self, pat: *const Pattern) void {
std.debug.assert((pat.sub != null) == withsub);
if (pat.isliteral) {
var e = self.literals.getOrPut(main.allocator, pat) catch unreachable;
if (!e.found_existing) {
e.key_ptr.* = pat;
e.value_ptr.* = if (withsub) .{} else {};
}
if (!withsub and !pat.isdir and e.key_ptr.*.isdir) e.key_ptr.* = pat;
if (withsub) {
if (pat.sub) |s| e.value_ptr.*.append(main.allocator, s) catch unreachable;
}
} else self.wild.append(main.allocator, pat) catch unreachable;
}
fn match(self: *const Self, name: [:0]const u8) ?bool {
var ret: ?bool = null;
if (self.literals.getKey(&.{ .pattern = name })) |p| ret = p.isdir;
for (self.wild.items) |p| {
if (ret == false) return ret;
if (c.fnmatch(p.pattern.ptr, name.ptr, 0) == 0) ret = p.isdir;
}
return ret;
}
fn enter(self: *const Self, out: *Patterns, name: [:0]const u8) void {
if (self.literals.get(&.{ .pattern = name })) |lst| for (lst.items) |sub| out.append(sub);
for (self.wild.items) |p| if (c.fnmatch(p.pattern.ptr, name.ptr, 0) == 0) out.append(p.sub.?);
}
fn deinit(self: *Self) void {
if (withsub) {
var it = self.literals.valueIterator();
while (it.next()) |e| e.deinit(main.allocator);
}
self.literals.deinit(main.allocator);
self.wild.deinit(main.allocator);
self.* = undefined;
}
};
}
// List of all patterns that should be matched at one level.
pub const Patterns = struct {
nonsub: PatternList(false) = .{},
sub: PatternList(true) = .{},
isroot: bool = false,
fn append(self: *Patterns, pat: *const Pattern) void {
if (pat.sub == null) self.nonsub.append(pat)
else self.sub.append(pat);
}
// Matches patterns in this level plus unanchored patterns.
// Returns null if nothing matches, otherwise whether the given item should
// only be exluced if it's a directory.
// (Should not be called on root_unanchored)
pub fn match(self: *const Patterns, name: [:0]const u8) ?bool {
const a = self.nonsub.match(name);
if (a == false) return false;
const b = root_unanchored.nonsub.match(name);
if (b == false) return false;
return a orelse b;
}
// Construct the list of patterns for a subdirectory.
pub fn enter(self: *const Patterns, name: [:0]const u8) Patterns {
var ret = Patterns{};
self.sub.enter(&ret, name);
root_unanchored.sub.enter(&ret, name);
return ret;
}
pub fn deinit(self: *Patterns) void {
// getPatterns() result should be deinit()ed, except when it returns the root,
// let's simplify that and simply don't deinit root.
if (self.isroot) return;
self.nonsub.deinit();
self.sub.deinit();
self.* = undefined;
}
};
// Unanchored patterns that should be checked at every level
var root_unanchored: Patterns = .{};
// Patterns anchored at the root
var root: Patterns = .{ .isroot = true };
pub fn addPattern(pattern: []const u8) void {
if (pattern.len == 0) return;
const p = Pattern.parse(pattern);
if (pattern[0] == '/') root.append(p)
else root_unanchored.append(p);
}
// Get the patterns for the given (absolute) path, assuming the given path
// itself hasn't been excluded. This function is slow, directory walking code
// should use Patterns.enter() instead.
pub fn getPatterns(path_: []const u8) Patterns {
var path = std.mem.trim(u8, path_, "/");
if (path.len == 0) return root;
var pat = root;
defer pat.deinit();
while (std.mem.indexOfScalar(u8, path, '/')) |idx| {
var name = main.allocator.dupeZ(u8, path[0..idx]) catch unreachable;
defer main.allocator.free(name);
path = path[idx+1..];
var sub = pat.enter(name);
pat.deinit();
pat = sub;
}
var name = main.allocator.dupeZ(u8, path) catch unreachable;
defer main.allocator.free(name);
return pat.enter(name);
}
fn testfoo(p: *const Patterns) !void {
try std.testing.expectEqual(p.match("root"), null);
try std.testing.expectEqual(p.match("bar"), false);
try std.testing.expectEqual(p.match("qoo"), false);
try std.testing.expectEqual(p.match("xyz"), false);
try std.testing.expectEqual(p.match("okay"), null);
try std.testing.expectEqual(p.match("somefile"), false);
var s = p.enter("okay");
try std.testing.expectEqual(s.match("bar"), null);
try std.testing.expectEqual(s.match("xyz"), null);
try std.testing.expectEqual(s.match("notokay"), false);
s.deinit();
}
test "Matching" {
addPattern("/foo/bar");
addPattern("/foo/qoo/");
addPattern("/foo/qoo");
addPattern("/foo/qoo/");
addPattern("/f??/xyz");
addPattern("/f??/xyz/");
addPattern("/*o/somefile");
addPattern("/a??/okay");
addPattern("/roo?");
addPattern("/root/");
addPattern("excluded");
addPattern("somefile/");
addPattern("o*y/not[o]kay");
var a0 = getPatterns("/");
try std.testing.expectEqual(a0.match("a"), null);
try std.testing.expectEqual(a0.match("excluded"), false);
try std.testing.expectEqual(a0.match("somefile"), true);
try std.testing.expectEqual(a0.match("root"), false);
var a1 = a0.enter("foo");
a0.deinit();
try testfoo(&a1);
a1.deinit();
var b0 = getPatterns("/somedir/somewhere");
try std.testing.expectEqual(b0.match("a"), null);
try std.testing.expectEqual(b0.match("excluded"), false);
try std.testing.expectEqual(b0.match("root"), null);
try std.testing.expectEqual(b0.match("okay"), null);
var b1 = b0.enter("okay");
b0.deinit();
try std.testing.expectEqual(b1.match("excluded"), false);
try std.testing.expectEqual(b1.match("okay"), null);
try std.testing.expectEqual(b1.match("notokay"), false);
b1.deinit();
var c0 = getPatterns("/foo/");
try testfoo(&c0);
c0.deinit();
}