mirror of
https://code.blicky.net/yorhel/ncdu.git
synced 2026-01-13 01:08:41 -09:00
322 lines
12 KiB
Zig
322 lines
12 KiB
Zig
// SPDX-FileCopyrightText: 2021-2023 Yoran Heling <projects@yorhel.nl>
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
const std = @import("std");
|
|
const main = @import("main.zig");
|
|
const c = @cImport(@cInclude("fnmatch.h"));
|
|
|
|
// Reference:
|
|
// https://manned.org/glob.7
|
|
// https://manned.org/man.b4c7391e/rsync#head17
|
|
// https://manned.org/man.401d6ade/arch/gitignore#head4
|
|
// Patterns:
|
|
// Single component (none of these patterns match a '/'):
|
|
// * -> match any character sequence
|
|
// ? -> match single character
|
|
// [abc] -> match a single character in the given list
|
|
// [a-c] -> match a single character in the given range
|
|
// [!a-c] -> match a single character not in the given range
|
|
// # (these are currently still handled by calling libc fnmatch())
|
|
// Anchored patterns:
|
|
// /pattern
|
|
// /dir/pattern
|
|
// /dir/subdir/pattern
|
|
// # In both rsync and gitignore, anchored patterns are relative to the
|
|
// # directory under consideration. In ncdu they are instead anchored to
|
|
// # the filesystem root (i.e. matched against the absolute path).
|
|
// Non-anchored patterns:
|
|
// somefile
|
|
// subdir/foo
|
|
// sub*/bar
|
|
// # In .gitignore, non-anchored patterns with a slash are implicitely anchored,
|
|
// # in rsync they can match anywhere in a path. We follow rsync here.
|
|
// Dir patterns (trailing '/' matches only dirs):
|
|
// /pattern/
|
|
// somedir/
|
|
// subdir/pattern/
|
|
//
|
|
// BREAKING CHANGE:
|
|
// ncdu < 2.2 single-component matches may cross directory boundary, e.g.
|
|
// 'a*b' matches 'a/b'. This is an old bug, the fix breaks compatibility with
|
|
// old exlude patterns.
|
|
|
|
const Pattern = struct {
|
|
isdir: bool = undefined,
|
|
isliteral: bool = undefined,
|
|
pattern: [:0]const u8,
|
|
sub: ?*const Pattern = undefined,
|
|
|
|
fn isLiteral(str: []const u8) bool {
|
|
for (str) |chr| switch (chr) {
|
|
'[', '*', '?', '\\' => return false,
|
|
else => {},
|
|
};
|
|
return true;
|
|
}
|
|
|
|
fn parse(pat_: []const u8) *const Pattern {
|
|
var pat = std.mem.trimLeft(u8, pat_, "/");
|
|
var top = main.allocator.create(Pattern) catch unreachable;
|
|
var tail = top;
|
|
tail.sub = null;
|
|
while (std.mem.indexOfScalar(u8, pat, '/')) |idx| {
|
|
tail.pattern = main.allocator.dupeZ(u8, pat[0..idx]) catch unreachable;
|
|
tail.isdir = true;
|
|
tail.isliteral = isLiteral(tail.pattern);
|
|
pat = pat[idx+1..];
|
|
if (std.mem.allEqual(u8, pat, '/')) return top;
|
|
|
|
const next = main.allocator.create(Pattern) catch unreachable;
|
|
tail.sub = next;
|
|
tail = next;
|
|
tail.sub = null;
|
|
}
|
|
tail.pattern = main.allocator.dupeZ(u8, pat) catch unreachable;
|
|
tail.isdir = false;
|
|
tail.isliteral = isLiteral(tail.pattern);
|
|
return top;
|
|
}
|
|
};
|
|
|
|
test "parse" {
|
|
const t1 = Pattern.parse("");
|
|
try std.testing.expectEqualStrings(t1.pattern, "");
|
|
try std.testing.expectEqual(t1.isdir, false);
|
|
try std.testing.expectEqual(t1.isliteral, true);
|
|
try std.testing.expectEqual(t1.sub, null);
|
|
|
|
const t2 = Pattern.parse("//a//");
|
|
try std.testing.expectEqualStrings(t2.pattern, "a");
|
|
try std.testing.expectEqual(t2.isdir, true);
|
|
try std.testing.expectEqual(t2.isliteral, true);
|
|
try std.testing.expectEqual(t2.sub, null);
|
|
|
|
const t3 = Pattern.parse("foo*/bar.zig");
|
|
try std.testing.expectEqualStrings(t3.pattern, "foo*");
|
|
try std.testing.expectEqual(t3.isdir, true);
|
|
try std.testing.expectEqual(t3.isliteral, false);
|
|
try std.testing.expectEqualStrings(t3.sub.?.pattern, "bar.zig");
|
|
try std.testing.expectEqual(t3.sub.?.isdir, false);
|
|
try std.testing.expectEqual(t3.sub.?.isliteral, true);
|
|
try std.testing.expectEqual(t3.sub.?.sub, null);
|
|
|
|
const t4 = Pattern.parse("/?/sub/dir/");
|
|
try std.testing.expectEqualStrings(t4.pattern, "?");
|
|
try std.testing.expectEqual(t4.isdir, true);
|
|
try std.testing.expectEqual(t4.isliteral, false);
|
|
try std.testing.expectEqualStrings(t4.sub.?.pattern, "sub");
|
|
try std.testing.expectEqual(t4.sub.?.isdir, true);
|
|
try std.testing.expectEqual(t4.sub.?.isliteral, true);
|
|
try std.testing.expectEqualStrings(t4.sub.?.sub.?.pattern, "dir");
|
|
try std.testing.expectEqual(t4.sub.?.sub.?.isdir, true);
|
|
try std.testing.expectEqual(t4.sub.?.sub.?.isliteral, true);
|
|
try std.testing.expectEqual(t4.sub.?.sub.?.sub, null);
|
|
}
|
|
|
|
|
|
// List of patterns to be matched at one particular level.
|
|
// There are 2 different types of lists: those where all patterns have a
|
|
// sub-pointer (where the pattern only matches directories at this level, and
|
|
// the match result is only used to construct the PatternList of the
|
|
// subdirectory) and patterns without a sub-pointer (where the match result
|
|
// determines whether the file/dir at this level should be included or not).
|
|
fn PatternList(comptime withsub: bool) type {
|
|
return struct {
|
|
literals: std.HashMapUnmanaged(*const Pattern, Val, Ctx, 80) = .{},
|
|
wild: std.ArrayListUnmanaged(*const Pattern) = .{},
|
|
|
|
// Not a fan of the map-of-arrays approach in the 'withsub' case, it
|
|
// has a lot of extra allocations. Linking the Patterns together in a
|
|
// list would be nicer, but that involves mutable Patterns, which in
|
|
// turn prevents multithreaded scanning. An alternative would be a
|
|
// sorted array + binary search, but that slows down lookups. Perhaps a
|
|
// custom hashmap with support for duplicate keys?
|
|
const Val = if (withsub) std.ArrayListUnmanaged(*const Pattern) else void;
|
|
|
|
const Ctx = struct {
|
|
pub fn hash(_: Ctx, p: *const Pattern) u64 {
|
|
return std.hash.Wyhash.hash(0, p.pattern);
|
|
}
|
|
pub fn eql(_: Ctx, a: *const Pattern, b: *const Pattern) bool {
|
|
return std.mem.eql(u8, a.pattern, b.pattern);
|
|
}
|
|
};
|
|
|
|
const Self = @This();
|
|
|
|
fn append(self: *Self, pat: *const Pattern) void {
|
|
std.debug.assert((pat.sub != null) == withsub);
|
|
if (pat.isliteral) {
|
|
var e = self.literals.getOrPut(main.allocator, pat) catch unreachable;
|
|
if (!e.found_existing) {
|
|
e.key_ptr.* = pat;
|
|
e.value_ptr.* = if (withsub) .{} else {};
|
|
}
|
|
if (!withsub and !pat.isdir and e.key_ptr.*.isdir) e.key_ptr.* = pat;
|
|
if (withsub) {
|
|
if (pat.sub) |s| e.value_ptr.*.append(main.allocator, s) catch unreachable;
|
|
}
|
|
|
|
} else self.wild.append(main.allocator, pat) catch unreachable;
|
|
}
|
|
|
|
fn match(self: *const Self, name: [:0]const u8) ?bool {
|
|
var ret: ?bool = null;
|
|
if (self.literals.getKey(&.{ .pattern = name })) |p| ret = p.isdir;
|
|
for (self.wild.items) |p| {
|
|
if (ret == false) return ret;
|
|
if (c.fnmatch(p.pattern.ptr, name.ptr, 0) == 0) ret = p.isdir;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
fn enter(self: *const Self, out: *Patterns, name: [:0]const u8) void {
|
|
if (self.literals.get(&.{ .pattern = name })) |lst| for (lst.items) |sub| out.append(sub);
|
|
for (self.wild.items) |p| if (c.fnmatch(p.pattern.ptr, name.ptr, 0) == 0) out.append(p.sub.?);
|
|
}
|
|
|
|
fn deinit(self: *Self) void {
|
|
if (withsub) {
|
|
var it = self.literals.valueIterator();
|
|
while (it.next()) |e| e.deinit(main.allocator);
|
|
}
|
|
self.literals.deinit(main.allocator);
|
|
self.wild.deinit(main.allocator);
|
|
self.* = undefined;
|
|
}
|
|
};
|
|
}
|
|
|
|
// List of all patterns that should be matched at one level.
|
|
pub const Patterns = struct {
|
|
nonsub: PatternList(false) = .{},
|
|
sub: PatternList(true) = .{},
|
|
isroot: bool = false,
|
|
|
|
fn append(self: *Patterns, pat: *const Pattern) void {
|
|
if (pat.sub == null) self.nonsub.append(pat)
|
|
else self.sub.append(pat);
|
|
}
|
|
|
|
// Matches patterns in this level plus unanchored patterns.
|
|
// Returns null if nothing matches, otherwise whether the given item should
|
|
// only be exluced if it's a directory.
|
|
// (Should not be called on root_unanchored)
|
|
pub fn match(self: *const Patterns, name: [:0]const u8) ?bool {
|
|
const a = self.nonsub.match(name);
|
|
if (a == false) return false;
|
|
const b = root_unanchored.nonsub.match(name);
|
|
if (b == false) return false;
|
|
return a orelse b;
|
|
}
|
|
|
|
// Construct the list of patterns for a subdirectory.
|
|
pub fn enter(self: *const Patterns, name: [:0]const u8) Patterns {
|
|
var ret = Patterns{};
|
|
self.sub.enter(&ret, name);
|
|
root_unanchored.sub.enter(&ret, name);
|
|
return ret;
|
|
}
|
|
|
|
pub fn deinit(self: *Patterns) void {
|
|
// getPatterns() result should be deinit()ed, except when it returns the root,
|
|
// let's simplify that and simply don't deinit root.
|
|
if (self.isroot) return;
|
|
self.nonsub.deinit();
|
|
self.sub.deinit();
|
|
self.* = undefined;
|
|
}
|
|
};
|
|
|
|
// Unanchored patterns that should be checked at every level
|
|
var root_unanchored: Patterns = .{};
|
|
|
|
// Patterns anchored at the root
|
|
var root: Patterns = .{ .isroot = true };
|
|
|
|
pub fn addPattern(pattern: []const u8) void {
|
|
if (pattern.len == 0) return;
|
|
const p = Pattern.parse(pattern);
|
|
if (pattern[0] == '/') root.append(p)
|
|
else root_unanchored.append(p);
|
|
}
|
|
|
|
// Get the patterns for the given (absolute) path, assuming the given path
|
|
// itself hasn't been excluded. This function is slow, directory walking code
|
|
// should use Patterns.enter() instead.
|
|
pub fn getPatterns(path_: []const u8) Patterns {
|
|
var path = std.mem.trim(u8, path_, "/");
|
|
if (path.len == 0) return root;
|
|
var pat = root;
|
|
defer pat.deinit();
|
|
while (std.mem.indexOfScalar(u8, path, '/')) |idx| {
|
|
var name = main.allocator.dupeZ(u8, path[0..idx]) catch unreachable;
|
|
defer main.allocator.free(name);
|
|
path = path[idx+1..];
|
|
|
|
var sub = pat.enter(name);
|
|
pat.deinit();
|
|
pat = sub;
|
|
}
|
|
|
|
var name = main.allocator.dupeZ(u8, path) catch unreachable;
|
|
defer main.allocator.free(name);
|
|
return pat.enter(name);
|
|
}
|
|
|
|
|
|
fn testfoo(p: *const Patterns) !void {
|
|
try std.testing.expectEqual(p.match("root"), null);
|
|
try std.testing.expectEqual(p.match("bar"), false);
|
|
try std.testing.expectEqual(p.match("qoo"), false);
|
|
try std.testing.expectEqual(p.match("xyz"), false);
|
|
try std.testing.expectEqual(p.match("okay"), null);
|
|
try std.testing.expectEqual(p.match("somefile"), false);
|
|
var s = p.enter("okay");
|
|
try std.testing.expectEqual(s.match("bar"), null);
|
|
try std.testing.expectEqual(s.match("xyz"), null);
|
|
try std.testing.expectEqual(s.match("notokay"), false);
|
|
s.deinit();
|
|
}
|
|
|
|
test "Matching" {
|
|
addPattern("/foo/bar");
|
|
addPattern("/foo/qoo/");
|
|
addPattern("/foo/qoo");
|
|
addPattern("/foo/qoo/");
|
|
addPattern("/f??/xyz");
|
|
addPattern("/f??/xyz/");
|
|
addPattern("/*o/somefile");
|
|
addPattern("/a??/okay");
|
|
addPattern("/roo?");
|
|
addPattern("/root/");
|
|
addPattern("excluded");
|
|
addPattern("somefile/");
|
|
addPattern("o*y/not[o]kay");
|
|
|
|
var a0 = getPatterns("/");
|
|
try std.testing.expectEqual(a0.match("a"), null);
|
|
try std.testing.expectEqual(a0.match("excluded"), false);
|
|
try std.testing.expectEqual(a0.match("somefile"), true);
|
|
try std.testing.expectEqual(a0.match("root"), false);
|
|
var a1 = a0.enter("foo");
|
|
a0.deinit();
|
|
try testfoo(&a1);
|
|
a1.deinit();
|
|
|
|
var b0 = getPatterns("/somedir/somewhere");
|
|
try std.testing.expectEqual(b0.match("a"), null);
|
|
try std.testing.expectEqual(b0.match("excluded"), false);
|
|
try std.testing.expectEqual(b0.match("root"), null);
|
|
try std.testing.expectEqual(b0.match("okay"), null);
|
|
var b1 = b0.enter("okay");
|
|
b0.deinit();
|
|
try std.testing.expectEqual(b1.match("excluded"), false);
|
|
try std.testing.expectEqual(b1.match("okay"), null);
|
|
try std.testing.expectEqual(b1.match("notokay"), false);
|
|
b1.deinit();
|
|
|
|
var c0 = getPatterns("/foo/");
|
|
try testfoo(&c0);
|
|
c0.deinit();
|
|
}
|