From 232a4f874178a9abdf5b0248f41ced5d1b6fb255 Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sun, 3 Nov 2024 10:36:49 +0100 Subject: [PATCH] JSON import: support reading escaped UTF-16 surrogate pairs Fixes #245 json/scanner.zig in std notes inconsistencies in the standard as to whether unpaired surrogate halves are allowed. That implementation disallows them and so does this commit. --- src/json_import.zig | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/json_import.zig b/src/json_import.zig index 958ca10..d119952 100644 --- a/src/json_import.zig +++ b/src/json_import.zig @@ -151,9 +151,16 @@ const Parser = struct { 'r' => if (n < buf.len) { buf[n] = 0xd; n += 1; }, 't' => if (n < buf.len) { buf[n] = 0x9; n += 1; }, 'u' => { - const char = (p.hexdig()<<12) + (p.hexdig()<<8) + (p.hexdig()<<4) + p.hexdig(); + const first = (p.hexdig()<<12) + (p.hexdig()<<8) + (p.hexdig()<<4) + p.hexdig(); + var unit = @as(u21, first); + if (std.unicode.utf16IsLowSurrogate(first)) p.die("Unexpected low surrogate"); + if (std.unicode.utf16IsHighSurrogate(first)) { + p.expectLit("\\u"); + const second = (p.hexdig()<<12) + (p.hexdig()<<8) + (p.hexdig()<<4) + p.hexdig(); + unit = std.unicode.utf16DecodeSurrogatePair(&.{first, second}) catch p.die("Invalid low surrogate"); + } if (n + 6 < buf.len) - n += std.unicode.utf8Encode(char, buf[n..n+5]) catch unreachable; + n += std.unicode.utf8Encode(unit, buf[n..n+5]) catch unreachable; }, else => p.die("invalid escape sequence"), },