From d848b8148bb3fdc984dd3cd9176fb24f29fc5d43 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 7 Sep 2022 12:05:58 -0400 Subject: [PATCH] Add a utf8() mode that allows byte/UTF-8 strings as input & output. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JSON::PP has a number of options that indicate a desire to facilitate different applications’ nonstandard needs. For example, latin1() caters to applications that use Latin-1 encoding rather than UTF-8, which violates the JSON specification. Some nontrivial Perl applications forgo character decoding. Their authors/maintainers may not know “perlunitut”’s recommended workflow, or the application may simply not care about Unicode. Either way, in such applications it’s ideal for a JSON encoder & decoder to forgo the usual UTF-8 decode/encode steps. utf8(0) almost achieves this. It falls over, though, if the JSON document contains a Unicode character escape (e.g., "\u00e9"), which JSON::PP decodes as Perl "\xe9". This causes an inconsistency in the decode logic: "é" in UTF-8 will yield a different result from "\u00e9". Ordinarily it works to do encode_utf8( JSON::PP->new->utf8->decode(..) ), but that falls over if applications need to allow non-UTF-8 sequences in JSON inputs. In short, a need exists for this Perl string: qq<"\xff\xc3\xa9\xc3\xa9\\u00e9"> … to decode to "\xff\xc3\xa9\xc3\xa9". This changeset adds a solution to this problem by changing utf8() from a simple flag to an enum: the existing chars-in-chars-out (0) and bytes-in-chars-out (1) options, plus a new bytes-in-bytes-out option. Named constants are added to avoid “magic numbers”. --- lib/JSON/PP.pm | 31 +++++++++++++++++++++++++++---- t/023_utf8_bytes.t | 19 +++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 t/023_utf8_bytes.t diff --git a/lib/JSON/PP.pm b/lib/JSON/PP.pm index 2a8b55a..97cdae0 100644 --- a/lib/JSON/PP.pm +++ b/lib/JSON/PP.pm @@ -48,6 +48,10 @@ use constant OLD_PERL => $] < 5.008 ? 1 : 0; use constant USE_B => $ENV{PERL_JSON_PP_USE_B} || 0; use constant CORE_BOOL => defined &builtin::is_bool; +use constant UTF8_CHARS => 0; +use constant UTF8_STANDARD => 1; +use constant UTF8_BYTES => 2; + my $invalid_char_re; BEGIN { @@ -67,7 +71,7 @@ BEGIN { BEGIN { my @xs_compati_bit_properties = qw( - latin1 ascii utf8 indent canonical space_before space_after allow_nonref shrink + latin1 ascii indent canonical space_before space_after allow_nonref shrink allow_blessed convert_blessed relaxed allow_unknown allow_tags ); @@ -109,7 +113,22 @@ BEGIN { } +sub utf8 { + my $value = defined $_[1] ? $_[1] : 1; + if ($value) { + $_[0]->{PROPS}->[P_UTF8] = ($value eq UTF8_BYTES) ? $value : UTF8_STANDARD; + } + else { + $_[0]->{PROPS}->[P_UTF8] = UTF8_CHARS; + } + + $_[0]; +} + +sub get_utf8 { + $_[0]->{PROPS}->[P_UTF8] || UTF8_CHARS; +} # Functions @@ -590,7 +609,7 @@ sub allow_bigint { $arg = JSON_PP_encode_latin1($arg); } - if ($utf8) { + if ($utf8 && $utf8 ne UTF8_BYTES) { utf8::encode($arg); } @@ -777,7 +796,7 @@ BEGIN { ($alt_true, $alt_false) = @$self{qw/true false/}; - if ( $utf8 ) { + if ( $utf8 && $utf8 ne UTF8_BYTES ) { $encoding = _detect_utf_encoding($text); if ($encoding ne 'UTF-8' and $encoding ne 'unknown') { require Encode; @@ -906,7 +925,11 @@ BEGIN { my $hex = hex( $u ); if ( chr $u =~ /[[:^ascii:]]/ ) { $is_utf8 = 1; - $s .= JSON_PP_decode_unicode($u) || next; + + my $char = JSON_PP_decode_unicode($u) || next; + + utf8::encode($char) if $utf8 eq UTF8_BYTES; + $s .= $char; } else { $s .= chr $hex; diff --git a/t/023_utf8_bytes.t b/t/023_utf8_bytes.t new file mode 100644 index 0000000..ff595d0 --- /dev/null +++ b/t/023_utf8_bytes.t @@ -0,0 +1,19 @@ +# copied over from JSON::XS and modified to use JSON::PP + +use strict; +use warnings; + +use Test::More; +plan tests => 1; + +use JSON::PP; + +my $source = qq<"\xff\xc3\xa9\\u00e9">; + +my $dec = JSON::PP->new()->utf8( JSON::PP::UTF8_BYTES ); + +is( + $dec->decode($source), + "\xff\xc3\xa9\xc3\xa9", + "utf8(UTF8_BYTES): expected decode", +);