From d848b8148bb3fdc984dd3cd9176fb24f29fc5d43 Mon Sep 17 00:00:00 2001
From: Felipe Gasper <felipe@felipegasper.com>
Date: Wed, 7 Sep 2022 12:05:58 -0400
Subject: [PATCH] Add a utf8() mode that allows byte/UTF-8 strings as input &
 output.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

JSON::PP has a number of options that indicate a desire to facilitate
different applications’ nonstandard needs. For example, latin1() caters
to applications that use Latin-1 encoding rather than UTF-8, which
violates the JSON specification.

Some nontrivial Perl applications forgo character decoding. Their
authors/maintainers may not know “perlunitut”’s recommended workflow,
or the application may simply not care about Unicode. Either way, in
such applications it’s ideal for a JSON encoder & decoder to forgo
the usual UTF-8 decode/encode steps.

utf8(0) almost achieves this. It falls over, though, if the JSON
document contains a Unicode character escape (e.g., "\u00e9"), which
JSON::PP decodes as Perl "\xe9". This causes an inconsistency in the
decode logic: "é" in UTF-8 will yield a different result from "\u00e9".

Ordinarily it works to do encode_utf8( JSON::PP->new->utf8->decode(..) ),
but that falls over if applications need to allow non-UTF-8 sequences
in JSON inputs.

In short, a need exists for this Perl string:

    qq<"\xff\xc3\xa9\xc3\xa9\\u00e9">

… to decode to "\xff\xc3\xa9\xc3\xa9".

This changeset adds a solution to this problem by changing utf8() from
a simple flag to an enum: the existing chars-in-chars-out (0) and
bytes-in-chars-out (1) options, plus a new bytes-in-bytes-out option.
Named constants are added to avoid “magic numbers”.
---
 lib/JSON/PP.pm     | 31 +++++++++++++++++++++++++++----
 t/023_utf8_bytes.t | 19 +++++++++++++++++++
 2 files changed, 46 insertions(+), 4 deletions(-)
 create mode 100644 t/023_utf8_bytes.t

diff --git a/lib/JSON/PP.pm b/lib/JSON/PP.pm
index 2a8b55a..97cdae0 100644
--- a/lib/JSON/PP.pm
+++ b/lib/JSON/PP.pm
@@ -48,6 +48,10 @@ use constant OLD_PERL => $] < 5.008 ? 1 : 0;
 use constant USE_B => $ENV{PERL_JSON_PP_USE_B} || 0;
 use constant CORE_BOOL => defined &builtin::is_bool;
 
+use constant UTF8_CHARS => 0;
+use constant UTF8_STANDARD => 1;
+use constant UTF8_BYTES => 2;
+
 my $invalid_char_re;
 
 BEGIN {
@@ -67,7 +71,7 @@ BEGIN {
 
 BEGIN {
     my @xs_compati_bit_properties = qw(
-            latin1 ascii utf8 indent canonical space_before space_after allow_nonref shrink
+            latin1 ascii indent canonical space_before space_after allow_nonref shrink
             allow_blessed convert_blessed relaxed allow_unknown
             allow_tags
     );
@@ -109,7 +113,22 @@ BEGIN {
 
 }
 
+sub utf8 {
+    my $value = defined $_[1] ? $_[1] : 1;
 
+    if ($value) {
+        $_[0]->{PROPS}->[P_UTF8] = ($value eq UTF8_BYTES) ? $value : UTF8_STANDARD;
+    }
+    else {
+        $_[0]->{PROPS}->[P_UTF8] = UTF8_CHARS;
+    }
+
+    $_[0];
+}
+
+sub get_utf8 {
+    $_[0]->{PROPS}->[P_UTF8] || UTF8_CHARS;
+}
 
 # Functions
 
@@ -590,7 +609,7 @@ sub allow_bigint {
             $arg = JSON_PP_encode_latin1($arg);
         }
 
-        if ($utf8) {
+        if ($utf8 && $utf8 ne UTF8_BYTES) {
             utf8::encode($arg);
         }
 
@@ -777,7 +796,7 @@ BEGIN {
 
         ($alt_true, $alt_false) = @$self{qw/true false/};
 
-        if ( $utf8 ) {
+        if ( $utf8 && $utf8 ne UTF8_BYTES ) {
             $encoding = _detect_utf_encoding($text);
             if ($encoding ne 'UTF-8' and $encoding ne 'unknown') {
                 require Encode;
@@ -906,7 +925,11 @@ BEGIN {
                             my $hex = hex( $u );
                             if ( chr $u =~ /[[:^ascii:]]/ ) {
                                 $is_utf8 = 1;
-                                $s .= JSON_PP_decode_unicode($u) || next;
+
+                                my $char = JSON_PP_decode_unicode($u) || next;
+
+                                utf8::encode($char) if $utf8 eq UTF8_BYTES;
+                                $s .= $char;
                             }
                             else {
                                 $s .= chr $hex;
diff --git a/t/023_utf8_bytes.t b/t/023_utf8_bytes.t
new file mode 100644
index 0000000..ff595d0
--- /dev/null
+++ b/t/023_utf8_bytes.t
@@ -0,0 +1,19 @@
+# copied over from JSON::XS and modified to use JSON::PP
+
+use strict;
+use warnings;
+
+use Test::More;
+plan tests => 1;
+
+use JSON::PP;
+
+my $source = qq<"\xff\xc3\xa9\\u00e9">;
+
+my $dec = JSON::PP->new()->utf8( JSON::PP::UTF8_BYTES );
+
+is(
+    $dec->decode($source),
+    "\xff\xc3\xa9\xc3\xa9",
+    "utf8(UTF8_BYTES): expected decode",
+);