Skip to content

Commit 29ba746

Browse files
fix: Resolve mismatch with Thrift compact protocol
The [Thrift compact protocol](https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md) is used for Parquet file metadata. [parquet-format-safe](https://github.com/jorgecarleitao/parquet-format-safe) and other Rust implementations of the protocol eagerly read string/binary fields as UTF-8. However, based on the protocol which states that > Strings are first encoded to UTF-8, and then send as binary it cannot be known upfront, without using the schema to disambiguate the field type, whether a field is a string or a binary. This means that when the field is actually a binary field and contains invalid UTF-8, Rust libraries error out when reading the field with `File out of specification: Invalid thrift: bad data`. To fix this, we patch the protocol implementation to correctly interpret string/binary fields as binary.
1 parent b499e46 commit 29ba746

6 files changed

Lines changed: 50 additions & 50 deletions

File tree

src/parquet_format.rs

Lines changed: 42 additions & 42 deletions
Large diffs are not rendered by default.

src/thrift/errors.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ impl Error {
133133
name: "TApplicationException".to_owned(),
134134
})?;
135135

136-
let message_field = TFieldIdentifier::new("message", TType::String, 1);
136+
let message_field = TFieldIdentifier::new("message", TType::Binary, 1);
137137
let type_field = TFieldIdentifier::new("type", TType::I32, 2);
138138

139139
o.write_field_begin(&message_field)?;

src/thrift/protocol/compact.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ pub(super) fn u8_to_type(b: u8) -> Result<TType> {
324324
0x05 => Ok(TType::I32),
325325
0x06 => Ok(TType::I64),
326326
0x07 => Ok(TType::Double),
327-
0x08 => Ok(TType::String),
327+
0x08 => Ok(TType::Binary),
328328
0x09 => Ok(TType::List),
329329
0x0A => Ok(TType::Set),
330330
0x0B => Ok(TType::Map),

src/thrift/protocol/compact_write.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ pub(super) fn type_to_u8(field_type: TType) -> u8 {
263263
TType::I32 => 0x05,
264264
TType::I64 => 0x06,
265265
TType::Double => 0x07,
266-
TType::String => 0x08,
266+
TType::Binary => 0x08,
267267
TType::List => 0x09,
268268
TType::Set => 0x0A,
269269
TType::Map => 0x0B,

src/thrift/protocol/mod.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ pub trait TInputProtocol: Sized {
162162
TType::I32 => self.read_i32().map(|_| ()),
163163
TType::I64 => self.read_i64().map(|_| ()),
164164
TType::Double => self.read_double().map(|_| ()),
165-
TType::String => self.read_string().map(|_| ()),
165+
TType::Binary => self.read_bytes().map(|_| ()),
166166
TType::Struct => {
167167
self.read_struct_begin()?;
168168
loop {
@@ -703,8 +703,8 @@ pub enum TType {
703703
I32,
704704
/// Signed 64-bit int.
705705
I64,
706-
/// UTF-8 string.
707-
String,
706+
/// Binary or UTF-8 string. Based on the Thrift compact protocol: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
707+
Binary,
708708
/// UTF-7 string. *Unsupported*.
709709
Utf7,
710710
/// Thrift struct.
@@ -732,7 +732,7 @@ impl Display for TType {
732732
TType::I16 => write!(f, "i16"),
733733
TType::I32 => write!(f, "i32"),
734734
TType::I64 => write!(f, "i64"),
735-
TType::String => write!(f, "string"),
735+
TType::Binary => write!(f, "binary"),
736736
TType::Utf7 => write!(f, "UTF7"),
737737
TType::Struct => write!(f, "struct"),
738738
TType::Map => write!(f, "map"),

src/thrift/protocol/stream.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ pub trait TInputStreamProtocol: Send + Sized {
148148
TType::I32 => self.read_i32().await.map(|_| ()),
149149
TType::I64 => self.read_i64().await.map(|_| ()),
150150
TType::Double => self.read_double().await.map(|_| ()),
151-
TType::String => self.read_string().await.map(|_| ()),
151+
TType::Binary => self.read_bytes().await.map(|_| ()),
152152
TType::Struct => {
153153
self.read_struct_begin().await?;
154154
loop {

0 commit comments

Comments
 (0)