Skip to content

Commit 720787a

Browse files
chliang71facebook-github-bot
authored andcommitted
Parquet reader should use parent type for nested types (facebookincubator#7774)
Summary: When traversing the Parquet metadata and evaluating a placeholder node (node created to represent "array" and "map") current implementation determines array(list) or map based on the number of children. But for some Parquet files, this is not necessarily accurate. A better check is to check on the parent of the placeholder node and determine the type of the parent. Pull Request resolved: facebookincubator#7774 Reviewed By: pedroerp Differential Revision: D56243812 Pulled By: Yuhta fbshipit-source-id: e7112e39271021006ad02914c4d1f128d1d38093
1 parent 841a9ba commit 720787a

File tree

4 files changed

+102
-4
lines changed

4 files changed

+102
-4
lines changed

velox/dwio/parquet/reader/ParquetReader.cpp

+17-4
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,10 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
279279
// value children.
280280
if (schema[parentSchemaIdx].converted_type ==
281281
thrift::ConvertedType::MAP) {
282+
// TODO: the group names need to be checked. According to the spec,
283+
// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps
284+
// the name of the schema element being 'key_value' is
285+
// also an indication of this is a map type
282286
VELOX_CHECK_EQ(
283287
schemaElement.repetition_type,
284288
thrift::FieldRepetitionType::REPEATED);
@@ -331,10 +335,14 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
331335
} else {
332336
if (schemaElement.repetition_type ==
333337
thrift::FieldRepetitionType::REPEATED) {
334-
VELOX_CHECK_LE(
335-
children.size(), 2, "children size should not be larger than 2");
336-
if (children.size() == 1) {
338+
if (schema[parentSchemaIdx].converted_type ==
339+
thrift::ConvertedType::LIST) {
340+
// TODO: the group names need to be checked. According to spec,
341+
// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
342+
// the name of the schema element being 'array' is
343+
// also an indication of this is a list type
337344
// child of LIST
345+
VELOX_CHECK_GE(children.size(), 1);
338346
auto type = TypeFactory<TypeKind::ARRAY>::create(children[0]->type());
339347
return std::make_unique<ParquetTypeWithId>(
340348
std::move(type),
@@ -347,8 +355,13 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
347355
std::nullopt,
348356
maxRepeat,
349357
maxDefine);
350-
} else if (children.size() == 2) {
358+
} else if (
359+
schema[parentSchemaIdx].converted_type ==
360+
thrift::ConvertedType::MAP ||
361+
schema[parentSchemaIdx].converted_type ==
362+
thrift::ConvertedType::MAP_KEY_VALUE) {
351363
// children of MAP
364+
VELOX_CHECK_EQ(children.size(), 2);
352365
auto type = TypeFactory<TypeKind::MAP>::create(
353366
children[0]->type(), children[1]->type());
354367
return std::make_unique<ParquetTypeWithId>(
Binary file not shown.
Binary file not shown.

velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp

+85
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,91 @@ TEST_F(ParquetReaderTest, parseSample) {
9797
sampleSchema(), *rowReader, expected, *leafPool_);
9898
}
9999

100+
TEST_F(ParquetReaderTest, parseUnannotatedList) {
101+
// unannotated_list.parquet has the following the schema
102+
// the list is defined without the middle layer
103+
// message ParquetSchema {
104+
// optional group self (LIST) {
105+
// repeated group self_tuple {
106+
// optional int64 a;
107+
// optional boolean b;
108+
// required binary c (STRING);
109+
// }
110+
// }
111+
// }
112+
const std::string sample(getExampleFilePath("unannotated_list.parquet"));
113+
114+
facebook::velox::dwio::common::ReaderOptions readerOpts{leafPool_.get()};
115+
auto reader = createReader(sample, readerOpts);
116+
117+
EXPECT_EQ(reader->numberOfRows(), 22ULL);
118+
119+
auto type = reader->typeWithId();
120+
EXPECT_EQ(type->size(), 1ULL);
121+
auto col0 = type->childAt(0);
122+
EXPECT_EQ(col0->type()->kind(), TypeKind::ARRAY);
123+
EXPECT_EQ(
124+
std::static_pointer_cast<const ParquetTypeWithId>(col0)->name_, "self");
125+
126+
EXPECT_EQ(col0->size(), 3ULL);
127+
EXPECT_EQ(col0->childAt(0)->type()->kind(), TypeKind::BIGINT);
128+
EXPECT_EQ(
129+
std::static_pointer_cast<const ParquetTypeWithId>(col0->childAt(0))
130+
->name_,
131+
"a");
132+
133+
EXPECT_EQ(col0->childAt(1)->type()->kind(), TypeKind::BOOLEAN);
134+
EXPECT_EQ(
135+
std::static_pointer_cast<const ParquetTypeWithId>(col0->childAt(1))
136+
->name_,
137+
"b");
138+
139+
EXPECT_EQ(col0->childAt(2)->type()->kind(), TypeKind::VARCHAR);
140+
EXPECT_EQ(
141+
std::static_pointer_cast<const ParquetTypeWithId>(col0->childAt(2))
142+
->name_,
143+
"c");
144+
}
145+
146+
TEST_F(ParquetReaderTest, parseUnannotatedMap) {
147+
// unannotated_map.parquet has the following the schema
148+
// the map is defined with a MAP_KEY_VALUE node
149+
// message hive_schema {
150+
// optional group test (MAP) {
151+
// repeated group key_value (MAP_KEY_VALUE) {
152+
// required binary key (STRING);
153+
// optional int64 value;
154+
// }
155+
// }
156+
//}
157+
const std::string filename("unnotated_map.parquet");
158+
const std::string sample(getExampleFilePath(filename));
159+
160+
facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
161+
auto reader = createReader(sample, readerOptions);
162+
auto numRows = reader->numberOfRows();
163+
164+
auto type = reader->typeWithId();
165+
EXPECT_EQ(type->size(), 1ULL);
166+
auto col0 = type->childAt(0);
167+
EXPECT_EQ(col0->type()->kind(), TypeKind::MAP);
168+
EXPECT_EQ(
169+
std::static_pointer_cast<const ParquetTypeWithId>(col0)->name_, "test");
170+
171+
EXPECT_EQ(col0->size(), 2ULL);
172+
EXPECT_EQ(col0->childAt(0)->type()->kind(), TypeKind::VARCHAR);
173+
EXPECT_EQ(
174+
std::static_pointer_cast<const ParquetTypeWithId>(col0->childAt(0))
175+
->name_,
176+
"key");
177+
178+
EXPECT_EQ(col0->childAt(1)->type()->kind(), TypeKind::BIGINT);
179+
EXPECT_EQ(
180+
std::static_pointer_cast<const ParquetTypeWithId>(col0->childAt(1))
181+
->name_,
182+
"value");
183+
}
184+
100185
TEST_F(ParquetReaderTest, parseSampleRange1) {
101186
const std::string sample(getExampleFilePath("sample.parquet"));
102187

0 commit comments

Comments
 (0)