Skip to content

Commit 6bbd789

Browse files
libailinlihongwei
authored and
lihongwei
committed
[Feature-#1918][s3] Add support for reading all types of documents supported by Apache Tika, read excel format
1 parent 112f183 commit 6bbd789

File tree

40 files changed

+5600
-28
lines changed

40 files changed

+5600
-28
lines changed

chunjun-connectors/chunjun-connector-s3/pom.xml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,15 @@
7070
</dependency>
7171
<dependency>
7272
<groupId>com.dtstack.chunjun</groupId>
73-
<artifactId>chunjun-connector-format-base</artifactId>
73+
<artifactId>chunjun-format-tika</artifactId>
7474
<version>${project.version}</version>
75+
<scope>provided</scope>
76+
</dependency>
77+
<dependency>
78+
<groupId>com.dtstack.chunjun</groupId>
79+
<artifactId>chunjun-format-excel</artifactId>
80+
<version>${project.version}</version>
81+
<scope>provided</scope>
7582
</dependency>
7683
</dependencies>
7784
<build>

chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/config/S3Config.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
package com.dtstack.chunjun.connector.s3.config;
2020

2121
import com.dtstack.chunjun.config.CommonConfig;
22-
import com.dtstack.chunjun.connector.format.base.config.TikaReadConfig;
22+
import com.dtstack.chunjun.format.excel.config.ExcelFormatConfig;
23+
import com.dtstack.chunjun.format.tika.config.TikaReadConfig;
2324

2425
import com.amazonaws.regions.Regions;
2526
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
@@ -104,4 +105,6 @@ public class S3Config extends CommonConfig implements Serializable {
104105
private boolean disableBucketNameInEndpoint = false;
105106

106107
private TikaReadConfig tikaReadConfig = new TikaReadConfig();
108+
109+
private ExcelFormatConfig excelFormatConfig = new ExcelFormatConfig();
107110
}

chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/sink/S3OutputFormat.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
import java.util.UUID;
4848
import java.util.stream.Collectors;
4949

50-
import static com.dtstack.chunjun.connector.format.base.config.TikaReadConfig.ORIGINAL_FILENAME;
50+
import static com.dtstack.chunjun.format.tika.config.TikaReadConfig.ORIGINAL_FILENAME;
5151

5252
/** The OutputFormat Implementation which write data to Amazon S3. */
5353
@Slf4j

chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/source/S3DynamicTableSource.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,13 @@ public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderCon
6666
field.setName(column.getName());
6767
field.setType(
6868
TypeConfig.fromString(column.getDataType().getLogicalType().asSummaryString()));
69-
field.setIndex(i);
69+
int index =
70+
s3Config.getExcelFormatConfig().getColumnIndex() != null
71+
? s3Config.getExcelFormatConfig()
72+
.getColumnIndex()
73+
.get(columns.indexOf(column))
74+
: columns.indexOf(column);
75+
field.setIndex(index);
7076
columnList.add(field);
7177
}
7278
s3Config.setColumn(columnList);

chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/source/S3InputFormat.java

Lines changed: 69 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,17 @@
1818

1919
package com.dtstack.chunjun.connector.s3.source;
2020

21+
import com.dtstack.chunjun.config.FieldConfig;
2122
import com.dtstack.chunjun.config.RestoreConfig;
22-
import com.dtstack.chunjun.connector.format.base.common.TikaData;
23-
import com.dtstack.chunjun.connector.format.base.source.TikaInputFormat;
2423
import com.dtstack.chunjun.connector.s3.config.S3Config;
2524
import com.dtstack.chunjun.connector.s3.enums.CompressType;
2625
import com.dtstack.chunjun.connector.s3.util.ReaderUtil;
2726
import com.dtstack.chunjun.connector.s3.util.S3SimpleObject;
2827
import com.dtstack.chunjun.connector.s3.util.S3Util;
28+
import com.dtstack.chunjun.format.excel.common.ExcelData;
29+
import com.dtstack.chunjun.format.excel.source.ExcelInputFormat;
30+
import com.dtstack.chunjun.format.tika.common.TikaData;
31+
import com.dtstack.chunjun.format.tika.source.TikaInputFormat;
2932
import com.dtstack.chunjun.restore.FormatState;
3033
import com.dtstack.chunjun.source.format.BaseRichInputFormat;
3134
import com.dtstack.chunjun.throwable.ChunJunRuntimeException;
@@ -40,6 +43,7 @@
4043
import com.amazonaws.services.s3.model.S3Object;
4144
import com.amazonaws.services.s3.model.S3ObjectInputStream;
4245
import lombok.extern.slf4j.Slf4j;
46+
import org.apache.commons.collections.CollectionUtils;
4347
import org.apache.commons.io.FilenameUtils;
4448
import org.apache.commons.lang3.StringUtils;
4549

@@ -77,6 +81,9 @@ public class S3InputFormat extends BaseRichInputFormat {
7781
private transient TikaData tikaData;
7882
private TikaInputFormat tikaInputFormat;
7983

84+
private transient ExcelData excelData;
85+
private ExcelInputFormat excelInputFormat;
86+
8087
@Override
8188
public void openInputFormat() throws IOException {
8289
super.openInputFormat();
@@ -143,10 +150,31 @@ protected InputSplit[] createInputSplitsInternal(int minNumSplits) {
143150
protected RowData nextRecordInternal(RowData rowData) throws ReadRecordException {
144151
String[] fields;
145152
try {
146-
fields =
147-
s3Config.getTikaReadConfig().isUseExtract() && tikaData != null
148-
? tikaData.getData()
149-
: readerUtil.getValues();
153+
if (s3Config.getTikaReadConfig().isUseExtract() && tikaData != null) {
154+
fields = tikaData.getData();
155+
} else if (s3Config.getExcelFormatConfig().isUseExcelFormat() && excelData != null) {
156+
fields = excelData.getData();
157+
} else {
158+
fields = readerUtil.getValues();
159+
}
160+
// 处理字段配置了对应的列索引
161+
if (s3Config.getExcelFormatConfig().getColumnIndex() != null) {
162+
List<FieldConfig> columns = s3Config.getColumn();
163+
String[] fieldsData = new String[columns.size()];
164+
for (int i = 0; i < CollectionUtils.size(columns); i++) {
165+
FieldConfig fieldConfig = columns.get(i);
166+
if (fieldConfig.getIndex() >= fields.length) {
167+
String errorMessage =
168+
String.format(
169+
"The column index is greater than the data size."
170+
+ " The current column index is [%s], but the data size is [%s]. Data loss may occur.",
171+
fieldConfig.getIndex(), fields.length);
172+
throw new IllegalArgumentException(errorMessage);
173+
}
174+
fieldsData[i] = fields[fieldConfig.getIndex()];
175+
}
176+
fields = fieldsData;
177+
}
150178
rowData = rowConverter.toInternal(fields);
151179
} catch (IOException e) {
152180
throw new ChunJunRuntimeException(e);
@@ -176,10 +204,45 @@ public boolean reachedEnd() throws IOException {
176204
if (s3Config.getTikaReadConfig().isUseExtract()) {
177205
tikaData = getTikaData();
178206
return tikaData == null || tikaData.getData() == null;
207+
} else if (s3Config.getExcelFormatConfig().isUseExcelFormat()) {
208+
excelData = getExcelData();
209+
return excelData == null || excelData.getData() == null;
179210
}
180211
return reachedEndWithoutCheckState();
181212
}
182213

214+
public ExcelData getExcelData() {
215+
if (excelInputFormat == null) {
216+
nextExcelDataStream();
217+
}
218+
if (excelInputFormat != null) {
219+
if (!excelInputFormat.hasNext()) {
220+
excelInputFormat.close();
221+
excelInputFormat = null;
222+
return getExcelData();
223+
}
224+
String[] record = excelInputFormat.nextRecord();
225+
return new ExcelData(record);
226+
} else {
227+
return null;
228+
}
229+
}
230+
231+
private void nextExcelDataStream() {
232+
if (splits.hasNext()) {
233+
currentObject = splits.next();
234+
GetObjectRequest rangeObjectRequest =
235+
new GetObjectRequest(s3Config.getBucket(), currentObject);
236+
log.info("Current read file {}", currentObject);
237+
S3Object o = amazonS3.getObject(rangeObjectRequest);
238+
S3ObjectInputStream s3is = o.getObjectContent();
239+
excelInputFormat = new ExcelInputFormat();
240+
excelInputFormat.open(s3is, s3Config.getExcelFormatConfig());
241+
} else {
242+
excelInputFormat = null;
243+
}
244+
}
245+
183246
public TikaData getTikaData() {
184247
if (tikaInputFormat == null) {
185248
nextTikaDataStream();

chunjun-connectors/chunjun-connector-s3/src/main/java/com/dtstack/chunjun/connector/s3/table/S3DynamicTableFactory.java

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,27 +18,36 @@
1818

1919
package com.dtstack.chunjun.connector.s3.table;
2020

21-
import com.dtstack.chunjun.connector.format.base.config.TikaReadConfig;
22-
import com.dtstack.chunjun.connector.format.base.options.TikaOptions;
2321
import com.dtstack.chunjun.connector.s3.config.S3Config;
2422
import com.dtstack.chunjun.connector.s3.sink.S3DynamicTableSink;
2523
import com.dtstack.chunjun.connector.s3.source.S3DynamicTableSource;
2624
import com.dtstack.chunjun.connector.s3.table.options.S3Options;
25+
import com.dtstack.chunjun.format.excel.config.ExcelFormatConfig;
26+
import com.dtstack.chunjun.format.excel.options.ExcelFormatOptions;
27+
import com.dtstack.chunjun.format.tika.config.TikaReadConfig;
28+
import com.dtstack.chunjun.format.tika.options.TikaOptions;
2729
import com.dtstack.chunjun.table.options.SinkOptions;
2830
import com.dtstack.chunjun.util.GsonUtil;
2931

3032
import org.apache.flink.configuration.ConfigOption;
3133
import org.apache.flink.configuration.ReadableConfig;
34+
import org.apache.flink.table.catalog.Column;
35+
import org.apache.flink.table.catalog.ResolvedSchema;
3236
import org.apache.flink.table.connector.sink.DynamicTableSink;
3337
import org.apache.flink.table.connector.source.DynamicTableSource;
3438
import org.apache.flink.table.factories.DynamicTableSinkFactory;
3539
import org.apache.flink.table.factories.DynamicTableSourceFactory;
3640
import org.apache.flink.table.factories.FactoryUtil;
3741

3842
import com.google.common.collect.Sets;
43+
import org.apache.commons.lang3.StringUtils;
3944

4045
import java.util.ArrayList;
46+
import java.util.Arrays;
47+
import java.util.List;
4148
import java.util.Set;
49+
import java.util.stream.Collectors;
50+
import java.util.stream.IntStream;
4251

4352
public class S3DynamicTableFactory implements DynamicTableSourceFactory, DynamicTableSinkFactory {
4453
private static final String IDENTIFIER = "s3-x";
@@ -71,7 +80,38 @@ public DynamicTableSource createDynamicTableSource(Context context) {
7180
tikaReadConfig.setOverlapRatio(options.get(TikaOptions.OVERLAP_RATIO));
7281
tikaReadConfig.setChunkSize(options.get(TikaOptions.CHUNK_SIZE));
7382
s3Config.setTikaReadConfig(tikaReadConfig);
74-
return new S3DynamicTableSource(context.getCatalogTable().getResolvedSchema(), s3Config);
83+
ResolvedSchema resolvedSchema = context.getCatalogTable().getResolvedSchema();
84+
List<Column> columns = resolvedSchema.getColumns();
85+
ExcelFormatConfig excelFormatConfig = new ExcelFormatConfig();
86+
excelFormatConfig.setUseExcelFormat(options.get(ExcelFormatOptions.USE_EXCEL_FORMAT));
87+
excelFormatConfig.setFirstLineHeader(options.get(S3Options.IS_FIRST_LINE_HEADER));
88+
if (StringUtils.isNotBlank(options.get(ExcelFormatOptions.SHEET_NO))) {
89+
List<Integer> sheetNo =
90+
Arrays.stream(options.get(ExcelFormatOptions.SHEET_NO).split(","))
91+
.map(Integer::parseInt)
92+
.collect(Collectors.toList());
93+
excelFormatConfig.setSheetNo(sheetNo);
94+
}
95+
if (StringUtils.isNotBlank(options.get(ExcelFormatOptions.COLUMN_INDEX))) {
96+
List<Integer> columnIndex =
97+
Arrays.stream(options.get(ExcelFormatOptions.COLUMN_INDEX).split(","))
98+
.map(Integer::parseInt)
99+
.collect(Collectors.toList());
100+
excelFormatConfig.setColumnIndex(columnIndex);
101+
}
102+
final String[] fields = new String[columns.size()];
103+
IntStream.range(0, fields.length).forEach(i -> fields[i] = columns.get(i).getName());
104+
excelFormatConfig.setFields(fields);
105+
s3Config.setExcelFormatConfig(excelFormatConfig);
106+
if (s3Config.getExcelFormatConfig().getColumnIndex() != null
107+
&& columns.size() != s3Config.getExcelFormatConfig().getColumnIndex().size()) {
108+
throw new IllegalArgumentException(
109+
String.format(
110+
"The number of fields (%s) is inconsistent with the number of indexes (%s).",
111+
columns.size(),
112+
s3Config.getExcelFormatConfig().getColumnIndex().size()));
113+
}
114+
return new S3DynamicTableSource(resolvedSchema, s3Config);
75115
}
76116

77117
@Override
@@ -112,6 +152,9 @@ public Set<ConfigOption<?>> optionalOptions() {
112152
options.add(TikaOptions.USE_EXTRACT);
113153
options.add(TikaOptions.CHUNK_SIZE);
114154
options.add(TikaOptions.OVERLAP_RATIO);
155+
options.add(ExcelFormatOptions.SHEET_NO);
156+
options.add(ExcelFormatOptions.COLUMN_INDEX);
157+
options.add(ExcelFormatOptions.USE_EXCEL_FORMAT);
115158
return options;
116159
}
117160

chunjun-connectors/pom.xml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,6 @@
110110
<module>chunjun-connector-nebula</module>
111111
<module>chunjun-connector-kingbase</module>
112112
<module>chunjun-connector-hudi</module>
113-
<module>chunjun-connector-format-base</module>
114113
</modules>
115114

116115
<dependencies>
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!--
3+
Licensed to the Apache Software Foundation (ASF) under one
4+
or more contributor license agreements. See the NOTICE file
5+
distributed with this work for additional information
6+
regarding copyright ownership. The ASF licenses this file
7+
to you under the Apache License, Version 2.0 (the
8+
"License"); you may not use this file except in compliance
9+
with the License. You may obtain a copy of the License at
10+
11+
http://www.apache.org/licenses/LICENSE-2.0
12+
13+
Unless required by applicable law or agreed to in writing,
14+
software distributed under the License is distributed on an
15+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
KIND, either express or implied. See the License for the
17+
specific language governing permissions and limitations
18+
under the License.
19+
-->
20+
21+
<project xmlns="http://maven.apache.org/POM/4.0.0"
22+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
23+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
24+
<modelVersion>4.0.0</modelVersion>
25+
<parent>
26+
<groupId>com.dtstack.chunjun</groupId>
27+
<artifactId>chunjun-formats</artifactId>
28+
<version>${revision}</version>
29+
</parent>
30+
31+
<artifactId>chunjun-format-excel</artifactId>
32+
<name>ChunJun : Formats : Excel</name>
33+
34+
<properties>
35+
<format.dir>excel</format.dir>
36+
</properties>
37+
38+
<dependencies>
39+
<dependency>
40+
<groupId>com.alibaba</groupId>
41+
<artifactId>easyexcel</artifactId>
42+
<version>3.2.0</version>
43+
</dependency>
44+
</dependencies>
45+
46+
<build>
47+
<plugins>
48+
<plugin>
49+
<groupId>org.apache.maven.plugins</groupId>
50+
<artifactId>maven-shade-plugin</artifactId>
51+
</plugin>
52+
<plugin>
53+
<groupId>org.apache.maven.plugins</groupId>
54+
<artifactId>maven-antrun-plugin</artifactId>
55+
</plugin>
56+
</plugins>
57+
</build>
58+
59+
</project>

0 commit comments

Comments
 (0)