Skip to content

Commit 7480812

Browse files
feat: Added native support to read/write parquet files from GCS URIs (deephaven#6007)
Closes deephaven#5999
1 parent 483a72f commit 7480812

File tree

6 files changed

+213
-7
lines changed

6 files changed

+213
-7
lines changed

engine/table/src/main/java/io/deephaven/engine/table/impl/locations/local/URIStreamKeyValuePartitionLayout.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ protected final void findKeys(@NotNull final Stream<URI> uriStream,
9090
buildLocationKeys(locationTable, targetURIs, locationKeyObserver);
9191
}
9292

93-
private void getPartitions(@NotNull final URI relativePath,
93+
private void getPartitions(
94+
@NotNull final URI relativePath,
9495
@NotNull final Set<String> partitionKeys,
9596
@NotNull final Collection<String> partitionValues,
9697
@NotNull final TIntObjectMap<ColumnNameInfo> partitionColInfo,

extensions/parquet/table/src/test/java/io/deephaven/parquet/table/S3ParquetRemoteTest.java

+57
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import static io.deephaven.engine.testutil.TstUtils.assertTableEquals;
2323
import static io.deephaven.parquet.table.ParquetTools.readTable;
2424
import static org.junit.Assert.assertEquals;
25+
import static org.junit.Assert.assertTrue;
2526

2627
/**
2728
* These tests verify the behavior of Parquet implementation when reading against remote S3 servers.
@@ -100,6 +101,59 @@ public void readSampleParquetFilesFromPublicS3Part3() {
100101
readTable("s3://redshift-downloads/redset/serverless/full.parquet", readInstructions).head(10).select();
101102
}
102103

104+
@Test
105+
public void readSampleParquetFromPublicGCS() {
106+
Assume.assumeTrue("Skipping test because s3 testing disabled.", ENABLE_REMOTE_S3_TESTING);
107+
final Table tableWithEndpointOverride;
108+
{
109+
final ParquetInstructions readInstructions = new ParquetInstructions.Builder()
110+
.setSpecialInstructions(S3Instructions.builder()
111+
.readTimeout(Duration.ofSeconds(60))
112+
.credentials(Credentials.anonymous())
113+
.regionName("us-east-1")
114+
.endpointOverride("https://storage.googleapis.com")
115+
.build())
116+
.build();
117+
tableWithEndpointOverride = ParquetTools.readTable(
118+
"s3://cloud-samples-data/bigquery/us-states/us-states.parquet", readInstructions).select();
119+
assertEquals(2, tableWithEndpointOverride.numColumns());
120+
assertEquals(50, tableWithEndpointOverride.size());
121+
}
122+
123+
final Table tableWithoutEndpointOverride;
124+
{
125+
final ParquetInstructions readInstructions = new ParquetInstructions.Builder()
126+
.setSpecialInstructions(S3Instructions.builder()
127+
.readTimeout(Duration.ofSeconds(60))
128+
.regionName("us-east-1")
129+
.credentials(Credentials.anonymous())
130+
.build())
131+
.build();
132+
tableWithoutEndpointOverride = ParquetTools.readTable(
133+
"gs://cloud-samples-data/bigquery/us-states/us-states.parquet", readInstructions).select();
134+
assertEquals(2, tableWithoutEndpointOverride.numColumns());
135+
assertEquals(50, tableWithoutEndpointOverride.size());
136+
}
137+
assertTableEquals(tableWithEndpointOverride, tableWithoutEndpointOverride);
138+
}
139+
140+
@Test
141+
public void testReadFromGCSFailure() {
142+
final ParquetInstructions readInstructions = new ParquetInstructions.Builder()
143+
.setSpecialInstructions(S3Instructions.builder()
144+
.readTimeout(Duration.ofSeconds(60))
145+
.credentials(Credentials.anonymous())
146+
.endpointOverride("https://storage.com")
147+
.build())
148+
.build();
149+
try {
150+
ParquetTools.readTable(
151+
"gs://cloud-samples-data/bigquery/us-states/us-states.parquet", readInstructions).select();
152+
} catch (final IllegalArgumentException e) {
153+
assertTrue(e.toString().contains("endpoint override"));
154+
}
155+
}
156+
103157
@Test
104158
public void readKeyValuePartitionedParquetFromPublicS3() {
105159
Assume.assumeTrue("Skipping test because s3 testing disabled.", ENABLE_REMOTE_S3_TESTING);
@@ -120,6 +174,9 @@ public void readKeyValuePartitionedParquetFromPublicS3() {
120174
assertEquals(2, table.numColumns());
121175
}
122176

177+
/**
178+
* The follow test reads from Deephaven's s3 bucket, thus requires the credentials to be set up.
179+
*/
123180
@Test
124181
public void readMetadataPartitionedParquetFromS3() {
125182
Assume.assumeTrue("Skipping test because s3 testing disabled.", ENABLE_REMOTE_S3_TESTING);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
//
2+
// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending
3+
//
4+
package io.deephaven.extensions.s3;
5+
6+
import io.deephaven.internal.log.LoggerFactory;
7+
import io.deephaven.io.logger.Logger;
8+
import io.deephaven.util.channel.CompletableOutputStream;
9+
import io.deephaven.util.channel.SeekableChannelContext;
10+
import org.jetbrains.annotations.NotNull;
11+
12+
import java.net.URI;
13+
import java.net.URISyntaxException;
14+
import java.nio.channels.SeekableByteChannel;
15+
import java.util.stream.Stream;
16+
17+
import static io.deephaven.extensions.s3.GCSSeekableChannelProviderPlugin.GCS_URI_SCHEME;
18+
import static io.deephaven.extensions.s3.S3SeekableChannelProviderPlugin.S3_URI_SCHEME;
19+
20+
final class GCSSeekableChannelProvider extends S3SeekableChannelProvider {
21+
22+
private static final Logger log = LoggerFactory.getLogger(GCSSeekableChannelProvider.class);
23+
24+
GCSSeekableChannelProvider(@NotNull final S3Instructions s3Instructions) {
25+
super(s3Instructions);
26+
}
27+
28+
@Override
29+
public boolean exists(@NotNull final URI uri) {
30+
return super.exists(gcsToS3Uri(uri));
31+
}
32+
33+
@Override
34+
public SeekableByteChannel getReadChannel(
35+
@NotNull final SeekableChannelContext channelContext,
36+
@NotNull final URI uri) {
37+
return super.getReadChannel(channelContext, gcsToS3Uri(uri));
38+
}
39+
40+
@Override
41+
public CompletableOutputStream getOutputStream(@NotNull final URI uri, final int bufferSizeHint) {
42+
return super.getOutputStream(gcsToS3Uri(uri), bufferSizeHint);
43+
}
44+
45+
@Override
46+
public Stream<URI> list(@NotNull final URI directory) {
47+
if (log.isDebugEnabled()) {
48+
log.debug().append("Fetching child URIs for directory: ").append(directory.toString()).endl();
49+
}
50+
return createStream(gcsToS3Uri(directory), false, GCS_URI_SCHEME);
51+
}
52+
53+
@Override
54+
public Stream<URI> walk(@NotNull final URI directory) {
55+
if (log.isDebugEnabled()) {
56+
log.debug().append("Performing recursive traversal from directory: ").append(directory.toString()).endl();
57+
}
58+
return createStream(gcsToS3Uri(directory), true, GCS_URI_SCHEME);
59+
}
60+
61+
private static URI gcsToS3Uri(@NotNull final URI uri) {
62+
try {
63+
if (S3_URI_SCHEME.equals(uri.getScheme())) {
64+
return uri;
65+
}
66+
return new URI(S3_URI_SCHEME, uri.getUserInfo(), uri.getHost(), uri.getPort(), uri.getPath(),
67+
uri.getQuery(), uri.getFragment());
68+
} catch (final URISyntaxException e) {
69+
throw new IllegalArgumentException("Failed to convert GCS URI " + uri + " to s3 URI", e);
70+
}
71+
}
72+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
//
2+
// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending
3+
//
4+
package io.deephaven.extensions.s3;
5+
6+
import com.google.auto.service.AutoService;
7+
import io.deephaven.util.channel.SeekableChannelsProvider;
8+
import io.deephaven.util.channel.SeekableChannelsProviderPlugin;
9+
import org.jetbrains.annotations.NotNull;
10+
import org.jetbrains.annotations.Nullable;
11+
12+
import java.net.URI;
13+
14+
/**
15+
* {@link SeekableChannelsProviderPlugin} implementation used for reading files from Google Cloud Storage.
16+
*/
17+
@AutoService(SeekableChannelsProviderPlugin.class)
18+
public final class GCSSeekableChannelProviderPlugin implements SeekableChannelsProviderPlugin {
19+
20+
static final String GCS_URI_SCHEME = "gs";
21+
22+
private static final String ENDPOINT_OVERRIDE_SUFFIX = ".googleapis.com";
23+
private static final URI DEFAULT_ENDPOINT_OVERRIDE = URI.create("https://storage.googleapis.com");
24+
private static final S3Instructions DEFAULT_INSTRUCTIONS =
25+
S3Instructions.builder().endpointOverride(DEFAULT_ENDPOINT_OVERRIDE).build();
26+
27+
@Override
28+
public boolean isCompatible(@NotNull final URI uri, @Nullable final Object config) {
29+
return GCS_URI_SCHEME.equals(uri.getScheme());
30+
}
31+
32+
@Override
33+
public SeekableChannelsProvider createProvider(@NotNull final URI uri, @Nullable final Object config) {
34+
if (!isCompatible(uri, config)) {
35+
throw new IllegalArgumentException("Arguments not compatible, provided uri " + uri);
36+
}
37+
return new GCSSeekableChannelProvider(s3Instructions(config));
38+
}
39+
40+
/**
41+
* Get the S3Instructions from the config object, or use the default if the config is null.
42+
*/
43+
private static S3Instructions s3Instructions(@Nullable final Object config) {
44+
if (config == null) {
45+
return DEFAULT_INSTRUCTIONS;
46+
}
47+
if (!(config instanceof S3Instructions)) {
48+
throw new IllegalArgumentException("Only S3Instructions are valid when reading GCS URIs, " +
49+
"provided config instance of class " + config.getClass().getName());
50+
}
51+
final S3Instructions s3Instructions = (S3Instructions) config;
52+
if (s3Instructions.endpointOverride().isEmpty()) {
53+
return s3Instructions.withEndpointOverride(DEFAULT_ENDPOINT_OVERRIDE);
54+
}
55+
if (!(s3Instructions.endpointOverride().get()).toString().endsWith(ENDPOINT_OVERRIDE_SUFFIX)) {
56+
throw new IllegalArgumentException("Provided endpoint override=(" +
57+
s3Instructions.endpointOverride().get() + " not supported when reading GCS URIs, must end with " +
58+
ENDPOINT_OVERRIDE_SUFFIX);
59+
}
60+
return s3Instructions;
61+
}
62+
}
63+

extensions/s3/src/main/java/io/deephaven/extensions/s3/S3Instructions.java

+2
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,8 @@ public LogOutput append(final LogOutput logOutput) {
145145
*/
146146
public abstract Optional<URI> endpointOverride();
147147

148+
public abstract S3Instructions withEndpointOverride(final URI endpointOverride);
149+
148150
public interface Builder {
149151
Builder regionName(String regionName);
150152

extensions/s3/src/main/java/io/deephaven/extensions/s3/S3SeekableChannelProvider.java

+17-6
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
/**
5252
* {@link SeekableChannelsProvider} implementation that is used to fetch objects from an S3-compatible API.
5353
*/
54-
final class S3SeekableChannelProvider implements SeekableChannelsProvider {
54+
class S3SeekableChannelProvider implements SeekableChannelsProvider {
5555

5656
private static final int MAX_KEYS_PER_BATCH = 1000;
5757
private static final int UNKNOWN_SIZE = -1;
@@ -97,7 +97,8 @@ public boolean exists(@NotNull final URI uri) {
9797
}
9898

9999
@Override
100-
public SeekableByteChannel getReadChannel(@NotNull final SeekableChannelContext channelContext,
100+
public SeekableByteChannel getReadChannel(
101+
@NotNull final SeekableChannelContext channelContext,
101102
@NotNull final URI uri) {
102103
final S3Uri s3Uri = s3AsyncClient.utilities().parseUri(uri);
103104
// context is unused here, will be set before reading from the channel
@@ -140,18 +141,28 @@ public Stream<URI> list(@NotNull final URI directory) {
140141
if (log.isDebugEnabled()) {
141142
log.debug().append("Fetching child URIs for directory: ").append(directory.toString()).endl();
142143
}
143-
return createStream(directory, false);
144+
return createStream(directory, false, S3_URI_SCHEME);
144145
}
145146

146147
@Override
147148
public Stream<URI> walk(@NotNull final URI directory) {
148149
if (log.isDebugEnabled()) {
149150
log.debug().append("Performing recursive traversal from directory: ").append(directory.toString()).endl();
150151
}
151-
return createStream(directory, true);
152+
return createStream(directory, true, S3_URI_SCHEME);
152153
}
153154

154-
private Stream<URI> createStream(@NotNull final URI directory, final boolean isRecursive) {
155+
/**
156+
* Create a stream of URIs, the elements of which are the entries in the directory.
157+
*
158+
* @param directory The parent directory to list.
159+
* @param isRecursive Whether to list the entries recursively.
160+
* @param childScheme The scheme to apply to the children URIs in the returned stream.
161+
*/
162+
Stream<URI> createStream(
163+
@NotNull final URI directory,
164+
final boolean isRecursive,
165+
@NotNull final String childScheme) {
155166
// The following iterator fetches URIs from S3 in batches and creates a stream
156167
final Iterator<URI> iterator = new Iterator<>() {
157168
private final String bucketName;
@@ -222,7 +233,7 @@ private void fetchNextBatch() throws IOException {
222233
}
223234
final URI uri;
224235
try {
225-
uri = new URI(S3_URI_SCHEME, directory.getUserInfo(), directory.getHost(),
236+
uri = new URI(childScheme, directory.getUserInfo(), directory.getHost(),
226237
directory.getPort(), path, null, null);
227238
} catch (final URISyntaxException e) {
228239
throw new UncheckedDeephavenException("Failed to create URI for S3 object with key: "

0 commit comments

Comments
 (0)