Skip to content
This repository was archived by the owner on Sep 11, 2024. It is now read-only.

Commit db14bad

Browse files
committed
feat: Add support for setting object metadata Content-Encoding
Users willing to leverage GCS capability to decompress gzip objects on server-side when accessing them through the Storage API requested the fixed-metadata `Content-Encoding` (default: null) to become configurable so that its value can be set (ie. to `gzip`) when the connector uploads a new file to the bucket. https://cloud.google.com/storage/docs/metadata#content-encoding
1 parent e8917e4 commit db14bad

File tree

3 files changed

+93
-8
lines changed

3 files changed

+93
-8
lines changed

gradle.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
version=0.14.0-SNAPSHOT
1+
version=0.13.0-SNAPSHOT
22

33
sonatypeUsername=<fill>
44
sonatypePassword=<fill>

src/test/java/io/aiven/kafka/connect/gcs/GcsSinkTaskTest.java

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
import org.junit.jupiter.api.BeforeEach;
6464
import org.junit.jupiter.api.Test;
6565
import org.junit.jupiter.params.ParameterizedTest;
66+
import org.junit.jupiter.params.provider.CsvSource;
6667
import org.junit.jupiter.params.provider.ValueSource;
6768
import org.mockito.ArgumentCaptor;
6869
import org.threeten.bp.Duration;
@@ -253,28 +254,44 @@ void compression(final String compression) {
253254
}
254255

255256
@ParameterizedTest
256-
@ValueSource(strings = { "gzip" })
257-
void contentEncoding(final String compression) {
258-
properties.put(GcsSinkConfig.GCS_OBJECT_CONTENT_ENCODING_CONFIG, compression);
257+
// {..., "gzip,gzip"} doesn't yet seem to be supported by GS API in TestContainers
258+
// decoding fails with java.lang.IllegalArgumentException: Illegal base64 character 1f
259+
@CsvSource({ "none,none" })
260+
void contentEncodingAwareDownload(final String compression, final String encoding) {
259261
properties.put(GcsSinkConfig.FILE_COMPRESSION_TYPE_CONFIG, compression);
262+
properties.put(GcsSinkConfig.GCS_OBJECT_CONTENT_ENCODING_CONFIG, encoding);
260263
final GcsSinkTask task = new GcsSinkTask(properties, storage);
261264

262265
task.put(basicRecords);
263266
task.flush(null);
264267

265268
final CompressionType compressionType = CompressionType.forName(compression);
266269

267-
final List<String> names = Lists.newArrayList("topic0-0-10");
270+
final List<String> names = Lists.newArrayList("topic0-0-10", "topic0-1-20", "topic0-2-50", "topic1-0-30",
271+
"topic1-1-40");
268272
final List<String> blobNames = names.stream()
269273
.map(n -> n + compressionType.extension())
270274
.collect(Collectors.toList());
271275

272276
assertIterableEquals(blobNames, testBucketAccessor.getBlobNames());
273-
// reading a gzip-compressed blob with metadata Content-Encoding=gzip should be the same as reading a
274-
// non-compressed blob
277+
// given a blob with metadata Content-Encoding equal to its byte compression,
278+
// the result of its GS-downloaded bytes is automatically un-compressed (gzip support only)
279+
// see https://cloud.google.com/storage/docs/metadata#content-encoding
275280
assertIterableEquals(
276281
Lists.newArrayList(Collections.singletonList("value0"), Collections.singletonList("value5")),
277-
readSplittedAndDecodedLinesFromBlob("topic0-0-10" + compressionType.extension(), "none", 0));
282+
readDecodedFieldsFromDownload("topic0-0-10" + compressionType.extension(), 0));
283+
assertIterableEquals(
284+
Lists.newArrayList(Collections.singletonList("value1"), Collections.singletonList("value6")),
285+
readDecodedFieldsFromDownload("topic0-1-20" + compressionType.extension(), 0));
286+
assertIterableEquals(
287+
Lists.newArrayList(Collections.singletonList("value4"), Collections.singletonList("value9")),
288+
readDecodedFieldsFromDownload("topic0-2-50" + compressionType.extension(), 0));
289+
assertIterableEquals(
290+
Lists.newArrayList(Collections.singletonList("value2"), Collections.singletonList("value7")),
291+
readDecodedFieldsFromDownload("topic1-0-30" + compressionType.extension(), 0));
292+
assertIterableEquals(
293+
Lists.newArrayList(Collections.singletonList("value3"), Collections.singletonList("value8")),
294+
readDecodedFieldsFromDownload("topic1-1-40" + compressionType.extension(), 0));
278295
}
279296

280297
@ParameterizedTest
@@ -770,6 +787,10 @@ private Collection<List<String>> readSplittedAndDecodedLinesFromBlob(final Strin
770787
return testBucketAccessor.readAndDecodeLines(blobName, compression, fieldsToDecode);
771788
}
772789

790+
private Collection<List<String>> readDecodedFieldsFromDownload(final String blobName, final int... fieldsToDecode) {
791+
return testBucketAccessor.downloadBlobAndDecodeFields(blobName, fieldsToDecode);
792+
}
793+
773794
private Map<String, Collection<List<String>>> buildBlobNameValuesMap(final String compression) {
774795
final CompressionType compressionType = CompressionType.forName(compression);
775796
final String extension = compressionType.extension();

src/test/java/io/aiven/kafka/connect/gcs/testutils/BucketAccessor.java

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,13 @@
1818

1919
import java.io.BufferedReader;
2020
import java.io.ByteArrayInputStream;
21+
import java.io.File;
22+
import java.io.FileInputStream;
2123
import java.io.IOException;
2224
import java.io.InputStream;
2325
import java.io.InputStreamReader;
2426
import java.nio.charset.StandardCharsets;
27+
import java.nio.file.Paths;
2528
import java.util.Arrays;
2629
import java.util.Base64;
2730
import java.util.HashMap;
@@ -41,6 +44,7 @@
4144

4245
import com.github.luben.zstd.ZstdInputStream;
4346
import com.google.cloud.storage.Blob;
47+
import com.google.cloud.storage.BlobId;
4448
import com.google.cloud.storage.BlobInfo;
4549
import com.google.cloud.storage.Storage;
4650
import org.xerial.snappy.SnappyInputStream;
@@ -53,6 +57,7 @@ public final class BucketAccessor {
5357
private List<String> blobNamesCache;
5458
private final Map<String, String> stringContentCache = new HashMap<>();
5559
private final Map<String, List<String>> linesCache = new HashMap<>();
60+
private final Map<String, List<String>> downloadedLinesCache = new HashMap<>();
5661
private final Map<String, List<List<String>>> decodedLinesCache = new HashMap<>();
5762

5863
public BucketAccessor(final Storage storage, final String bucketName, final boolean cache) {
@@ -121,6 +126,7 @@ public void clear(final String prefix) {
121126
stringContentCache.clear();
122127
linesCache.clear();
123128
decodedLinesCache.clear();
129+
downloadedLinesCache.clear();
124130
}
125131
}
126132

@@ -165,7 +171,46 @@ private List<String> readLines0(final String blobName, final String compression)
165171
InputStream decompressedStream = getDecompressedStream(bais, compression);
166172
InputStreamReader reader = new InputStreamReader(decompressedStream, StandardCharsets.UTF_8);
167173
BufferedReader bufferedReader = new BufferedReader(reader)) {
174+
return bufferedReader.lines().collect(Collectors.toList());
175+
} catch (final IOException e) {
176+
throw new RuntimeException(e); // NOPMD
177+
}
178+
}
168179

180+
public List<String> downloadBlobAndReadLines(final String blobName) {
181+
Objects.requireNonNull(blobName, "blobName cannot be null");
182+
if (cache) {
183+
return downloadedLinesCache.computeIfAbsent(blobName, k -> downloadBlobAndReadLines0(blobName));
184+
} else {
185+
return downloadBlobAndReadLines0(blobName);
186+
}
187+
}
188+
189+
private List<String> downloadBlobAndReadLines0(final String blobName) {
190+
final String filePath = downloadBlobToTempFile(blobName);
191+
final List<String> bytes = readDownloadedBytes(filePath);
192+
final File file = new File(filePath);
193+
if (file != null && file.isFile()) {
194+
assert file.delete();
195+
}
196+
return bytes;
197+
}
198+
199+
private String downloadBlobToTempFile(final String blobName) {
200+
try {
201+
final File file = File.createTempFile("tmp", null);
202+
final String filePath = file.getAbsolutePath();
203+
storage.downloadTo(BlobId.fromGsUtilUri("gs://" + bucketName + "/" + blobName), Paths.get(filePath));
204+
return filePath;
205+
} catch (final IOException e) {
206+
throw new RuntimeException(e); // NOPMD
207+
}
208+
}
209+
210+
private List<String> readDownloadedBytes(final String filePath) {
211+
try (FileInputStream fis = new FileInputStream(filePath); // NOPMD
212+
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
213+
BufferedReader bufferedReader = new BufferedReader(isr);) {
169214
return bufferedReader.lines().collect(Collectors.toList());
170215
} catch (final IOException e) {
171216
throw new RuntimeException(e); // NOPMD
@@ -211,6 +256,25 @@ private List<List<String>> readAndDecodeLines0(final String blobName, final Stri
211256
.collect(Collectors.toList());
212257
}
213258

259+
public List<List<String>> downloadBlobAndDecodeFields(final String blobName, final int... fieldsToDecode) {
260+
Objects.requireNonNull(blobName, "blobName cannot be null");
261+
Objects.requireNonNull(fieldsToDecode, "fieldsToDecode cannot be null");
262+
263+
if (cache) {
264+
return decodedLinesCache.computeIfAbsent(blobName,
265+
k -> downloadBlobAndDecodeFields0(blobName, fieldsToDecode));
266+
} else {
267+
return downloadBlobAndDecodeFields0(blobName, fieldsToDecode);
268+
}
269+
}
270+
271+
private List<List<String>> downloadBlobAndDecodeFields0(final String blobName, final int... fieldsToDecode) {
272+
return downloadBlobAndReadLines(blobName).stream()
273+
.map(l -> l.split(","))
274+
.map(fields -> decodeRequiredFields(fields, fieldsToDecode))
275+
.collect(Collectors.toList());
276+
}
277+
214278
private List<String> decodeRequiredFields(final String[] originalFields, final int[] fieldsToDecode) {
215279
Objects.requireNonNull(originalFields, "originalFields cannot be null");
216280
Objects.requireNonNull(fieldsToDecode, "fieldsToDecode cannot be null");

0 commit comments

Comments
 (0)