18
18
19
19
import java .io .BufferedReader ;
20
20
import java .io .ByteArrayInputStream ;
21
+ import java .io .File ;
21
22
import java .io .IOException ;
22
23
import java .io .InputStream ;
23
24
import java .io .InputStreamReader ;
24
25
import java .nio .charset .StandardCharsets ;
26
+ import java .nio .file .Files ;
27
+ import java .nio .file .Path ;
28
+ import java .nio .file .Paths ;
25
29
import java .util .Arrays ;
26
30
import java .util .Base64 ;
27
31
import java .util .HashMap ;
41
45
42
46
import com .github .luben .zstd .ZstdInputStream ;
43
47
import com .google .cloud .storage .Blob ;
48
+ import com .google .cloud .storage .BlobId ;
44
49
import com .google .cloud .storage .BlobInfo ;
45
50
import com .google .cloud .storage .Storage ;
46
51
import org .xerial .snappy .SnappyInputStream ;
@@ -53,6 +58,7 @@ public final class BucketAccessor {
53
58
private List <String > blobNamesCache ;
54
59
private final Map <String , String > stringContentCache = new HashMap <>();
55
60
private final Map <String , List <String >> linesCache = new HashMap <>();
61
+ private final Map <String , List <String >> downloadedLinesCache = new HashMap <>();
56
62
private final Map <String , List <List <String >>> decodedLinesCache = new HashMap <>();
57
63
58
64
public BucketAccessor (final Storage storage , final String bucketName , final boolean cache ) {
@@ -121,6 +127,7 @@ public void clear(final String prefix) {
121
127
stringContentCache .clear ();
122
128
linesCache .clear ();
123
129
decodedLinesCache .clear ();
130
+ downloadedLinesCache .clear ();
124
131
}
125
132
}
126
133
@@ -165,13 +172,49 @@ private List<String> readLines0(final String blobName, final String compression)
165
172
InputStream decompressedStream = getDecompressedStream (bais , compression );
166
173
InputStreamReader reader = new InputStreamReader (decompressedStream , StandardCharsets .UTF_8 );
167
174
BufferedReader bufferedReader = new BufferedReader (reader )) {
168
-
169
175
return bufferedReader .lines ().collect (Collectors .toList ());
170
176
} catch (final IOException e ) {
171
177
throw new RuntimeException (e ); // NOPMD
172
178
}
173
179
}
174
180
181
+ public List <String > downloadBlobAndReadLines (final String blobName , final String compression ) {
182
+ Objects .requireNonNull (blobName , "blobName cannot be null" );
183
+ Objects .requireNonNull (compression , "compression cannot be null" );
184
+ if (cache ) {
185
+ return downloadedLinesCache .computeIfAbsent (blobName ,
186
+ k -> downloadBlobAndReadLines0 (blobName , compression ));
187
+ } else {
188
+ return downloadBlobAndReadLines0 (blobName , compression );
189
+ }
190
+ }
191
+
192
+ private List <String > downloadBlobAndReadLines0 (final String blobName , final String compression ) {
193
+ final String filePath = downloadBlobToTempFile (blobName );
194
+ try {
195
+ final byte [] bytes = Files .readAllBytes (Path .of (filePath ));
196
+ try (ByteArrayInputStream bais = new ByteArrayInputStream (bytes );
197
+ InputStream decompressedStream = getDecompressedStream (bais , compression );
198
+ InputStreamReader reader = new InputStreamReader (decompressedStream , StandardCharsets .UTF_8 );
199
+ BufferedReader bufferedReader = new BufferedReader (reader )) {
200
+ return bufferedReader .lines ().collect (Collectors .toList ());
201
+ }
202
+ } catch (IOException exception ) {
203
+ throw new RuntimeException (exception ); // NOPMD
204
+ }
205
+ }
206
+
207
+ private String downloadBlobToTempFile (final String blobName ) {
208
+ try {
209
+ final File file = File .createTempFile ("tmp" , null );
210
+ final String filePath = file .getAbsolutePath ();
211
+ storage .downloadTo (BlobId .fromGsUtilUri ("gs://" + bucketName + "/" + blobName ), Paths .get (filePath ));
212
+ return filePath ;
213
+ } catch (final IOException e ) {
214
+ throw new RuntimeException (e ); // NOPMD
215
+ }
216
+ }
217
+
175
218
private InputStream getDecompressedStream (final InputStream inputStream , final String compression )
176
219
throws IOException {
177
220
Objects .requireNonNull (inputStream , "inputStream cannot be null" );
@@ -211,6 +254,27 @@ private List<List<String>> readAndDecodeLines0(final String blobName, final Stri
211
254
.collect (Collectors .toList ());
212
255
}
213
256
257
+ public List <List <String >> downloadBlobAndDecodeFields (final String blobName , final String compression ,
258
+ final int ... fieldsToDecode ) {
259
+ Objects .requireNonNull (blobName , "blobName cannot be null" );
260
+ Objects .requireNonNull (fieldsToDecode , "fieldsToDecode cannot be null" );
261
+
262
+ if (cache ) {
263
+ return decodedLinesCache .computeIfAbsent (blobName ,
264
+ k -> downloadBlobAndDecodeFields0 (blobName , compression , fieldsToDecode ));
265
+ } else {
266
+ return downloadBlobAndDecodeFields0 (blobName , compression , fieldsToDecode );
267
+ }
268
+ }
269
+
270
+ private List <List <String >> downloadBlobAndDecodeFields0 (final String blobName , final String compression ,
271
+ final int ... fieldsToDecode ) {
272
+ return downloadBlobAndReadLines (blobName , compression ).stream ()
273
+ .map (l -> l .split ("," ))
274
+ .map (fields -> decodeRequiredFields (fields , fieldsToDecode ))
275
+ .collect (Collectors .toList ());
276
+ }
277
+
214
278
private List <String > decodeRequiredFields (final String [] originalFields , final int [] fieldsToDecode ) {
215
279
Objects .requireNonNull (originalFields , "originalFields cannot be null" );
216
280
Objects .requireNonNull (fieldsToDecode , "fieldsToDecode cannot be null" );
0 commit comments