18
18
19
19
import java .io .BufferedReader ;
20
20
import java .io .ByteArrayInputStream ;
21
+ import java .io .File ;
22
+ import java .io .FileInputStream ;
21
23
import java .io .IOException ;
22
24
import java .io .InputStream ;
23
25
import java .io .InputStreamReader ;
24
26
import java .nio .charset .StandardCharsets ;
27
+ import java .nio .file .Paths ;
25
28
import java .util .Arrays ;
26
29
import java .util .Base64 ;
27
30
import java .util .HashMap ;
41
44
42
45
import com .github .luben .zstd .ZstdInputStream ;
43
46
import com .google .cloud .storage .Blob ;
47
+ import com .google .cloud .storage .BlobId ;
44
48
import com .google .cloud .storage .BlobInfo ;
45
49
import com .google .cloud .storage .Storage ;
46
50
import org .xerial .snappy .SnappyInputStream ;
@@ -53,6 +57,7 @@ public final class BucketAccessor {
53
57
private List <String > blobNamesCache ;
54
58
private final Map <String , String > stringContentCache = new HashMap <>();
55
59
private final Map <String , List <String >> linesCache = new HashMap <>();
60
+ private final Map <String , List <String >> downloadedLinesCache = new HashMap <>();
56
61
private final Map <String , List <List <String >>> decodedLinesCache = new HashMap <>();
57
62
58
63
public BucketAccessor (final Storage storage , final String bucketName , final boolean cache ) {
@@ -121,6 +126,7 @@ public void clear(final String prefix) {
121
126
stringContentCache .clear ();
122
127
linesCache .clear ();
123
128
decodedLinesCache .clear ();
129
+ downloadedLinesCache .clear ();
124
130
}
125
131
}
126
132
@@ -165,7 +171,46 @@ private List<String> readLines0(final String blobName, final String compression)
165
171
InputStream decompressedStream = getDecompressedStream (bais , compression );
166
172
InputStreamReader reader = new InputStreamReader (decompressedStream , StandardCharsets .UTF_8 );
167
173
BufferedReader bufferedReader = new BufferedReader (reader )) {
174
+ return bufferedReader .lines ().collect (Collectors .toList ());
175
+ } catch (final IOException e ) {
176
+ throw new RuntimeException (e ); // NOPMD
177
+ }
178
+ }
168
179
180
+ public List <String > downloadBlobAndReadLines (final String blobName ) {
181
+ Objects .requireNonNull (blobName , "blobName cannot be null" );
182
+ if (cache ) {
183
+ return downloadedLinesCache .computeIfAbsent (blobName , k -> downloadBlobAndReadLines0 (blobName ));
184
+ } else {
185
+ return downloadBlobAndReadLines0 (blobName );
186
+ }
187
+ }
188
+
189
+ private List <String > downloadBlobAndReadLines0 (final String blobName ) {
190
+ final String filePath = downloadBlobToTempFile (blobName );
191
+ final List <String > bytes = readDownloadedBytes (filePath );
192
+ final File file = new File (filePath );
193
+ if (file != null && file .isFile ()) {
194
+ assert file .delete ();
195
+ }
196
+ return bytes ;
197
+ }
198
+
199
+ private String downloadBlobToTempFile (final String blobName ) {
200
+ try {
201
+ final File file = File .createTempFile ("tmp" , null );
202
+ final String filePath = file .getAbsolutePath ();
203
+ storage .downloadTo (BlobId .fromGsUtilUri ("gs://" + bucketName + "/" + blobName ), Paths .get (filePath ));
204
+ return filePath ;
205
+ } catch (final IOException e ) {
206
+ throw new RuntimeException (e ); // NOPMD
207
+ }
208
+ }
209
+
210
+ private List <String > readDownloadedBytes (final String filePath ) {
211
+ try (FileInputStream fis = new FileInputStream (filePath ); // NOPMD
212
+ InputStreamReader isr = new InputStreamReader (fis , "UTF-8" );
213
+ BufferedReader bufferedReader = new BufferedReader (isr );) {
169
214
return bufferedReader .lines ().collect (Collectors .toList ());
170
215
} catch (final IOException e ) {
171
216
throw new RuntimeException (e ); // NOPMD
@@ -211,6 +256,25 @@ private List<List<String>> readAndDecodeLines0(final String blobName, final Stri
211
256
.collect (Collectors .toList ());
212
257
}
213
258
259
+ public List <List <String >> downloadBlobAndDecodeFields (final String blobName , final int ... fieldsToDecode ) {
260
+ Objects .requireNonNull (blobName , "blobName cannot be null" );
261
+ Objects .requireNonNull (fieldsToDecode , "fieldsToDecode cannot be null" );
262
+
263
+ if (cache ) {
264
+ return decodedLinesCache .computeIfAbsent (blobName ,
265
+ k -> downloadBlobAndDecodeFields0 (blobName , fieldsToDecode ));
266
+ } else {
267
+ return downloadBlobAndDecodeFields0 (blobName , fieldsToDecode );
268
+ }
269
+ }
270
+
271
+ private List <List <String >> downloadBlobAndDecodeFields0 (final String blobName , final int ... fieldsToDecode ) {
272
+ return downloadBlobAndReadLines (blobName ).stream ()
273
+ .map (l -> l .split ("," ))
274
+ .map (fields -> decodeRequiredFields (fields , fieldsToDecode ))
275
+ .collect (Collectors .toList ());
276
+ }
277
+
214
278
private List <String > decodeRequiredFields (final String [] originalFields , final int [] fieldsToDecode ) {
215
279
Objects .requireNonNull (originalFields , "originalFields cannot be null" );
216
280
Objects .requireNonNull (fieldsToDecode , "fieldsToDecode cannot be null" );
0 commit comments