Skip to content

Commit bf6fcdb

Browse files
Added support to read partitioned parquet files from S3 (deephaven#5206)
Breaking Change: Renamed KeyValuePartitionLayout to FileKeyValuePartitionLayout.
1 parent 2043030 commit bf6fcdb

File tree

24 files changed

+1065
-254
lines changed

24 files changed

+1065
-254
lines changed

Base/src/main/java/io/deephaven/base/FileUtils.java

+13-2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import java.net.URISyntaxException;
1313
import java.nio.file.Path;
1414
import java.util.ArrayList;
15+
import java.util.regex.Pattern;
1516

1617
public class FileUtils {
1718
private final static FileFilter DIRECTORY_FILE_FILTER = new FileFilter() {
@@ -29,6 +30,8 @@ public boolean accept(File dir, String name) {
2930
};
3031
private final static String[] EMPTY_STRING_ARRAY = new String[0];
3132

33+
public static final Pattern DUPLICATE_SLASH_PATTERN = Pattern.compile("//+");
34+
3235
/**
3336
* Cleans the specified path. All files and subdirectories in the path will be deleted. (ie you'll be left with an
3437
* empty directory).
@@ -254,7 +257,8 @@ public boolean accept(File pathname) {
254257
}
255258

256259
/**
257-
* Take the file source path or URI string and convert it to a URI object.
260+
* Take the file source path or URI string and convert it to a URI object. Any unnecessary path separators will be
261+
* removed.
258262
*
259263
* @param source The file source path or URI
260264
* @param isDirectory Whether the source is a directory
@@ -264,9 +268,16 @@ public static URI convertToURI(final String source, final boolean isDirectory) {
264268
if (source.isEmpty()) {
265269
throw new IllegalArgumentException("Cannot convert empty source to URI");
266270
}
267-
final URI uri;
271+
URI uri;
268272
try {
269273
uri = new URI(source);
274+
// Replace two or more consecutive slashes in the path with a single slash
275+
final String path = uri.getPath();
276+
if (path.contains("//")) {
277+
final String canonicalizedPath = DUPLICATE_SLASH_PATTERN.matcher(path).replaceAll("/");
278+
uri = new URI(uri.getScheme(), uri.getUserInfo(), uri.getHost(), uri.getPort(), canonicalizedPath,
279+
uri.getQuery(), uri.getFragment());
280+
}
270281
} catch (final URISyntaxException e) {
271282
// If the URI is invalid, assume it's a file path
272283
return convertToURI(new File(source), isDirectory);

Util/channel/src/main/java/io/deephaven/util/channel/CachedChannelProvider.java

+11
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import java.nio.channels.SeekableByteChannel;
2020
import java.nio.file.Path;
2121
import java.util.*;
22+
import java.util.stream.Stream;
2223

2324
/**
2425
* {@link SeekableChannelsProvider Channel provider} that will cache a bounded number of unused channels.
@@ -109,6 +110,16 @@ public SeekableByteChannel getWriteChannel(@NotNull final Path path, final boole
109110
// end no matter what.
110111
}
111112

113+
@Override
114+
public Stream<URI> list(@NotNull final URI directory) throws IOException {
115+
return wrappedProvider.list(directory);
116+
}
117+
118+
@Override
119+
public Stream<URI> walk(@NotNull final URI directory) throws IOException {
120+
return wrappedProvider.walk(directory);
121+
}
122+
112123
@Nullable
113124
private synchronized CachedChannel tryGetPooledChannel(@NotNull final String pathKey,
114125
@NotNull final KeyedObjectHashMap<String, PerPathPool> channelPool) {

Util/channel/src/main/java/io/deephaven/util/channel/LocalFSChannelProvider.java

+17
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
//
44
package io.deephaven.util.channel;
55

6+
import io.deephaven.base.FileUtils;
67
import org.jetbrains.annotations.NotNull;
78
import org.jetbrains.annotations.Nullable;
89

@@ -12,8 +13,10 @@
1213
import java.net.URI;
1314
import java.nio.channels.FileChannel;
1415
import java.nio.channels.SeekableByteChannel;
16+
import java.nio.file.Files;
1517
import java.nio.file.Path;
1618
import java.nio.file.StandardOpenOption;
19+
import java.util.stream.Stream;
1720

1821
public class LocalFSChannelProvider implements SeekableChannelsProvider {
1922
@Override
@@ -56,6 +59,20 @@ public SeekableByteChannel getWriteChannel(@NotNull final Path filePath, final b
5659
return result;
5760
}
5861

62+
@Override
63+
public final Stream<URI> list(@NotNull final URI directory) throws IOException {
64+
// Assuming that the URI is a file, not a directory. The caller should manage file vs. directory handling in
65+
// the processor.
66+
return Files.list(Path.of(directory)).map(path -> FileUtils.convertToURI(path, false));
67+
}
68+
69+
@Override
70+
public final Stream<URI> walk(@NotNull final URI directory) throws IOException {
71+
// Assuming that the URI is a file, not a directory. The caller should manage file vs. directory handling in
72+
// the processor.
73+
return Files.walk(Path.of(directory)).map(path -> FileUtils.convertToURI(path, false));
74+
}
75+
5976
@Override
6077
public void close() {}
6178
}

Util/channel/src/main/java/io/deephaven/util/channel/SeekableChannelsProvider.java

+25
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import java.nio.channels.SeekableByteChannel;
1313
import java.nio.file.Path;
1414
import java.nio.file.Paths;
15+
import java.util.stream.Stream;
1516

1617
import static io.deephaven.base.FileUtils.convertToURI;
1718

@@ -85,4 +86,28 @@ default SeekableByteChannel getWriteChannel(@NotNull final String path, final bo
8586
}
8687

8788
SeekableByteChannel getWriteChannel(@NotNull Path path, boolean append) throws IOException;
89+
90+
/**
91+
* Returns a stream of URIs, the elements of which are the entries in the directory. The listing is non-recursive.
92+
* The URIs supplied by the stream will not have any unnecessary slashes or path separators. Also, the URIs will be
93+
* file URIs (not ending with "/") irrespective of whether the URI corresponds to a file or a directory. The caller
94+
* should manage file vs. directory handling in the processor. The caller is also responsible for closing the
95+
* stream, preferably using a try-with-resources block.
96+
*
97+
* @param directory the URI of the directory to list
98+
* @return The {@link Stream} of {@link URI}s
99+
*/
100+
Stream<URI> list(@NotNull URI directory) throws IOException;
101+
102+
/**
103+
* Returns a stream of URIs, the elements of which are all the files in the file tree rooted at the given starting
104+
* directory. The URIs supplied by the stream will not have any unnecessary slashes or path separators. Also, the
105+
* URIs will be file URIs (not ending with "/") irrespective of whether the URI corresponds to a file or a
106+
* directory. The caller should manage file vs. directory handling in the processor. The caller is also responsible
107+
* for closing the stream, preferably using a try-with-resources block.
108+
*
109+
* @param directory the URI of the directory to walk
110+
* @return The {@link Stream} of {@link URI}s
111+
*/
112+
Stream<URI> walk(@NotNull URI directory) throws IOException;
88113
}

Util/channel/src/test/java/io/deephaven/util/channel/CachedChannelProviderTest.java

+14
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
//
44
package io.deephaven.util.channel;
55

6+
import io.deephaven.base.FileUtils;
67
import org.jetbrains.annotations.NotNull;
78
import org.jetbrains.annotations.Nullable;
89
import org.junit.jupiter.api.Test;
@@ -12,10 +13,13 @@
1213
import java.net.URI;
1314
import java.nio.ByteBuffer;
1415
import java.nio.channels.SeekableByteChannel;
16+
import java.nio.file.Files;
1517
import java.nio.file.Path;
1618
import java.util.ArrayList;
1719
import java.util.List;
1820
import java.util.concurrent.atomic.AtomicInteger;
21+
import java.util.function.Consumer;
22+
import java.util.stream.Stream;
1923

2024
import static org.junit.jupiter.api.Assertions.assertEquals;
2125
import static org.junit.jupiter.api.Assertions.assertNull;
@@ -215,6 +219,16 @@ public SeekableByteChannel getWriteChannel(@NotNull Path path, boolean append) {
215219
return new TestMockChannel(count.getAndIncrement(), path.toString());
216220
}
217221

222+
@Override
223+
public final Stream<URI> list(@NotNull final URI directory) {
224+
throw new UnsupportedOperationException("list");
225+
}
226+
227+
@Override
228+
public final Stream<URI> walk(@NotNull final URI directory) {
229+
throw new UnsupportedOperationException("walk");
230+
}
231+
218232
@Override
219233
public void close() {}
220234
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
//
2+
// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending
3+
//
4+
package io.deephaven.engine.table.impl.locations.local;
5+
6+
import gnu.trove.map.TIntObjectMap;
7+
import gnu.trove.map.hash.TIntObjectHashMap;
8+
import io.deephaven.base.verify.Require;
9+
import io.deephaven.engine.table.Table;
10+
import io.deephaven.api.util.NameValidator;
11+
import io.deephaven.engine.table.impl.locations.TableDataException;
12+
import io.deephaven.engine.table.impl.locations.TableLocationKey;
13+
import io.deephaven.engine.table.impl.locations.impl.TableLocationKeyFinder;
14+
import org.jetbrains.annotations.NotNull;
15+
import org.jetbrains.annotations.Nullable;
16+
17+
import java.io.File;
18+
import java.io.IOException;
19+
import java.nio.file.*;
20+
import java.nio.file.attribute.BasicFileAttributes;
21+
import java.util.*;
22+
import java.util.function.BiFunction;
23+
import java.util.function.Consumer;
24+
import java.util.function.Predicate;
25+
import java.util.function.Supplier;
26+
27+
28+
/**
29+
* {@link TableLocationKeyFinder Location finder} that will take a directory file, traverse the directory hierarchy and
30+
* infer partitions from key-value pairs in the directory names, for example:
31+
*
32+
* <pre>
33+
* tableRootDirectory/Country=France/City=Paris/parisData.parquet
34+
* </pre>
35+
*
36+
* Traversal is depth-first, and assumes that target files will only be found at a single depth. This class is
37+
* specialized for handling of files. For handling of URIs, see {@link URIStreamKeyValuePartitionLayout}.
38+
*
39+
* @implNote Column names will be legalized via {@link NameValidator#legalizeColumnName(String, Set)}.
40+
*/
41+
public class FileKeyValuePartitionLayout<TLK extends TableLocationKey>
42+
extends KeyValuePartitionLayout<TLK, Path>
43+
implements TableLocationKeyFinder<TLK> {
44+
45+
private final File tableRootDirectory;
46+
private final Predicate<Path> pathFilter;
47+
private final Supplier<LocationTableBuilder> locationTableBuilderFactory;
48+
private final int maxPartitioningLevels;
49+
50+
/**
51+
* @param tableRootDirectory The directory to traverse from
52+
* @param pathFilter Filter to determine whether a regular file should be used to create a key
53+
* @param locationTableBuilderFactory Factory for {@link LocationTableBuilder builders} used to organize partition
54+
* information; as builders are typically stateful, a new builder is created each time this
55+
* {@link KeyValuePartitionLayout} is used to {@link #findKeys(Consumer) find keys}
56+
* @param keyFactory Factory function used to generate table location keys from target files and partition values
57+
* @param maxPartitioningLevels Maximum partitioning levels to traverse. Must be {@code >= 0}. {@code 0} means only
58+
* look at files in {@code tableRootDirectory} and find no partitions.
59+
*/
60+
public FileKeyValuePartitionLayout(
61+
@NotNull final File tableRootDirectory,
62+
@NotNull final Predicate<Path> pathFilter,
63+
@NotNull final Supplier<LocationTableBuilder> locationTableBuilderFactory,
64+
@NotNull final BiFunction<Path, Map<String, Comparable<?>>, TLK> keyFactory,
65+
final int maxPartitioningLevels) {
66+
super(keyFactory);
67+
this.tableRootDirectory = tableRootDirectory;
68+
this.pathFilter = pathFilter;
69+
this.locationTableBuilderFactory = locationTableBuilderFactory;
70+
this.maxPartitioningLevels = Require.geqZero(maxPartitioningLevels, "maxPartitioningLevels");
71+
}
72+
73+
@Override
74+
public String toString() {
75+
return FileKeyValuePartitionLayout.class.getSimpleName() + '[' + tableRootDirectory + ']';
76+
}
77+
78+
@Override
79+
public void findKeys(@NotNull final Consumer<TLK> locationKeyObserver) {
80+
final Queue<Path> targetFiles = new ArrayDeque<>();
81+
final LocationTableBuilder locationTableBuilder = locationTableBuilderFactory.get();
82+
try {
83+
Files.walkFileTree(tableRootDirectory.toPath(), EnumSet.of(FileVisitOption.FOLLOW_LINKS),
84+
maxPartitioningLevels + 1, new SimpleFileVisitor<>() {
85+
final Set<String> partitionKeys = new LinkedHashSet<>(); // Preserve order of insertion
86+
final List<String> partitionValues = new ArrayList<>();
87+
final TIntObjectMap<ColumnNameInfo> partitionColInfo = new TIntObjectHashMap<>();
88+
boolean registered;
89+
int columnCount = -1;
90+
91+
@Override
92+
public FileVisitResult preVisitDirectory(
93+
@NotNull final Path dir,
94+
@NotNull final BasicFileAttributes attrs) {
95+
final String dirName = dir.getFileName().toString();
96+
// Skip dot directories
97+
if (!dirName.isEmpty() && dirName.charAt(0) == '.') {
98+
return FileVisitResult.SKIP_SUBTREE;
99+
}
100+
if (++columnCount > 0) {
101+
// We're descending and past the root
102+
final int columnIndex = columnCount - 1;
103+
processSubdirectoryInternal(dirName, dir.toString(), columnIndex, partitionKeys,
104+
partitionValues, partitionColInfo);
105+
}
106+
return FileVisitResult.CONTINUE;
107+
}
108+
109+
@Override
110+
public FileVisitResult visitFile(
111+
@NotNull final Path file,
112+
@NotNull final BasicFileAttributes attrs) {
113+
if (attrs.isRegularFile() && pathFilter.test(file)) {
114+
if (!registered) {
115+
locationTableBuilder.registerPartitionKeys(partitionKeys);
116+
registered = true;
117+
}
118+
locationTableBuilder.acceptLocation(partitionValues);
119+
targetFiles.add(file);
120+
}
121+
return FileVisitResult.CONTINUE;
122+
}
123+
124+
@Override
125+
public FileVisitResult postVisitDirectory(
126+
@NotNull final Path dir,
127+
@Nullable final IOException exc) throws IOException {
128+
if (--columnCount >= 0) {
129+
partitionValues.remove(columnCount);
130+
}
131+
return super.postVisitDirectory(dir, exc);
132+
}
133+
});
134+
} catch (IOException e) {
135+
throw new TableDataException("Error finding locations for under " + tableRootDirectory, e);
136+
}
137+
138+
final Table locationTable = locationTableBuilder.build();
139+
buildLocationKeys(locationTable, targetFiles, locationKeyObserver);
140+
}
141+
}

0 commit comments

Comments
 (0)