|
| 1 | +// |
| 2 | +// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending |
| 3 | +// |
| 4 | +package io.deephaven.engine.table.impl.locations.local; |
| 5 | + |
| 6 | +import gnu.trove.map.TIntObjectMap; |
| 7 | +import gnu.trove.map.hash.TIntObjectHashMap; |
| 8 | +import io.deephaven.base.verify.Require; |
| 9 | +import io.deephaven.engine.table.Table; |
| 10 | +import io.deephaven.api.util.NameValidator; |
| 11 | +import io.deephaven.engine.table.impl.locations.TableDataException; |
| 12 | +import io.deephaven.engine.table.impl.locations.TableLocationKey; |
| 13 | +import io.deephaven.engine.table.impl.locations.impl.TableLocationKeyFinder; |
| 14 | +import org.jetbrains.annotations.NotNull; |
| 15 | +import org.jetbrains.annotations.Nullable; |
| 16 | + |
| 17 | +import java.io.File; |
| 18 | +import java.io.IOException; |
| 19 | +import java.nio.file.*; |
| 20 | +import java.nio.file.attribute.BasicFileAttributes; |
| 21 | +import java.util.*; |
| 22 | +import java.util.function.BiFunction; |
| 23 | +import java.util.function.Consumer; |
| 24 | +import java.util.function.Predicate; |
| 25 | +import java.util.function.Supplier; |
| 26 | + |
| 27 | + |
| 28 | +/** |
| 29 | + * {@link TableLocationKeyFinder Location finder} that will take a directory file, traverse the directory hierarchy and |
| 30 | + * infer partitions from key-value pairs in the directory names, for example: |
| 31 | + * |
| 32 | + * <pre> |
| 33 | + * tableRootDirectory/Country=France/City=Paris/parisData.parquet |
| 34 | + * </pre> |
| 35 | + * |
| 36 | + * Traversal is depth-first, and assumes that target files will only be found at a single depth. This class is |
| 37 | + * specialized for handling of files. For handling of URIs, see {@link URIStreamKeyValuePartitionLayout}. |
| 38 | + * |
| 39 | + * @implNote Column names will be legalized via {@link NameValidator#legalizeColumnName(String, Set)}. |
| 40 | + */ |
| 41 | +public class FileKeyValuePartitionLayout<TLK extends TableLocationKey> |
| 42 | + extends KeyValuePartitionLayout<TLK, Path> |
| 43 | + implements TableLocationKeyFinder<TLK> { |
| 44 | + |
| 45 | + private final File tableRootDirectory; |
| 46 | + private final Predicate<Path> pathFilter; |
| 47 | + private final Supplier<LocationTableBuilder> locationTableBuilderFactory; |
| 48 | + private final int maxPartitioningLevels; |
| 49 | + |
| 50 | + /** |
| 51 | + * @param tableRootDirectory The directory to traverse from |
| 52 | + * @param pathFilter Filter to determine whether a regular file should be used to create a key |
| 53 | + * @param locationTableBuilderFactory Factory for {@link LocationTableBuilder builders} used to organize partition |
| 54 | + * information; as builders are typically stateful, a new builder is created each time this |
| 55 | + * {@link KeyValuePartitionLayout} is used to {@link #findKeys(Consumer) find keys} |
| 56 | + * @param keyFactory Factory function used to generate table location keys from target files and partition values |
| 57 | + * @param maxPartitioningLevels Maximum partitioning levels to traverse. Must be {@code >= 0}. {@code 0} means only |
| 58 | + * look at files in {@code tableRootDirectory} and find no partitions. |
| 59 | + */ |
| 60 | + public FileKeyValuePartitionLayout( |
| 61 | + @NotNull final File tableRootDirectory, |
| 62 | + @NotNull final Predicate<Path> pathFilter, |
| 63 | + @NotNull final Supplier<LocationTableBuilder> locationTableBuilderFactory, |
| 64 | + @NotNull final BiFunction<Path, Map<String, Comparable<?>>, TLK> keyFactory, |
| 65 | + final int maxPartitioningLevels) { |
| 66 | + super(keyFactory); |
| 67 | + this.tableRootDirectory = tableRootDirectory; |
| 68 | + this.pathFilter = pathFilter; |
| 69 | + this.locationTableBuilderFactory = locationTableBuilderFactory; |
| 70 | + this.maxPartitioningLevels = Require.geqZero(maxPartitioningLevels, "maxPartitioningLevels"); |
| 71 | + } |
| 72 | + |
| 73 | + @Override |
| 74 | + public String toString() { |
| 75 | + return FileKeyValuePartitionLayout.class.getSimpleName() + '[' + tableRootDirectory + ']'; |
| 76 | + } |
| 77 | + |
| 78 | + @Override |
| 79 | + public void findKeys(@NotNull final Consumer<TLK> locationKeyObserver) { |
| 80 | + final Queue<Path> targetFiles = new ArrayDeque<>(); |
| 81 | + final LocationTableBuilder locationTableBuilder = locationTableBuilderFactory.get(); |
| 82 | + try { |
| 83 | + Files.walkFileTree(tableRootDirectory.toPath(), EnumSet.of(FileVisitOption.FOLLOW_LINKS), |
| 84 | + maxPartitioningLevels + 1, new SimpleFileVisitor<>() { |
| 85 | + final Set<String> partitionKeys = new LinkedHashSet<>(); // Preserve order of insertion |
| 86 | + final List<String> partitionValues = new ArrayList<>(); |
| 87 | + final TIntObjectMap<ColumnNameInfo> partitionColInfo = new TIntObjectHashMap<>(); |
| 88 | + boolean registered; |
| 89 | + int columnCount = -1; |
| 90 | + |
| 91 | + @Override |
| 92 | + public FileVisitResult preVisitDirectory( |
| 93 | + @NotNull final Path dir, |
| 94 | + @NotNull final BasicFileAttributes attrs) { |
| 95 | + final String dirName = dir.getFileName().toString(); |
| 96 | + // Skip dot directories |
| 97 | + if (!dirName.isEmpty() && dirName.charAt(0) == '.') { |
| 98 | + return FileVisitResult.SKIP_SUBTREE; |
| 99 | + } |
| 100 | + if (++columnCount > 0) { |
| 101 | + // We're descending and past the root |
| 102 | + final int columnIndex = columnCount - 1; |
| 103 | + processSubdirectoryInternal(dirName, dir.toString(), columnIndex, partitionKeys, |
| 104 | + partitionValues, partitionColInfo); |
| 105 | + } |
| 106 | + return FileVisitResult.CONTINUE; |
| 107 | + } |
| 108 | + |
| 109 | + @Override |
| 110 | + public FileVisitResult visitFile( |
| 111 | + @NotNull final Path file, |
| 112 | + @NotNull final BasicFileAttributes attrs) { |
| 113 | + if (attrs.isRegularFile() && pathFilter.test(file)) { |
| 114 | + if (!registered) { |
| 115 | + locationTableBuilder.registerPartitionKeys(partitionKeys); |
| 116 | + registered = true; |
| 117 | + } |
| 118 | + locationTableBuilder.acceptLocation(partitionValues); |
| 119 | + targetFiles.add(file); |
| 120 | + } |
| 121 | + return FileVisitResult.CONTINUE; |
| 122 | + } |
| 123 | + |
| 124 | + @Override |
| 125 | + public FileVisitResult postVisitDirectory( |
| 126 | + @NotNull final Path dir, |
| 127 | + @Nullable final IOException exc) throws IOException { |
| 128 | + if (--columnCount >= 0) { |
| 129 | + partitionValues.remove(columnCount); |
| 130 | + } |
| 131 | + return super.postVisitDirectory(dir, exc); |
| 132 | + } |
| 133 | + }); |
| 134 | + } catch (IOException e) { |
| 135 | + throw new TableDataException("Error finding locations for under " + tableRootDirectory, e); |
| 136 | + } |
| 137 | + |
| 138 | + final Table locationTable = locationTableBuilder.build(); |
| 139 | + buildLocationKeys(locationTable, targetFiles, locationKeyObserver); |
| 140 | + } |
| 141 | +} |
0 commit comments