Skip to content

Commit 8840189

Browse files
authored
Fix quadtree spatial join OOMs on large numbers of input polygons (#1381)
Followup to #1346. * Fixes some typos/omissions in types and CMake. * Adds a new test that OOMs when quadtree_point_in_polygon is passed too many input polygons. * Fixes quadtree spatial join to handle overflow while counting and more conservatively allocate output buffers. Fixes #890. * [Failing test run](https://github.com/rapidsai/cuspatial/actions/runs/8979838628/job/24662981350#step:7:840) * [Passing test run](https://github.com/rapidsai/cuspatial/actions/runs/8981106226/job/24666403165#step:7:840) Authors: - Paul Taylor (https://github.com/trxcllnt) Approvers: - Mark Harris (https://github.com/harrism) - Michael Wang (https://github.com/isVoid) URL: #1381
1 parent eff6753 commit 8840189

File tree

8 files changed

+271
-47
lines changed

8 files changed

+271
-47
lines changed

cpp/benchmarks/CMakeLists.txt

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#=============================================================================
2-
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
2+
# Copyright (c) 2019-2024, NVIDIA CORPORATION.
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License");
55
# you may not use this file except in compliance with the License.
@@ -23,6 +23,17 @@ add_library(cuspatial_benchmark_common OBJECT
2323

2424
target_compile_features(cuspatial_benchmark_common PUBLIC cxx_std_17 cuda_std_17)
2525

26+
set_target_properties(cuspatial_benchmark_common
27+
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUSPATIAL_BINARY_DIR}/benchmarks>"
28+
INSTALL_RPATH "\$ORIGIN/../../../lib"
29+
CXX_STANDARD 17
30+
CXX_STANDARD_REQUIRED ON
31+
CUDA_STANDARD 17
32+
CUDA_STANDARD_REQUIRED ON
33+
POSITION_INDEPENDENT_CODE ON
34+
INTERFACE_POSITION_INDEPENDENT_CODE ON
35+
)
36+
2637
target_link_libraries(cuspatial_benchmark_common
2738
PUBLIC benchmark::benchmark
2839
cudf::cudftestutil
@@ -43,6 +54,10 @@ function(ConfigureBench CMAKE_BENCH_NAME)
4354
set_target_properties(${CMAKE_BENCH_NAME}
4455
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUSPATIAL_BINARY_DIR}/benchmarks>"
4556
INSTALL_RPATH "\$ORIGIN/../../../lib"
57+
CXX_STANDARD 17
58+
CXX_STANDARD_REQUIRED ON
59+
CUDA_STANDARD 17
60+
CUDA_STANDARD_REQUIRED ON
4661
)
4762
target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE benchmark::benchmark_main cuspatial_benchmark_common)
4863
install(
@@ -61,7 +76,11 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
6176
${CMAKE_BENCH_NAME}
6277
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUSPATIAL_BINARY_DIR}/benchmarks>"
6378
INSTALL_RPATH "\$ORIGIN/../../../lib"
64-
)
79+
CXX_STANDARD 17
80+
CXX_STANDARD_REQUIRED ON
81+
CUDA_STANDARD 17
82+
CUDA_STANDARD_REQUIRED ON
83+
)
6584
target_link_libraries(
6685
${CMAKE_BENCH_NAME} PRIVATE cuspatial_benchmark_common nvbench::main
6786
)

cpp/include/cuspatial/detail/join/quadtree_point_in_polygon.cuh

Lines changed: 85 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,16 @@
2424
#include <cuspatial/range/multipolygon_range.cuh>
2525
#include <cuspatial/traits.hpp>
2626

27+
#include <rmm/cuda_device.hpp>
2728
#include <rmm/device_uvector.hpp>
2829
#include <rmm/exec_policy.hpp>
2930
#include <rmm/resource_ref.hpp>
3031

3132
#include <thrust/iterator/permutation_iterator.h>
33+
#include <thrust/iterator/transform_iterator.h>
3234
#include <thrust/scan.h>
3335

34-
#include <cstdint>
36+
#include <limits>
3537

3638
namespace cuspatial {
3739
namespace detail {
@@ -57,7 +59,7 @@ struct compute_poly_and_point_indices {
5759
using IndexType = iterator_value_type<QuadOffsetsIterator>;
5860

5961
inline thrust::tuple<IndexType, IndexType> __device__
60-
operator()(IndexType const global_index) const
62+
operator()(std::uint64_t const global_index) const
6163
{
6264
auto const [quad_poly_index, local_point_index] =
6365
get_quad_and_local_point_indices(global_index, point_offsets_begin, point_offsets_end);
@@ -118,16 +120,26 @@ std::pair<rmm::device_uvector<IndexType>, rmm::device_uvector<IndexType>> quadtr
118120

119121
auto num_poly_quad_pairs = std::distance(poly_indices_first, poly_indices_last);
120122

121-
auto quad_lengths_iter =
122-
thrust::make_permutation_iterator(quadtree.length_begin(), quad_indices_first);
123+
// The quadtree length is an iterator of uint32_t, but we have to transform into uint64_t values
124+
// so the thrust::inclusive_scan accumulates into uint64_t outputs. Changing the output iterator
125+
// to uint64_t isn't sufficient to achieve this behavior.
126+
auto quad_lengths_iter = thrust::make_transform_iterator(
127+
thrust::make_permutation_iterator(quadtree.length_begin(), quad_indices_first),
128+
cuda::proclaim_return_type<std::uint64_t>([] __device__(IndexType const& i) -> std::uint64_t {
129+
return static_cast<std::uint64_t>(i);
130+
}));
123131

124132
auto quad_offsets_iter =
125133
thrust::make_permutation_iterator(quadtree.offset_begin(), quad_indices_first);
126134

127-
// Compute a "local" set of zero-based point offsets from number of points in each quadrant
135+
// Compute a "local" set of zero-based point offsets from the number of points in each quadrant.
136+
//
128137
// Use `num_poly_quad_pairs + 1` as the length so that the last element produced by
129138
// `inclusive_scan` is the total number of points to be tested against any polygon.
130-
rmm::device_uvector<IndexType> local_point_offsets(num_poly_quad_pairs + 1, stream);
139+
//
140+
// Accumulate into uint64_t, because the prefix sums can overflow the size of uint32_t
141+
// when testing a large number of polygons against a large quadtree.
142+
rmm::device_uvector<std::uint64_t> local_point_offsets(num_poly_quad_pairs + 1, stream);
131143

132144
// inclusive scan of quad_lengths_iter
133145
thrust::inclusive_scan(rmm::exec_policy(stream),
@@ -136,21 +148,27 @@ std::pair<rmm::device_uvector<IndexType>, rmm::device_uvector<IndexType>> quadtr
136148
local_point_offsets.begin() + 1);
137149

138150
// Ensure local point offsets starts at 0
139-
IndexType init{0};
151+
std::uint64_t init{0};
140152
local_point_offsets.set_element_async(0, init, stream);
141153

142154
// The last element is the total number of points to test against any polygon.
143155
auto num_total_points = local_point_offsets.back_element(stream);
144156

145-
// Allocate the output polygon and point index pair vectors
146-
rmm::device_uvector<IndexType> poly_indices(num_total_points, stream);
147-
rmm::device_uvector<IndexType> point_indices(num_total_points, stream);
148-
149-
auto poly_and_point_indices =
150-
thrust::make_zip_iterator(poly_indices.begin(), point_indices.begin());
151-
152-
// Enumerate the point X/Ys using the sorted `point_indices` (from quadtree construction)
153-
auto point_xys_iter = thrust::make_permutation_iterator(points_first, point_indices_first);
157+
// The largest supported input size for thrust::count_if/copy_if is INT32_MAX.
158+
// This functor iterates over the input space and processes up to INT32_MAX elements at a time.
159+
std::uint64_t max_points_to_test = std::numeric_limits<std::int32_t>::max();
160+
auto count_in_chunks = [&](auto const& func) {
161+
std::uint64_t memo{};
162+
for (std::uint64_t offset{0}; offset < num_total_points; offset += max_points_to_test) {
163+
memo += func(memo, offset, std::min(max_points_to_test, num_total_points - offset));
164+
}
165+
return memo;
166+
};
167+
168+
detail::test_poly_point_intersection test_poly_point_pair{
169+
// Enumerate the point X/Ys using the sorted `point_indices` (from quadtree construction)
170+
thrust::make_permutation_iterator(points_first, point_indices_first),
171+
polygons};
154172

155173
// Compute the combination of polygon and point index pairs. For each polygon/quadrant pair,
156174
// enumerate pairs of (poly_index, point_index) for each point in each quadrant.
@@ -163,28 +181,57 @@ std::pair<rmm::device_uvector<IndexType>, rmm::device_uvector<IndexType>> quadtr
163181
// pp_pairs.append((polygon, point))
164182
// ```
165183
//
166-
auto global_to_poly_and_point_indices = detail::make_counting_transform_iterator(
167-
0,
168-
detail::compute_poly_and_point_indices{quad_offsets_iter,
169-
local_point_offsets.begin(),
170-
local_point_offsets.end(),
171-
poly_indices_first});
172-
173-
// Compute the number of intersections by removing (poly, point) pairs that don't intersect
174-
auto num_intersections = thrust::distance(
175-
poly_and_point_indices,
176-
thrust::copy_if(rmm::exec_policy(stream),
177-
global_to_poly_and_point_indices,
178-
global_to_poly_and_point_indices + num_total_points,
179-
poly_and_point_indices,
180-
detail::test_poly_point_intersection{point_xys_iter, polygons}));
181-
182-
poly_indices.resize(num_intersections, stream);
183-
poly_indices.shrink_to_fit(stream);
184-
point_indices.resize(num_intersections, stream);
185-
point_indices.shrink_to_fit(stream);
186-
187-
return std::pair{std::move(poly_indices), std::move(point_indices)};
184+
auto global_to_poly_and_point_indices = [&](auto offset = 0) {
185+
return detail::make_counting_transform_iterator(
186+
offset,
187+
detail::compute_poly_and_point_indices{quad_offsets_iter,
188+
local_point_offsets.begin(),
189+
local_point_offsets.end(),
190+
poly_indices_first});
191+
};
192+
193+
auto run_quadtree_point_in_polygon = [&](auto output_size) {
194+
// Allocate the output polygon and point index pair vectors
195+
rmm::device_uvector<IndexType> poly_indices(output_size, stream);
196+
rmm::device_uvector<IndexType> point_indices(output_size, stream);
197+
198+
auto num_intersections = count_in_chunks([&](auto memo, auto offset, auto size) {
199+
auto poly_and_point_indices =
200+
thrust::make_zip_iterator(poly_indices.begin(), point_indices.begin()) + memo;
201+
// Remove (poly, point) pairs that don't intersect
202+
return thrust::distance(poly_and_point_indices,
203+
thrust::copy_if(rmm::exec_policy(stream),
204+
global_to_poly_and_point_indices(offset),
205+
global_to_poly_and_point_indices(offset) + size,
206+
poly_and_point_indices,
207+
test_poly_point_pair));
208+
});
209+
210+
if (num_intersections < output_size) {
211+
poly_indices.resize(num_intersections, stream);
212+
point_indices.resize(num_intersections, stream);
213+
poly_indices.shrink_to_fit(stream);
214+
point_indices.shrink_to_fit(stream);
215+
}
216+
217+
return std::pair{std::move(poly_indices), std::move(point_indices)};
218+
};
219+
220+
try {
221+
// First attempt to run the hit test assuming allocating space for all possible intersections
222+
// fits into the available memory.
223+
return run_quadtree_point_in_polygon(num_total_points);
224+
} catch (rmm::out_of_memory const&) {
225+
// If we OOM the first time, pre-compute the number of hits and allocate only that amount of
226+
// space for the output buffers. This halves performance, but it should at least return valid
227+
// results.
228+
return run_quadtree_point_in_polygon(count_in_chunks([&](auto memo, auto offset, auto size) {
229+
return thrust::count_if(rmm::exec_policy(stream),
230+
global_to_poly_and_point_indices(offset),
231+
global_to_poly_and_point_indices(offset) + size,
232+
test_poly_point_pair);
233+
}));
234+
}
188235
}
189236

190237
} // namespace cuspatial

cpp/include/cuspatial/detail/range/multilinestring_range.cuh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
2+
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -74,6 +74,7 @@ template <typename GeometryIterator, typename PartIterator, typename VecIterator
7474
class multilinestring_range;
7575

7676
template <typename GeometryIterator, typename PartIterator, typename VecIterator>
77+
CUSPATIAL_HOST_DEVICE
7778
multilinestring_range<GeometryIterator, PartIterator, VecIterator>::multilinestring_range(
7879
GeometryIterator geometry_begin,
7980
GeometryIterator geometry_end,

cpp/include/cuspatial/geometry/box.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2022, NVIDIA CORPORATION.
2+
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -40,9 +40,9 @@ class alignas(sizeof(Vertex)) box {
4040

4141
private:
4242
/**
43-
* @brief Output stream operator for `vec_2d<T>` for human-readable formatting
43+
* @brief Output stream operator for `box<T>` for human-readable formatting
4444
*/
45-
friend std::ostream& operator<<(std::ostream& os, cuspatial::box<T> const& b)
45+
friend std::ostream& operator<<(std::ostream& os, cuspatial::box<T, Vertex> const& b)
4646
{
4747
return os << "{" << b.v1 << ", " << b.v2 << "}";
4848
}

cpp/include/cuspatial_test/geometry_generator.cuh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333

3434
#include <ranger/ranger.hpp>
3535

36+
#include <cmath>
37+
3638
namespace cuspatial {
3739
namespace test {
3840

@@ -399,8 +401,9 @@ auto generate_multipoint_array(multipoint_generator_parameter<T> params,
399401
std::size_t{0},
400402
params.num_points_per_multipoints);
401403

402-
auto engine_x = deterministic_engine(params.num_points());
403-
auto engine_y = deterministic_engine(2 * params.num_points());
404+
auto golden_ratio = (1 + std::sqrt(T{5})) / 2;
405+
auto engine_x = deterministic_engine(golden_ratio * params.num_points());
406+
auto engine_y = deterministic_engine((1 / golden_ratio) * params.num_points());
404407

405408
auto x_dist = make_uniform_dist(params.lower_left.x, params.upper_right.x);
406409
auto y_dist = make_uniform_dist(params.lower_left.y, params.upper_right.y);

cpp/include/cuspatial_test/vector_factories.cuh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,17 @@ class multipoint_array {
363363
_geometry_offsets.begin(), _geometry_offsets.end(), _coordinates.begin(), _coordinates.end()};
364364
}
365365

366+
/**
367+
* @brief Copy the offset arrays to host.
368+
*/
369+
auto to_host() const
370+
{
371+
auto geometry_offsets = cuspatial::test::to_host<geometry_t>(_geometry_offsets);
372+
auto coordinate_offsets = cuspatial::test::to_host<coord_t>(_coordinates);
373+
374+
return std::tuple{geometry_offsets, coordinate_offsets};
375+
}
376+
366377
/// Release ownership
367378
auto release() { return std::pair{std::move(_geometry_offsets), std::move(_coordinates)}; }
368379

cpp/tests/CMakeLists.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#=============================================================================
2-
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
2+
# Copyright (c) 2019-2024, NVIDIA CORPORATION.
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License");
55
# you may not use this file except in compliance with the License.
@@ -158,6 +158,9 @@ ConfigureTest(JOIN_POINT_IN_POLYGON_SMALL_TEST_EXP
158158
ConfigureTest(JOIN_POINT_IN_POLYGON_LARGE_TEST_EXP
159159
join/quadtree_point_in_polygon_test_large.cu)
160160

161+
ConfigureTest(JOIN_POINT_IN_POLYGON_OOM_TEST_EXP
162+
join/quadtree_point_in_polygon_test_oom.cu)
163+
161164
ConfigureTest(JOIN_POINT_TO_LINESTRING_SMALL_TEST_EXP
162165
join/quadtree_point_to_nearest_linestring_test_small.cu)
163166

0 commit comments

Comments
 (0)