24
24
#include < cuspatial/range/multipolygon_range.cuh>
25
25
#include < cuspatial/traits.hpp>
26
26
27
+ #include < rmm/cuda_device.hpp>
27
28
#include < rmm/device_uvector.hpp>
28
29
#include < rmm/exec_policy.hpp>
29
30
#include < rmm/resource_ref.hpp>
30
31
31
32
#include < thrust/iterator/permutation_iterator.h>
33
+ #include < thrust/iterator/transform_iterator.h>
32
34
#include < thrust/scan.h>
33
35
34
- #include < cstdint >
36
+ #include < limits >
35
37
36
38
namespace cuspatial {
37
39
namespace detail {
@@ -57,7 +59,7 @@ struct compute_poly_and_point_indices {
57
59
using IndexType = iterator_value_type<QuadOffsetsIterator>;
58
60
59
61
inline thrust::tuple<IndexType, IndexType> __device__
60
- operator ()(IndexType const global_index) const
62
+ operator ()(std:: uint64_t const global_index) const
61
63
{
62
64
auto const [quad_poly_index, local_point_index] =
63
65
get_quad_and_local_point_indices (global_index, point_offsets_begin, point_offsets_end);
@@ -118,16 +120,26 @@ std::pair<rmm::device_uvector<IndexType>, rmm::device_uvector<IndexType>> quadtr
118
120
119
121
auto num_poly_quad_pairs = std::distance (poly_indices_first, poly_indices_last);
120
122
121
- auto quad_lengths_iter =
122
- thrust::make_permutation_iterator (quadtree.length_begin (), quad_indices_first);
123
+ // The quadtree length is an iterator of uint32_t, but we have to transform into uint64_t values
124
+ // so the thrust::inclusive_scan accumulates into uint64_t outputs. Changing the output iterator
125
+ // to uint64_t isn't sufficient to achieve this behavior.
126
+ auto quad_lengths_iter = thrust::make_transform_iterator (
127
+ thrust::make_permutation_iterator (quadtree.length_begin (), quad_indices_first),
128
+ cuda::proclaim_return_type<std::uint64_t >([] __device__ (IndexType const & i) -> std::uint64_t {
129
+ return static_cast <std::uint64_t >(i);
130
+ }));
123
131
124
132
auto quad_offsets_iter =
125
133
thrust::make_permutation_iterator (quadtree.offset_begin (), quad_indices_first);
126
134
127
- // Compute a "local" set of zero-based point offsets from number of points in each quadrant
135
+ // Compute a "local" set of zero-based point offsets from the number of points in each quadrant.
136
+ //
128
137
// Use `num_poly_quad_pairs + 1` as the length so that the last element produced by
129
138
// `inclusive_scan` is the total number of points to be tested against any polygon.
130
- rmm::device_uvector<IndexType> local_point_offsets (num_poly_quad_pairs + 1 , stream);
139
+ //
140
+ // Accumulate into uint64_t, because the prefix sums can overflow the size of uint32_t
141
+ // when testing a large number of polygons against a large quadtree.
142
+ rmm::device_uvector<std::uint64_t > local_point_offsets (num_poly_quad_pairs + 1 , stream);
131
143
132
144
// inclusive scan of quad_lengths_iter
133
145
thrust::inclusive_scan (rmm::exec_policy (stream),
@@ -136,21 +148,27 @@ std::pair<rmm::device_uvector<IndexType>, rmm::device_uvector<IndexType>> quadtr
136
148
local_point_offsets.begin () + 1 );
137
149
138
150
// Ensure local point offsets starts at 0
139
- IndexType init{0 };
151
+ std:: uint64_t init{0 };
140
152
local_point_offsets.set_element_async (0 , init, stream);
141
153
142
154
// The last element is the total number of points to test against any polygon.
143
155
auto num_total_points = local_point_offsets.back_element (stream);
144
156
145
- // Allocate the output polygon and point index pair vectors
146
- rmm::device_uvector<IndexType> poly_indices (num_total_points, stream);
147
- rmm::device_uvector<IndexType> point_indices (num_total_points, stream);
148
-
149
- auto poly_and_point_indices =
150
- thrust::make_zip_iterator (poly_indices.begin (), point_indices.begin ());
151
-
152
- // Enumerate the point X/Ys using the sorted `point_indices` (from quadtree construction)
153
- auto point_xys_iter = thrust::make_permutation_iterator (points_first, point_indices_first);
157
+ // The largest supported input size for thrust::count_if/copy_if is INT32_MAX.
158
+ // This functor iterates over the input space and processes up to INT32_MAX elements at a time.
159
+ std::uint64_t max_points_to_test = std::numeric_limits<std::int32_t >::max ();
160
+ auto count_in_chunks = [&](auto const & func) {
161
+ std::uint64_t memo{};
162
+ for (std::uint64_t offset{0 }; offset < num_total_points; offset += max_points_to_test) {
163
+ memo += func (memo, offset, std::min (max_points_to_test, num_total_points - offset));
164
+ }
165
+ return memo;
166
+ };
167
+
168
+ detail::test_poly_point_intersection test_poly_point_pair{
169
+ // Enumerate the point X/Ys using the sorted `point_indices` (from quadtree construction)
170
+ thrust::make_permutation_iterator (points_first, point_indices_first),
171
+ polygons};
154
172
155
173
// Compute the combination of polygon and point index pairs. For each polygon/quadrant pair,
156
174
// enumerate pairs of (poly_index, point_index) for each point in each quadrant.
@@ -163,28 +181,57 @@ std::pair<rmm::device_uvector<IndexType>, rmm::device_uvector<IndexType>> quadtr
163
181
// pp_pairs.append((polygon, point))
164
182
// ```
165
183
//
166
- auto global_to_poly_and_point_indices = detail::make_counting_transform_iterator (
167
- 0 ,
168
- detail::compute_poly_and_point_indices{quad_offsets_iter,
169
- local_point_offsets.begin (),
170
- local_point_offsets.end (),
171
- poly_indices_first});
172
-
173
- // Compute the number of intersections by removing (poly, point) pairs that don't intersect
174
- auto num_intersections = thrust::distance (
175
- poly_and_point_indices,
176
- thrust::copy_if (rmm::exec_policy (stream),
177
- global_to_poly_and_point_indices,
178
- global_to_poly_and_point_indices + num_total_points,
179
- poly_and_point_indices,
180
- detail::test_poly_point_intersection{point_xys_iter, polygons}));
181
-
182
- poly_indices.resize (num_intersections, stream);
183
- poly_indices.shrink_to_fit (stream);
184
- point_indices.resize (num_intersections, stream);
185
- point_indices.shrink_to_fit (stream);
186
-
187
- return std::pair{std::move (poly_indices), std::move (point_indices)};
184
+ auto global_to_poly_and_point_indices = [&](auto offset = 0 ) {
185
+ return detail::make_counting_transform_iterator (
186
+ offset,
187
+ detail::compute_poly_and_point_indices{quad_offsets_iter,
188
+ local_point_offsets.begin (),
189
+ local_point_offsets.end (),
190
+ poly_indices_first});
191
+ };
192
+
193
+ auto run_quadtree_point_in_polygon = [&](auto output_size) {
194
+ // Allocate the output polygon and point index pair vectors
195
+ rmm::device_uvector<IndexType> poly_indices (output_size, stream);
196
+ rmm::device_uvector<IndexType> point_indices (output_size, stream);
197
+
198
+ auto num_intersections = count_in_chunks ([&](auto memo, auto offset, auto size) {
199
+ auto poly_and_point_indices =
200
+ thrust::make_zip_iterator (poly_indices.begin (), point_indices.begin ()) + memo;
201
+ // Remove (poly, point) pairs that don't intersect
202
+ return thrust::distance (poly_and_point_indices,
203
+ thrust::copy_if (rmm::exec_policy (stream),
204
+ global_to_poly_and_point_indices (offset),
205
+ global_to_poly_and_point_indices (offset) + size,
206
+ poly_and_point_indices,
207
+ test_poly_point_pair));
208
+ });
209
+
210
+ if (num_intersections < output_size) {
211
+ poly_indices.resize (num_intersections, stream);
212
+ point_indices.resize (num_intersections, stream);
213
+ poly_indices.shrink_to_fit (stream);
214
+ point_indices.shrink_to_fit (stream);
215
+ }
216
+
217
+ return std::pair{std::move (poly_indices), std::move (point_indices)};
218
+ };
219
+
220
+ try {
221
+ // First attempt to run the hit test assuming allocating space for all possible intersections
222
+ // fits into the available memory.
223
+ return run_quadtree_point_in_polygon (num_total_points);
224
+ } catch (rmm::out_of_memory const &) {
225
+ // If we OOM the first time, pre-compute the number of hits and allocate only that amount of
226
+ // space for the output buffers. This halves performance, but it should at least return valid
227
+ // results.
228
+ return run_quadtree_point_in_polygon (count_in_chunks ([&](auto memo, auto offset, auto size) {
229
+ return thrust::count_if (rmm::exec_policy (stream),
230
+ global_to_poly_and_point_indices (offset),
231
+ global_to_poly_and_point_indices (offset) + size,
232
+ test_poly_point_pair);
233
+ }));
234
+ }
188
235
}
189
236
190
237
} // namespace cuspatial
0 commit comments