Skip to content

Commit aafbb67

Browse files
committed
separate memory chunks for gpu kernel map
1 parent 4868c3a commit aafbb67

File tree

3 files changed

+67
-35
lines changed

3 files changed

+67
-35
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
- Added `to_sparse()` that removes zeros. (issue #317)
1818
- Previous `to_sparse()` was renamed to `to_sparse_all()`
1919
- `MinkowskiToSparseTensor` takes an optional `remove_zeros` boolean argument.
20+
- Fix global max pool with batch size 1
21+
- Use separate memory chunks for in, out map, and kernel indices for `gpu_kernel_map` for gpu memory misaligned error
2022

2123

2224
## [0.5.1]

src/kernel_map.cuh

Lines changed: 62 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -122,17 +122,25 @@ public:
122122
LOG_DEBUG("Initialized gpu_kernel_map");
123123
}
124124
gpu_kernel_map(self_type const &other)
125-
: m_decomposed(other.m_decomposed),
126-
m_memory_size_byte(other.m_memory_size_byte),
127-
m_capacity{other.m_capacity}, m_memory{other.m_memory},
128-
m_allocator{other.m_allocator},
129-
m_kernel_size_map{other.m_kernel_size_map},
130-
m_kernel_offset_map{other.m_kernel_offset_map}, kernels{*this},
131-
in_maps{*this}, out_maps{*this} {
125+
: m_decomposed(other.m_decomposed), //
126+
m_requires_kernel_index(other.m_requires_kernel_index), //
127+
m_memory_size_byte(other.m_memory_size_byte), //
128+
m_capacity{other.m_capacity}, //
129+
m_in_map_memory{other.m_in_map_memory}, //
130+
m_out_map_memory{other.m_out_map_memory}, //
131+
m_allocator{other.m_allocator}, //
132+
m_kernel_size_map{other.m_kernel_size_map}, //
133+
m_kernel_offset_map{other.m_kernel_offset_map}, //
134+
kernels{*this}, //
135+
in_maps{*this}, //
136+
out_maps{*this} {
132137
LOG_DEBUG("gpu_kernel_map copy constructor");
133138
in_maps.data(other.in_maps.begin());
134139
out_maps.data(other.out_maps.begin());
135-
kernels.data(other.kernels.begin());
140+
if (m_requires_kernel_index) {
141+
m_kernel_index_memory = other.m_kernel_index_memory;
142+
kernels.data(other.kernels.begin());
143+
}
136144
}
137145

138146
gpu_kernel_map(size_type capacity,
@@ -141,27 +149,37 @@ public:
141149
: m_requires_kernel_index(requires_kernel_index), m_capacity{capacity},
142150
m_allocator{alloc}, kernels{*this}, in_maps{*this}, out_maps{*this} {
143151
// kernel map without kernel index
144-
m_memory_size_byte =
145-
(requires_kernel_index ? 3 : 2) * capacity * sizeof(index_type);
146-
index_type *ptr = reinterpret_cast<index_type *>(
152+
m_memory_size_byte = capacity * sizeof(index_type);
153+
index_type *ptr_in_map = reinterpret_cast<index_type *>(
154+
m_allocator.allocate(m_memory_size_byte));
155+
index_type *ptr_out_map = reinterpret_cast<index_type *>(
147156
m_allocator.allocate(m_memory_size_byte));
157+
index_type *ptr_kernel = nullptr;
148158

149159
auto deleter = [](index_type *p, byte_allocator_type alloc,
150160
size_type size) {
151161
alloc.deallocate(reinterpret_cast<char *>(p), size);
152162
LOG_DEBUG("Deallocate kernel map");
153163
};
154164

155-
m_memory = std::shared_ptr<index_type[]>{
156-
ptr, std::bind(deleter, std::placeholders::_1, m_allocator,
157-
m_memory_size_byte)};
158-
165+
m_in_map_memory = std::shared_ptr<index_type[]>{
166+
ptr_in_map, std::bind(deleter, std::placeholders::_1, m_allocator,
167+
m_memory_size_byte)};
168+
m_out_map_memory = std::shared_ptr<index_type[]>{
169+
ptr_out_map, std::bind(deleter, std::placeholders::_1, m_allocator,
170+
m_memory_size_byte)};
159171
// kernel maps
160-
in_maps.data(m_memory.get() + 0 * m_capacity);
161-
out_maps.data(m_memory.get() + 1 * m_capacity);
162-
kernels.data(m_memory.get() + 2 * m_capacity);
163-
164-
if (!requires_kernel_index) {
172+
in_maps.data(m_in_map_memory.get());
173+
out_maps.data(m_out_map_memory.get());
174+
175+
if (requires_kernel_index) {
176+
ptr_kernel = reinterpret_cast<index_type *>(
177+
m_allocator.allocate(m_memory_size_byte));
178+
m_kernel_index_memory = std::shared_ptr<index_type[]>{
179+
ptr_kernel, std::bind(deleter, std::placeholders::_1, m_allocator,
180+
m_memory_size_byte)};
181+
kernels.data(m_kernel_index_memory.get());
182+
} else {
165183
m_kernel_offset_map[0] = 0;
166184
m_kernel_size_map[0] = capacity;
167185
// Initialize the decomposed begins and sizes
@@ -171,10 +189,10 @@ public:
171189

172190
self_type swap() const {
173191
self_type swapped_gpu_kernel_map(*this);
174-
swapped_gpu_kernel_map.in_maps.data(swapped_gpu_kernel_map.m_memory.get() +
175-
1 * m_capacity);
176-
swapped_gpu_kernel_map.out_maps.data(swapped_gpu_kernel_map.m_memory.get() +
177-
0 * m_capacity);
192+
swapped_gpu_kernel_map.in_maps.data(
193+
swapped_gpu_kernel_map.m_out_map_memory.get());
194+
swapped_gpu_kernel_map.out_maps.data(
195+
swapped_gpu_kernel_map.m_in_map_memory.get());
178196

179197
#ifdef DEBUG
180198
size_type map_size = std::min<size_type>(in_maps.size(0), 100);
@@ -217,7 +235,6 @@ public:
217235
CUDA_CHECK(cudaDeviceSynchronize());
218236
std::free(p_kernel_map);
219237
#endif
220-
221238
return swapped_gpu_kernel_map;
222239
}
223240

@@ -228,7 +245,9 @@ public:
228245
m_memory_size_byte = other.m_memory_size_byte;
229246
m_capacity = other.m_capacity;
230247

231-
m_memory = other.m_memory;
248+
m_kernel_index_memory = other.m_kernel_index_memory;
249+
m_in_map_memory = other.m_in_map_memory;
250+
m_out_map_memory = other.m_out_map_memory;
232251
m_allocator = other.m_allocator;
233252

234253
m_kernel_size_map = other.m_kernel_size_map;
@@ -242,8 +261,6 @@ public:
242261
}
243262

244263
// functions
245-
inline index_type *data() { return m_memory.get(); }
246-
247264
inline typename std::map<index_type, index_type>::const_iterator
248265
key_cbegin() const {
249266
return m_kernel_offset_map.cbegin();
@@ -306,14 +323,23 @@ public:
306323
));
307324

308325
#ifdef DEBUG
326+
size_type map_size = std::min<size_type>(in_maps.size(0), 100);
309327
index_type *p_kernel_map =
310-
(index_type *)std::malloc(m_capacity * 3 * sizeof(index_type));
311-
CUDA_CHECK(cudaMemcpy(p_kernel_map, data(), m_memory_size_byte,
328+
(index_type *)std::malloc(map_size * 3 * sizeof(index_type));
329+
CUDA_CHECK(cudaMemcpy(p_kernel_map, m_kernel_index_memory.get(),
330+
map_size * sizeof(index_type),
331+
cudaMemcpyDeviceToHost));
332+
CUDA_CHECK(cudaMemcpy(p_kernel_map + map_size, m_in_map_memory.get(),
333+
map_size * sizeof(index_type),
334+
cudaMemcpyDeviceToHost));
335+
CUDA_CHECK(cudaMemcpy(p_kernel_map + 2 * map_size, m_out_map_memory.get(),
336+
map_size * sizeof(index_type),
312337
cudaMemcpyDeviceToHost));
338+
313339
for (index_type i = 0; i < std::min<size_type>(m_capacity, 100); ++i) {
314-
std::cout << p_kernel_map[i + 2 * m_capacity] << ":"
315-
<< p_kernel_map[i + 0 * m_capacity] << "->"
316-
<< p_kernel_map[i + 1 * m_capacity] << "\n";
340+
std::cout << p_kernel_map[i + 0 * map_size] << ":"
341+
<< p_kernel_map[i + 1 * map_size] << "->"
342+
<< p_kernel_map[i + 2 * map_size] << "\n";
317343
}
318344
std::free(p_kernel_map);
319345
#endif
@@ -376,7 +402,9 @@ private:
376402
bool m_decomposed{false};
377403
bool m_requires_kernel_index;
378404
size_type m_memory_size_byte, m_capacity;
379-
std::shared_ptr<index_type[]> m_memory;
405+
std::shared_ptr<index_type[]> m_kernel_index_memory;
406+
std::shared_ptr<index_type[]> m_in_map_memory;
407+
std::shared_ptr<index_type[]> m_out_map_memory;
380408
byte_allocator_type m_allocator;
381409

382410
std::map<index_type, index_type> m_kernel_size_map;

tests/python/convolution.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
from tests.python.common import data_loader, load_file
4646
from utils.gradcheck import gradcheck
4747

48-
LEAK_TEST_ITER = 10000000
48+
LEAK_TEST_ITER = 100000
4949

5050

5151
class TestConvolution(unittest.TestCase):
@@ -220,6 +220,8 @@ def test(self):
220220
for i in range(LEAK_TEST_ITER):
221221
input = SparseTensor(feats, coordinates=coords)
222222
conv(input).F.sum().backward()
223+
if i % 1000 == 0:
224+
print(i)
223225

224226
def test_analytic(self):
225227
print(f"{self.__class__.__name__}: test")

0 commit comments

Comments
 (0)