@@ -122,17 +122,25 @@ public:
122
122
LOG_DEBUG (" Initialized gpu_kernel_map" );
123
123
}
124
124
gpu_kernel_map (self_type const &other)
125
- : m_decomposed(other.m_decomposed),
126
- m_memory_size_byte (other.m_memory_size_byte),
127
- m_capacity{other.m_capacity }, m_memory{other.m_memory },
128
- m_allocator{other.m_allocator },
129
- m_kernel_size_map{other.m_kernel_size_map },
130
- m_kernel_offset_map{other.m_kernel_offset_map }, kernels{*this },
131
- in_maps{*this }, out_maps{*this } {
125
+ : m_decomposed(other.m_decomposed), //
126
+ m_requires_kernel_index (other.m_requires_kernel_index), //
127
+ m_memory_size_byte(other.m_memory_size_byte), //
128
+ m_capacity{other.m_capacity }, //
129
+ m_in_map_memory{other.m_in_map_memory }, //
130
+ m_out_map_memory{other.m_out_map_memory }, //
131
+ m_allocator{other.m_allocator }, //
132
+ m_kernel_size_map{other.m_kernel_size_map }, //
133
+ m_kernel_offset_map{other.m_kernel_offset_map }, //
134
+ kernels{*this }, //
135
+ in_maps{*this }, //
136
+ out_maps{*this } {
132
137
LOG_DEBUG (" gpu_kernel_map copy constructor" );
133
138
in_maps.data (other.in_maps .begin ());
134
139
out_maps.data (other.out_maps .begin ());
135
- kernels.data (other.kernels .begin ());
140
+ if (m_requires_kernel_index) {
141
+ m_kernel_index_memory = other.m_kernel_index_memory ;
142
+ kernels.data (other.kernels .begin ());
143
+ }
136
144
}
137
145
138
146
gpu_kernel_map (size_type capacity,
@@ -141,27 +149,37 @@ public:
141
149
: m_requires_kernel_index(requires_kernel_index), m_capacity{capacity},
142
150
m_allocator{alloc}, kernels{*this }, in_maps{*this }, out_maps{*this } {
143
151
// kernel map without kernel index
144
- m_memory_size_byte =
145
- (requires_kernel_index ? 3 : 2 ) * capacity * sizeof (index_type);
146
- index_type *ptr = reinterpret_cast <index_type *>(
152
+ m_memory_size_byte = capacity * sizeof (index_type);
153
+ index_type *ptr_in_map = reinterpret_cast <index_type *>(
154
+ m_allocator.allocate (m_memory_size_byte));
155
+ index_type *ptr_out_map = reinterpret_cast <index_type *>(
147
156
m_allocator.allocate (m_memory_size_byte));
157
+ index_type *ptr_kernel = nullptr ;
148
158
149
159
auto deleter = [](index_type *p, byte_allocator_type alloc,
150
160
size_type size) {
151
161
alloc.deallocate (reinterpret_cast <char *>(p), size);
152
162
LOG_DEBUG (" Deallocate kernel map" );
153
163
};
154
164
155
- m_memory = std::shared_ptr<index_type[]>{
156
- ptr, std::bind (deleter, std::placeholders::_1, m_allocator,
157
- m_memory_size_byte)};
158
-
165
+ m_in_map_memory = std::shared_ptr<index_type[]>{
166
+ ptr_in_map, std::bind (deleter, std::placeholders::_1, m_allocator,
167
+ m_memory_size_byte)};
168
+ m_out_map_memory = std::shared_ptr<index_type[]>{
169
+ ptr_out_map, std::bind (deleter, std::placeholders::_1, m_allocator,
170
+ m_memory_size_byte)};
159
171
// kernel maps
160
- in_maps.data (m_memory.get () + 0 * m_capacity);
161
- out_maps.data (m_memory.get () + 1 * m_capacity);
162
- kernels.data (m_memory.get () + 2 * m_capacity);
163
-
164
- if (!requires_kernel_index) {
172
+ in_maps.data (m_in_map_memory.get ());
173
+ out_maps.data (m_out_map_memory.get ());
174
+
175
+ if (requires_kernel_index) {
176
+ ptr_kernel = reinterpret_cast <index_type *>(
177
+ m_allocator.allocate (m_memory_size_byte));
178
+ m_kernel_index_memory = std::shared_ptr<index_type[]>{
179
+ ptr_kernel, std::bind (deleter, std::placeholders::_1, m_allocator,
180
+ m_memory_size_byte)};
181
+ kernels.data (m_kernel_index_memory.get ());
182
+ } else {
165
183
m_kernel_offset_map[0 ] = 0 ;
166
184
m_kernel_size_map[0 ] = capacity;
167
185
// Initialize the decomposed begins and sizes
@@ -171,10 +189,10 @@ public:
171
189
172
190
self_type swap () const {
173
191
self_type swapped_gpu_kernel_map (*this );
174
- swapped_gpu_kernel_map.in_maps .data (swapped_gpu_kernel_map. m_memory . get () +
175
- 1 * m_capacity );
176
- swapped_gpu_kernel_map.out_maps .data (swapped_gpu_kernel_map. m_memory . get () +
177
- 0 * m_capacity );
192
+ swapped_gpu_kernel_map.in_maps .data (
193
+ swapped_gpu_kernel_map. m_out_map_memory . get () );
194
+ swapped_gpu_kernel_map.out_maps .data (
195
+ swapped_gpu_kernel_map. m_in_map_memory . get () );
178
196
179
197
#ifdef DEBUG
180
198
size_type map_size = std::min<size_type>(in_maps.size (0 ), 100 );
@@ -217,7 +235,6 @@ public:
217
235
CUDA_CHECK (cudaDeviceSynchronize ());
218
236
std::free (p_kernel_map);
219
237
#endif
220
-
221
238
return swapped_gpu_kernel_map;
222
239
}
223
240
@@ -228,7 +245,9 @@ public:
228
245
m_memory_size_byte = other.m_memory_size_byte ;
229
246
m_capacity = other.m_capacity ;
230
247
231
- m_memory = other.m_memory ;
248
+ m_kernel_index_memory = other.m_kernel_index_memory ;
249
+ m_in_map_memory = other.m_in_map_memory ;
250
+ m_out_map_memory = other.m_out_map_memory ;
232
251
m_allocator = other.m_allocator ;
233
252
234
253
m_kernel_size_map = other.m_kernel_size_map ;
@@ -242,8 +261,6 @@ public:
242
261
}
243
262
244
263
// functions
245
- inline index_type *data () { return m_memory.get (); }
246
-
247
264
inline typename std::map<index_type, index_type>::const_iterator
248
265
key_cbegin () const {
249
266
return m_kernel_offset_map.cbegin ();
@@ -306,14 +323,23 @@ public:
306
323
));
307
324
308
325
#ifdef DEBUG
326
+ size_type map_size = std::min<size_type>(in_maps.size (0 ), 100 );
309
327
index_type *p_kernel_map =
310
- (index_type *)std::malloc (m_capacity * 3 * sizeof (index_type));
311
- CUDA_CHECK (cudaMemcpy (p_kernel_map, data (), m_memory_size_byte,
328
+ (index_type *)std::malloc (map_size * 3 * sizeof (index_type));
329
+ CUDA_CHECK (cudaMemcpy (p_kernel_map, m_kernel_index_memory.get (),
330
+ map_size * sizeof (index_type),
331
+ cudaMemcpyDeviceToHost));
332
+ CUDA_CHECK (cudaMemcpy (p_kernel_map + map_size, m_in_map_memory.get (),
333
+ map_size * sizeof (index_type),
334
+ cudaMemcpyDeviceToHost));
335
+ CUDA_CHECK (cudaMemcpy (p_kernel_map + 2 * map_size, m_out_map_memory.get (),
336
+ map_size * sizeof (index_type),
312
337
cudaMemcpyDeviceToHost));
338
+
313
339
for (index_type i = 0 ; i < std::min<size_type>(m_capacity, 100 ); ++i) {
314
- std::cout << p_kernel_map[i + 2 * m_capacity ] << " :"
315
- << p_kernel_map[i + 0 * m_capacity ] << " ->"
316
- << p_kernel_map[i + 1 * m_capacity ] << " \n " ;
340
+ std::cout << p_kernel_map[i + 0 * map_size ] << " :"
341
+ << p_kernel_map[i + 1 * map_size ] << " ->"
342
+ << p_kernel_map[i + 2 * map_size ] << " \n " ;
317
343
}
318
344
std::free (p_kernel_map);
319
345
#endif
@@ -376,7 +402,9 @@ private:
376
402
bool m_decomposed{false };
377
403
bool m_requires_kernel_index;
378
404
size_type m_memory_size_byte, m_capacity;
379
- std::shared_ptr<index_type[]> m_memory;
405
+ std::shared_ptr<index_type[]> m_kernel_index_memory;
406
+ std::shared_ptr<index_type[]> m_in_map_memory;
407
+ std::shared_ptr<index_type[]> m_out_map_memory;
380
408
byte_allocator_type m_allocator;
381
409
382
410
std::map<index_type, index_type> m_kernel_size_map;
0 commit comments