@@ -87,10 +87,6 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::WriteChunk(const
87
87
char uuid_string[36 ];
88
88
uuid_unparse (uuid_binary, uuid_string);
89
89
std::string chunk_filename = std::string (uuid_string) + " .bin" ;
90
-
91
- // if(chunk_filename.find_first_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_-.") != std::string::npos) {
92
- // throw std::runtime_error("Error: non-compliant chunk filename detected!");
93
- // }
94
90
95
91
IndexVector stop_ind = start_ind + chunk.shape ();
96
92
@@ -111,22 +107,17 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::WriteChunk(const
111
107
// std::size_t sparse_vs_dense_expense_ratio = 3; // when only counting storage space: sparse storage is approximately 3x as expensive as dense storage per nonzero element
112
108
std::size_t sparse_vs_dense_expense_ratio = 20 ; // when also counting complexity of deserializing + rebuilding a dense chunk
113
109
if (sparse_vs_dense_expense_ratio * num_nonzero_elems < num_elems) {
114
-
115
- // std::cout << "going to sparsify" << std::endl;
116
-
110
+
117
111
auto to_keep = [](scalar_t value) -> bool {
118
112
return value != 0.0 ;
119
113
};
120
114
sparse_t sparse_chunk = sparse_t::From (chunk, to_keep, 0.0 );
121
115
meta.chunk_type = ChunkType::sparse;
122
-
123
- // std::cout << "after sparsification, " << sparse_chunk.NumEntries() << " entries remain" << std::endl;
124
116
125
117
m_ser.template serialize <ChunkMetadata>(ofs, meta);
126
118
m_ser.template serialize <sparse_t >(ofs, sparse_chunk);
127
119
}
128
- else {
129
- // std::cout << "store as dense" << std::endl;
120
+ else {
130
121
m_ser.template serialize <ChunkMetadata>(ofs, meta);
131
122
m_ser.template serialize <dense_t >(ofs, chunk);
132
123
}
@@ -168,65 +159,30 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::rebuildIndex() {
168
159
ifs.open (dir_entry.path (), std::ios::in | std::ios::binary);
169
160
ChunkMetadata meta = m_ser.template deserialize <ChunkMetadata>(ifs);
170
161
ifs.close ();
171
-
172
- // std::cout << "found metadata containing filename: '" + meta.filename + "' from path '" + std::string(dir_entry.path()) + "'!" << std::endl;
173
-
174
- // if(meta.filename.find_first_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_-.") != std::string::npos) {
175
- // throw std::runtime_error("Error: found metadata containing non-compliant filename: '" + meta.filename + "' from path '" + std::string(dir_entry.path()) + "'!");
176
- // }
177
162
178
163
m_chunk_index.push_back (meta);
179
164
}
180
165
}
181
166
182
167
template <class T , std::size_t dims, template <class , std::size_t > class DenseT , template <class , std::size_t > class SparseT , class SerializerT >
183
168
void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::RebuildChunks(const IndexVector& requested_chunk_size) {
184
-
185
- // How to make sure own index doesn't get messed up when new chunks are generated?
186
-
187
- // 0) factor out most things of currently existing RegisterChunk into new private method:
188
- // -> WriteChunk(chunk, start_ind, add_to_index)
189
- // (keep the overlap checking etc. in the original RegisterChunk)
190
-
191
- // Then, this function will look like
192
- // 0) rebuild index (to make sure everything is taken into account)
193
- // 1) then loop over chunks like in `WeightingFieldUtils`
194
- // -> get chunk that has the starting indices
195
- // -> if it conforms to the requested size, don't touch it and continue (will guarantee this function is idempotent)
196
- // -> if it doesn't, build a chunk of correct size (using NDArray::range)
197
- // 2) write the individual chunks, but don't touch the index (using `WriteChunk`)
198
- // 3) remove all chunks in the index (-> this will remove all the old ones), BUT not those that haven't been touched because they are already conforming to the correct size
199
-
200
- // std::cout << "in RebuildChunks" << std::endl;
201
169
202
170
if (requested_chunk_size.size () != dims) {
203
171
throw std::runtime_error (" Error: requested chunk size has wrong dimensionality!" );
204
172
}
205
-
206
- // std::cout << "rebuilding index" << std::endl;
207
173
208
174
// Make sure we start from a clean index
209
175
rebuildIndex ();
210
-
211
- // std::cout << "rebuilt index" << std::endl;
212
176
213
177
if (!isGloballyContiguous (getGlobalStartInd (), getGlobalStopInd ())) {
214
178
throw std::runtime_error (" Error: refusing to rebuild chunks for a non-contiguous array!" );
215
179
}
216
180
217
181
calculateShape ();
218
182
219
- IndexVector global_shape (this -> m_shape);
220
-
221
- // std::cout << "global shape" << std::endl;
222
- // global_shape.print();
223
-
183
+ IndexVector global_shape (this -> m_shape);
224
184
IndexVector number_required_chunks = (global_shape + requested_chunk_size - 1 ) / requested_chunk_size;
225
185
226
- // std::cout << "will have " << std::endl;
227
- // number_required_chunks.print();
228
- // std::cout << " chunks after rebuilding" << std::endl;
229
-
230
186
index_t chunks_to_keep;
231
187
232
188
IndexVector buf_inds_start (dims, 0 );
@@ -244,15 +200,11 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::RebuildChunks(co
244
200
dense_t current_chunk = retrieveChunk (chunk_index);
245
201
246
202
if (actual_chunk_shape == current_chunk.shape ()) {
247
- // std::cout << "chunk already has the correct size, keep it" << std::endl;
248
203
chunks_to_keep.push_back (m_chunk_index[chunk_index]);
249
204
continue ;
250
205
}
251
206
252
- // std::cout << "now working on rebuild chunk with inds" << std::endl;
253
- // std::cout << "chunk_inds_start = " << std::endl;
254
207
chunk_inds_start.print ();
255
- // std::cout << "chunk_inds_end = " << std::endl;
256
208
chunk_inds_end.print ();
257
209
258
210
dense_t chunk = range (chunk_inds_start, chunk_inds_end);
@@ -278,30 +230,13 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::RebuildChunks(co
278
230
template <class T , std::size_t dims, template <class , std::size_t > class DenseT , template <class , std::size_t > class SparseT , class SerializerT >
279
231
void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::MergeChunks(std::size_t dim_to_merge, std::size_t max_dimsize) {
280
232
281
- // 0) Rebuild index (to make sure we have the full picture)
282
-
283
- // 2) Then, start with chunk that has the global start inds
284
-
285
- // --> operate entirely on the index and prepare a full list of chunk mergings that need to happen (without actually doing anything)
286
- // --> remove any elements from the index that are already taken care of
287
- // --> check if there are any problems; if not, go ahead and implement the chunk mergings
288
- // --> go to the chunk that neighbours the current main one and continue
289
-
290
- // 3) again, have the to_keep mechanism
291
-
292
- // std::cout << "in MergeNeighbouringChunks" << std::endl;
293
-
294
233
rebuildIndex ();
295
-
296
- // std::cout << "rebuilt index" << std::endl;
297
234
298
235
if (!isGloballyContiguous (getGlobalStartInd (), getGlobalStopInd ())) {
299
236
throw std::runtime_error (" Error: refusing to merge chunks for a non-contiguous array!" );
300
237
}
301
238
302
239
calculateShape ();
303
-
304
- // std::cout << "have a total of " << m_chunk_index.size() << " chunks before merging" << std::endl;
305
240
306
241
// put chunks in order along the merging axis
307
242
std::vector<std::size_t > chunk_order (m_chunk_index.size ());
@@ -338,14 +273,10 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::MergeChunks(std:
338
273
chunks_to_merge.push_back (retrieveChunk (cur_chunk_index));
339
274
std::size_t output_chunk_shape = chunks_to_merge.back ().shape (dim_to_merge);
340
275
std::size_t neighbour_chunk_index = cur_chunk_index;
341
-
342
- // std::cout << "start merging tracer from chunk with index = " << neighbour_chunk_index << std::endl;
343
276
344
277
while (true ) {
345
278
try {
346
279
neighbour_chunk_index = getNeighbouringChunkIndex (neighbour_chunk_index, dim_to_merge);
347
-
348
- // std::cout << "have neighbouring chunk with index = " << neighbour_chunk_index << " / " << m_chunk_index.size() << std::endl;
349
280
350
281
chunks_to_merge.push_back (retrieveChunk (neighbour_chunk_index));
351
282
std::erase (chunk_order, neighbour_chunk_index);
@@ -364,31 +295,7 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::MergeChunks(std:
364
295
365
296
// perform the merge and write the result to disk
366
297
dense_t output_chunk = NDArrayOps::concatenate (chunks_to_merge, dim_to_merge);
367
- WriteChunk (output_chunk, cur_chunk_meta.start_ind , false );
368
-
369
- // dense_t output_chunk = retrieveChunk(cur_chunk_index);
370
- // std::size_t neighbour_chunk_index = cur_chunk_index;
371
- // while(true) {
372
- // try {
373
- // neighbour_chunk_index = getNeighbouringChunkIndex(neighbour_chunk_index, dim_to_merge);
374
- // std::erase(chunk_order, neighbour_chunk_index);
375
-
376
- // std::cout << "merging chunk " << cur_chunk_index << " with chunk " << neighbour_chunk_index << std::endl;
377
-
378
- // output_chunk = NDArrayOps::concatenate(output_chunk, retrieveChunk(neighbour_chunk_index), dim_to_merge);
379
-
380
- // // chunk is now large enough
381
- // if(output_chunk.shape(dim_to_merge) >= max_dimsize) {
382
- // break;
383
- // }
384
- // } catch(const ChunkNotFoundError& e) {
385
- // // there are no more neighbours in this direction; stop here
386
- // break;
387
- // }
388
- // }
389
-
390
- // // write it to disk
391
- // WriteChunk(output_chunk, cur_chunk_meta.start_ind, false);
298
+ WriteChunk (output_chunk, cur_chunk_meta.start_ind , false );
392
299
}
393
300
394
301
// remove all the chunks in the old index that should not be kept around
@@ -438,12 +345,6 @@ std::size_t DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::getNeighb
438
345
439
346
std::size_t neighbour_chunk_ind = getChunkIndex (neighbour_chunk_start_ind);
440
347
441
- // if(neighbour_chunk_ind > m_chunk_index.size()) {
442
- // std::cout << "MMMM Found out-of-bounds neighbouring chunk: " + std::to_string(neighbour_chunk_ind) + "/" + std::to_string(m_chunk_index.size()) << std::endl;
443
- // std::cout << "MMMM From request with neighbour_chunk_start_ind = " << std::endl;
444
- // neighbour_chunk_start_ind.print();
445
- // }
446
-
447
348
return neighbour_chunk_ind;
448
349
}
449
350
@@ -502,18 +403,15 @@ std::size_t DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::getChunkI
502
403
503
404
if (chunkContainsInds (m_chunk_index[m_chunk_last_accessed], inds)) {
504
405
[[likely]];
505
- // std::cout << "NNN m_chunk_last_accessed gave " << m_chunk_last_accessed << std::endl;
506
406
return m_chunk_last_accessed;
507
407
}
508
408
else {
509
409
// Trigger a full chunk lookup
510
410
// TODO: have a search tree here with logarithmic instead of linear complexity
511
- // std::cout << "NNN trigger full chunk lookup" << std::endl;
512
411
std::size_t chunk_ind = 0 ;
513
412
for (chunk_ind = 0 ; chunk_ind < m_chunk_index.size (); chunk_ind++) {
514
413
if (chunkContainsInds (m_chunk_index[chunk_ind], inds)) {
515
414
m_chunk_last_accessed = chunk_ind;
516
- // std::cout << "NNN full chunk lookup gave " << chunk_ind << std::endl;
517
415
return chunk_ind;
518
416
}
519
417
}
@@ -549,10 +447,6 @@ DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::dense_t& DistributedN
549
447
std::fstream ifs;
550
448
std::string chunk_path = m_dirpath + " /" + chunk_meta.filename ;
551
449
std::cout << " Loading chunk from " + chunk_path + " ... " ;
552
-
553
- // if(chunk_meta.filename.find_first_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_-.") != std::string::npos) {
554
- // throw std::runtime_error("Error: trying to open chunk " + std::to_string(chunk_ind) + "/" + std::to_string(m_chunk_index.size()) + " from file with non-compliant name: '" + chunk_meta.filename + "'!");
555
- // }
556
450
557
451
if (!std::filesystem::exists (chunk_path)) {
558
452
throw std::runtime_error (" Error: trying to access non-existant file '" + chunk_path + " '!" );
@@ -566,7 +460,6 @@ DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::dense_t& DistributedN
566
460
}
567
461
else if (meta.chunk_type == ChunkType::sparse) {
568
462
m_chunk_cache.insert ({chunk_ind, dense_t::FromSparseFile (ifs)});
569
- // m_chunk_cache.insert({chunk_ind, dense_t::From(m_ser.template deserialize<sparse_t>(ifs))});
570
463
}
571
464
else {
572
465
throw std::runtime_error (" Error: unknown chunk type encountered!" );
0 commit comments