@@ -87,6 +87,10 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::WriteChunk(const
87
87
char uuid_string[36 ];
88
88
uuid_unparse (uuid_binary, uuid_string);
89
89
std::string chunk_filename = std::string (uuid_string) + " .bin" ;
90
+
91
+ // if(chunk_filename.find_first_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_-.") != std::string::npos) {
92
+ // throw std::runtime_error("Error: non-compliant chunk filename detected!");
93
+ // }
90
94
91
95
IndexVector stop_ind = start_ind + chunk.shape ();
92
96
@@ -108,21 +112,21 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::WriteChunk(const
108
112
std::size_t sparse_vs_dense_expense_ratio = 20 ; // when also counting complexity of deserializing + rebuilding a dense chunk
109
113
if (sparse_vs_dense_expense_ratio * num_nonzero_elems < num_elems) {
110
114
111
- std::cout << " going to sparsify" << std::endl;
115
+ // std::cout << "going to sparsify" << std::endl;
112
116
113
117
auto to_keep = [](scalar_t value) -> bool {
114
118
return value != 0.0 ;
115
119
};
116
120
sparse_t sparse_chunk = sparse_t::From (chunk, to_keep, 0.0 );
117
121
meta.chunk_type = ChunkType::sparse;
118
122
119
- std::cout << " after sparsification, " << sparse_chunk.NumEntries () << " entries remain" << std::endl;
123
+ // std::cout << "after sparsification, " << sparse_chunk.NumEntries() << " entries remain" << std::endl;
120
124
121
125
m_ser.template serialize <ChunkMetadata>(ofs, meta);
122
126
m_ser.template serialize <sparse_t >(ofs, sparse_chunk);
123
127
}
124
128
else {
125
- std::cout << " store as dense" << std::endl;
129
+ // std::cout << "store as dense" << std::endl;
126
130
m_ser.template serialize <ChunkMetadata>(ofs, meta);
127
131
m_ser.template serialize <dense_t >(ofs, chunk);
128
132
}
@@ -164,6 +168,12 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::rebuildIndex() {
164
168
ifs.open (dir_entry.path (), std::ios::in | std::ios::binary);
165
169
ChunkMetadata meta = m_ser.template deserialize <ChunkMetadata>(ifs);
166
170
ifs.close ();
171
+
172
+ // std::cout << "found metadata containing filename: '" + meta.filename + "' from path '" + std::string(dir_entry.path()) + "'!" << std::endl;
173
+
174
+ // if(meta.filename.find_first_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_-.") != std::string::npos) {
175
+ // throw std::runtime_error("Error: found metadata containing non-compliant filename: '" + meta.filename + "' from path '" + std::string(dir_entry.path()) + "'!");
176
+ // }
167
177
168
178
m_chunk_index.push_back (meta);
169
179
}
@@ -187,18 +197,18 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::RebuildChunks(co
187
197
// 2) write the individual chunks, but don't touch the index (using `WriteChunk`)
188
198
// 3) remove all chunks in the index (-> this will remove all the old ones), BUT not those that haven't been touched because they are already conforming to the correct size
189
199
190
- std::cout << " in RebuildChunks" << std::endl;
200
+ // std::cout << "in RebuildChunks" << std::endl;
191
201
192
202
if (requested_chunk_size.size () != dims) {
193
203
throw std::runtime_error (" Error: requested chunk size has wrong dimensionality!" );
194
204
}
195
205
196
- std::cout << " rebuilding index" << std::endl;
206
+ // std::cout << "rebuilding index" << std::endl;
197
207
198
208
// Make sure we start from a clean index
199
209
rebuildIndex ();
200
210
201
- std::cout << " rebuilt index" << std::endl;
211
+ // std::cout << "rebuilt index" << std::endl;
202
212
203
213
if (!isGloballyContiguous (getGlobalStartInd (), getGlobalStopInd ())) {
204
214
throw std::runtime_error (" Error: refusing to rebuild chunks for a non-contiguous array!" );
@@ -208,14 +218,14 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::RebuildChunks(co
208
218
209
219
IndexVector global_shape (this -> m_shape);
210
220
211
- std::cout << " global shape" << std::endl;
212
- global_shape.print ();
221
+ // std::cout << "global shape" << std::endl;
222
+ // global_shape.print();
213
223
214
224
IndexVector number_required_chunks = (global_shape + requested_chunk_size - 1 ) / requested_chunk_size;
215
225
216
- std::cout << " will have " << std::endl;
217
- number_required_chunks.print ();
218
- std::cout << " chunks after rebuilding" << std::endl;
226
+ // std::cout << "will have " << std::endl;
227
+ // number_required_chunks.print();
228
+ // std::cout << " chunks after rebuilding" << std::endl;
219
229
220
230
index_t chunks_to_keep;
221
231
@@ -279,18 +289,20 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::MergeChunks(std:
279
289
280
290
// 3) again, have the to_keep mechanism
281
291
282
- std::cout << " in MergeNeighbouringChunks" << std::endl;
292
+ // std::cout << "in MergeNeighbouringChunks" << std::endl;
283
293
284
294
rebuildIndex ();
285
295
286
- std::cout << " rebuilt index" << std::endl;
296
+ // std::cout << "rebuilt index" << std::endl;
287
297
288
298
if (!isGloballyContiguous (getGlobalStartInd (), getGlobalStopInd ())) {
289
299
throw std::runtime_error (" Error: refusing to merge chunks for a non-contiguous array!" );
290
300
}
291
301
292
302
calculateShape ();
293
303
304
+ // std::cout << "have a total of " << m_chunk_index.size() << " chunks before merging" << std::endl;
305
+
294
306
// put chunks in order along the merging axis
295
307
std::vector<std::size_t > chunk_order (m_chunk_index.size ());
296
308
std::iota (chunk_order.begin (), chunk_order.end (), 0 );
@@ -326,9 +338,15 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::MergeChunks(std:
326
338
chunks_to_merge.push_back (retrieveChunk (cur_chunk_index));
327
339
std::size_t output_chunk_shape = chunks_to_merge.back ().shape (dim_to_merge);
328
340
std::size_t neighbour_chunk_index = cur_chunk_index;
341
+
342
+ // std::cout << "start merging tracer from chunk with index = " << neighbour_chunk_index << std::endl;
343
+
329
344
while (true ) {
330
345
try {
331
346
neighbour_chunk_index = getNeighbouringChunkIndex (neighbour_chunk_index, dim_to_merge);
347
+
348
+ // std::cout << "have neighbouring chunk with index = " << neighbour_chunk_index << " / " << m_chunk_index.size() << std::endl;
349
+
332
350
chunks_to_merge.push_back (retrieveChunk (neighbour_chunk_index));
333
351
std::erase (chunk_order, neighbour_chunk_index);
334
352
@@ -409,12 +427,24 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::printChunks() {
409
427
template <class T , std::size_t dims, template <class , std::size_t > class DenseT , template <class , std::size_t > class SparseT , class SerializerT >
410
428
std::size_t DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::getNeighbouringChunkIndex(std::size_t chunk_index, std::size_t dim) {
411
429
430
+ if (chunk_index > m_chunk_index.size ()) {
431
+ throw std::runtime_error (" Trying to look for neighbours of out-of-bounds chunk: " + std::to_string (chunk_index) + " /" + std::to_string (m_chunk_index.size ()));
432
+ }
433
+
412
434
ChunkMetadata& chunk_meta = m_chunk_index[chunk_index];
413
435
IndexVector chunk_size_along_dim (dims, 0 );
414
436
chunk_size_along_dim (dim) = chunk_meta.stop_ind (dim) - chunk_meta.start_ind (dim);
415
437
IndexVector neighbour_chunk_start_ind = chunk_meta.start_ind + chunk_size_along_dim;
416
438
417
- return getChunkIndex (neighbour_chunk_start_ind);
439
+ std::size_t neighbour_chunk_ind = getChunkIndex (neighbour_chunk_start_ind);
440
+
441
+ // if(neighbour_chunk_ind > m_chunk_index.size()) {
442
+ // std::cout << "MMMM Found out-of-bounds neighbouring chunk: " + std::to_string(neighbour_chunk_ind) + "/" + std::to_string(m_chunk_index.size()) << std::endl;
443
+ // std::cout << "MMMM From request with neighbour_chunk_start_ind = " << std::endl;
444
+ // neighbour_chunk_start_ind.print();
445
+ // }
446
+
447
+ return neighbour_chunk_ind;
418
448
}
419
449
420
450
template <class T , std::size_t dims, template <class , std::size_t > class DenseT , template <class , std::size_t > class SparseT , class SerializerT >
@@ -472,15 +502,18 @@ std::size_t DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::getChunkI
472
502
473
503
if (chunkContainsInds (m_chunk_index[m_chunk_last_accessed], inds)) {
474
504
[[likely]];
505
+ // std::cout << "NNN m_chunk_last_accessed gave " << m_chunk_last_accessed << std::endl;
475
506
return m_chunk_last_accessed;
476
507
}
477
508
else {
478
509
// Trigger a full chunk lookup
479
510
// TODO: have a search tree here with logarithmic instead of linear complexity
511
+ // std::cout << "NNN trigger full chunk lookup" << std::endl;
480
512
std::size_t chunk_ind = 0 ;
481
513
for (chunk_ind = 0 ; chunk_ind < m_chunk_index.size (); chunk_ind++) {
482
514
if (chunkContainsInds (m_chunk_index[chunk_ind], inds)) {
483
- m_chunk_last_accessed = chunk_ind;
515
+ m_chunk_last_accessed = chunk_ind;
516
+ // std::cout << "NNN full chunk lookup gave " << chunk_ind << std::endl;
484
517
return chunk_ind;
485
518
}
486
519
}
@@ -496,6 +529,10 @@ std::size_t DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::getChunkI
496
529
497
530
template <class T , std::size_t dims, template <class , std::size_t > class DenseT , template <class , std::size_t > class SparseT , class SerializerT >
498
531
DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::dense_t & DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::retrieveChunk(std::size_t chunk_ind) {
532
+
533
+ if (chunk_ind > m_chunk_index.size ()) {
534
+ throw std::runtime_error (" Trying to retrieve out-of-bounds chunk: " + std::to_string (chunk_ind) + " /" + std::to_string (m_chunk_index.size ()));
535
+ }
499
536
500
537
ChunkMetadata& chunk_meta = m_chunk_index[chunk_ind];
501
538
@@ -512,6 +549,15 @@ DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::dense_t& DistributedN
512
549
std::fstream ifs;
513
550
std::string chunk_path = m_dirpath + " /" + chunk_meta.filename ;
514
551
std::cout << " Loading chunk from " + chunk_path + " ... " ;
552
+
553
+ // if(chunk_meta.filename.find_first_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_-.") != std::string::npos) {
554
+ // throw std::runtime_error("Error: trying to open chunk " + std::to_string(chunk_ind) + "/" + std::to_string(m_chunk_index.size()) + " from file with non-compliant name: '" + chunk_meta.filename + "'!");
555
+ // }
556
+
557
+ if (!std::filesystem::exists (chunk_path)) {
558
+ throw std::runtime_error (" Error: trying to access non-existant file '" + chunk_path + " '!" );
559
+ }
560
+
515
561
ifs.open (chunk_path, std::ios::in | std::ios::binary);
516
562
ChunkMetadata meta = m_ser.template deserialize <ChunkMetadata>(ifs);
517
563
0 commit comments