@@ -30,7 +30,7 @@ namespace stor {
30
30
template <class T , std::size_t dims, template <class , std::size_t > class DenseT , template <class , std::size_t > class SparseT , class SerializerT >
31
31
DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::DistributedNDArray(std::string dirpath, std::size_t max_cache_size, SerializerT& ser) :
32
32
NDArray<T, dims>(), m_dirpath(dirpath), m_indexpath(dirpath + " /index.bin" ), m_max_cache_size(max_cache_size),
33
- m_global_start_ind (dims, 0 ), m_ser(ser) {
33
+ m_chunk_last_accessed ( 0 ), m_global_start_ind(dims, 0 ), m_ser(ser) {
34
34
35
35
// Create directory if it does not already exist
36
36
if (!std::filesystem::exists (m_dirpath)) {
@@ -104,7 +104,8 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::WriteChunk(const
104
104
std::size_t num_elems = chunk.volume ();
105
105
106
106
// pays off to store as sparse chunk
107
- std::size_t sparse_vs_dense_expense_ratio = 3 ; // sparse storage is approximately 3x as expensive as dense storage per nonzero element
107
+ // std::size_t sparse_vs_dense_expense_ratio = 3; // when only counting storage space: sparse storage is approximately 3x as expensive as dense storage per nonzero element
108
+ std::size_t sparse_vs_dense_expense_ratio = 20 ; // when also counting complexity of deserializing + rebuilding a dense chunk
108
109
if (sparse_vs_dense_expense_ratio * num_nonzero_elems < num_elems) {
109
110
110
111
std::cout << " going to sparsify" << std::endl;
@@ -232,15 +233,15 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::RebuildChunks(co
232
233
dense_t current_chunk = retrieveChunk (chunk_index);
233
234
234
235
if (actual_chunk_shape == current_chunk.shape ()) {
235
- std::cout << " chunk already has the correct size, keep it" << std::endl;
236
+ // std::cout << "chunk already has the correct size, keep it" << std::endl;
236
237
chunks_to_keep.push_back (m_chunk_index[chunk_index]);
237
238
continue ;
238
239
}
239
240
240
- std::cout << " now working on rebuild chunk with inds" << std::endl;
241
- std::cout << " chunk_inds_start = " << std::endl;
241
+ // std::cout << "now working on rebuild chunk with inds" << std::endl;
242
+ // std::cout << "chunk_inds_start = " << std::endl;
242
243
chunk_inds_start.print ();
243
- std::cout << " chunk_inds_end = " << std::endl;
244
+ // std::cout << "chunk_inds_end = " << std::endl;
244
245
chunk_inds_end.print ();
245
246
246
247
dense_t chunk = range (chunk_inds_start, chunk_inds_end);
@@ -436,11 +437,26 @@ bool DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::chunkContainsInd
436
437
437
438
template <class T , std::size_t dims, template <class , std::size_t > class DenseT , template <class , std::size_t > class SparseT , class SerializerT >
438
439
std::size_t DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::getChunkIndex(const IndexVector& inds) {
439
- std::size_t chunk_ind = 0 ;
440
- for (chunk_ind = 0 ; chunk_ind < m_chunk_index.size (); chunk_ind++) {
441
- if (chunkContainsInds (m_chunk_index[chunk_ind], inds)) {
442
- return chunk_ind;
443
- }
440
+
441
+ if (m_chunk_index.size () == 0 ) {
442
+ [[unlikely]];
443
+ throw ChunkNotFoundError ();
444
+ }
445
+
446
+ if (chunkContainsInds (m_chunk_index[m_chunk_last_accessed], inds)) {
447
+ [[likely]];
448
+ return m_chunk_last_accessed;
449
+ }
450
+ else {
451
+ // Trigger a full chunk lookup
452
+ // TODO: have a search tree here with logarithmic instead of linear complexity
453
+ std::size_t chunk_ind = 0 ;
454
+ for (chunk_ind = 0 ; chunk_ind < m_chunk_index.size (); chunk_ind++) {
455
+ if (chunkContainsInds (m_chunk_index[chunk_ind], inds)) {
456
+ m_chunk_last_accessed = chunk_ind;
457
+ return chunk_ind;
458
+ }
459
+ }
444
460
}
445
461
446
462
std::cout << " HHHHHHH" << std::endl;
@@ -476,7 +492,8 @@ DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::dense_t& DistributedN
476
492
m_chunk_cache.insert ({chunk_ind, m_ser.template deserialize <dense_t >(ifs)});
477
493
}
478
494
else if (meta.chunk_type == ChunkType::sparse) {
479
- m_chunk_cache.insert ({chunk_ind, dense_t::From (m_ser.template deserialize <sparse_t >(ifs))});
495
+ m_chunk_cache.insert ({chunk_ind, dense_t::FromSparseFile (ifs)});
496
+ // m_chunk_cache.insert({chunk_ind, dense_t::From(m_ser.template deserialize<sparse_t>(ifs))});
480
497
}
481
498
else {
482
499
throw std::runtime_error (" Error: unknown chunk type encountered!" );
0 commit comments