Skip to content

Commit 78e316c

Browse files
clean up
1 parent 088a4fa commit 78e316c

File tree

1 file changed

+4
-111
lines changed

1 file changed

+4
-111
lines changed

src/DistributedNDArray.hxx

+4-111
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,6 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::WriteChunk(const
8787
char uuid_string[36];
8888
uuid_unparse(uuid_binary, uuid_string);
8989
std::string chunk_filename = std::string(uuid_string) + ".bin";
90-
91-
// if(chunk_filename.find_first_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_-.") != std::string::npos) {
92-
// throw std::runtime_error("Error: non-compliant chunk filename detected!");
93-
// }
9490

9591
IndexVector stop_ind = start_ind + chunk.shape();
9692

@@ -111,22 +107,17 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::WriteChunk(const
111107
// std::size_t sparse_vs_dense_expense_ratio = 3; // when only counting storage space: sparse storage is approximately 3x as expensive as dense storage per nonzero element
112108
std::size_t sparse_vs_dense_expense_ratio = 20; // when also counting complexity of deserializing + rebuilding a dense chunk
113109
if(sparse_vs_dense_expense_ratio * num_nonzero_elems < num_elems) {
114-
115-
// std::cout << "going to sparsify" << std::endl;
116-
110+
117111
auto to_keep = [](scalar_t value) -> bool {
118112
return value != 0.0;
119113
};
120114
sparse_t sparse_chunk = sparse_t::From(chunk, to_keep, 0.0);
121115
meta.chunk_type = ChunkType::sparse;
122-
123-
// std::cout << "after sparsification, " << sparse_chunk.NumEntries() << " entries remain" << std::endl;
124116

125117
m_ser.template serialize<ChunkMetadata>(ofs, meta);
126118
m_ser.template serialize<sparse_t>(ofs, sparse_chunk);
127119
}
128-
else {
129-
// std::cout << "store as dense" << std::endl;
120+
else {
130121
m_ser.template serialize<ChunkMetadata>(ofs, meta);
131122
m_ser.template serialize<dense_t>(ofs, chunk);
132123
}
@@ -168,65 +159,30 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::rebuildIndex() {
168159
ifs.open(dir_entry.path(), std::ios::in | std::ios::binary);
169160
ChunkMetadata meta = m_ser.template deserialize<ChunkMetadata>(ifs);
170161
ifs.close();
171-
172-
// std::cout << "found metadata containing filename: '" + meta.filename + "' from path '" + std::string(dir_entry.path()) + "'!" << std::endl;
173-
174-
// if(meta.filename.find_first_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_-.") != std::string::npos) {
175-
// throw std::runtime_error("Error: found metadata containing non-compliant filename: '" + meta.filename + "' from path '" + std::string(dir_entry.path()) + "'!");
176-
// }
177162

178163
m_chunk_index.push_back(meta);
179164
}
180165
}
181166

182167
template <class T, std::size_t dims, template<class, std::size_t> class DenseT, template<class, std::size_t> class SparseT, class SerializerT>
183168
void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::RebuildChunks(const IndexVector& requested_chunk_size) {
184-
185-
// How to make sure own index doesn't get messed up when new chunks are generated?
186-
187-
// 0) factor out most things of currently existing RegisterChunk into new private method:
188-
// -> WriteChunk(chunk, start_ind, add_to_index)
189-
// (keep the overlap checking etc. in the original RegisterChunk)
190-
191-
// Then, this function will look like
192-
// 0) rebuild index (to make sure everything is taken into account)
193-
// 1) then loop over chunks like in `WeightingFieldUtils`
194-
// -> get chunk that has the starting indices
195-
// -> if it conforms to the requested size, don't touch it and continue (will guarantee this function is idempotent)
196-
// -> if it doesn't, build a chunk of correct size (using NDArray::range)
197-
// 2) write the individual chunks, but don't touch the index (using `WriteChunk`)
198-
// 3) remove all chunks in the index (-> this will remove all the old ones), BUT not those that haven't been touched because they are already conforming to the correct size
199-
200-
// std::cout << "in RebuildChunks" << std::endl;
201169

202170
if(requested_chunk_size.size() != dims) {
203171
throw std::runtime_error("Error: requested chunk size has wrong dimensionality!");
204172
}
205-
206-
// std::cout << "rebuilding index" << std::endl;
207173

208174
// Make sure we start from a clean index
209175
rebuildIndex();
210-
211-
// std::cout << "rebuilt index" << std::endl;
212176

213177
if(!isGloballyContiguous(getGlobalStartInd(), getGlobalStopInd())) {
214178
throw std::runtime_error("Error: refusing to rebuild chunks for a non-contiguous array!");
215179
}
216180

217181
calculateShape();
218182

219-
IndexVector global_shape(this -> m_shape);
220-
221-
// std::cout << "global shape" << std::endl;
222-
// global_shape.print();
223-
183+
IndexVector global_shape(this -> m_shape);
224184
IndexVector number_required_chunks = (global_shape + requested_chunk_size - 1) / requested_chunk_size;
225185

226-
// std::cout << "will have " << std::endl;
227-
// number_required_chunks.print();
228-
// std::cout << " chunks after rebuilding" << std::endl;
229-
230186
index_t chunks_to_keep;
231187

232188
IndexVector buf_inds_start(dims, 0);
@@ -244,15 +200,11 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::RebuildChunks(co
244200
dense_t current_chunk = retrieveChunk(chunk_index);
245201

246202
if(actual_chunk_shape == current_chunk.shape()) {
247-
// std::cout << "chunk already has the correct size, keep it" << std::endl;
248203
chunks_to_keep.push_back(m_chunk_index[chunk_index]);
249204
continue;
250205
}
251206

252-
// std::cout << "now working on rebuild chunk with inds" << std::endl;
253-
// std::cout << "chunk_inds_start = " << std::endl;
254207
chunk_inds_start.print();
255-
// std::cout << "chunk_inds_end = " << std::endl;
256208
chunk_inds_end.print();
257209

258210
dense_t chunk = range(chunk_inds_start, chunk_inds_end);
@@ -278,30 +230,13 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::RebuildChunks(co
278230
template <class T, std::size_t dims, template<class, std::size_t> class DenseT, template<class, std::size_t> class SparseT, class SerializerT>
279231
void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::MergeChunks(std::size_t dim_to_merge, std::size_t max_dimsize) {
280232

281-
// 0) Rebuild index (to make sure we have the full picture)
282-
283-
// 2) Then, start with chunk that has the global start inds
284-
285-
// --> operate entirely on the index and prepare a full list of chunk mergings that need to happen (without actually doing anything)
286-
// --> remove any elements from the index that are already taken care of
287-
// --> check if there are any problems; if not, go ahead and implement the chunk mergings
288-
// --> go to the chunk that neighbours the current main one and continue
289-
290-
// 3) again, have the to_keep mechanism
291-
292-
// std::cout << "in MergeNeighbouringChunks" << std::endl;
293-
294233
rebuildIndex();
295-
296-
// std::cout << "rebuilt index" << std::endl;
297234

298235
if(!isGloballyContiguous(getGlobalStartInd(), getGlobalStopInd())) {
299236
throw std::runtime_error("Error: refusing to merge chunks for a non-contiguous array!");
300237
}
301238

302239
calculateShape();
303-
304-
// std::cout << "have a total of " << m_chunk_index.size() << " chunks before merging" << std::endl;
305240

306241
// put chunks in order along the merging axis
307242
std::vector<std::size_t> chunk_order(m_chunk_index.size());
@@ -338,14 +273,10 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::MergeChunks(std:
338273
chunks_to_merge.push_back(retrieveChunk(cur_chunk_index));
339274
std::size_t output_chunk_shape = chunks_to_merge.back().shape(dim_to_merge);
340275
std::size_t neighbour_chunk_index = cur_chunk_index;
341-
342-
// std::cout << "start merging tracer from chunk with index = " << neighbour_chunk_index << std::endl;
343276

344277
while(true) {
345278
try {
346279
neighbour_chunk_index = getNeighbouringChunkIndex(neighbour_chunk_index, dim_to_merge);
347-
348-
// std::cout << "have neighbouring chunk with index = " << neighbour_chunk_index << " / " << m_chunk_index.size() << std::endl;
349280

350281
chunks_to_merge.push_back(retrieveChunk(neighbour_chunk_index));
351282
std::erase(chunk_order, neighbour_chunk_index);
@@ -364,31 +295,7 @@ void DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::MergeChunks(std:
364295

365296
// perform the merge and write the result to disk
366297
dense_t output_chunk = NDArrayOps::concatenate(chunks_to_merge, dim_to_merge);
367-
WriteChunk(output_chunk, cur_chunk_meta.start_ind, false);
368-
369-
// dense_t output_chunk = retrieveChunk(cur_chunk_index);
370-
// std::size_t neighbour_chunk_index = cur_chunk_index;
371-
// while(true) {
372-
// try {
373-
// neighbour_chunk_index = getNeighbouringChunkIndex(neighbour_chunk_index, dim_to_merge);
374-
// std::erase(chunk_order, neighbour_chunk_index);
375-
376-
// std::cout << "merging chunk " << cur_chunk_index << " with chunk " << neighbour_chunk_index << std::endl;
377-
378-
// output_chunk = NDArrayOps::concatenate(output_chunk, retrieveChunk(neighbour_chunk_index), dim_to_merge);
379-
380-
// // chunk is now large enough
381-
// if(output_chunk.shape(dim_to_merge) >= max_dimsize) {
382-
// break;
383-
// }
384-
// } catch(const ChunkNotFoundError& e) {
385-
// // there are no more neighbours in this direction; stop here
386-
// break;
387-
// }
388-
// }
389-
390-
// // write it to disk
391-
// WriteChunk(output_chunk, cur_chunk_meta.start_ind, false);
298+
WriteChunk(output_chunk, cur_chunk_meta.start_ind, false);
392299
}
393300

394301
// remove all the chunks in the old index that should not be kept around
@@ -438,12 +345,6 @@ std::size_t DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::getNeighb
438345

439346
std::size_t neighbour_chunk_ind = getChunkIndex(neighbour_chunk_start_ind);
440347

441-
// if(neighbour_chunk_ind > m_chunk_index.size()) {
442-
// std::cout << "MMMM Found out-of-bounds neighbouring chunk: " + std::to_string(neighbour_chunk_ind) + "/" + std::to_string(m_chunk_index.size()) << std::endl;
443-
// std::cout << "MMMM From request with neighbour_chunk_start_ind = " << std::endl;
444-
// neighbour_chunk_start_ind.print();
445-
// }
446-
447348
return neighbour_chunk_ind;
448349
}
449350

@@ -502,18 +403,15 @@ std::size_t DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::getChunkI
502403

503404
if(chunkContainsInds(m_chunk_index[m_chunk_last_accessed], inds)) {
504405
[[likely]];
505-
// std::cout << "NNN m_chunk_last_accessed gave " << m_chunk_last_accessed << std::endl;
506406
return m_chunk_last_accessed;
507407
}
508408
else {
509409
// Trigger a full chunk lookup
510410
// TODO: have a search tree here with logarithmic instead of linear complexity
511-
// std::cout << "NNN trigger full chunk lookup" << std::endl;
512411
std::size_t chunk_ind = 0;
513412
for(chunk_ind = 0; chunk_ind < m_chunk_index.size(); chunk_ind++) {
514413
if(chunkContainsInds(m_chunk_index[chunk_ind], inds)) {
515414
m_chunk_last_accessed = chunk_ind;
516-
// std::cout << "NNN full chunk lookup gave " << chunk_ind << std::endl;
517415
return chunk_ind;
518416
}
519417
}
@@ -549,10 +447,6 @@ DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::dense_t& DistributedN
549447
std::fstream ifs;
550448
std::string chunk_path = m_dirpath + "/" + chunk_meta.filename;
551449
std::cout << "Loading chunk from " + chunk_path + " ... ";
552-
553-
// if(chunk_meta.filename.find_first_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_-.") != std::string::npos) {
554-
// throw std::runtime_error("Error: trying to open chunk " + std::to_string(chunk_ind) + "/" + std::to_string(m_chunk_index.size()) + " from file with non-compliant name: '" + chunk_meta.filename + "'!");
555-
// }
556450

557451
if(!std::filesystem::exists(chunk_path)) {
558452
throw std::runtime_error("Error: trying to access non-existant file '" + chunk_path + "'!");
@@ -566,7 +460,6 @@ DistributedNDArray<T, dims, DenseT, SparseT, SerializerT>::dense_t& DistributedN
566460
}
567461
else if(meta.chunk_type == ChunkType::sparse) {
568462
m_chunk_cache.insert({chunk_ind, dense_t::FromSparseFile(ifs)});
569-
// m_chunk_cache.insert({chunk_ind, dense_t::From(m_ser.template deserialize<sparse_t>(ifs))});
570463
}
571464
else {
572465
throw std::runtime_error("Error: unknown chunk type encountered!");

0 commit comments

Comments
 (0)