Skip to content

Commit 0f3722d

Browse files
committed
add documentation
1 parent 2fb3f02 commit 0f3722d

File tree

14 files changed

+1562
-1989
lines changed

14 files changed

+1562
-1989
lines changed

.DS_Store

-8 KB
Binary file not shown.

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
8989
IF (USE_CUDA)
9090
include(ExternalProject)
9191
ExternalProject_Add(cnmem
92-
GIT_REPOSITORY "https://github.com/junzhezhang/cnmem.git"
92+
GIT_REPOSITORY "https://github.com/nusdbsystem/cnmem.git"
9393
GIT_TAG "master"
9494
SOURCE_DIR "cnmem/"
9595
CONFIGURE_COMMAND "${CMAKE_COMMAND}"
File renamed without changes.

examples/cifar10/train.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
from singa.proto import core_pb2
4040
from caffe import caffe_net
4141

42-
import alexnet
42+
import cnn
4343
import vgg
4444
import resnet
4545

include/singa/core/common.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ class Device;
5656
/// Block represent a chunk of memory (on device or host).
5757
class Block {
5858
public:
59-
Block(void* ptr, size_t size, size_t offset = 0, Device* ptrDevice = nullptr)
60-
: data_(ptr), size_(size), offset_(offset), ptrDevice_(ptrDevice) {
59+
Block(void* ptr, size_t size, size_t offset = 0, Device* ptr_device = nullptr)
60+
: data_(ptr), size_(size), offset_(offset), ptr_device_(ptr_device) {
6161
ref_count_ = 1; // std::make_shared<std::atomic<int>>(1);
6262
}
6363
// Disabled as it is not used currently.
@@ -90,7 +90,7 @@ class Block {
9090
void* data_ = nullptr;
9191
size_t size_ = 0;
9292
size_t offset_ = 0;
93-
Device* ptrDevice_;
93+
Device* ptr_device_;
9494
bool initialized_ = false;
9595
// Disabled as it is not used currently.
9696
// std::shared_ptr<std::atomic<int>> ref_count_ = nullptr;

include/singa/core/device.h

Lines changed: 102 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,8 @@ class Device {
6666
/// Called by Tensor.
6767
void FreeBlock(Block* block);
6868

69-
void AppendInfo(string blockInfo);
70-
void* GetRealGpuPtrInfo(const Block* block_);
71-
void SwapOutInfo(const Block* block_);
72-
void SwapInInfo(const Block* block_);
69+
void AppendInfo(string block_info);
70+
void* UpdateGpuPtrInfo(const Block* block_ptr);
7371

7472
/// Return the size (bytes) of memory in use
7573
/// TODO(wangwei) override this function for all devices.
@@ -108,7 +106,7 @@ class Device {
108106

109107
int id() const { return id_; }
110108

111-
virtual void* GetRealGpuPtr(const Block* block_) = 0;
109+
virtual void* UpdateGpuPtr(const Block* block_ptr) = 0;
112110

113111
private:
114112
Device() {};
@@ -125,11 +123,8 @@ class Device {
125123

126124
/// Free device memory.
127125
virtual void Free(void* ptr) = 0;
128-
virtual void MakeMetaTable(Block* block,void* data_,int size) = 0;
129-
virtual void Append(string blockInfo) = 0;
130-
131-
virtual void SwapOut(const Block* block_) = 0;
132-
virtual void SwapIn(const Block* block_) = 0;
126+
virtual void AppendAfterMalloc(Block* block,void* data_ptr,int size) = 0;
127+
virtual void Append(string block_info) = 0;
133128

134129
protected:
135130
int id_ = 0;
@@ -171,11 +166,10 @@ class CppCPU : public Device {
171166

172167
/// Free cpu memory.
173168
void Free(void* ptr) override;
174-
void MakeMetaTable(Block* block,void* data_,int size) override {}
175-
void Append(string blockInfo) override {}
176-
void* GetRealGpuPtr(const Block* block_) override {}
177-
void SwapOut(const Block* block_) override {}
178-
void SwapIn(const Block* block_) override {}
169+
void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {}
170+
void Append(string block_info) override {}
171+
void* UpdateGpuPtr(const Block* block_ptr) override {}
172+
179173
};
180174

181175

@@ -206,11 +200,9 @@ class CudaGPU : public Device {
206200

207201
/// Free cpu memory.
208202
void Free(void* ptr) override;
209-
void MakeMetaTable(Block* block,void* data_,int size) override {}
210-
void Append(string blockInfo) override;
211-
void* GetRealGpuPtr(const Block* block_) override;
212-
void SwapOut(const Block* block_) override;
213-
void SwapIn(const Block* block_) override;
203+
void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {}
204+
void Append(string block_info) override;
205+
void* UpdateGpuPtr(const Block* block_ptr) override;
214206

215207
private:
216208
void Setup();
@@ -222,21 +214,21 @@ class CudaGPU : public Device {
222214
/// CudaCPU which uses cudaMallocHost to allocate pinned memory for host.
223215

224216
///SwapGPU
225-
struct onePieceMsg{
217+
struct DeviceOptInfo{
226218
/*
227-
members: [ptr, size, MallocFree, idx]
219+
members: [ptr, size, operation_type, idx]
228220
*/
229221
string ptr;
230222
size_t size;
231-
int MallocFree;
223+
int operation_type;
232224
int idx;
233225
double t;
234-
onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){}
226+
DeviceOptInfo(string p, size_t s, int M, int i):ptr(p),size(s),operation_type(M),idx(i){}
235227
};
236228

237229
struct BlockMeta{
238230
/*
239-
block Meta.
231+
meta of swapping memory blocks
240232
*/
241233
Block* block_ = nullptr;
242234
void* data_ = nullptr;
@@ -249,34 +241,39 @@ struct BlockMeta{
249241
};
250242

251243
struct SwapBlock{
252-
244+
/*
245+
meta of candidate blocks
246+
*/
253247
string ptr;
254-
string cat; //A1, A2, A3...
248+
string cat; //sub category of the candidate blocks, read-read, write-read, etc.
255249
int name;
256250
size_t size;
251+
//index of last read/write before swap out, and first read/write after swap in
257252
int r_idx; //out idx
258253
int d_idx; //in idx
254+
//index of last read/write before swap out, and first read/write after swap in
259255
double r_time; // out time
260256
double d_time; //in time
261-
double dt; //delta t: t2'-t1'
262-
double pri; //look at here if big enough TODO(junzhe)
263-
double dto; //t2-t1
264-
double wdto = 0; //t2-t1 weighted by swap_load
265-
double r_idx_ready; //r_idx + buffer, could be set during selection.
266-
//int free = -1; //when it is freed
267-
//below as per planned.
268-
int i1 = 0;
269-
int i1p = 0;
270-
int i2 = 0;
271-
int i2p = 0;
272-
double t1 = 0;
273-
double t2 = 0;
274-
double t1p = 0;
275-
double t2p = 0;
276-
SwapBlock(string p, size_t s, int i1, int i2, double t1, double t2):
277-
ptr(p), size(s), r_idx(i1),d_idx(i2),r_time(t1), d_time(t2) {}
257+
double DOA; //Duation of Absence
258+
double AOA; //Area of Absence
259+
double DOA_origin; //t2-t1, DOA without taking out time spent
260+
double WDOA = 0; //weighted DOA
261+
double majority_voting = 0;
262+
int r_idx_ready; //r_idx + buffer
263+
264+
//below are index and time for scheduling
265+
int idx_out_start = 0;
266+
int idx_out_end = 0;
267+
int idx_in_end = 0;
268+
int idx_in_start = 0;
269+
double t_out_start = 0;
270+
double t_out_end = 0;
271+
double t_in_end = 0;
272+
double t_in_start = 0;
273+
SwapBlock(string p, size_t s, int idx_out_start, int idx_in_end, double t_out_start, double t_in_end):
274+
ptr(p), size(s), r_idx(idx_out_start),d_idx(idx_in_end),r_time(t_out_start), d_time(t_in_end) {}
278275
};
279-
/// Device able to Swap memory between Nvidia GPU and Swap
276+
/// Device able to Swap memory between Nvidia GPU and CPU
280277
class SwapGPU : public Device {
281278
public:
282279
~SwapGPU();
@@ -300,98 +297,92 @@ class SwapGPU : public Device {
300297
/// Free cpu memory.
301298
void Free(void* ptr) override;
302299

303-
//Append at every index: malloc, free, read, mutable
304-
void Append(string blockInfo) override;
300+
//Append at every index: free, read, mutable
301+
void Append(string block_info) override;
305302

306-
//append info after Malloc, pair.
307-
void MakeMetaTable(Block* block,void* data_,int size) override;
303+
//append info after Malloc, as Block* is not available till Malloc() done.
304+
void AppendAfterMalloc(Block* block,void* data_ptr,int size) override;
308305

309-
//all the testing, without swap, during Append()
310-
void Test_sched_switch_swap();
306+
//Detection and Plan
307+
void DetectionPlan();
311308

312309
//test iteration, return GC
313-
int swap_test(vector<string>vec_block,int &maxLen, int &location);
310+
int Detection(vector<string>vec_block,int &iteration_length, int &location_of_2nd_iteration);
314311

315-
//entire plan, from swap_select() to swap_sched(), swap_deploy_tables()
316-
void swap_plan();
312+
//entire plan, from SelectBlock() to Scheduling(), BuildMetaTables()
313+
void Plan();
317314

318-
//selection algo
319-
vector<SwapBlock> swap_select(vector<SwapBlock>vec_swap,vector<double> tempLoad,double memLimit,string mode);
315+
//block selection algo
316+
vector<SwapBlock> SelectBlock(vector<SwapBlock>vec_swap,vector<double> temp_load,double mem_limit,string mode);
320317

321318
//schedule algo
322-
void swap_sched(vector<SwapBlock>&vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double memLimit,string mode);
319+
void Scheduling(vector<SwapBlock>&vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double mem_limit,string mode);
323320

324-
//make tables Table_sched and Table_meta
325-
void swap_construct_tables(vector<SwapBlock>vec_swap_selct);
321+
//make tables table_sched and table_meta
322+
void BuildMetaTables(vector<SwapBlock>vec_swap_selct);
326323

327-
//update Table_meta, during Append()
328-
void swap_update_tables(Block* tempBlock_);
324+
//update table_meta, during Append()
325+
void UpdateMetaTables(Block* block_ptr);
329326

330327
//swap/sync during Append()
331328
void DeploySwap();
332329

333330
//exec DelpoySwap
334-
void DeploySwap_exec(int r_gc);
335-
336-
331+
void DeploySwapExec(int relative_counter);
337332

338333
//load profile as per synchronous swap.
339-
vector<double> swap_load_ideal(vector<double>vec_load,vector<SwapBlock> vec_swap_selct);
334+
vector<double> GetIdealLoad(vector<double>vec_load,vector<SwapBlock> vec_swap_selct);
340335

341-
//in case gpu ptr wrong. TODO(junzhe) to verify if needed.
342-
void* GetRealGpuPtr(const Block* block_) override;
336+
//in case gpu ptr wrong, updated it after swap_in ad hoc
337+
void* UpdateGpuPtr(const Block* block_ptr) override;
343338

344-
void SwapOut(const Block* block_) override;
345-
void SwapIn(const Block* block_) override;
339+
//Swap Synchronous, for early iterations
340+
void SwapOutSynchronous(const Block* block_ptr);
341+
void SwapInSynchronous(const Block* block_ptr);
346342

347-
//changed to intake data_ instead
348-
void SwapOut_idx(const int r_idx);
349-
void SwapIn_idx(const int r_idx);
343+
//Swap asynchronous, for middle iteraions
344+
void SwapOut(const int idx);
345+
void SwapIn(const int idx);
350346

351347
private:
352348
void Setup();
353-
///Tables needed
354-
//r_idx->BlockMeta
355-
map<int,BlockMeta>Table_meta;
356-
map<const Block*,BlockMeta>Table_block_meta; //TODO(junzhe) for measure speed only.
357-
map<const Block*, int>Table_not_at_device; //int refers to its r_idx of the block/meta
358-
//map<const Block*, size_t>Table_block_size; //Table block_ -> size TODO(junzhe) no need, can call block_->size()
359-
360-
//schedule: idx--> r_idx, dir; sync_r_idx,dir. int 0 means D2H, 1 means H2D.
361-
map<int,std::tuple<int,int,int,int>>Table_sched; // changed to with sync_r_idx
362349

363-
// vector<SwapBlock>vec_swap_selct_global;
350+
map<int,BlockMeta>table_meta;
351+
map<const Block*,BlockMeta>table_block_meta; //for measure speed only.
352+
map<const Block*, int>table_not_at_device; //int refers to its r_idx of the block/meta
353+
map<int,std::tuple<int,int,int,int>>table_sched; // changed to with sync_r_idx
364354

365355
//vec_block
366-
vector<string>vec_block; //iteration 0-3
367-
vector<string>vec_block_fresh; //iteration 4 5 6
368-
vector<string>vec_block_mf; //itr 8 9 10
369-
vector<double>global_load; // from begining
370-
vector<double>origin_load; //vec_load 3 itr. TODO(junzhe) to delete vec_load, global_load after use.
371-
vector<onePieceMsg>vec_run;
372-
vector<int>opsSequence; //sequence of operations of one middle iteration
373-
vector<size_t>sizeSequence; //size of all operations of one middle iteration
374-
int asyncSwapFlag = 0; //0 for sync, 1 for async.
375-
int testFlag = 0; //0 means open for test, 1 means no need test anymore.
376-
int gc = 0; //global counter, index, add 1 after each Malloc/Free/read/write.
377-
int globeCounter = -1;
378-
int maxLen = 0;
379-
int location = 0;
380-
int three_more_location = 0; //location at 3 more iterations later.
381-
int three_more_globeCounter = -1; //
382-
//design requirement TODO(junzhe)
383-
float memLimit_ratio = 0.70;
356+
vector<string>vec_block; //iterations for Detection, i.e. detect iterations.
357+
vector<string>vec_block_fresh; //iterations that are used for Planning,
358+
vector<string>vec_block_mf; //iterations used to construct pool
359+
vector<double>global_load; // load from begining
360+
vector<double>origin_load; //3 iteration load, for planning.
361+
vector<DeviceOptInfo>vec_run;
362+
vector<int>operation_sequence; //sequence of operations of one middle iteration
363+
vector<size_t>size_sequence; //size of all operations of one middle iteration
364+
365+
int async_swap_flag = 0; //0 for sync, 1 for async.
366+
int past_test_flag = 0; //0 means need to test, 1 means no need test anymore.
367+
int global_index = 0; //global counter, index, add 1 after each Malloc/Free/read/write.
368+
int global_index_threshold = -1;
369+
int iteration_length = 0;
370+
int location_of_2nd_iteration = 0; //index of start of 2nd iteration
371+
int location_of_5th_iteration = 0; //index of start of 5th iteration
372+
int three_more_iteration_global_index_threshold = -1;
373+
374+
//design specs
375+
float mem_limit_ratio = 0.70;
384376
size_t smallest_block = 1<<20; //1 MB
385377
int data_buffer = 4; // used to control readyIdx
386378
int mutable_data_buffer = 6;
387-
double maxLoad;
388-
int maxIdx;
389-
double total_swapInTime = 0;
390-
double total_swapOutTime = 0;
391-
double tempTime = 0;
392-
double tempTime2 = 0;
393-
double tempTime_baseline; //vec_run[0] time
394-
int maxLen_threshold = 1000;
379+
double max_load;
380+
int max_idx;
381+
double total_swap_in_time = 0;
382+
double total_swap_out_time = 0;
383+
double temp_time = 0;
384+
double temp_time_baseline; //vec_run[0] time
385+
int iteration_length_threshold = 1000;
395386

396387
private:
397388
shared_ptr<DeviceMemPool> pool_;
@@ -447,11 +438,9 @@ class OpenclDevice : public singa::Device {
447438
/// Converts the void pointer into a Buffer object, then deletes the object.
448439
/// This has the effect of freeing up device memory.
449440
void Free(void* ptr) override;
450-
void MakeMetaTable(Block* block,void* data_,int size) override {}
451-
void Append(string blockInfo) override {}
452-
void* GetRealGpuPtr(const Block* block_) override {}
453-
void SwapOut(const Block* block_) override {}
454-
void SwapIn(const Block* block_) override {}
441+
void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {}
442+
void Append(string block_info) override {}
443+
void* UpdateGpuPtr(const Block* block_ptr) override {}
455444

456445

457446
private:

0 commit comments

Comments
 (0)