@@ -66,10 +66,8 @@ class Device {
66
66
// / Called by Tensor.
67
67
void FreeBlock (Block* block);
68
68
69
- void AppendInfo (string blockInfo);
70
- void * GetRealGpuPtrInfo (const Block* block_);
71
- void SwapOutInfo (const Block* block_);
72
- void SwapInInfo (const Block* block_);
69
+ void AppendInfo (string block_info);
70
+ void * UpdateGpuPtrInfo (const Block* block_ptr);
73
71
74
72
// / Return the size (bytes) of memory in use
75
73
// / TODO(wangwei) override this function for all devices.
@@ -108,7 +106,7 @@ class Device {
108
106
109
107
int id () const { return id_; }
110
108
111
- virtual void * GetRealGpuPtr (const Block* block_ ) = 0;
109
+ virtual void * UpdateGpuPtr (const Block* block_ptr ) = 0;
112
110
113
111
private:
114
112
Device () {};
@@ -125,11 +123,8 @@ class Device {
125
123
126
124
// / Free device memory.
127
125
virtual void Free (void * ptr) = 0;
128
- virtual void MakeMetaTable (Block* block,void * data_,int size) = 0;
129
- virtual void Append (string blockInfo) = 0;
130
-
131
- virtual void SwapOut (const Block* block_) = 0;
132
- virtual void SwapIn (const Block* block_) = 0;
126
+ virtual void AppendAfterMalloc (Block* block,void * data_ptr,int size) = 0;
127
+ virtual void Append (string block_info) = 0;
133
128
134
129
protected:
135
130
int id_ = 0 ;
@@ -171,11 +166,10 @@ class CppCPU : public Device {
171
166
172
167
// / Free cpu memory.
173
168
void Free (void * ptr) override ;
174
- void MakeMetaTable (Block* block,void * data_,int size) override {}
175
- void Append (string blockInfo) override {}
176
- void * GetRealGpuPtr (const Block* block_) override {}
177
- void SwapOut (const Block* block_) override {}
178
- void SwapIn (const Block* block_) override {}
169
+ void AppendAfterMalloc (Block* block,void * data_ptr,int size) override {}
170
+ void Append (string block_info) override {}
171
+ void * UpdateGpuPtr (const Block* block_ptr) override {}
172
+
179
173
};
180
174
181
175
@@ -206,11 +200,9 @@ class CudaGPU : public Device {
206
200
207
201
// / Free cpu memory.
208
202
void Free (void * ptr) override ;
209
- void MakeMetaTable (Block* block,void * data_,int size) override {}
210
- void Append (string blockInfo) override ;
211
- void * GetRealGpuPtr (const Block* block_) override ;
212
- void SwapOut (const Block* block_) override ;
213
- void SwapIn (const Block* block_) override ;
203
+ void AppendAfterMalloc (Block* block,void * data_ptr,int size) override {}
204
+ void Append (string block_info) override ;
205
+ void * UpdateGpuPtr (const Block* block_ptr) override ;
214
206
215
207
private:
216
208
void Setup ();
@@ -222,21 +214,21 @@ class CudaGPU : public Device {
222
214
// / CudaCPU which uses cudaMallocHost to allocate pinned memory for host.
223
215
224
216
// /SwapGPU
225
- struct onePieceMsg {
217
+ struct DeviceOptInfo {
226
218
/*
227
- members: [ptr, size, MallocFree , idx]
219
+ members: [ptr, size, operation_type , idx]
228
220
*/
229
221
string ptr;
230
222
size_t size;
231
- int MallocFree ;
223
+ int operation_type ;
232
224
int idx;
233
225
double t;
234
- onePieceMsg (string p, size_t s, int M, int i):ptr(p),size(s),MallocFree (M),idx(i){}
226
+ DeviceOptInfo (string p, size_t s, int M, int i):ptr(p),size(s),operation_type (M),idx(i){}
235
227
};
236
228
237
229
struct BlockMeta {
238
230
/*
239
- block Meta.
231
+ meta of swapping memory blocks
240
232
*/
241
233
Block* block_ = nullptr ;
242
234
void * data_ = nullptr ;
@@ -249,34 +241,39 @@ struct BlockMeta{
249
241
};
250
242
251
243
struct SwapBlock {
252
-
244
+ /*
245
+ meta of candidate blocks
246
+ */
253
247
string ptr;
254
- string cat; // A1, A2, A3.. .
248
+ string cat; // sub category of the candidate blocks, read-read, write-read, etc .
255
249
int name;
256
250
size_t size;
251
+ // index of last read/write before swap out, and first read/write after swap in
257
252
int r_idx; // out idx
258
253
int d_idx; // in idx
254
+ // index of last read/write before swap out, and first read/write after swap in
259
255
double r_time; // out time
260
256
double d_time; // in time
261
- double dt; // delta t: t2'-t1'
262
- double pri; // look at here if big enough TODO(junzhe)
263
- double dto; // t2-t1
264
- double wdto = 0 ; // t2-t1 weighted by swap_load
265
- double r_idx_ready; // r_idx + buffer, could be set during selection.
266
- // int free = -1; //when it is freed
267
- // below as per planned.
268
- int i1 = 0 ;
269
- int i1p = 0 ;
270
- int i2 = 0 ;
271
- int i2p = 0 ;
272
- double t1 = 0 ;
273
- double t2 = 0 ;
274
- double t1p = 0 ;
275
- double t2p = 0 ;
276
- SwapBlock (string p, size_t s, int i1, int i2, double t1, double t2):
277
- ptr (p), size(s), r_idx(i1),d_idx(i2),r_time(t1), d_time(t2) {}
257
+ double DOA; // Duation of Absence
258
+ double AOA; // Area of Absence
259
+ double DOA_origin; // t2-t1, DOA without taking out time spent
260
+ double WDOA = 0 ; // weighted DOA
261
+ double majority_voting = 0 ;
262
+ int r_idx_ready; // r_idx + buffer
263
+
264
+ // below are index and time for scheduling
265
+ int idx_out_start = 0 ;
266
+ int idx_out_end = 0 ;
267
+ int idx_in_end = 0 ;
268
+ int idx_in_start = 0 ;
269
+ double t_out_start = 0 ;
270
+ double t_out_end = 0 ;
271
+ double t_in_end = 0 ;
272
+ double t_in_start = 0 ;
273
+ SwapBlock (string p, size_t s, int idx_out_start, int idx_in_end, double t_out_start, double t_in_end):
274
+ ptr (p), size(s), r_idx(idx_out_start),d_idx(idx_in_end),r_time(t_out_start), d_time(t_in_end) {}
278
275
};
279
- // / Device able to Swap memory between Nvidia GPU and Swap
276
+ // / Device able to Swap memory between Nvidia GPU and CPU
280
277
class SwapGPU : public Device {
281
278
public:
282
279
~SwapGPU ();
@@ -300,98 +297,92 @@ class SwapGPU : public Device {
300
297
// / Free cpu memory.
301
298
void Free (void * ptr) override ;
302
299
303
- // Append at every index: malloc, free, read, mutable
304
- void Append (string blockInfo ) override ;
300
+ // Append at every index: free, read, mutable
301
+ void Append (string block_info ) override ;
305
302
306
- // append info after Malloc, pair .
307
- void MakeMetaTable (Block* block,void * data_ ,int size) override ;
303
+ // append info after Malloc, as Block* is not available till Malloc() done .
304
+ void AppendAfterMalloc (Block* block,void * data_ptr ,int size) override ;
308
305
309
- // all the testing, without swap, during Append()
310
- void Test_sched_switch_swap ();
306
+ // Detection and Plan
307
+ void DetectionPlan ();
311
308
312
309
// test iteration, return GC
313
- int swap_test (vector<string>vec_block,int &maxLen , int &location );
310
+ int Detection (vector<string>vec_block,int &iteration_length , int &location_of_2nd_iteration );
314
311
315
- // entire plan, from swap_select () to swap_sched (), swap_deploy_tables ()
316
- void swap_plan ();
312
+ // entire plan, from SelectBlock () to Scheduling (), BuildMetaTables ()
313
+ void Plan ();
317
314
318
- // selection algo
319
- vector<SwapBlock> swap_select (vector<SwapBlock>vec_swap,vector<double > tempLoad ,double memLimit ,string mode);
315
+ // block selection algo
316
+ vector<SwapBlock> SelectBlock (vector<SwapBlock>vec_swap,vector<double > temp_load ,double mem_limit ,string mode);
320
317
321
318
// schedule algo
322
- void swap_sched (vector<SwapBlock>&vec_swap_selct, vector<double >&vec_load_temp,double &overhead,double memLimit ,string mode);
319
+ void Scheduling (vector<SwapBlock>&vec_swap_selct, vector<double >&vec_load_temp,double &overhead,double mem_limit ,string mode);
323
320
324
- // make tables Table_sched and Table_meta
325
- void swap_construct_tables (vector<SwapBlock>vec_swap_selct);
321
+ // make tables table_sched and table_meta
322
+ void BuildMetaTables (vector<SwapBlock>vec_swap_selct);
326
323
327
- // update Table_meta , during Append()
328
- void swap_update_tables (Block* tempBlock_ );
324
+ // update table_meta , during Append()
325
+ void UpdateMetaTables (Block* block_ptr );
329
326
330
327
// swap/sync during Append()
331
328
void DeploySwap ();
332
329
333
330
// exec DelpoySwap
334
- void DeploySwap_exec (int r_gc);
335
-
336
-
331
+ void DeploySwapExec (int relative_counter);
337
332
338
333
// load profile as per synchronous swap.
339
- vector<double > swap_load_ideal (vector<double >vec_load,vector<SwapBlock> vec_swap_selct);
334
+ vector<double > GetIdealLoad (vector<double >vec_load,vector<SwapBlock> vec_swap_selct);
340
335
341
- // in case gpu ptr wrong. TODO(junzhe) to verify if needed.
342
- void * GetRealGpuPtr (const Block* block_ ) override ;
336
+ // in case gpu ptr wrong, updated it after swap_in ad hoc
337
+ void * UpdateGpuPtr (const Block* block_ptr ) override ;
343
338
344
- void SwapOut (const Block* block_) override ;
345
- void SwapIn (const Block* block_) override ;
339
+ // Swap Synchronous, for early iterations
340
+ void SwapOutSynchronous (const Block* block_ptr);
341
+ void SwapInSynchronous (const Block* block_ptr);
346
342
347
- // changed to intake data_ instead
348
- void SwapOut_idx (const int r_idx );
349
- void SwapIn_idx (const int r_idx );
343
+ // Swap asynchronous, for middle iteraions
344
+ void SwapOut (const int idx );
345
+ void SwapIn (const int idx );
350
346
351
347
private:
352
348
void Setup ();
353
- // /Tables needed
354
- // r_idx->BlockMeta
355
- map<int ,BlockMeta>Table_meta;
356
- map<const Block*,BlockMeta>Table_block_meta; // TODO(junzhe) for measure speed only.
357
- map<const Block*, int >Table_not_at_device; // int refers to its r_idx of the block/meta
358
- // map<const Block*, size_t>Table_block_size; //Table block_ -> size TODO(junzhe) no need, can call block_->size()
359
-
360
- // schedule: idx--> r_idx, dir; sync_r_idx,dir. int 0 means D2H, 1 means H2D.
361
- map<int ,std::tuple<int ,int ,int ,int >>Table_sched; // changed to with sync_r_idx
362
349
363
- // vector<SwapBlock>vec_swap_selct_global;
350
+ map<int ,BlockMeta>table_meta;
351
+ map<const Block*,BlockMeta>table_block_meta; // for measure speed only.
352
+ map<const Block*, int >table_not_at_device; // int refers to its r_idx of the block/meta
353
+ map<int ,std::tuple<int ,int ,int ,int >>table_sched; // changed to with sync_r_idx
364
354
365
355
// vec_block
366
- vector<string>vec_block; // iteration 0-3
367
- vector<string>vec_block_fresh; // iteration 4 5 6
368
- vector<string>vec_block_mf; // itr 8 9 10
369
- vector<double >global_load; // from begining
370
- vector<double >origin_load; // vec_load 3 itr. TODO(junzhe) to delete vec_load, global_load after use.
371
- vector<onePieceMsg>vec_run;
372
- vector<int >opsSequence; // sequence of operations of one middle iteration
373
- vector<size_t >sizeSequence; // size of all operations of one middle iteration
374
- int asyncSwapFlag = 0 ; // 0 for sync, 1 for async.
375
- int testFlag = 0 ; // 0 means open for test, 1 means no need test anymore.
376
- int gc = 0 ; // global counter, index, add 1 after each Malloc/Free/read/write.
377
- int globeCounter = -1 ;
378
- int maxLen = 0 ;
379
- int location = 0 ;
380
- int three_more_location = 0 ; // location at 3 more iterations later.
381
- int three_more_globeCounter = -1 ; //
382
- // design requirement TODO(junzhe)
383
- float memLimit_ratio = 0.70 ;
356
+ vector<string>vec_block; // iterations for Detection, i.e. detect iterations.
357
+ vector<string>vec_block_fresh; // iterations that are used for Planning,
358
+ vector<string>vec_block_mf; // iterations used to construct pool
359
+ vector<double >global_load; // load from begining
360
+ vector<double >origin_load; // 3 iteration load, for planning.
361
+ vector<DeviceOptInfo>vec_run;
362
+ vector<int >operation_sequence; // sequence of operations of one middle iteration
363
+ vector<size_t >size_sequence; // size of all operations of one middle iteration
364
+
365
+ int async_swap_flag = 0 ; // 0 for sync, 1 for async.
366
+ int past_test_flag = 0 ; // 0 means need to test, 1 means no need test anymore.
367
+ int global_index = 0 ; // global counter, index, add 1 after each Malloc/Free/read/write.
368
+ int global_index_threshold = -1 ;
369
+ int iteration_length = 0 ;
370
+ int location_of_2nd_iteration = 0 ; // index of start of 2nd iteration
371
+ int location_of_5th_iteration = 0 ; // index of start of 5th iteration
372
+ int three_more_iteration_global_index_threshold = -1 ;
373
+
374
+ // design specs
375
+ float mem_limit_ratio = 0.70 ;
384
376
size_t smallest_block = 1 <<20 ; // 1 MB
385
377
int data_buffer = 4 ; // used to control readyIdx
386
378
int mutable_data_buffer = 6 ;
387
- double maxLoad;
388
- int maxIdx;
389
- double total_swapInTime = 0 ;
390
- double total_swapOutTime = 0 ;
391
- double tempTime = 0 ;
392
- double tempTime2 = 0 ;
393
- double tempTime_baseline; // vec_run[0] time
394
- int maxLen_threshold = 1000 ;
379
+ double max_load;
380
+ int max_idx;
381
+ double total_swap_in_time = 0 ;
382
+ double total_swap_out_time = 0 ;
383
+ double temp_time = 0 ;
384
+ double temp_time_baseline; // vec_run[0] time
385
+ int iteration_length_threshold = 1000 ;
395
386
396
387
private:
397
388
shared_ptr<DeviceMemPool> pool_;
@@ -447,11 +438,9 @@ class OpenclDevice : public singa::Device {
447
438
// / Converts the void pointer into a Buffer object, then deletes the object.
448
439
// / This has the effect of freeing up device memory.
449
440
void Free (void * ptr) override ;
450
- void MakeMetaTable (Block* block,void * data_,int size) override {}
451
- void Append (string blockInfo) override {}
452
- void * GetRealGpuPtr (const Block* block_) override {}
453
- void SwapOut (const Block* block_) override {}
454
- void SwapIn (const Block* block_) override {}
441
+ void AppendAfterMalloc (Block* block,void * data_ptr,int size) override {}
442
+ void Append (string block_info) override {}
443
+ void * UpdateGpuPtr (const Block* block_ptr) override {}
455
444
456
445
457
446
private:
0 commit comments