@@ -90,14 +90,16 @@ void compute_loop(odla_computation comp) {
90
90
}
91
91
92
92
odla_status _odla_computation::compile_and_export () {
93
+ odla_status ret_value = ODLA_SUCCESS;
93
94
popart::logging::warn (" Start compile and export" );
94
95
const std::string& cache_file_name =
95
96
PopartConfig::instance ()->get_cache_path ();
96
97
std::string file_suffix (" .popart" );
97
98
int file_prefix = cache_file_name.rfind (file_suffix);
98
99
if (file_prefix == std::string::npos ||
99
100
file_prefix + file_suffix.size () < cache_file_name.size ()) {
100
- popart::logging::err (" Bad cache file name" );
101
+ popart::logging::err (
102
+ " Bad cache file name. File name should end with '.popart'" );
101
103
return ODLA_FAILURE;
102
104
}
103
105
if (file_prefix == std::string::npos) {
@@ -117,7 +119,7 @@ odla_status _odla_computation::compile_and_export() {
117
119
config_fs.open (config_file_name, std::ios_base::in | std::ifstream::binary);
118
120
if (!config_fs.is_open ()) {
119
121
popart::logging::warn (
120
- " invalid config file name :[ {} ] will use default config" ,
122
+ " Open config file failed :[ {} ] will use default config" ,
121
123
config_file_name);
122
124
PopartConfig::instance ()->use_default ();
123
125
config_string = PopartConfig::instance ()->get_default_config_string ();
@@ -134,18 +136,28 @@ odla_status _odla_computation::compile_and_export() {
134
136
cache_fs.write ((char *)&config_size, sizeof (config_size));
135
137
cache_fs.write (config_string.c_str (), config_string.size ());
136
138
137
- _odla_computation::instance ()->session ->compileAndExport (cache_fs.flush ());
138
-
139
+ try {
140
+ _odla_computation::instance ()->session ->compileAndExport (cache_fs.flush ());
141
+ } catch (std::exception & e) {
142
+ popart::logging::err (" compileAndExport Falied: {}" , e.what ());
143
+ ret_value = ODLA_FAILURE;
144
+ }
139
145
cache_fs.flush ();
140
146
cache_fs.close ();
141
147
config_fs.close ();
148
+
149
+ return ret_value;
142
150
}
143
151
144
- void _odla_computation::init (bool is_compile) {
152
+ odla_status _odla_computation::init (bool is_compile) {
145
153
if (!session) {
146
154
std::lock_guard<std::mutex> guard (init_mutex_);
147
155
if (!session) {
148
- set_opts ();
156
+ odla_status status = set_opts ();
157
+ if (status != ODLA_SUCCESS) {
158
+ popart::logging::err (" set computation option failed" );
159
+ return status;
160
+ }
149
161
// Cretate the dataflow
150
162
std::vector<popart::TensorId> ids;
151
163
for (const auto & output : outputs_map)
@@ -168,8 +180,14 @@ void _odla_computation::init(bool is_compile) {
168
180
169
181
// Create and config SessionOptions
170
182
set_session_opts ();
171
- if (use_pipeline ())
172
- builder = popart::Builder::createFromOnnxModel (set_pipeline_stage ());
183
+ if (use_pipeline ()) {
184
+ try {
185
+ builder = popart::Builder::createFromOnnxModel (set_pipeline_stage ());
186
+ } catch (std::exception & e) {
187
+ popart::logging::err (" create builder from onnx model failed." );
188
+ return ODLA_FAILURE;
189
+ }
190
+ }
173
191
auto proto = builder->getModelProto (); // So, the init must be called at
174
192
// odla_ExecuteCompute
175
193
@@ -185,12 +203,19 @@ void _odla_computation::init(bool is_compile) {
185
203
PopartConfig::instance ()->save_model_path ());
186
204
}
187
205
188
- // Create InferenceSession
189
- auto new_session = popart::InferenceSession::createFromOnnxModel (
190
- proto, data_flow, device, popart::InputShapeInfo (), session_opts_);
206
+ std::unique_ptr<popart::InferenceSession> new_session;
207
+ try {
208
+ // Create InferenceSession
209
+ new_session = std::move (popart::InferenceSession::createFromOnnxModel (
210
+ proto, data_flow, device, popart::InputShapeInfo (), session_opts_));
211
+ } catch (std::exception & e) {
212
+ popart::logging::err (" Session::createFromOnnxModel failed:{}" ,
213
+ e.what ());
214
+ return ODLA_FAILURE;
215
+ }
191
216
192
217
if (!is_compile) {
193
- if (PopartConfig::instance ()->load_cache ()) {
218
+ if (PopartConfig::instance ()->load_or_save_cache ()) {
194
219
popart::logging::info (" Load cachefile from existing stream" );
195
220
auto cache_fs = PopartConfig::instance ()->get_cache_fs ();
196
221
if (cache_fs->is_open ()) {
@@ -202,10 +227,14 @@ void _odla_computation::init(bool is_compile) {
202
227
}
203
228
}
204
229
205
- new_session->prepareDevice ();
206
- new_session->setRandomSeed (0 ); // Init seed
207
- new_session->weightsFromHost (); // Copy weights from host to IPU
208
-
230
+ try {
231
+ new_session->prepareDevice ();
232
+ new_session->setRandomSeed (0 ); // Init seed
233
+ new_session->weightsFromHost (); // Copy weights from host to IPU
234
+ } catch (std::exception & e) {
235
+ popart::logging::err (" session init failed: {}" , e.what ());
236
+ return ODLA_FAILURE;
237
+ }
209
238
// If in parallel mode, start the thread
210
239
ExecutionMode mode = PopartConfig::instance ()->execution_mode ();
211
240
if (PIPELINE == mode || PARALLEL == mode) {
@@ -214,33 +243,42 @@ void _odla_computation::init(bool is_compile) {
214
243
popart::logging::warn (" Parallel loop has been started" );
215
244
parallel_thread.detach ();
216
245
}
246
+ } else {
247
+ is_compile_only_ = true ;
217
248
}
249
+
218
250
session =
219
251
std::move (new_session); // set session after all initialization done.
220
252
}
221
253
}
222
254
}
223
255
224
256
// Now we set this by config file, should set by the caller?
225
- void _odla_computation::set_opts () {
257
+ odla_status _odla_computation::set_opts () {
226
258
if (PopartConfig::instance ()->debug ()) {
227
259
opts.ipu_num = PopartConfig::instance ()->ipu_num ();
228
260
opts.batches_per_step = PopartConfig::instance ()->batches_per_step ();
229
261
} else if (use_pipeline ()) { // Only check when use pipeline
230
- if (opts.ipu_num != PopartConfig::instance ()->ipu_num ())
231
- throw std::invalid_argument (
262
+ if (opts.ipu_num != PopartConfig::instance ()->ipu_num ()) {
263
+ popart::logging::err (
232
264
" number of ipus in pipeline configuration:" +
233
265
std::to_string (PopartConfig::instance ()->ipu_num ()) +
234
266
" must same with options: " + std::to_string (opts.ipu_num ));
235
- if (opts.batches_per_step != PopartConfig::instance ()->batches_per_step ())
236
- throw std::invalid_argument (
267
+ return ODLA_FAILURE;
268
+ }
269
+ if (opts.batches_per_step != PopartConfig::instance ()->batches_per_step ()) {
270
+ popart::logging::err (
237
271
" batches per step in pipeline configuration:" +
238
272
std::to_string (PopartConfig::instance ()->batches_per_step ()) +
239
273
" must same with options: " + std::to_string (opts.batches_per_step ));
274
+ return ODLA_FAILURE;
275
+ }
240
276
}
277
+ return ODLA_SUCCESS;
241
278
}
242
279
243
- void _odla_computation::set_executor () {
280
+ odla_status _odla_computation::set_executor () {
281
+ odla_status ret_value = ODLA_SUCCESS;
244
282
ExecutionMode mode = PopartConfig::instance ()->execution_mode ();
245
283
if (PIPELINE == mode || PARALLEL == mode) {
246
284
popart::logging::info (" set the executor as parallel" );
@@ -249,10 +287,13 @@ void _odla_computation::set_executor() {
249
287
popart::logging::info (" set the executor as sequence" );
250
288
executor_ = new Sequence ();
251
289
} else {
252
- throw std::invalid_argument (
253
- " *** FATAL *** unknown execution mode: {}" + std::to_string (mode) +
254
- " . Should be one of pipeline, parallel or sequence" );
290
+ popart::logging::err (
291
+ " unknown excution mode: {}, Should be one of pipeline, parallel or "
292
+ " sequence" ,
293
+ std::to_string (mode));
294
+ ret_value = ODLA_FAILURE;
255
295
}
296
+ return ret_value;
256
297
}
257
298
258
299
void _odla_computation::set_session_opts () {
@@ -270,9 +311,9 @@ void _odla_computation::set_session_opts() {
270
311
session_opts_.cachePath =
271
312
opts.enable_engine_cache ? opts.cache_dir : envEngineCachePath;
272
313
}
273
- // session_opts_.matmulOptions["use128BitConvUnitLoad"] = "true";
274
- // session_opts_.matmulOptions["enableMultiStageReduce"] = "false";
275
- // session_opts_.matmulOptions["enableFastReduce"] = "true";
314
+ session_opts_.matmulOptions [" use128BitConvUnitLoad" ] = " true" ;
315
+ session_opts_.matmulOptions [" enableMultiStageReduce" ] = " false" ;
316
+ session_opts_.matmulOptions [" enableFastReduce" ] = " true" ;
276
317
session_opts_.enableFloatingPointChecks = false ;
277
318
session_opts_.enableStochasticRounding = false ;
278
319
session_opts_.enablePrefetchDatastreams = false ; // true;
@@ -392,9 +433,12 @@ bool _odla_context::hold(const std::string& function_name) {
392
433
ss_holder << thread_id_of_holder;
393
434
popart::logging::err (
394
435
" [{}] odla_context {} has been held by thread: {}"
395
- " , when try to hold it in function {}." ,
436
+ " , when try to hold it in function {}. multi threads try to hold the "
437
+ " same context." ,
396
438
this_thread_id, this , thread_id_of_holder, function_name);
397
- throw std::runtime_error (" Multiple threads try to hold the same context" );
439
+ return false ;
440
+ // throw std::runtime_error("Multiple threads try to hold the same
441
+ // context");
398
442
}
399
443
return false ;
400
444
}
0 commit comments