Skip to content

Commit 6bc7b45

Browse files
yanwei-gryanwei
and
yanwei
authored
catch potential exception and handle (#677)
* close floating point check in popart * create pipeline resource when computation created. not do it at the time of set the cache computation item fix pixel bert detect application runtime error * add weiming build script fix * add inplementation of CreateExecutable move complie_and_run to createExecute, and more error handle logic * add potential exception handle & open session_option to optimize * add more exception handling * update odla_computation::init() to return odla_status value * fix popart_config load error * add bool to make sure a compuation can't be executed when used to compile executable * improve load config logic * updat noticee strings * update load_config logic bug Co-authored-by: yanwei <yw01041751@alibaba-inc.com>
1 parent e0f07fa commit 6bc7b45

File tree

5 files changed

+204
-81
lines changed

5 files changed

+204
-81
lines changed

ODLA/platforms/odla_popart/odla_compute.cc

+39-15
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,9 @@ odla_status odla_SetComputationItem(odla_computation comp, odla_item_type type,
6262
comp->opts.cache_dir = (reinterpret_cast<char*>(value));
6363
break;
6464
case 1001: // load cache directly, need set path of cache file
65-
PopartConfig::instance()->set_load_cache(true);
66-
PopartConfig::instance()->set_cache_path(reinterpret_cast<char*>(value));
65+
PopartConfig::instance()->set_load_or_save_cache(true);
66+
PopartConfig::instance()->set_cache_path(
67+
(std::string) reinterpret_cast<char*>(value));
6768
break;
6869
default:
6970
std::cerr << "Unsupported property type: " << type << std::endl;
@@ -81,8 +82,10 @@ odla_status odla_CreateExecutable(odla_executable* executable,
8182
return ODLA_FAILURE;
8283
} else {
8384
if (comp->session) {
85+
popart::logging::info("Create cache file from exist session");
8486
return comp->compile_and_export();
8587
} else {
88+
popart::logging::info("Computation is not initialized. init it first");
8689
_odla_computation::instance()->init(true); // set is_compile to true
8790
// this comp init will create
8891
// executable
@@ -107,6 +110,7 @@ odla_status odla_LoadExecutable(const odla_char* file_name,
107110
odla_status odla_CreateComputation(odla_computation* comp) {
108111
static void* custom_op_handle = nullptr;
109112
*comp = _odla_computation::instance();
113+
popart::logging::info("computation created");
110114
if (custom_op_handle == nullptr) {
111115
custom_op_handle = dlopen("libcustom_ops.so", RTLD_NOW | RTLD_GLOBAL);
112116
if (custom_op_handle == nullptr) {
@@ -116,16 +120,18 @@ odla_status odla_CreateComputation(odla_computation* comp) {
116120
}
117121
// Read the config file
118122
if (!PopartConfig::instance()->inited()) {
119-
if (PopartConfig::instance()->load_cache()) {
120-
odla_status ret = PopartConfig::instance()->extract_config_from_cache();
121-
if (ret == ODLA_FAILURE) {
122-
popart::logging::err("load config from cache failed");
123-
return ret;
124-
}
123+
auto ret = PopartConfig::instance()->load_config(
124+
std::getenv("ODLA_POPART_CONFIG"));
125+
if (ret != ODLA_SUCCESS) {
126+
popart::logging::err("error load config");
127+
return ret;
125128
}
126-
PopartConfig::instance()->load_config(std::getenv("ODLA_POPART_CONFIG"));
127129
}
128-
_odla_computation::instance()->set_executor();
130+
odla_status status = _odla_computation::instance()->set_executor();
131+
if (status != ODLA_SUCCESS) {
132+
popart::logging::err("set_executor failed");
133+
return ODLA_FAILURE;
134+
}
129135
if (PopartConfig::instance()->execution_mode() == PARALLEL ||
130136
PopartConfig::instance()->execution_mode() == PIPELINE) {
131137
QManager::instance()->createQ(PopartConfig::instance()->queue_type());
@@ -137,8 +143,14 @@ odla_status odla_CreateComputation(odla_computation* comp) {
137143
}
138144

139145
odla_status odla_CreateContext(odla_context* context) {
140-
_odla_computation::instance(false)
141-
->init(); // Place the init here to avoid long execution problem
146+
odla_status status =
147+
_odla_computation::instance(false)
148+
->init(); // Place the init here to avoid long execution problem
149+
if (status != ODLA_SUCCESS &&
150+
_odla_computation::instance()->session == nullptr) {
151+
popart::logging::err("init computation item in CreateContext failed.");
152+
return ODLA_FAILURE;
153+
}
142154
*context = new _odla_pipeline_context(_odla_computation::instance());
143155
return ODLA_SUCCESS;
144156
}
@@ -149,15 +161,27 @@ odla_status odla_DestroyContext(odla_context ctx) {
149161
}
150162

151163
odla_status odla_DestroyComputation(odla_computation comp) {
152-
comp->mark_done();
153-
_odla_computation::destruct();
154-
QManager::instance()->deleteQ(); // delete current queue
164+
if (comp != nullptr) {
165+
if (!comp->is_compile_only()) {
166+
comp->mark_done();
167+
QManager::instance()->deleteQ(); // delete current queue
168+
}
169+
comp->release_session();
170+
_odla_computation::destruct(); // release the real computation
171+
}
172+
155173
return ODLA_SUCCESS;
156174
}
157175

158176
odla_status odla_ExecuteComputation(odla_computation comp, odla_context context,
159177
odla_compute_mode mode,
160178
odla_device device) {
179+
if (_odla_computation::instance()->is_compile_only()) {
180+
popart::logging::err(
181+
"This computation is created for compile executable, please re-create "
182+
"another computation for computing");
183+
return ODLA_FAILURE;
184+
}
161185
if (!context->hold("odla_ExecuteComputation")) return ODLA_FAILURE;
162186
return comp->executor()->compute(comp, context, mode, device);
163187
}

ODLA/platforms/odla_popart/odla_popart.cc

+74-30
Original file line numberDiff line numberDiff line change
@@ -90,14 +90,16 @@ void compute_loop(odla_computation comp) {
9090
}
9191

9292
odla_status _odla_computation::compile_and_export() {
93+
odla_status ret_value = ODLA_SUCCESS;
9394
popart::logging::warn("Start compile and export");
9495
const std::string& cache_file_name =
9596
PopartConfig::instance()->get_cache_path();
9697
std::string file_suffix(".popart");
9798
int file_prefix = cache_file_name.rfind(file_suffix);
9899
if (file_prefix == std::string::npos ||
99100
file_prefix + file_suffix.size() < cache_file_name.size()) {
100-
popart::logging::err("Bad cache file name");
101+
popart::logging::err(
102+
"Bad cache file name. File name should end with '.popart'");
101103
return ODLA_FAILURE;
102104
}
103105
if (file_prefix == std::string::npos) {
@@ -117,7 +119,7 @@ odla_status _odla_computation::compile_and_export() {
117119
config_fs.open(config_file_name, std::ios_base::in | std::ifstream::binary);
118120
if (!config_fs.is_open()) {
119121
popart::logging::warn(
120-
"invalid config file name:[ {} ] will use default config",
122+
"Open config file failed:[ {} ] will use default config",
121123
config_file_name);
122124
PopartConfig::instance()->use_default();
123125
config_string = PopartConfig::instance()->get_default_config_string();
@@ -134,18 +136,28 @@ odla_status _odla_computation::compile_and_export() {
134136
cache_fs.write((char*)&config_size, sizeof(config_size));
135137
cache_fs.write(config_string.c_str(), config_string.size());
136138

137-
_odla_computation::instance()->session->compileAndExport(cache_fs.flush());
138-
139+
try {
140+
_odla_computation::instance()->session->compileAndExport(cache_fs.flush());
141+
} catch (std::exception& e) {
142+
popart::logging::err("compileAndExport Falied: {}", e.what());
143+
ret_value = ODLA_FAILURE;
144+
}
139145
cache_fs.flush();
140146
cache_fs.close();
141147
config_fs.close();
148+
149+
return ret_value;
142150
}
143151

144-
void _odla_computation::init(bool is_compile) {
152+
odla_status _odla_computation::init(bool is_compile) {
145153
if (!session) {
146154
std::lock_guard<std::mutex> guard(init_mutex_);
147155
if (!session) {
148-
set_opts();
156+
odla_status status = set_opts();
157+
if (status != ODLA_SUCCESS) {
158+
popart::logging::err("set computation option failed");
159+
return status;
160+
}
149161
// Cretate the dataflow
150162
std::vector<popart::TensorId> ids;
151163
for (const auto& output : outputs_map)
@@ -168,8 +180,14 @@ void _odla_computation::init(bool is_compile) {
168180

169181
// Create and config SessionOptions
170182
set_session_opts();
171-
if (use_pipeline())
172-
builder = popart::Builder::createFromOnnxModel(set_pipeline_stage());
183+
if (use_pipeline()) {
184+
try {
185+
builder = popart::Builder::createFromOnnxModel(set_pipeline_stage());
186+
} catch (std::exception& e) {
187+
popart::logging::err("create builder from onnx model failed.");
188+
return ODLA_FAILURE;
189+
}
190+
}
173191
auto proto = builder->getModelProto(); // So, the init must be called at
174192
// odla_ExecuteCompute
175193

@@ -185,12 +203,19 @@ void _odla_computation::init(bool is_compile) {
185203
PopartConfig::instance()->save_model_path());
186204
}
187205

188-
// Create InferenceSession
189-
auto new_session = popart::InferenceSession::createFromOnnxModel(
190-
proto, data_flow, device, popart::InputShapeInfo(), session_opts_);
206+
std::unique_ptr<popart::InferenceSession> new_session;
207+
try {
208+
// Create InferenceSession
209+
new_session = std::move(popart::InferenceSession::createFromOnnxModel(
210+
proto, data_flow, device, popart::InputShapeInfo(), session_opts_));
211+
} catch (std::exception& e) {
212+
popart::logging::err("Session::createFromOnnxModel failed:{}",
213+
e.what());
214+
return ODLA_FAILURE;
215+
}
191216

192217
if (!is_compile) {
193-
if (PopartConfig::instance()->load_cache()) {
218+
if (PopartConfig::instance()->load_or_save_cache()) {
194219
popart::logging::info("Load cachefile from existing stream");
195220
auto cache_fs = PopartConfig::instance()->get_cache_fs();
196221
if (cache_fs->is_open()) {
@@ -202,10 +227,14 @@ void _odla_computation::init(bool is_compile) {
202227
}
203228
}
204229

205-
new_session->prepareDevice();
206-
new_session->setRandomSeed(0); // Init seed
207-
new_session->weightsFromHost(); // Copy weights from host to IPU
208-
230+
try {
231+
new_session->prepareDevice();
232+
new_session->setRandomSeed(0); // Init seed
233+
new_session->weightsFromHost(); // Copy weights from host to IPU
234+
} catch (std::exception& e) {
235+
popart::logging::err("session init failed: {}", e.what());
236+
return ODLA_FAILURE;
237+
}
209238
// If in parallel mode, start the thread
210239
ExecutionMode mode = PopartConfig::instance()->execution_mode();
211240
if (PIPELINE == mode || PARALLEL == mode) {
@@ -214,33 +243,42 @@ void _odla_computation::init(bool is_compile) {
214243
popart::logging::warn("Parallel loop has been started");
215244
parallel_thread.detach();
216245
}
246+
} else {
247+
is_compile_only_ = true;
217248
}
249+
218250
session =
219251
std::move(new_session); // set session after all initialization done.
220252
}
221253
}
222254
}
223255

224256
// Now we set this by config file, should set by the caller?
225-
void _odla_computation::set_opts() {
257+
odla_status _odla_computation::set_opts() {
226258
if (PopartConfig::instance()->debug()) {
227259
opts.ipu_num = PopartConfig::instance()->ipu_num();
228260
opts.batches_per_step = PopartConfig::instance()->batches_per_step();
229261
} else if (use_pipeline()) { // Only check when use pipeline
230-
if (opts.ipu_num != PopartConfig::instance()->ipu_num())
231-
throw std::invalid_argument(
262+
if (opts.ipu_num != PopartConfig::instance()->ipu_num()) {
263+
popart::logging::err(
232264
"number of ipus in pipeline configuration:" +
233265
std::to_string(PopartConfig::instance()->ipu_num()) +
234266
" must same with options: " + std::to_string(opts.ipu_num));
235-
if (opts.batches_per_step != PopartConfig::instance()->batches_per_step())
236-
throw std::invalid_argument(
267+
return ODLA_FAILURE;
268+
}
269+
if (opts.batches_per_step != PopartConfig::instance()->batches_per_step()) {
270+
popart::logging::err(
237271
"batches per step in pipeline configuration:" +
238272
std::to_string(PopartConfig::instance()->batches_per_step()) +
239273
" must same with options: " + std::to_string(opts.batches_per_step));
274+
return ODLA_FAILURE;
275+
}
240276
}
277+
return ODLA_SUCCESS;
241278
}
242279

243-
void _odla_computation::set_executor() {
280+
odla_status _odla_computation::set_executor() {
281+
odla_status ret_value = ODLA_SUCCESS;
244282
ExecutionMode mode = PopartConfig::instance()->execution_mode();
245283
if (PIPELINE == mode || PARALLEL == mode) {
246284
popart::logging::info("set the executor as parallel");
@@ -249,10 +287,13 @@ void _odla_computation::set_executor() {
249287
popart::logging::info("set the executor as sequence");
250288
executor_ = new Sequence();
251289
} else {
252-
throw std::invalid_argument(
253-
"*** FATAL *** unknown execution mode: {}" + std::to_string(mode) +
254-
". Should be one of pipeline, parallel or sequence");
290+
popart::logging::err(
291+
"unknown excution mode: {}, Should be one of pipeline, parallel or "
292+
"sequence",
293+
std::to_string(mode));
294+
ret_value = ODLA_FAILURE;
255295
}
296+
return ret_value;
256297
}
257298

258299
void _odla_computation::set_session_opts() {
@@ -270,9 +311,9 @@ void _odla_computation::set_session_opts() {
270311
session_opts_.cachePath =
271312
opts.enable_engine_cache ? opts.cache_dir : envEngineCachePath;
272313
}
273-
// session_opts_.matmulOptions["use128BitConvUnitLoad"] = "true";
274-
// session_opts_.matmulOptions["enableMultiStageReduce"] = "false";
275-
// session_opts_.matmulOptions["enableFastReduce"] = "true";
314+
session_opts_.matmulOptions["use128BitConvUnitLoad"] = "true";
315+
session_opts_.matmulOptions["enableMultiStageReduce"] = "false";
316+
session_opts_.matmulOptions["enableFastReduce"] = "true";
276317
session_opts_.enableFloatingPointChecks = false;
277318
session_opts_.enableStochasticRounding = false;
278319
session_opts_.enablePrefetchDatastreams = false; // true;
@@ -392,9 +433,12 @@ bool _odla_context::hold(const std::string& function_name) {
392433
ss_holder << thread_id_of_holder;
393434
popart::logging::err(
394435
"[{}] odla_context {} has been held by thread: {}"
395-
", when try to hold it in function {}.",
436+
", when try to hold it in function {}. multi threads try to hold the "
437+
"same context.",
396438
this_thread_id, this, thread_id_of_holder, function_name);
397-
throw std::runtime_error("Multiple threads try to hold the same context");
439+
return false;
440+
// throw std::runtime_error("Multiple threads try to hold the same
441+
// context");
398442
}
399443
return false;
400444
}

ODLA/platforms/odla_popart/odla_popart.h

+18-8
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ struct _odla_computation {
115115
}
116116
}
117117
}
118+
bool is_compile_only_;
118119
bool done_;
119120
bool thread_complete_;
120121
std::mutex init_mutex_;
@@ -127,21 +128,34 @@ struct _odla_computation {
127128
device(nullptr),
128129
opts({false, 1, 1}),
129130
done_(false),
131+
is_compile_only_(false),
130132
executor_(nullptr),
131133
thread_state_(DONE) {
132134
builder->setAttribute(popart::sVirtualGraphAttribute, 0);
133135
}
134-
void init(bool is_compile = false);
135136
std::string set_pipeline_stage();
136137
void set_session_opts();
137-
void set_executor();
138-
void set_opts();
138+
139139
bool use_pipeline();
140140
bool hold();
141+
142+
odla_status init_working_thread();
143+
odla_status init(bool is_compile = false);
144+
odla_status set_executor();
145+
odla_status set_opts();
141146
odla_status compile_and_export();
142147

143148
inline Execution* executor() { return executor_; }
144149
inline bool is_done() { return thread_state_ != RUNNING; }
150+
inline bool is_compile_only() { return is_compile_only_; }
151+
inline void release_session() {
152+
if (session != nullptr) {
153+
session->getDevice().getDeviceInfo()->detach();
154+
session.reset();
155+
assert(session == nullptr);
156+
}
157+
}
158+
145159
inline void mark_done() {
146160
while (thread_state_ != DONE) {
147161
std::unique_lock<std::mutex> lock(thread_done_mutex_);
@@ -150,11 +164,7 @@ struct _odla_computation {
150164
}
151165
// Once get notified, only detach the device once
152166
std::lock_guard<std::mutex> guard(init_mutex_);
153-
if (session != nullptr) {
154-
session->getDevice().getDeviceInfo()->detach();
155-
session.reset();
156-
assert(session == nullptr);
157-
}
167+
release_session();
158168
}
159169
inline void thread_done() {
160170
std::unique_lock<std::mutex> lock(thread_done_mutex_);

0 commit comments

Comments
 (0)