Skip to content

Commit 3be81f0

Browse files
jackzipuweimingzha0
authored andcommitted
Make all resources used in thread intialized before start the thread
1 parent 7630bbf commit 3be81f0

File tree

3 files changed

+48
-16
lines changed

3 files changed

+48
-16
lines changed

ODLA/platforms/odla_popart/odla_compute.cc

+4-3
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,8 @@ odla_status odla_CreateComputation(odla_computation* comp) {
167167

168168
static void* custom_op_handle = nullptr;
169169
*comp = _odla_computation::instance();
170-
popart::logging::info("computation created");
170+
popart::logging::warn("computation created: {}",
171+
_odla_computation::instance());
171172
if (custom_op_handle == nullptr) {
172173
custom_op_handle = dlopen("libcustom_ops.so", RTLD_NOW | RTLD_GLOBAL);
173174
if (custom_op_handle == nullptr) {
@@ -228,7 +229,7 @@ odla_status odla_DestroyContext(odla_context ctx) {
228229

229230
odla_status odla_DestroyComputation(odla_computation comp) {
230231
std::lock_guard<std::mutex> guard(g_computation_mutex);
231-
popart::logging::info("call odla_destroyComputation");
232+
popart::logging::warn("call odla_destroyComputation comp: {}", comp);
232233
if (comp != nullptr) {
233234
if (!comp->is_compile_only()) {
234235
comp->mark_done();
@@ -237,7 +238,7 @@ odla_status odla_DestroyComputation(odla_computation comp) {
237238
comp->release_session();
238239
_odla_computation::destruct(); // release the real computation
239240
}
240-
popart::logging::info("reset config state");
241+
popart::logging::warn("reset config state, comp: {}", comp);
241242
PopartConfig::instance()->reset_init_state();
242243

243244
return ODLA_SUCCESS;

ODLA/platforms/odla_popart/odla_popart.cc

+25-11
Original file line numberDiff line numberDiff line change
@@ -81,15 +81,23 @@ void compute_loop(odla_computation comp) {
8181
popart::StepIOCallback stepio(input_callback, input_complete_callback,
8282
output_callback, output_complete_callback);
8383
int i = 0;
84+
bool info_printed = false;
8485
POPLAR_TRY
86+
comp->set_thread_run(); // set the state to RUNNING
8587
while (!comp->is_done()) {
8688
auto start = std::chrono::steady_clock::now();
8789
popart::logging::info("This is the {} time for the inference", i++);
8890
if (i == INT_MAX) i = 0;
91+
if (!info_printed) {
92+
popart::logging::warn(
93+
"Start to run the stepio with comp:{}, session:{}, device:{}", comp,
94+
comp->session.get(), comp->session->getDevice().getDeviceInfo());
95+
info_printed = true;
96+
}
8997
comp->session->run(stepio);
9098
auto end = std::chrono::steady_clock::now();
9199
std::chrono::duration<double> elapsed_seconds = end - start;
92-
popart::logging::warn(
100+
popart::logging::info(
93101
"[ {} ] ONE_STEP takes {} s. Check whether more inference tasks "
94102
"wating.",
95103
i, elapsed_seconds.count());
@@ -99,11 +107,12 @@ void compute_loop(odla_computation comp) {
99107
std::this_thread::sleep_for(std::chrono::milliseconds(1));
100108
end = std::chrono::steady_clock::now();
101109
std::chrono::duration<double, std::milli> elapsed_ms = end - start;
102-
popart::logging::warn("Found new tasks in {} ms.", elapsed_ms.count());
110+
popart::logging::info("Found new tasks in {} ms.", elapsed_ms.count());
103111
}
104112
POPLAR_CATCH
105113

106-
popart::logging::info("The pipeline loop finished");
114+
popart::logging::warn(
115+
"The computation: {} pipeline loop finished after {} steps run", comp, i);
107116
comp->thread_done();
108117
}
109118

@@ -184,6 +193,7 @@ odla_status _odla_computation::compile_and_export() {
184193

185194
odla_status _odla_computation::init(bool is_compile) {
186195
if (!session) {
196+
popart::logging::warn("The computation:{} start to init", this);
187197
std::lock_guard<std::mutex> guard(init_mutex_);
188198
if (!session) {
189199
POPLAR_TRY
@@ -281,22 +291,26 @@ odla_status _odla_computation::init(bool is_compile) {
281291
new_session->prepareDevice();
282292
new_session->setRandomSeed(0); // Init seed
283293
new_session->weightsFromHost(); // Copy weights from host to IPU
284-
// If in parallel mode, start the thread
294+
} else {
295+
is_compile_only_ = true;
296+
}
297+
// set session after all initialization done.
298+
session = std::move(new_session);
299+
// Thread must be started after all initialization done
300+
if (!is_compile) {
285301
ExecutionMode mode = PopartConfig::instance()->execution_mode();
286302
if (PIPELINE == mode || PARALLEL == mode || PIPELINE_ASYNC == mode) {
287303
std::thread parallel_thread(compute_loop, this);
288-
thread_state_ = RUNNING;
289-
popart::logging::warn("Parallel loop has been started");
304+
popart::logging::warn(
305+
"The computation: {}, parallel loop has been started", this);
290306
parallel_thread.detach();
291307
}
292-
} else {
293-
is_compile_only_ = true;
294308
}
295-
296-
session =
297-
std::move(new_session); // set session after all initialization done.
298309
POPLAR_CATCH
299310
}
311+
popart::logging::warn(
312+
"The computation:{} has been initialised with session:{}", this,
313+
session.get());
300314
}
301315
return ODLA_SUCCESS;
302316
}

ODLA/platforms/odla_popart/odla_popart.h

+19-2
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ struct _odla_computation {
101101
if (instance_ == nullptr) {
102102
std::lock_guard<std::mutex> guard(comp_mutex_);
103103
if (instance_ == nullptr) instance_ = new _odla_computation();
104+
popart::logging::warn("The computation:{} has been firstly created",
105+
instance_);
104106
}
105107
if (hold_it) instance_->hold();
106108
return instance_;
@@ -110,6 +112,8 @@ struct _odla_computation {
110112
std::lock_guard<std::mutex> guard(comp_mutex_);
111113
if (instance_ != nullptr) {
112114
delete instance_;
115+
popart::logging::warn("The computation:{} has been destructed",
116+
instance_);
113117
instance_ = nullptr;
114118
}
115119
}
@@ -150,19 +154,31 @@ struct _odla_computation {
150154
inline void release_session() {
151155
if (session != nullptr) {
152156
session->getDevice().getDeviceInfo()->detach();
157+
popart::logging::warn(
158+
"The computation:{} session:{} detached from device", this,
159+
session.get());
153160
session.reset();
154161
assert(session == nullptr);
162+
popart::logging::warn("The computation:{} session has been reset", this);
155163
}
156164
}
157-
165+
inline void set_thread_run() {
166+
std::unique_lock<std::mutex> lock(thread_done_mutex_);
167+
thread_state_ = RUNNING;
168+
}
158169
inline void mark_done() {
159170
while (thread_state_ != DONE) {
160171
std::unique_lock<std::mutex> lock(thread_done_mutex_);
161172
if (thread_state_ != DONE) {
162173
thread_state_ = MARK_DONE;
174+
popart::logging::warn(
175+
"The computation:{} thread now is MARK_DONE, waiting for DONE",
176+
this);
163177
thread_done_cv_.wait_for(lock, std::chrono::milliseconds(5));
164178
} else
165-
popart::logging::warn("Alread DONE when try to mark_done");
179+
popart::logging::warn(
180+
"The computation {} thread already DONE when try to mark_done",
181+
this);
166182
}
167183
// Once get notified, only detach the device once
168184
std::lock_guard<std::mutex> guard(init_mutex_);
@@ -171,6 +187,7 @@ struct _odla_computation {
171187
inline void thread_done() {
172188
std::unique_lock<std::mutex> lock(thread_done_mutex_);
173189
thread_state_ = DONE;
190+
popart::logging::warn("The computation:{} thread is DONE.", this);
174191
thread_done_cv_.notify_all();
175192
}
176193
};

0 commit comments

Comments
 (0)