Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve costmodel on 4 models #826

Open
wants to merge 2 commits into
base: cmb_vodla_demo_0.2.1
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion include/halo/halo.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ struct AnalyzerOpts {
bool print_details = false;
int batch_size = 1;
int qps = 0; // image per second
int model_type = 0;
};

struct CXXCodeGenOpts {
Expand Down Expand Up @@ -150,7 +151,8 @@ int halo_Analyze(halo::ModelFormat model_format, unsigned num_models,
const char* const input_shapes[], unsigned num_inputs,
const char* const inputs[], unsigned num_outputs,
const char* const outputs[], const HaloCodeGenOpts* cg_opts,
const char* main_output_file, HaloModelInfo* model_info);
const char* main_output_file, HaloModelInfo* model_info,
const int model_type);
}

#endif // HALO_HALO_H_
32 changes: 29 additions & 3 deletions lib/interface/interface.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ static int InvokeCompiler(Module* m, const std::string& target, int batch,
ModelFormat model_format,
const CXXCodeGenOpts& cg_opts,
const std::string& main_output_file_name,
ModelInfo* model_info, bool is_compile_model = true) {
ModelInfo* model_info, bool is_compile_model = true,
const int model_type = 0) {
auto& ctx = m->GetGlobalContext();
ctx.SetVerbosity(1);
ctx.SetBasePath(GetBaseDir());
Expand Down Expand Up @@ -107,6 +108,7 @@ static int InvokeCompiler(Module* m, const std::string& target, int batch,
alz_opts.batch_size = model_info->adaptive_bsz;
alz_opts.print_details = false;
alz_opts.qps = model_info->input_qps;
alz_opts.model_type = model_type;
Analyzer* analyzer =
static_cast<Analyzer*>(pm.AddAnalyzerPass(&std::cout, alz_opts));
pm.Run(m);
Expand Down Expand Up @@ -149,6 +151,29 @@ int Compile(ModelFormat format, const std::vector<const void*>& model_defs,
format, cg_opts, main_output_file_name, model_info);
}

HL_API_EXPORT
int Compile(halo::ModelFormat format, const std::vector<const char*>& models,
const std::vector<size_t>& model_sizes, const std::string& target,
int batch, const std::vector<std::string>& input_shapes,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs,
const CXXCodeGenOpts& cg_opts,
const std::string& main_output_file_name, ModelInfo* model_info,
bool is_compile_model, const int model_type) {
GlobalContext ctx;
Function* func;
std::unique_ptr<Module> m;
std::tie(m, func) = CreateModule(&ctx, target);
if (auto status = Parser::Parse(func, models, model_sizes, format);
status != Status::SUCCESS) {
return 1;
}

return InvokeCompiler(m.get(), target, batch, input_shapes, inputs, outputs,
format, cg_opts, main_output_file_name, model_info,
is_compile_model, model_type);
}

HL_API_EXPORT
int Compile(halo::ModelFormat format, const std::vector<const char*>& models,
const std::vector<size_t>& model_sizes, const std::string& target,
Expand Down Expand Up @@ -250,7 +275,8 @@ int halo_Analyze(halo::ModelFormat model_format, unsigned num_models,
const char* const input_shapes[], unsigned num_inputs,
const char* const inputs[], unsigned num_outputs,
const char* const outputs[], const HaloCodeGenOpts* cg_opts,
const char* main_output_file, HaloModelInfo* model_info) {
const char* main_output_file, HaloModelInfo* model_info,
const int model_type) {
const halo::CXXCodeGenOpts& opts =
*reinterpret_cast<const halo::CXXCodeGenOpts*>(cg_opts);
std::vector<const char*> models_data(num_models);
Expand All @@ -263,5 +289,5 @@ int halo_Analyze(halo::ModelFormat model_format, unsigned num_models,
model_format, models_data, models_sizes, std::string(target), batch,
ToStrings(num_input_shapes, input_shapes), ToStrings(num_inputs, inputs),
ToStrings(num_outputs, outputs), opts, std::string(main_output_file),
model_info, false);
model_info, false, model_type);
}
171 changes: 168 additions & 3 deletions lib/transforms/analyzer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,66 @@ bool Analyzer::RunOnModule(Module* m) {
// return l;
// }

// This is a solver for function such as y=a*x^3+b*x^2+c*x+d
static float NewtonSolver(const std::array<double, 4> func, int iteration,
float error) {
const std::array<double, 3> func_de{func[1], func[2] * 2, func[3] * 3};
const float init = 5;
const float max_per = 100;
const float min_per = 5;
float per = init;

for (int i = 0; i < iteration; i++) {
if (fabs(func[0] + func[1] * per + func[2] * per * per +
func[3] * per * per * per) < error) {
break;
}
per = per - (func[0] + func[1] * per + func[2] * per * per +
func[3] * per * per * per) /
(func_de[0] + func_de[1] * per + func_de[2] * per * per);
}
if (per > max_per) {
per = max_per;
} else if (per < min_per) {
per = min_per;
}

return per;
}

// This is a solver for function such as y=b/x^2+c/x+d
static float NewtonSolverV2(const std::array<double, 3> func, int iteration,
float error) {
const std::array<double, 2> func_de{
func[1] * (-1),
func[2] * (-2),
};
const float init = 50;
const float max_per = 100;
const float min_per = 5;
float per = init;

for (int i = 0; i < iteration; i++) {
if (fabs(func[0] + func[1] / per + func[2] / (per * per)) < error) {
break;
}
per = per - (func[0] + func[1] / per + func[2] / (per * per)) /
(func_de[0] / (per * per) + func_de[1] / (per * per * per));
if (per > max_per) {
per = max_per;
} else if (per < min_per) {
per = min_per;
}
}
if (per > max_per) {
per = max_per;
} else if (per < min_per) {
per = min_per;
}

return per;
}

void Analyzer::GenerateRscInfo(std::ostream& os) {
static constexpr float mflops = 1000000.0F;
static constexpr float gflops = 1000 * mflops;
Expand Down Expand Up @@ -688,8 +748,8 @@ void Analyzer::GenerateRscInfo(std::ostream& os) {
hw_paras_step["GPU_t4"].other_time *
(total_flops - conv_flops - conv_act_flops - matmul_flops);
// knl_latency_temp *= static_cast<float>(adaptive_bsz_);
cur_qps = opts_.batch_size * ms2s * 4 /
(init_latency + knl_latency_temp * opts_.batch_size);
cur_qps = float(opts_.batch_size) * ms2s * 4 /
(init_latency + knl_latency_temp * float(opts_.batch_size));
// os << "step:" << step << " ,cur_qps:" << cur_qps << "\n";
// os << "hw_paras_step[GPU_t4].conv_time" <<
// hw_paras_step["GPU_t4"].conv_time
Expand Down Expand Up @@ -742,11 +802,116 @@ void Analyzer::GenerateRscInfo(std::ostream& os) {

float est_latency = init_latency + knl_latency;
const float t4 = t4_flops / 100;
const double u_sec = 1e+6;
const int iteration = 10;
const float error_rate = 0.001;
const float max_percent = 100;
const float fixed_latency = 0;
if (opts_.model_type == 1) {
// const std::array<double, 10> model{64073.283167584894, -88.91731411,
// 12.78189374, 26.05789414,
// 8533.30914793, -2900.88985761};
// const std::array<double, 4> func{
// model[0] +
// model[2] * float(opts_.batch_size) * float(opts_.batch_size) +
// model[4] * float(opts_.batch_size) -
// u_sec * float(opts_.batch_size) / opts_.qps,
// model[1] * float(opts_.batch_size) + model[5], model[3]};
const int resnet_max_batch = 64;
if (opts_.batch_size > resnet_max_batch) {
opts_.batch_size = resnet_max_batch;
}
const std::array<double, 9> model{
3178.584323243631, 1.04834476e+03, -1.79959003e-01,
4.22275381e+05, 4.01474040e+04, 1.03502481e+03,
-3.83377305e+06, 6.07300852e+05, -1.00743898e+04};
const std::array<double, 3> func{
model[0] + model[1] * float(opts_.batch_size) +
model[2] * float(opts_.batch_size) * float(opts_.batch_size) -
fixed_latency - u_sec * float(opts_.batch_size) / opts_.qps,
model[3] + model[4] * float(opts_.batch_size) +
model[5] * float(opts_.batch_size) * float(opts_.batch_size),
model[6] + model[7] * float(opts_.batch_size) +
model[8] * float(opts_.batch_size) * float(opts_.batch_size)};
float per = NewtonSolverV2(
func, iteration,
error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
floatsrate = per * t4_flops / max_percent;
os << "Model: resnet50"
<< "\n";
os << "est latency: "
<< func[0] + func[1] / per + func[2] / (per * per) + fixed_latency +
u_sec * float(opts_.batch_size) / opts_.qps
<< "\n";
} else if (opts_.model_type == 2) {
const std::array<double, 3> func{
15172.983994027589 - u_sec * float(opts_.batch_size) / opts_.qps -
fixed_latency,
632927.68536315, -1369248.3065591};
float per = NewtonSolverV2(
func, iteration,
error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
floatsrate = per * t4_flops / max_percent;
os << "Model: dbnet"
<< "\n";
os << "est latency: "
<< func[0] + func[1] / per + func[2] / (per * per) + fixed_latency +
u_sec * float(opts_.batch_size) / opts_.qps
<< "\n";
} else if (opts_.model_type == 3) {
const std::array<double, 4> func{
31525.584310580438 - u_sec * float(opts_.batch_size) / opts_.qps -
fixed_latency,
-475.78524037, 2.58107976, 0.0};
float per =
NewtonSolver(func, iteration,
error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
floatsrate = per * t4_flops / max_percent;
os << "Model: crnn"
<< "\n";
os << "est latency: "
<< func[0] + func[1] * per + func[2] * per * per +
func[3] * per * per * per + fixed_latency +
u_sec * float(opts_.batch_size) / opts_.qps
<< "\n";
} else if (opts_.model_type == 4) {
const int bert_max_batch = 64;
if (opts_.batch_size > bert_max_batch) {
opts_.batch_size = bert_max_batch;
}
const std::array<double, 9> model{
393.0349355327198, 8.03122552e+03, 2.68919145e+01,
-2.54569898e+05, 4.60746406e+05, 1.46693913e+02,
2.42762903e+06, 1.65355925e+06, -5.13532294e+03};
const std::array<double, 3> func{
model[0] + model[1] * float(opts_.batch_size) +
model[2] * float(opts_.batch_size) * float(opts_.batch_size) -
fixed_latency - u_sec * float(opts_.batch_size) / opts_.qps,
model[3] + model[4] * float(opts_.batch_size) +
model[5] * float(opts_.batch_size) * float(opts_.batch_size),
model[6] + model[7] * float(opts_.batch_size) +
model[8] * float(opts_.batch_size) * float(opts_.batch_size)};
float per = NewtonSolverV2(
func, iteration,
error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
floatsrate = per * t4_flops / max_percent;
os << "Model: bert"
<< "\n";
os << "est latency: "
<< func[0] + func[1] / per + func[2] / (per * per) + fixed_latency +
u_sec * float(opts_.batch_size) / opts_.qps
<< "\n";
} else {
os << "Model: other"
<< "\n";
}

os << "Device: GPU T4"
<< "\n";
os << "batch size: " << adaptive_bsz_ << "\n";
os << "est FLOPs: " << floatsrate << " gFlops\n";
os << "est split: " << floatsrate / t4 << "% T4\n";
os << "model FLOPs: " << floatsrate / t4 << "% T4\n";
os << "est latency: " << est_latency << " ms\n";
os << "est mem: " << trt_mem << " MB\n";
/*-----Generated T4 parameters-----------------*/
Expand All @@ -768,7 +933,7 @@ void Analyzer::GenerateRscInfo(std::ostream& os) {
rsc_req_.append("\"size\":1,");
rsc_req_.append("\"flops\":\"");
// std::string s = std::to_string(total_flops * gflops);
std::string s = std::to_string(ceil(floatsrate));
std::string s = std::to_string(ceil(int(floatsrate)));
rsc_req_.append(s.substr(0, s.find('.')));
rsc_req_.append("\",");
rsc_req_.append("\"precision\":\"Fp32\",");
Expand Down
15 changes: 13 additions & 2 deletions python/halo/halo.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ class ModelInfo(Structure):
c_void_p, # cg_opts
c_char_p, # filename
c_void_p, # model_info
c_int, # model_type
]


Expand Down Expand Up @@ -192,7 +193,7 @@ def CompileModel(model_file, input_shapes, output_names, batch, format):
return [output_file, output_bin]


def AnalyzeModel(model_file, input_shapes, batch, format, model_info):
def AnalyzeModel(model_file, input_shapes, batch, format, model_info, model_type=""):
output_file = ""
odla_lib = cast(create_string_buffer(b""), c_char_p)
opts = CXXCodeGenOpts()
Expand Down Expand Up @@ -228,7 +229,16 @@ def AnalyzeModel(model_file, input_shapes, batch, format, model_info):

target = "cxx".encode("utf-8")
output_filename = output_file.encode("utf-8")

if("resnet50" in model_type):
model_type = 1
elif("dbnet" in model_type):
model_type = 2
elif("crnn" in model_type):
model_type = 3
elif("bert" in model_type):
model_type = 4
else:
model_type = 0
Analyze(
format_val,
model_num,
Expand All @@ -245,6 +255,7 @@ def AnalyzeModel(model_file, input_shapes, batch, format, model_info):
pointer(opts),
output_filename,
pointer(model_info),
model_type,
)

def CompileODLAModel(files, device, debug=False):
Expand Down
7 changes: 5 additions & 2 deletions python/halo/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def __init__(
qps,
debug,
log_level,
model_type=""
):
self.debug = debug
logging.getLogger("halo").setLevel(log_level)
Expand All @@ -56,6 +57,7 @@ def __init__(
self.device = device
self.batch = batch
self.qps = qps
self.model_type = model_type
self.model = None
self.so_file = None

Expand All @@ -64,15 +66,16 @@ def __del__(self):

def Initialize(self):
self.logger.info(f"Begin initialization;{self.model_file}")
self.so_file = "libvodla.so"
self.so_file = "/usr/local/lib/libvodla.so"
self.model = odla.ODLAModel(self.so_file)
self.model.Load(
self.model_file,
self.input_shapes,
self.output_names,
self.format,
self.batch,
self.qps)
self.qps,
self.model_type)
self.logger.info("Done initialization")

def Run(self, data):
Expand Down
4 changes: 2 additions & 2 deletions python/halo/odla.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def __del__(self):
self.h.odla_DestroyComputation(self.comp)
self.h.odla_DestroyDevice(self.device)

def Load(self,model,input_shapes,output_names,format,batch,qps):
def Load(self,model,input_shapes,output_names,format,batch,qps,model_type=""):
if self.h is None:
self.h = CDLL(self.so_file)
self.comp = c_void_p(0)
Expand All @@ -62,7 +62,7 @@ def Load(self,model,input_shapes,output_names,format,batch,qps):
model_info.adaptive_bsz = batch
rsc_est = c_void_p(0)
if qps>0:
halo.AnalyzeModel(model,input_shapes,batch,format,model_info)
halo.AnalyzeModel(model, input_shapes, batch, format, model_info,model_type)
rsc_est = (c_char_p)(model_info.output_rsc_est)
self.h.odla_AllocateDevice(c_void_p(0), 0, pointer(self.device), rsc_est)
self.files = halo.CompileModel(
Expand Down