alibaba · lcsama · Feb 14, 2022 · Mar 16, 2022
diff --git a/include/halo/halo.h b/include/halo/halo.h
@@ -54,6 +54,7 @@ struct AnalyzerOpts {
   bool print_details = false;
   int batch_size = 1;
   int qps = 0; // image per second
+  int model_type = 0;
 };
 
 struct CXXCodeGenOpts {
@@ -150,7 +151,8 @@ int halo_Analyze(halo::ModelFormat model_format, unsigned num_models,
                  const char* const input_shapes[], unsigned num_inputs,
                  const char* const inputs[], unsigned num_outputs,
                  const char* const outputs[], const HaloCodeGenOpts* cg_opts,
-                 const char* main_output_file, HaloModelInfo* model_info);
+                 const char* main_output_file, HaloModelInfo* model_info,
+                 const int model_type);
 }
 
 #endif // HALO_HALO_H_
diff --git a/lib/interface/interface.cc b/lib/interface/interface.cc
@@ -53,7 +53,8 @@ static int InvokeCompiler(Module* m, const std::string& target, int batch,
                           ModelFormat model_format,
                           const CXXCodeGenOpts& cg_opts,
                           const std::string& main_output_file_name,
-                          ModelInfo* model_info, bool is_compile_model = true) {
+                          ModelInfo* model_info, bool is_compile_model = true,
+                          const int model_type = 0) {
   auto& ctx = m->GetGlobalContext();
   ctx.SetVerbosity(1);
   ctx.SetBasePath(GetBaseDir());
@@ -107,6 +108,7 @@ static int InvokeCompiler(Module* m, const std::string& target, int batch,
     alz_opts.batch_size = model_info->adaptive_bsz;
     alz_opts.print_details = false;
     alz_opts.qps = model_info->input_qps;
+    alz_opts.model_type = model_type;
     Analyzer* analyzer =
         static_cast<Analyzer*>(pm.AddAnalyzerPass(&std::cout, alz_opts));
     pm.Run(m);
@@ -149,6 +151,29 @@ int Compile(ModelFormat format, const std::vector<const void*>& model_defs,
                         format, cg_opts, main_output_file_name, model_info);
 }
 
+HL_API_EXPORT
+int Compile(halo::ModelFormat format, const std::vector<const char*>& models,
+            const std::vector<size_t>& model_sizes, const std::string& target,
+            int batch, const std::vector<std::string>& input_shapes,
+            const std::vector<std::string>& inputs,
+            const std::vector<std::string>& outputs,
+            const CXXCodeGenOpts& cg_opts,
+            const std::string& main_output_file_name, ModelInfo* model_info,
+            bool is_compile_model, const int model_type) {
+  GlobalContext ctx;
+  Function* func;
+  std::unique_ptr<Module> m;
+  std::tie(m, func) = CreateModule(&ctx, target);
+  if (auto status = Parser::Parse(func, models, model_sizes, format);
+      status != Status::SUCCESS) {
+    return 1;
+  }
+
+  return InvokeCompiler(m.get(), target, batch, input_shapes, inputs, outputs,
+                        format, cg_opts, main_output_file_name, model_info,
+                        is_compile_model, model_type);
+}
+
 HL_API_EXPORT
 int Compile(halo::ModelFormat format, const std::vector<const char*>& models,
             const std::vector<size_t>& model_sizes, const std::string& target,
@@ -250,7 +275,8 @@ int halo_Analyze(halo::ModelFormat model_format, unsigned num_models,
                  const char* const input_shapes[], unsigned num_inputs,
                  const char* const inputs[], unsigned num_outputs,
                  const char* const outputs[], const HaloCodeGenOpts* cg_opts,
-                 const char* main_output_file, HaloModelInfo* model_info) {
+                 const char* main_output_file, HaloModelInfo* model_info,
+                 const int model_type) {
   const halo::CXXCodeGenOpts& opts =
       *reinterpret_cast<const halo::CXXCodeGenOpts*>(cg_opts);
   std::vector<const char*> models_data(num_models);
@@ -263,5 +289,5 @@ int halo_Analyze(halo::ModelFormat model_format, unsigned num_models,
       model_format, models_data, models_sizes, std::string(target), batch,
       ToStrings(num_input_shapes, input_shapes), ToStrings(num_inputs, inputs),
       ToStrings(num_outputs, outputs), opts, std::string(main_output_file),
-      model_info, false);
+      model_info, false, model_type);
 }
diff --git a/lib/transforms/analyzer.cc b/lib/transforms/analyzer.cc
@@ -565,6 +565,66 @@ bool Analyzer::RunOnModule(Module* m) {
 //   return l;
 // }
 
+// This is a solver for function such as y=a*x^3+b*x^2+c*x+d
+static float NewtonSolver(const std::array<double, 4> func, int iteration,
+                          float error) {
+  const std::array<double, 3> func_de{func[1], func[2] * 2, func[3] * 3};
+  const float init = 5;
+  const float max_per = 100;
+  const float min_per = 5;
+  float per = init;
+
+  for (int i = 0; i < iteration; i++) {
+    if (fabs(func[0] + func[1] * per + func[2] * per * per +
+             func[3] * per * per * per) < error) {
+      break;
+    }
+    per = per - (func[0] + func[1] * per + func[2] * per * per +
+                 func[3] * per * per * per) /
+                    (func_de[0] + func_de[1] * per + func_de[2] * per * per);
+  }
+  if (per > max_per) {
+    per = max_per;
+  } else if (per < min_per) {
+    per = min_per;
+  }
+
+  return per;
+}
+
+// This is a solver for function such as y=b/x^2+c/x+d
+static float NewtonSolverV2(const std::array<double, 3> func, int iteration,
+                            float error) {
+  const std::array<double, 2> func_de{
+      func[1] * (-1),
+      func[2] * (-2),
+  };
+  const float init = 50;
+  const float max_per = 100;
+  const float min_per = 5;
+  float per = init;
+
+  for (int i = 0; i < iteration; i++) {
+    if (fabs(func[0] + func[1] / per + func[2] / (per * per)) < error) {
+      break;
+    }
+    per = per - (func[0] + func[1] / per + func[2] / (per * per)) /
+                    (func_de[0] / (per * per) + func_de[1] / (per * per * per));
+    if (per > max_per) {
+      per = max_per;
+    } else if (per < min_per) {
+      per = min_per;
+    }
+  }
+  if (per > max_per) {
+    per = max_per;
+  } else if (per < min_per) {
+    per = min_per;
+  }
+
+  return per;
+}
+
 void Analyzer::GenerateRscInfo(std::ostream& os) {
   static constexpr float mflops = 1000000.0F;
   static constexpr float gflops = 1000 * mflops;
@@ -688,8 +748,8 @@ void Analyzer::GenerateRscInfo(std::ostream& os) {
         hw_paras_step["GPU_t4"].other_time *
         (total_flops - conv_flops - conv_act_flops - matmul_flops);
     // knl_latency_temp *= static_cast<float>(adaptive_bsz_);
-    cur_qps = opts_.batch_size * ms2s * 4 /
-              (init_latency + knl_latency_temp * opts_.batch_size);
+    cur_qps = float(opts_.batch_size) * ms2s * 4 /
+              (init_latency + knl_latency_temp * float(opts_.batch_size));
     // os << "step:" << step << " ,cur_qps:" << cur_qps << "\n";
     // os << "hw_paras_step[GPU_t4].conv_time" <<
     // hw_paras_step["GPU_t4"].conv_time
@@ -742,11 +802,116 @@ void Analyzer::GenerateRscInfo(std::ostream& os) {
 
   float est_latency = init_latency + knl_latency;
   const float t4 = t4_flops / 100;
+  const double u_sec = 1e+6;
+  const int iteration = 10;
+  const float error_rate = 0.001;
+  const float max_percent = 100;
+  const float fixed_latency = 0;
+  if (opts_.model_type == 1) {
+    // const std::array<double, 10> model{64073.283167584894, -88.91731411,
+    //                                    12.78189374,        26.05789414,
+    //                                    8533.30914793,      -2900.88985761};
+    // const std::array<double, 4> func{
+    //     model[0] +
+    //         model[2] * float(opts_.batch_size) * float(opts_.batch_size) +
+    //         model[4] * float(opts_.batch_size) -
+    //         u_sec * float(opts_.batch_size) / opts_.qps,
+    //     model[1] * float(opts_.batch_size) + model[5], model[3]};
+    const int resnet_max_batch = 64;
+    if (opts_.batch_size > resnet_max_batch) {
+      opts_.batch_size = resnet_max_batch;
+    }
+    const std::array<double, 9> model{
+        3178.584323243631, 1.04834476e+03, -1.79959003e-01,
+        4.22275381e+05,    4.01474040e+04, 1.03502481e+03,
+        -3.83377305e+06,   6.07300852e+05, -1.00743898e+04};
+    const std::array<double, 3> func{
+        model[0] + model[1] * float(opts_.batch_size) +
+            model[2] * float(opts_.batch_size) * float(opts_.batch_size) -
+            fixed_latency - u_sec * float(opts_.batch_size) / opts_.qps,
+        model[3] + model[4] * float(opts_.batch_size) +
+            model[5] * float(opts_.batch_size) * float(opts_.batch_size),
+        model[6] + model[7] * float(opts_.batch_size) +
+            model[8] * float(opts_.batch_size) * float(opts_.batch_size)};
+    float per = NewtonSolverV2(
+        func, iteration,
+        error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
+    floatsrate = per * t4_flops / max_percent;
+    os << "Model: resnet50"
+       << "\n";
+    os << "est latency: "
+       << func[0] + func[1] / per + func[2] / (per * per) + fixed_latency +
+              u_sec * float(opts_.batch_size) / opts_.qps
+       << "\n";
+  } else if (opts_.model_type == 2) {
+    const std::array<double, 3> func{
+        15172.983994027589 - u_sec * float(opts_.batch_size) / opts_.qps -
+            fixed_latency,
+        632927.68536315, -1369248.3065591};
+    float per = NewtonSolverV2(
+        func, iteration,
+        error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
+    floatsrate = per * t4_flops / max_percent;
+    os << "Model: dbnet"
+       << "\n";
+    os << "est latency: "
+       << func[0] + func[1] / per + func[2] / (per * per) + fixed_latency +
+              u_sec * float(opts_.batch_size) / opts_.qps
+       << "\n";
+  } else if (opts_.model_type == 3) {
+    const std::array<double, 4> func{
+        31525.584310580438 - u_sec * float(opts_.batch_size) / opts_.qps -
+            fixed_latency,
+        -475.78524037, 2.58107976, 0.0};
+    float per =
+        NewtonSolver(func, iteration,
+                     error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
+    floatsrate = per * t4_flops / max_percent;
+    os << "Model: crnn"
+       << "\n";
+    os << "est latency: "
+       << func[0] + func[1] * per + func[2] * per * per +
+              func[3] * per * per * per + fixed_latency +
+              u_sec * float(opts_.batch_size) / opts_.qps
+       << "\n";
+  } else if (opts_.model_type == 4) {
+    const int bert_max_batch = 64;
+    if (opts_.batch_size > bert_max_batch) {
+      opts_.batch_size = bert_max_batch;
+    }
+    const std::array<double, 9> model{
+        393.0349355327198, 8.03122552e+03, 2.68919145e+01,
+        -2.54569898e+05,   4.60746406e+05, 1.46693913e+02,
+        2.42762903e+06,    1.65355925e+06, -5.13532294e+03};
+    const std::array<double, 3> func{
+        model[0] + model[1] * float(opts_.batch_size) +
+            model[2] * float(opts_.batch_size) * float(opts_.batch_size) -
+            fixed_latency - u_sec * float(opts_.batch_size) / opts_.qps,
+        model[3] + model[4] * float(opts_.batch_size) +
+            model[5] * float(opts_.batch_size) * float(opts_.batch_size),
+        model[6] + model[7] * float(opts_.batch_size) +
+            model[8] * float(opts_.batch_size) * float(opts_.batch_size)};
+    float per = NewtonSolverV2(
+        func, iteration,
+        error_rate * u_sec * float(opts_.batch_size) / opts_.qps);
+    floatsrate = per * t4_flops / max_percent;
+    os << "Model: bert"
+       << "\n";
+    os << "est latency: "
+       << func[0] + func[1] / per + func[2] / (per * per) + fixed_latency +
+              u_sec * float(opts_.batch_size) / opts_.qps
+       << "\n";
+  } else {
+    os << "Model: other"
+       << "\n";
+  }
+
   os << "Device: GPU T4"
      << "\n";
   os << "batch size: " << adaptive_bsz_ << "\n";
   os << "est FLOPs: " << floatsrate << " gFlops\n";
   os << "est split: " << floatsrate / t4 << "% T4\n";
+  os << "model FLOPs: " << floatsrate / t4 << "% T4\n";
   os << "est latency: " << est_latency << " ms\n";
   os << "est mem: " << trt_mem << " MB\n";
   /*-----Generated T4 parameters-----------------*/
@@ -768,7 +933,7 @@ void Analyzer::GenerateRscInfo(std::ostream& os) {
   rsc_req_.append("\"size\":1,");
   rsc_req_.append("\"flops\":\"");
   // std::string s = std::to_string(total_flops * gflops);
-  std::string s = std::to_string(ceil(floatsrate));
+  std::string s = std::to_string(ceil(int(floatsrate)));
   rsc_req_.append(s.substr(0, s.find('.')));
   rsc_req_.append("\",");
   rsc_req_.append("\"precision\":\"Fp32\",");

diff --git a/python/halo/halo.py b/python/halo/halo.py
@@ -118,6 +118,7 @@ class ModelInfo(Structure):
     c_void_p,  # cg_opts
     c_char_p,  # filename
     c_void_p,  # model_info
+    c_int,  # model_type
 ]
 
 
@@ -192,7 +193,7 @@ def CompileModel(model_file, input_shapes, output_names, batch, format):
     return [output_file, output_bin]
 
 
-def AnalyzeModel(model_file, input_shapes, batch, format, model_info):
+def AnalyzeModel(model_file, input_shapes, batch, format, model_info, model_type=""):
     output_file = ""
     odla_lib = cast(create_string_buffer(b""), c_char_p)
     opts = CXXCodeGenOpts()
@@ -228,7 +229,16 @@ def AnalyzeModel(model_file, input_shapes, batch, format, model_info):
 
     target = "cxx".encode("utf-8")
     output_filename = output_file.encode("utf-8")
-
+    if("resnet50" in model_type):
+        model_type = 1
+    elif("dbnet" in model_type):
+        model_type = 2
+    elif("crnn" in model_type):
+        model_type = 3
+    elif("bert" in model_type):
+        model_type = 4
+    else:
+        model_type = 0
     Analyze(
         format_val,
         model_num,
@@ -245,6 +255,7 @@ def AnalyzeModel(model_file, input_shapes, batch, format, model_info):
         pointer(opts),
         output_filename,
         pointer(model_info),
+        model_type,
     )
 
 def CompileODLAModel(files, device, debug=False):

diff --git a/python/halo/inference.py b/python/halo/inference.py
@@ -35,6 +35,7 @@ def __init__(
         qps,
         debug,
         log_level,
+        model_type=""
     ):
         self.debug = debug
         logging.getLogger("halo").setLevel(log_level)
@@ -56,6 +57,7 @@ def __init__(
         self.device = device
         self.batch = batch
         self.qps = qps
+        self.model_type = model_type
         self.model = None
         self.so_file = None
 
@@ -64,15 +66,16 @@ def __del__(self):
 
     def Initialize(self):
         self.logger.info(f"Begin initialization;{self.model_file}")
-        self.so_file = "libvodla.so"
+        self.so_file = "/usr/local/lib/libvodla.so"
         self.model = odla.ODLAModel(self.so_file)
         self.model.Load(
             self.model_file,
             self.input_shapes,
             self.output_names,
             self.format,
             self.batch,
-            self.qps)
+            self.qps,
+            self.model_type)
         self.logger.info("Done initialization")
 
     def Run(self, data):

diff --git a/python/halo/odla.py b/python/halo/odla.py
@@ -52,7 +52,7 @@ def __del__(self):
             self.h.odla_DestroyComputation(self.comp)
             self.h.odla_DestroyDevice(self.device)
 
-    def Load(self,model,input_shapes,output_names,format,batch,qps):
+    def Load(self,model,input_shapes,output_names,format,batch,qps,model_type=""):
         if self.h is None:
             self.h = CDLL(self.so_file)
         self.comp = c_void_p(0)
@@ -62,7 +62,7 @@ def Load(self,model,input_shapes,output_names,format,batch,qps):
         model_info.adaptive_bsz = batch
         rsc_est = c_void_p(0)
         if qps>0:
-            halo.AnalyzeModel(model,input_shapes,batch,format,model_info)
+            halo.AnalyzeModel(model, input_shapes, batch, format, model_info,model_type)
             rsc_est = (c_char_p)(model_info.output_rsc_est)
         self.h.odla_AllocateDevice(c_void_p(0), 0, pointer(self.device), rsc_est)
         self.files = halo.CompileModel(