From 3ca4c75748eed1d33089158e6092c6000b477c7a Mon Sep 17 00:00:00 2001
From: chunyuan-w <chunyuan.wu@intel.com>
Date: Fri, 11 Dec 2020 11:51:59 +0800
Subject: [PATCH 1/2] optimize nms in IPEX

---
 .../single_stage_detector/pytorch/infer.py    |  4 ++--
 .../single_stage_detector/pytorch/utils.py    | 24 ++++++++++++++++++-
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/others/cloud/single_stage_detector/pytorch/infer.py b/others/cloud/single_stage_detector/pytorch/infer.py
index 1bb622457a..ac1ca69c29 100644
--- a/others/cloud/single_stage_detector/pytorch/infer.py
+++ b/others/cloud/single_stage_detector/pytorch/infer.py
@@ -198,7 +198,7 @@ def coco_eval(model, val_dataloader, cocoGt, encoder, inv_map, args):
                                 inference_time.update(time.time() - start_time)
                                 end_time = time.time()
                             try:
-                                results = encoder.decode_batch(ploc.to('cpu'), plabel.to('cpu'), 0.50, 200,device=device)
+                                results = encoder.decode_batch(ploc, plabel, 0.50, 200,device=device)
                             except:
                                 print("No object detected in idx: {}".format(idx))
                                 continue
@@ -255,7 +255,7 @@ def coco_eval(model, val_dataloader, cocoGt, encoder, inv_map, args):
                         inference_time.update(time.time() - start_time)
                         end_time = time.time()
                     try:
-                        results = encoder.decode_batch(ploc.to('cpu'), plabel.to('cpu'), 0.50, 200,device=device)
+                        results = encoder.decode_batch(ploc, plabel, 0.50, 200,device=device)
                     except:
                         print("No object detected in idx: {}".format(idx))
                         continue
diff --git a/others/cloud/single_stage_detector/pytorch/utils.py b/others/cloud/single_stage_detector/pytorch/utils.py
index 83b9d5bf55..5993e61f37 100644
--- a/others/cloud/single_stage_detector/pytorch/utils.py
+++ b/others/cloud/single_stage_detector/pytorch/utils.py
@@ -17,6 +17,8 @@
 import pickle
 from math import sqrt, ceil
 
+from intel_pytorch_extension import batch_score_nms
+
 # This function is from https://github.com/kuangliu/pytorch-ssd.
 def calc_iou_tensor(box1, box2):
     """ Calculation of IoU based on two boxes tensor,
@@ -122,6 +124,9 @@ def scale_back_batch(self, bboxes_in, scores_in,device):
         if bboxes_in.device == torch.device("cpu"):
             self.dboxes = self.dboxes.cpu()
             self.dboxes_xywh = self.dboxes_xywh.cpu()
+        elif bboxes_in.device == torch.device("dpcpp"):
+            self.dboxes = self.dboxes.to("dpcpp")
+            self.dboxes_xywh = self.dboxes_xywh.to("dpcpp")
         else:
             self.dboxes = self.dboxes.cuda(device)
             self.dboxes_xywh = self.dboxes_xywh.cuda(device)
@@ -155,10 +160,27 @@ def decode_batch(self, bboxes_in, scores_in,  criteria = 0.45, max_output=200,de
         for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
             bbox = bbox.squeeze(0)
             prob = prob.squeeze(0)
-            output.append(self.decode_single(bbox, prob, criteria, max_output))
+            if bbox.device == torch.device("dpcpp"):
+                output.append(self.decode_single_dpcpp(bbox, prob, criteria, max_output))
+            else:
+                output.append(self.decode_single(bbox, prob, criteria, max_output))
             #print(output[-1])
         return output
 
+    # perform non-maximum suppression for dpcpp tensor
+    def decode_single_dpcpp(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
+        # Reference to https://github.com/amdegroot/ssd.pytorch
+
+        bboxes_out = []
+        scores_out = []
+        labels_out = []
+
+        bboxes_out, labels_out, scores_out = batch_score_nms(bboxes_in, scores_in, criteria)
+
+        _, max_ids = scores_out.sort(dim=0)
+        max_ids = max_ids[-max_output:]
+        return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
+
     # perform non-maximum suppression
     def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
         # Reference to https://github.com/amdegroot/ssd.pytorch

From 8fcccda9d1d0650e67d6620a4114ca1eff949808 Mon Sep 17 00:00:00 2001
From: chunyuan-w <chunyuan.wu@intel.com>
Date: Fri, 29 Jan 2021 15:45:04 +0800
Subject: [PATCH 2/2] update device name to ipex.DEVICE

---
 .../single_stage_detector/pytorch/utils.py      | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/others/cloud/single_stage_detector/pytorch/utils.py b/others/cloud/single_stage_detector/pytorch/utils.py
index 5993e61f37..5ac707a3c3 100644
--- a/others/cloud/single_stage_detector/pytorch/utils.py
+++ b/others/cloud/single_stage_detector/pytorch/utils.py
@@ -17,6 +17,9 @@
 import pickle
 from math import sqrt, ceil
 
+if os.environ.get('USE_IPEX') == "1":
+    import intel_pytorch_extension as ipex
+
 from intel_pytorch_extension import batch_score_nms
 
 # This function is from https://github.com/kuangliu/pytorch-ssd.
@@ -124,9 +127,9 @@ def scale_back_batch(self, bboxes_in, scores_in,device):
         if bboxes_in.device == torch.device("cpu"):
             self.dboxes = self.dboxes.cpu()
             self.dboxes_xywh = self.dboxes_xywh.cpu()
-        elif bboxes_in.device == torch.device("dpcpp"):
-            self.dboxes = self.dboxes.to("dpcpp")
-            self.dboxes_xywh = self.dboxes_xywh.to("dpcpp")
+        elif bboxes_in.device == torch.device(ipex.DEVICE):
+            self.dboxes = self.dboxes.to(ipex.DEVICE)
+            self.dboxes_xywh = self.dboxes_xywh.to(ipex.DEVICE)
         else:
             self.dboxes = self.dboxes.cuda(device)
             self.dboxes_xywh = self.dboxes_xywh.cuda(device)
@@ -160,15 +163,15 @@ def decode_batch(self, bboxes_in, scores_in,  criteria = 0.45, max_output=200,de
         for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
             bbox = bbox.squeeze(0)
             prob = prob.squeeze(0)
-            if bbox.device == torch.device("dpcpp"):
-                output.append(self.decode_single_dpcpp(bbox, prob, criteria, max_output))
+            if bbox.device == torch.device(ipex.DEVICE):
+                output.append(self.decode_single_ipex(bbox, prob, criteria, max_output))
             else:
                 output.append(self.decode_single(bbox, prob, criteria, max_output))
             #print(output[-1])
         return output
 
-    # perform non-maximum suppression for dpcpp tensor
-    def decode_single_dpcpp(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
+    # perform non-maximum suppression for IPEX tensor
+    def decode_single_ipex(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
         # Reference to https://github.com/amdegroot/ssd.pytorch
 
         bboxes_out = []