diff --git a/others/cloud/single_stage_detector/pytorch/infer.py b/others/cloud/single_stage_detector/pytorch/infer.py
index 1bb622457a..ac1ca69c29 100644
--- a/others/cloud/single_stage_detector/pytorch/infer.py
+++ b/others/cloud/single_stage_detector/pytorch/infer.py
@@ -198,7 +198,7 @@ def coco_eval(model, val_dataloader, cocoGt, encoder, inv_map, args):
                                 inference_time.update(time.time() - start_time)
                                 end_time = time.time()
                             try:
-                                results = encoder.decode_batch(ploc.to('cpu'), plabel.to('cpu'), 0.50, 200,device=device)
+                                results = encoder.decode_batch(ploc, plabel, 0.50, 200,device=device)
                             except:
                                 print("No object detected in idx: {}".format(idx))
                                 continue
@@ -255,7 +255,7 @@ def coco_eval(model, val_dataloader, cocoGt, encoder, inv_map, args):
                         inference_time.update(time.time() - start_time)
                         end_time = time.time()
                     try:
-                        results = encoder.decode_batch(ploc.to('cpu'), plabel.to('cpu'), 0.50, 200,device=device)
+                        results = encoder.decode_batch(ploc, plabel, 0.50, 200,device=device)
                     except:
                         print("No object detected in idx: {}".format(idx))
                         continue
diff --git a/others/cloud/single_stage_detector/pytorch/utils.py b/others/cloud/single_stage_detector/pytorch/utils.py
index 83b9d5bf55..5ac707a3c3 100644
--- a/others/cloud/single_stage_detector/pytorch/utils.py
+++ b/others/cloud/single_stage_detector/pytorch/utils.py
@@ -17,6 +17,11 @@
 import pickle
 from math import sqrt, ceil
 
+if os.environ.get('USE_IPEX') == "1":
+    import intel_pytorch_extension as ipex
+
+from intel_pytorch_extension import batch_score_nms
+
 # This function is from https://github.com/kuangliu/pytorch-ssd.
 def calc_iou_tensor(box1, box2):
     """ Calculation of IoU based on two boxes tensor,
@@ -122,6 +127,9 @@ def scale_back_batch(self, bboxes_in, scores_in,device):
         if bboxes_in.device == torch.device("cpu"):
             self.dboxes = self.dboxes.cpu()
             self.dboxes_xywh = self.dboxes_xywh.cpu()
+        elif bboxes_in.device == torch.device(ipex.DEVICE):
+            self.dboxes = self.dboxes.to(ipex.DEVICE)
+            self.dboxes_xywh = self.dboxes_xywh.to(ipex.DEVICE)
         else:
             self.dboxes = self.dboxes.cuda(device)
             self.dboxes_xywh = self.dboxes_xywh.cuda(device)
@@ -155,10 +163,27 @@ def decode_batch(self, bboxes_in, scores_in,  criteria = 0.45, max_output=200,de
         for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
             bbox = bbox.squeeze(0)
             prob = prob.squeeze(0)
-            output.append(self.decode_single(bbox, prob, criteria, max_output))
+            if bbox.device == torch.device(ipex.DEVICE):
+                output.append(self.decode_single_ipex(bbox, prob, criteria, max_output))
+            else:
+                output.append(self.decode_single(bbox, prob, criteria, max_output))
             #print(output[-1])
         return output
 
+    # perform non-maximum suppression for IPEX tensor
+    def decode_single_ipex(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
+        # Reference to https://github.com/amdegroot/ssd.pytorch
+
+        bboxes_out = []
+        scores_out = []
+        labels_out = []
+
+        bboxes_out, labels_out, scores_out = batch_score_nms(bboxes_in, scores_in, criteria)
+
+        _, max_ids = scores_out.sort(dim=0)
+        max_ids = max_ids[-max_output:]
+        return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
+
     # perform non-maximum suppression
     def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
         # Reference to https://github.com/amdegroot/ssd.pytorch