fix lint

depeng1994 · depeng1994 · commit 25ee16339933 · 2025-06-03T19:53:57.000+08:00
Signed-off-by: depeng1994 &lt;depengzhang@foxmail.com&gt;
diff --git a/docs/source/developer_guide/evaluation/index.md b/docs/source/developer_guide/evaluation/index.md
@@ -12,4 +12,5 @@ using_evalscope
 :caption: Performance
 :maxdepth: 1
 performance_benchmark
+profile_execute_duration
 :::
diff --git a/docs/source/developer_guide/evaluation/profile_execute_duration.md b/docs/source/developer_guide/evaluation/profile_execute_duration.md
@@ -1,4 +1,4 @@
-# Profile Execute Duration Observation
+# Profile Execute Duration
 
 The execution duration of each stage (including pre/post-processing, model forward, etc.) usually needs to be captured during a complete inference process. Typically, this is done by using `torch.npu.synchronize()` and obtaining CPU timestamps, which increases the performance overhead of host/device synchronization.
 
diff --git a/tests/singlecard/test_profile_execute_duration.py b/tests/singlecard/test_profile_execute_duration.py
@@ -17,9 +17,10 @@
 # limitations under the License.
 #
 import time
-import torch
 
+import torch
 import vllm  # noqa: F401
+
 import vllm_ascend.envs as envs
 from vllm_ascend.utils import ProfileExecuteDuration
 
@@ -31,24 +32,27 @@ def test_execue_duration_enabled_discrepancy():
     envs.VLLM_MODEL_EXECUTE_TIME_OBSERVE = True
     cpu_start = time.perf_counter()
     with ProfileExecuteDuration().capture_async("forward"):
-        result = torch.matmul(a, b)
+        torch.matmul(a, b)
         torch.npu.synchronize()
-    cpu_duration = (time.perf_counter() - cpu_start) * 1000
-    npu_durations= ProfileExecuteDuration().pop_captured_sync()
+        cpu_duration = (time.perf_counter() - cpu_start) * 1000
+    npu_durations = ProfileExecuteDuration().pop_captured_sync()
     assert npu_durations and 'forward' in npu_durations
     assert not ProfileExecuteDuration._observations
-    
+
     # Assert discrepancy between CPU and NPU duration is within 10% tolerance
-    diff = abs(cpu_duration - npu_durations['forward']) / max(cpu_duration, npu_durations['forward'])
-    assert diff <= 0.1, (f"CPU={cpu_duration:.2f}ms, NPU={npu_durations['forward']:.2f}ms")
+    diff = abs(cpu_duration - npu_durations['forward']) / max(
+        cpu_duration, npu_durations['forward'])
+    assert diff <= 0.1, (
+        f"CPU={cpu_duration:.2f}ms, NPU={npu_durations['forward']:.2f}ms")
+
 
 def test_execue_duration_disabled():
     a = torch.randn(100, 100).npu()
     b = torch.randn(100, 100).npu()
 
     envs.VLLM_MODEL_EXECUTE_TIME_OBSERVE = False
     with ProfileExecuteDuration().capture_async("forward"):
-        result = torch.matmul(a, b)
+        torch.matmul(a, b)
         torch.npu.synchronize()
-    npu_durations= ProfileExecuteDuration().pop_captured_sync()
-    assert not npu_durations
+    npu_durations = ProfileExecuteDuration().pop_captured_sync()
+    assert not npu_durations
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -216,7 +216,7 @@ def capture_async(self, duration_tag: str):
 
     def pop_captured_sync(self) -> dict:
         """Pop and synchronize all events in the observation list"""
-        durations = {}
+        durations: dict[str, float] = {}
         if not envs.VLLM_MODEL_EXECUTE_TIME_OBSERVE:
             return durations
 
@@ -225,5 +225,5 @@ def pop_captured_sync(self) -> dict:
                 tag, observe_start, observe_end = self._observations.pop()
             observe_end.synchronize()
             durations[tag] = observe_start.elapsed_time(observe_end)
-        
+
         return durations
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -1016,9 +1016,13 @@ def execute_model(
 
         durations = ProfileExecuteDuration().pop_captured_sync()
         if durations:
-            durations_str = [f"[{tag}]:{duration:.2f}ms" for tag, duration in durations.items()]
+            dr_str = [
+                f"[{tag}]:{duration:.2f}ms"
+                for tag, duration in durations.items()
+            ]
             captured_name = "Decode" if self.attn_state == AscendAttentionState.DecodeOnly else "Prefill"
-            print(f"Profile execute duration [{captured_name}]:", " ".join(durations_str))
+            print(f"Profile execute duration [{captured_name}]:",
+                  " ".join(dr_str))
 
         return model_runner_output
 

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Profile Execute Duration Observation`
	`1`	`+# Profile Execute Duration`
`2`	`2`
`3`	`3`	The execution duration of each stage (including pre/post-processing, model forward, etc.) usually needs to be captured during a complete inference process. Typically, this is done by using `torch.npu.synchronize()` and obtaining CPU timestamps, which increases the performance overhead of host/device synchronization.
`4`	`4`