Skip to content

Commit 25ee163

Browse files
committed
fix lint
Signed-off-by: depeng1994 <depengzhang@foxmail.com>
1 parent cd4885d commit 25ee163

File tree

5 files changed

+24
-15
lines changed

5 files changed

+24
-15
lines changed

docs/source/developer_guide/evaluation/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ using_evalscope
1212
:caption: Performance
1313
:maxdepth: 1
1414
performance_benchmark
15+
profile_execute_duration
1516
:::

docs/source/developer_guide/profile_execute_duration_observation.md renamed to docs/source/developer_guide/evaluation/profile_execute_duration.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Profile Execute Duration Observation
1+
# Profile Execute Duration
22

33
The execution duration of each stage (including pre/post-processing, model forward, etc.) usually needs to be captured during a complete inference process. Typically, this is done by using `torch.npu.synchronize()` and obtaining CPU timestamps, which increases the performance overhead of host/device synchronization.
44

tests/singlecard/test_profile_execute_duration.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@
1717
# limitations under the License.
1818
#
1919
import time
20-
import torch
2120

21+
import torch
2222
import vllm # noqa: F401
23+
2324
import vllm_ascend.envs as envs
2425
from vllm_ascend.utils import ProfileExecuteDuration
2526

@@ -31,24 +32,27 @@ def test_execue_duration_enabled_discrepancy():
3132
envs.VLLM_MODEL_EXECUTE_TIME_OBSERVE = True
3233
cpu_start = time.perf_counter()
3334
with ProfileExecuteDuration().capture_async("forward"):
34-
result = torch.matmul(a, b)
35+
torch.matmul(a, b)
3536
torch.npu.synchronize()
36-
cpu_duration = (time.perf_counter() - cpu_start) * 1000
37-
npu_durations= ProfileExecuteDuration().pop_captured_sync()
37+
cpu_duration = (time.perf_counter() - cpu_start) * 1000
38+
npu_durations = ProfileExecuteDuration().pop_captured_sync()
3839
assert npu_durations and 'forward' in npu_durations
3940
assert not ProfileExecuteDuration._observations
40-
41+
4142
# Assert discrepancy between CPU and NPU duration is within 10% tolerance
42-
diff = abs(cpu_duration - npu_durations['forward']) / max(cpu_duration, npu_durations['forward'])
43-
assert diff <= 0.1, (f"CPU={cpu_duration:.2f}ms, NPU={npu_durations['forward']:.2f}ms")
43+
diff = abs(cpu_duration - npu_durations['forward']) / max(
44+
cpu_duration, npu_durations['forward'])
45+
assert diff <= 0.1, (
46+
f"CPU={cpu_duration:.2f}ms, NPU={npu_durations['forward']:.2f}ms")
47+
4448

4549
def test_execue_duration_disabled():
4650
a = torch.randn(100, 100).npu()
4751
b = torch.randn(100, 100).npu()
4852

4953
envs.VLLM_MODEL_EXECUTE_TIME_OBSERVE = False
5054
with ProfileExecuteDuration().capture_async("forward"):
51-
result = torch.matmul(a, b)
55+
torch.matmul(a, b)
5256
torch.npu.synchronize()
53-
npu_durations= ProfileExecuteDuration().pop_captured_sync()
54-
assert not npu_durations
57+
npu_durations = ProfileExecuteDuration().pop_captured_sync()
58+
assert not npu_durations

vllm_ascend/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ def capture_async(self, duration_tag: str):
216216

217217
def pop_captured_sync(self) -> dict:
218218
"""Pop and synchronize all events in the observation list"""
219-
durations = {}
219+
durations: dict[str, float] = {}
220220
if not envs.VLLM_MODEL_EXECUTE_TIME_OBSERVE:
221221
return durations
222222

@@ -225,5 +225,5 @@ def pop_captured_sync(self) -> dict:
225225
tag, observe_start, observe_end = self._observations.pop()
226226
observe_end.synchronize()
227227
durations[tag] = observe_start.elapsed_time(observe_end)
228-
228+
229229
return durations

vllm_ascend/worker/model_runner_v1.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1016,9 +1016,13 @@ def execute_model(
10161016

10171017
durations = ProfileExecuteDuration().pop_captured_sync()
10181018
if durations:
1019-
durations_str = [f"[{tag}]:{duration:.2f}ms" for tag, duration in durations.items()]
1019+
dr_str = [
1020+
f"[{tag}]:{duration:.2f}ms"
1021+
for tag, duration in durations.items()
1022+
]
10201023
captured_name = "Decode" if self.attn_state == AscendAttentionState.DecodeOnly else "Prefill"
1021-
print(f"Profile execute duration [{captured_name}]:", " ".join(durations_str))
1024+
print(f"Profile execute duration [{captured_name}]:",
1025+
" ".join(dr_str))
10221026

10231027
return model_runner_output
10241028

0 commit comments

Comments
 (0)