Skip to content

Commit ed6cfb9

Browse files
[Hardware][Intel GPU] Upgrade to torch 2.7 (#17444)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Qiming Zhang <qiming1.zhang@intel.com>
1 parent 6ed9f60 commit ed6cfb9

File tree

5 files changed

+18
-35
lines changed

5 files changed

+18
-35
lines changed

docker/Dockerfile.xpu

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
4040
--mount=type=bind,source=.git,target=.git \
4141
python3 setup.py install
4242

43-
# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
44-
# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
45-
RUN --mount=type=cache,target=/root/.cache/pip \
46-
pip install intel-extension-for-pytorch==2.6.10+xpu \
47-
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
48-
4943
CMD ["/bin/bash"]
5044

5145
FROM vllm-base AS vllm-openai

docs/source/getting_started/installation/gpu/xpu.inc.md

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,6 @@ pip install -v -r requirements/xpu.txt
3535
VLLM_TARGET_DEVICE=xpu python setup.py install
3636
```
3737

38-
- Finally, due to a known issue of conflict dependency(oneapi related) in torch-xpu 2.6 and ipex-xpu 2.6, we install ipex here. This will be fixed in the ipex-xpu 2.7.
39-
40-
```console
41-
pip install intel-extension-for-pytorch==2.6.10+xpu \
42-
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
43-
```
44-
4538
:::{note}
4639
- FP16 is the default data type in the current XPU backend. The BF16 data
4740
type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
@@ -81,5 +74,3 @@ python -m vllm.entrypoints.openai.api_server \
8174
```
8275

8376
By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
84-
85-
There are some new features coming with ipex-xpu 2.6, e.g. **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc.

requirements/xpu.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@ wheel
1010
jinja2>=3.1.6
1111
datasets # for benchmark scripts
1212

13-
torch==2.6.0+xpu
13+
torch==2.7.0+xpu
1414
torchaudio
1515
torchvision
1616
pytorch-triton-xpu
1717
--extra-index-url=https://download.pytorch.org/whl/xpu
1818

1919
# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
2020
# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
21-
# intel-extension-for-pytorch==2.6.10+xpu
22-
oneccl_bind_pt==2.6.0+xpu
21+
intel-extension-for-pytorch==2.7.10+xpu
22+
oneccl_bind_pt==2.7.0+xpu
2323
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/

vllm/_ipex_ops.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ def varlen_attention(
177177
out: torch.Tensor,
178178
seqlen_q: torch.Tensor,
179179
seqlen_k: torch.Tensor,
180+
alibi_slopes: torch.Tensor,
180181
max_seqlen_q: int,
181182
max_seqlen_k: int,
182183
pdropout: float,
@@ -185,6 +186,8 @@ def varlen_attention(
185186
is_causal: bool,
186187
return_softmax: bool,
187188
gen_: torch.Generator,
189+
window_size_left: float,
190+
window_size_right: float,
188191
logits_soft_cap: float,
189192
) -> None:
190193
if ipex.__version__.endswith("cpu"):
@@ -200,15 +203,12 @@ def varlen_attention(
200203
is_causal, return_softmax,
201204
gen_)
202205
else: # XPU build
203-
ipex.llm.functional.varlen_attention(query.contiguous(),
204-
key.contiguous(),
205-
value.contiguous(), out,
206-
seqlen_q.int(),
207-
seqlen_k.int(), max_seqlen_q,
208-
max_seqlen_k, pdropout,
209-
softmax_scale, zero_tensors,
210-
is_causal, return_softmax,
211-
gen_, logits_soft_cap)
206+
ipex.llm.functional.varlen_attention(
207+
query.contiguous(), key.contiguous(), value.contiguous(), out,
208+
seqlen_q.int(), seqlen_k.int(), alibi_slopes, max_seqlen_q,
209+
max_seqlen_k, pdropout, softmax_scale, zero_tensors, is_causal,
210+
return_softmax, gen_, window_size_left, window_size_right,
211+
logits_soft_cap)
212212

213213
@staticmethod
214214
def reshape_and_cache(

vllm/attention/backends/ipex_attn.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -143,10 +143,9 @@ def __init__(
143143

144144
assert self.num_heads % self.num_kv_heads == 0
145145
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
146-
self.need_mask = (self.alibi_slopes is not None
147-
or self.sliding_window is not None)
146+
self.need_mask = (self.sliding_window is not None)
148147
if logits_soft_cap is None:
149-
logits_soft_cap = 0
148+
logits_soft_cap = -1
150149
self.logits_soft_cap = logits_soft_cap
151150

152151
supported_head_sizes = PagedAttention.get_supported_head_sizes()
@@ -234,11 +233,7 @@ def forward(
234233
dim=1)
235234

236235
if attn_metadata.attn_bias is None:
237-
if self.alibi_slopes is not None:
238-
att_masks = _make_alibi_bias(
239-
self.alibi_slopes, query.dtype,
240-
attn_metadata.seq_lens) # type: ignore
241-
elif self.sliding_window is not None:
236+
if self.sliding_window is not None:
242237
att_masks = _make_sliding_window_bias(
243238
attn_metadata.seq_lens, self.sliding_window,
244239
query.dtype) # type: ignore
@@ -258,6 +253,7 @@ def forward(
258253
output,
259254
attn_metadata.seqlen_q,
260255
attn_metadata.seqlen_q,
256+
self.alibi_slopes,
261257
attn_metadata.max_seqlen,
262258
attn_metadata.max_seqlen,
263259
pdropout=0.0,
@@ -266,6 +262,8 @@ def forward(
266262
is_causal=True,
267263
return_softmax=False,
268264
gen_=None,
265+
window_size_left=-1,
266+
window_size_right=-1,
269267
logits_soft_cap=self.logits_soft_cap,
270268
)
271269
else:

0 commit comments

Comments
 (0)