@@ -150,10 +150,12 @@ yield switch (avxSize) {
150
150
}
151
151
case DWORD , SINGLE -> new PermuteOp (result , source , indices , encoding );
152
152
case QWORD , DOUBLE -> {
153
- if (encoding == AMD64SIMDInstructionEncoding .EVEX || avxSize != YMM ) {
153
+ if (encoding == AMD64SIMDInstructionEncoding .EVEX && avxSize != XMM ) {
154
154
yield new PermuteOp (result , source , indices , encoding );
155
- } else {
155
+ } else if ( avxSize == YMM ) {
156
156
yield new PermuteOpWithTemps (gen , result , source , indices , encoding , 2 , false );
157
+ } else {
158
+ yield new PermuteOpWithTemps (gen , result , source , indices , encoding , 1 , false );
157
159
}
158
160
}
159
161
default -> throw GraalError .shouldNotReachHereUnexpectedValue (eKind );
@@ -255,21 +257,35 @@ public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
255
257
}
256
258
case DWORD , SINGLE -> throw GraalError .shouldNotReachHere ("should be PermuteOp" );
257
259
case QWORD , DOUBLE -> {
258
- GraalError .guarantee (encoding == AMD64SIMDInstructionEncoding .VEX && avxSize == YMM , "should be PermuteOp" );
259
- Register indexReg = asRegister (indices );
260
- Register xtmp1Reg = asRegister (xtmps [0 ]);
261
- Register xtmp2Reg = asRegister (xtmps [1 ]);
262
-
263
- // Transform into an int permute by transforming a 64-bit index with value v
264
- // into a pair of 32-bit indices v + 2, v * 2 + 1
265
- VexShiftOp .VPSLLQ .encoding (encoding ).emit (masm , YMM , xtmp1Reg , indexReg , Integer .SIZE + 1 );
266
- AMD64Address inc = (AMD64Address ) crb .asLongConstRef (JavaConstant .forLong (1L << Integer .SIZE ));
267
- VexRMOp .VPBROADCASTQ .encoding (encoding ).emit (masm , YMM , xtmp2Reg , inc );
268
- VexRVMOp .VPOR .encoding (encoding ).emit (masm , YMM , xtmp2Reg , xtmp1Reg , xtmp2Reg );
269
- VexShiftOp .VPSLLQ .encoding (encoding ).emit (masm , YMM , xtmp1Reg , indexReg , 1 );
270
- VexRVMOp .VPOR .encoding (encoding ).emit (masm , YMM , xtmp1Reg , xtmp1Reg , xtmp2Reg );
271
- VexRVMOp op = eKind == AMD64Kind .QWORD ? VexRVMOp .VPERMD : VexRVMOp .VPERMPS ;
272
- op .encoding (encoding ).emit (masm , YMM , asRegister (result ), xtmp1Reg , asRegister (source ));
260
+ if (avxSize == YMM ) {
261
+ GraalError .guarantee (encoding == AMD64SIMDInstructionEncoding .VEX , "should be PermuteOp" );
262
+ Register indexReg = asRegister (indices );
263
+ Register xtmp1Reg = asRegister (xtmps [0 ]);
264
+ Register xtmp2Reg = asRegister (xtmps [1 ]);
265
+
266
+ // Transform into an int permute by transforming a 64-bit index with value v
267
+ // into a pair of 32-bit indices v + 2, v * 2 + 1
268
+ VexShiftOp .VPSLLQ .encoding (encoding ).emit (masm , YMM , xtmp1Reg , indexReg , Integer .SIZE + 1 );
269
+ AMD64Address inc = (AMD64Address ) crb .asLongConstRef (JavaConstant .forLong (1L << Integer .SIZE ));
270
+ VexRMOp .VPBROADCASTQ .encoding (encoding ).emit (masm , YMM , xtmp2Reg , inc );
271
+ VexRVMOp .VPOR .encoding (encoding ).emit (masm , YMM , xtmp2Reg , xtmp1Reg , xtmp2Reg );
272
+ VexShiftOp .VPSLLQ .encoding (encoding ).emit (masm , YMM , xtmp1Reg , indexReg , 1 );
273
+ VexRVMOp .VPOR .encoding (encoding ).emit (masm , YMM , xtmp1Reg , xtmp1Reg , xtmp2Reg );
274
+ VexRVMOp op = eKind == AMD64Kind .QWORD ? VexRVMOp .VPERMD : VexRVMOp .VPERMPS ;
275
+ op .encoding (encoding ).emit (masm , YMM , asRegister (result ), xtmp1Reg , asRegister (source ));
276
+ } else {
277
+ GraalError .guarantee (avxSize == XMM , "should be PermuteOp" );
278
+ Register xtmpReg = asRegister (xtmps [0 ]);
279
+ /*
280
+ * VPERMILPD uses the SECOND bit in each element as the index. Note that
281
+ * although the textual description of the instruction in the Intel SDM
282
+ * (March 2025) says that "The control bits are located at bit 0 of each
283
+ * quadword element", the pseudocode in the same manual as well as
284
+ * experiments show that it is actually the bit 1 that is the control bit.
285
+ */
286
+ VexShiftOp .VPSLLQ .encoding (encoding ).emit (masm , XMM , xtmpReg , asRegister (indices ), 1 );
287
+ VexRVMOp .VPERMILPD .encoding (encoding ).emit (masm , XMM , asRegister (result ), asRegister (source ), xtmpReg );
288
+ }
273
289
}
274
290
default -> throw GraalError .shouldNotReachHereUnexpectedValue (eKind );
275
291
}
0 commit comments