@@ -111,7 +111,7 @@ public CompareAndSwapOp(AArch64Kind accessKind, MemoryOrderMode memoryOrder, boo
111
111
}
112
112
113
113
/**
114
- * Both cas and ld(a)xr produce a zero-extended value. Since comparisons must be at minimum
114
+ * Both cas and ldxr produce a zero-extended value. Since comparisons must be at minimum
115
115
* 32-bits, the expected value must also be zero-extended to produce an accurate comparison.
116
116
*/
117
117
private static void emitCompare (AArch64MacroAssembler masm , int memAccessSize , Register result , Register expected ) {
@@ -168,53 +168,64 @@ public void emitCode(CompilationResultBuilder crb, AArch64MacroAssembler masm) {
168
168
throw GraalError .shouldNotReachHereUnexpectedValue (memoryOrder ); // ExcludeFromJacocoGeneratedReport
169
169
}
170
170
171
+ int moveSize = Math .max (memAccessSize , 32 );
171
172
if (AArch64LIRFlags .useLSE (masm )) {
172
- masm .mov (Math . max ( memAccessSize , 32 ) , result , expected );
173
+ masm .mov (moveSize , result , expected );
173
174
moveSPAndEmitCode (masm , asRegister (newValue ), newVal -> {
174
175
masm .cas (memAccessSize , result , newVal , address , acquire , release );
175
176
});
176
177
if (setConditionFlags ) {
177
178
emitCompare (masm , memAccessSize , result , expected );
178
179
}
179
180
} else {
181
+
182
+ try (ScratchRegister scratchRegister1 = masm .getScratchRegister (); ScratchRegister scratchRegister2 = masm .getScratchRegister ()) {
183
+ Label retry = new Label ();
184
+ masm .bind (retry );
185
+ Register scratch2 = scratchRegister2 .getRegister ();
186
+ Register newValueReg = asRegister (newValue );
187
+ if (newValueReg .equals (sp )) {
188
+ /*
189
+ * SP cannot be used in csel or stl(x)r.
190
+ *
191
+ * Since csel overwrites scratch2, newValue must be newly loaded each loop
192
+ * iteration. However, unless under heavy contention, the storeExclusive
193
+ * should rarely fail.
194
+ */
195
+ masm .mov (moveSize , scratch2 , newValueReg );
196
+ newValueReg = scratch2 ;
197
+ }
198
+ masm .loadExclusive (memAccessSize , result , address , false );
199
+
200
+ emitCompare (masm , memAccessSize , result , expected );
201
+ masm .csel (moveSize , scratch2 , newValueReg , result , AArch64Assembler .ConditionFlag .EQ );
202
+
203
+ /*
204
+ * STLXR must be used also if acquire is set to ensure prior ldaxr/stlxr
205
+ * instructions are not reordered after it.
206
+ */
207
+ Register scratch1 = scratchRegister1 .getRegister ();
208
+ masm .storeExclusive (memAccessSize , scratch1 , scratch2 , address , acquire || release );
209
+ // if scratch1 == 0 then write successful, else retry.
210
+ masm .cbnz (32 , scratch1 , retry );
211
+ }
212
+
180
213
/*
181
- * Because the store is only conditionally emitted, a dmb is needed for performing a
182
- * release.
183
- *
184
- * Furthermore, even if the stlxr is emitted, if both acquire and release semantics
185
- * are required, then a dmb is anyways needed to ensure that the instruction
186
- * sequence:
214
+ * From the Java perspective, the (ldxr, cmp, csel, stl(x)r) is a single atomic
215
+ * operation which must abide by all requested semantics. Therefore, when acquire
216
+ * semantics are needed, we use a full barrier after the store to guarantee that
217
+ * instructions following the store cannot execute before it and violate acquire
218
+ * semantics.
187
219
*
188
- * A -> ldaxr -> stlxr -> B
189
- *
190
- * cannot be executed as:
191
- *
192
- * ldaxr -> B -> A -> stlxr
220
+ * Note we could instead perform a conditional branch and when the comparison fails
221
+ * skip the store, but this introduces an opportunity for branch mispredictions, and
222
+ * also, when release semantics are needed, requires a barrier to be inserted before
223
+ * the operation.
193
224
*/
194
- if (release ) {
225
+
226
+ if (acquire ) {
195
227
masm .dmb (AArch64Assembler .BarrierKind .ANY_ANY );
196
228
}
197
-
198
- moveSPAndEmitCode (masm , asRegister (newValue ), newVal -> {
199
- try (ScratchRegister scratchRegister = masm .getScratchRegister ()) {
200
- Register scratch = scratchRegister .getRegister ();
201
- Label retry = new Label ();
202
- Label fail = new Label ();
203
- masm .bind (retry );
204
- masm .loadExclusive (memAccessSize , result , address , acquire );
205
- emitCompare (masm , memAccessSize , result , expected );
206
- masm .branchConditionally (AArch64Assembler .ConditionFlag .NE , fail );
207
- /*
208
- * Even with the prior dmb, for releases it is still necessary to use stlxr
209
- * instead of stxr to guarantee subsequent lda(x)r/stl(x)r cannot be hoisted
210
- * above this instruction and thereby violate volatile semantics.
211
- */
212
- masm .storeExclusive (memAccessSize , scratch , newVal , address , release );
213
- // if scratch == 0 then write successful, else retry.
214
- masm .cbnz (32 , scratch , retry );
215
- masm .bind (fail );
216
- }
217
- });
218
229
}
219
230
}
220
231
}
@@ -290,14 +301,10 @@ public void emitCode(CompilationResultBuilder crb, AArch64MacroAssembler masm) {
290
301
}
291
302
}
292
303
/*
293
- * Use a full barrier for the acquire semantics instead of ldaxr to guarantee that the
294
- * instruction sequence:
295
- *
296
- * A -> ldaxr -> stlxr -> B
297
- *
298
- * cannot be executed as:
299
- *
300
- * ldaxr -> B -> A -> stlxr
304
+ * From the Java perspective, the (ldxr, add, stlxr) is a single atomic operation which
305
+ * must abide by both acquire and release semantics. Therefore, we use a full barrier
306
+ * after the store to guarantee that instructions following the store cannot execute
307
+ * before it and violate acquire semantics.
301
308
*/
302
309
masm .dmb (AArch64Assembler .BarrierKind .ANY_ANY );
303
310
}
@@ -405,14 +412,10 @@ public void emitCode(CompilationResultBuilder crb, AArch64MacroAssembler masm) {
405
412
// if scratch == 0 then write successful, else retry
406
413
masm .cbnz (32 , scratch , retry );
407
414
/*
408
- * Use a full barrier for the acquire semantics instead of ldaxr to
409
- * guarantee that the instruction sequence:
410
- *
411
- * A -> ldaxr -> stlxr -> B
412
- *
413
- * cannot be executed as:
414
- *
415
- * ldaxr -> B -> A -> stlxr
415
+ * From the Java perspective, the (ldxr, stlxr) is a single atomic operation
416
+ * which must abide by both acquire and release semantics. Therefore, we use
417
+ * a full barrier after the store to guarantee that instructions following
418
+ * the store cannot execute before it and violate acquire semantics.
416
419
*/
417
420
masm .dmb (AArch64Assembler .BarrierKind .ANY_ANY );
418
421
}
0 commit comments