@@ -154,11 +154,13 @@ func upper(in []uint64, out []uint64, size int) {
154
154
}
155
155
}
156
156
157
- func variableTime (sps []uint64 , p1 []uint64 , p2 []uint64 , p3 []uint64 , s []uint8 ) {
157
+ // The variable time technique is describe in the "Nibbling" paper (https://eprint.iacr.org/2023/1683.pdf)
158
+ // Section 4 (and Figure 2).
159
+ func calculatePStVarTime (sps []uint64 , p1 []uint64 , p2 []uint64 , p3 []uint64 , s []uint8 ) {
158
160
var accumulator [K * N ][P * 16 ]uint64
159
161
160
- // compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2]
161
- // [ 0 P3 ] [S2] [ P3*S2]
162
+ // compute P * S^t = [ P1 P2 ] * [S1^t ] = [P1*S1^t + P2*S2^t ]
163
+ // [ 0 P3 ] [S2^t ] [ P3*S2^t ]
162
164
163
165
// Note that S = S1||S2 is strided at N=V+O
164
166
@@ -167,7 +169,7 @@ func variableTime(sps []uint64, p1 []uint64, p2 []uint64, p3 []uint64, s []uint8
167
169
for r := 0 ; r < V ; r ++ {
168
170
for c := r ; c < V ; c ++ {
169
171
for k := 0 ; k < K ; k ++ {
170
- vecAddPacked (P , p1 [P * pos :], accumulator [r * K + k ][P * int (s [k * N + c ]):])
172
+ vecAddPacked (p1 [P * pos :], accumulator [r * K + k ][P * int (s [k * N + c ]):])
171
173
}
172
174
pos ++
173
175
}
@@ -178,7 +180,7 @@ func variableTime(sps []uint64, p1 []uint64, p2 []uint64, p3 []uint64, s []uint8
178
180
for r := 0 ; r < V ; r ++ {
179
181
for c := 0 ; c < O ; c ++ {
180
182
for k := 0 ; k < K ; k ++ {
181
- vecAddPacked (P , p2 [P * pos :], accumulator [r * K + k ][P * int (s [k * N + V + c ]):])
183
+ vecAddPacked (p2 [P * pos :], accumulator [r * K + k ][P * int (s [k * N + V + c ]):])
182
184
}
183
185
pos ++
184
186
}
@@ -189,50 +191,51 @@ func variableTime(sps []uint64, p1 []uint64, p2 []uint64, p3 []uint64, s []uint8
189
191
for r := 0 ; r < O ; r ++ {
190
192
for c := r ; c < O ; c ++ {
191
193
for k := 0 ; k < K ; k ++ {
192
- vecAddPacked (P , p3 [P * pos :], accumulator [(r + V )* K + k ][P * int (s [k * N + V + c ]):])
194
+ vecAddPacked (p3 [P * pos :], accumulator [(r + V )* K + k ][P * int (s [k * N + V + c ]):])
193
195
}
194
196
pos ++
195
197
}
196
198
}
197
199
198
200
for i := 0 ; i < K * N ; i ++ {
199
- aggregate (P , accumulator [i ], sps [P * i :])
201
+ accumulate (P , accumulator [i ], sps [P * i :])
200
202
}
201
203
}
202
204
203
- func variableTime2 (sps []uint64 , s []uint8 , pst []uint64 ) {
205
+ func calculateSPstVarTime (sps []uint64 , s []uint8 , pst []uint64 ) {
204
206
var accumulator [K * K ][P * 16 ]uint64
205
207
206
208
// S * PST : KxN * N*K
207
209
for r := 0 ; r < K ; r ++ {
208
210
for c := 0 ; c < N ; c ++ {
209
211
for k := 0 ; k < K ; k ++ {
210
- vecAddPacked (P , pst [P * (c * K + k ):], accumulator [r * K + k ][P * int (s [r * N + c ]):])
212
+ vecAddPacked (pst [P * (c * K + k ):], accumulator [r * K + k ][P * int (s [r * N + c ]):])
211
213
}
212
214
}
213
215
}
214
216
215
217
for i := 0 ; i < K * K ; i ++ {
216
- aggregate (P , accumulator [i ], sps [P * i :])
218
+ accumulate (P , accumulator [i ], sps [P * i :])
217
219
}
218
220
}
219
221
220
- // p is always P, but is still kept to be consistent with other functions
221
- //
222
- //nolint:unparam
223
- func vecAddPacked (p int , in []uint64 , acc []uint64 ) {
224
- for i := 0 ; i < p ; i ++ {
222
+ func vecAddPacked (in []uint64 , acc []uint64 ) {
223
+ for i := 0 ; i < P ; i ++ {
225
224
acc [i ] ^= in [i ]
226
225
}
227
226
}
228
227
229
- func aggregate (p int , bins [P * 16 ]uint64 , out []uint64 ) {
230
- // The following two methods are mathematically equivalent, but the second one is slightly faster.
228
+ func accumulate (p int , bins [P * 16 ]uint64 , out []uint64 ) {
229
+ // The following two approches are mathematically equivalent, but the second one is slightly faster.
231
230
232
- // the powers of x mod x^4+x+1, represented as integers, are 1,2,4,8,3,..,13,9
233
- // out = bins[9]*x^14 + bins[13]*x^13 + bins[15]*x^12 ... + bin[4]*x^2 + bins[2]*x + bins[1]
234
- // = ((bins[9]x+bins[13])x+bins[15])x + ... bins[4])x+bins[2])x+bins[1]
231
+ // Here we chose to multiply by x^-1 all the way through,
232
+ // unlike Method 3 in Figure 2 (see paper) which interleaves *x and *x^-1
233
+ // which probably gives more parallelism on more complex CPUs.
234
+ //
235
+ // Also, on M1 Pro, Method 2 in Figure 2 is not faster then Approach 2 coded here.
235
236
237
+ // Approach 1. Multiplying by x all the way through:
238
+ // the powers of x mod x^4+x+1, represented as integers, are 1,2,4,8,3,..,13,9
236
239
// vecMulAddPackedByX(p, bins[P*9:], bins[P*13:])
237
240
// vecMulAddPackedByX(p, bins[P*13:], bins[P*15:])
238
241
// vecMulAddPackedByX(p, bins[P*15:], bins[P*14:])
@@ -249,8 +252,8 @@ func aggregate(p int, bins [P * 16]uint64, out []uint64) {
249
252
// vecMulAddPackedByX(p, bins[P*2:], bins[P*1:])
250
253
// copy(out[:P], bins[P*1:])
251
254
252
- // In the reversed order of the above, because /x turns out to be slightly faster than *x.
253
- // out = ((bins[2]x^-1+bins[4])x^-1+bins[8])x^-1 + ... bins[13])x^-1+bins[9])x^-1+bins[1]
255
+ // Approach 2. Multiplying by x^-1 all the way through:
256
+ // In the reversed order of the first approach, because /x turns out to be slightly faster than *x.
254
257
vecMulAddPackedByInvX (p , bins [P * 2 :], bins [P * 4 :])
255
258
vecMulAddPackedByInvX (p , bins [P * 4 :], bins [P * 8 :])
256
259
vecMulAddPackedByInvX (p , bins [P * 8 :], bins [P * 3 :])
@@ -278,7 +281,9 @@ func aggregate(p int, bins [P * 16]uint64, out []uint64) {
278
281
// }
279
282
// }
280
283
284
+ // It can be seen by comparison to the commented code above that this requires fewer instructions.
281
285
func vecMulAddPackedByInvX (p int , in []uint64 , acc []uint64 ) {
286
+ // Equivalently:
282
287
// vecMulAddPacked(p, in, 9, acc)
283
288
284
289
lsb := uint64 (0x1111111111111111 )
0 commit comments