]>
git.saurik.com Git - apple/security.git/blob - libsecurity_apple_csp/lib/rijndael-alg-ref.c
2 * Copyright (c) 2000-2001 Apple Computer, Inc. All Rights Reserved.
4 * The contents of this file constitute Original Code as defined in and are
5 * subject to the Apple Public Source License Version 1.2 (the 'License').
6 * You may not use this file except in compliance with the License. Please obtain
7 * a copy of the License at http://www.apple.com/publicsource and read it before
10 * This Original Code and all software distributed under the License are
11 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS
12 * OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, INCLUDING WITHOUT
13 * LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
14 * PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. Please see the License for the
15 * specific language governing rights and limitations under the License.
19 /* rijndael-alg-ref.c v2.0 August '99
20 * Reference ANSI C code
21 * authors: Paulo Barreto
24 * PPC and 128-bit block optimization by Doug Mitchell May 2001.
31 #include "rijndael-alg-ref.h"
32 #include <cspdebugging.h>
34 #define SC ((BC - 4) >> 1)
36 #include "boxes-ref.h"
38 static const word8 shifts
[3][4][2] = {
56 #if !GLADMAN_AES_128_ENABLE
58 /* 128 bit key/word shift table in bits */
59 static const word8 shifts128
[4][2] = {
66 #endif /* GLADMAN_AES_128_ENABLE */
68 #if !AES_MUL_BY_LOOKUP
70 * Profiling measurements showed that the mul routine is where a large propertion of
71 * the time is spent. Since the first argument to mul is always one of six
72 * constants (2, 3, 0xe, etc.), we implement six 256x256 byte lookup tables to
73 * do the multiplies. This eliminates the need for the log/antilog tables, so
74 * it's only adding one kilobyte of const data. Throughput improvement for this
75 * mod is a factor of 3.3 for encrypt and 4.1 for decrypt in the 128-bit optimized
76 * case. Improvement for the general case (with a 256 bit key) is 1.46 for encrypt
77 * and 1.88 for decrypt. (Decrypt wins more for this enhancement because the
78 * InvMixColumn does four muls, vs. 2 muls for MixColumn). Measurements taken
79 * on a 500 MHz G4 with 1 MB of L2 cache.
83 * The mod 255 op in mul is really expensive...
85 * We know that b <= (254 * 2), so there are only two cases. Either return b,
88 * On a G4 this single optimization results in a 24% speedup for encrypt and
89 * a 25% speedup for decrypt.
91 static inline word8
mod255(word32 b
)
99 word8
mul(word8 a
, word8 b
) {
100 /* multiply two elements of GF(2^m)
101 * needed for MixColumn and InvMixColumn
103 if (a
&& b
) return Alogtable
[mod255(Logtable
[a
] + Logtable
[b
])];
106 #endif /* !AES_MUL_BY_LOOKUP */
109 void KeyAddition(word8 a
[4][MAXBC
], word8 rk
[4][MAXBC
], word8 BC
) {
110 /* Exor corresponding text input and round key input bytes
114 for(i
= 0; i
< 4; i
++)
115 for(j
= 0; j
< BC
; j
++) a
[i
][j
] ^= rk
[i
][j
];
119 void ShiftRow(word8 a
[4][MAXBC
], word8 d
, word8 BC
) {
120 /* Row 0 remains unchanged
121 * The other three rows are shifted a variable amount
126 for(i
= 1; i
< 4; i
++) {
127 for(j
= 0; j
< BC
; j
++) tmp
[j
] = a
[i
][(j
+ shifts
[SC
][i
][d
]) % BC
];
128 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = tmp
[j
];
133 void Substitution(word8 a
[4][MAXBC
], const word8 box
[256], word8 BC
) {
134 /* Replace every byte of the input by the byte at that place
135 * in the nonlinear S-box
139 for(i
= 0; i
< 4; i
++)
140 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = box
[a
[i
][j
]] ;
144 void MixColumn(word8 a
[4][MAXBC
], word8 BC
) {
145 /* Mix the four bytes of every column in a linear way
150 for(j
= 0; j
< BC
; j
++) {
151 for(i
= 0; i
< 4; i
++) {
152 #if AES_MUL_BY_LOOKUP
153 b
[i
][j
] = mulBy0x02
[a
[i
][j
]]
154 ^ mulBy0x03
[a
[(i
+ 1) % 4][j
]]
158 b
[i
][j
] = mul(2,a
[i
][j
])
159 ^ mul(3,a
[(i
+ 1) % 4][j
])
165 for(i
= 0; i
< 4; i
++) {
166 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = b
[i
][j
];
171 void InvMixColumn(word8 a
[4][MAXBC
], word8 BC
) {
172 /* Mix the four bytes of every column in a linear way
173 * This is the opposite operation of Mixcolumn
178 for(j
= 0; j
< BC
; j
++) {
179 for(i
= 0; i
< 4; i
++) {
180 #if AES_MUL_BY_LOOKUP
181 b
[i
][j
] = mulBy0x0e
[a
[i
][j
]]
182 ^ mulBy0x0b
[a
[(i
+ 1) % 4][j
]]
183 ^ mulBy0x0d
[a
[(i
+ 2) % 4][j
]]
184 ^ mulBy0x09
[a
[(i
+ 3) % 4][j
]];
186 b
[i
][j
] = mul(0xe,a
[i
][j
])
187 ^ mul(0xb,a
[(i
+ 1) % 4][j
])
188 ^ mul(0xd,a
[(i
+ 2) % 4][j
])
189 ^ mul(0x9,a
[(i
+ 3) % 4][j
]);
193 for(i
= 0; i
< 4; i
++) {
194 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = b
[i
][j
];
198 int rijndaelKeySched (
202 word8 W
[MAXROUNDS
+1][4][MAXBC
]) {
204 /* Calculate the necessary round keys
205 * The number of calculations depends on keyBits and blockBits
208 int i
, j
, t
, rconpointer
= 0;
212 case 128: KC
= 4; break;
213 case 192: KC
= 6; break;
214 case 256: KC
= 8; break;
215 default : return (-1);
219 case 128: BC
= 4; break;
220 case 192: BC
= 6; break;
221 case 256: BC
= 8; break;
222 default : return (-2);
225 switch (keyBits
>= blockBits
? keyBits
: blockBits
) {
226 case 128: ROUNDS
= 10; break;
227 case 192: ROUNDS
= 12; break;
228 case 256: ROUNDS
= 14; break;
229 default : return (-3); /* this cannot happen */
233 for(j
= 0; j
< KC
; j
++)
234 for(i
= 0; i
< 4; i
++)
237 /* copy values into round key array */
238 for(j
= 0; (j
< KC
) && (t
< (ROUNDS
+1)*BC
); j
++, t
++)
239 for(i
= 0; i
< 4; i
++) W
[t
/ BC
][i
][t
% BC
] = tk
[i
][j
];
241 while (t
< (ROUNDS
+1)*BC
) { /* while not enough round key material calculated */
242 /* calculate new values */
243 for(i
= 0; i
< 4; i
++)
244 tk
[i
][0] ^= S
[tk
[(i
+1)%4
][KC
-1]];
245 tk
[0][0] ^= rcon
[rconpointer
++];
248 for(j
= 1; j
< KC
; j
++)
249 for(i
= 0; i
< 4; i
++) tk
[i
][j
] ^= tk
[i
][j
-1];
251 for(j
= 1; j
< KC
/2; j
++)
252 for(i
= 0; i
< 4; i
++) tk
[i
][j
] ^= tk
[i
][j
-1];
253 for(i
= 0; i
< 4; i
++) tk
[i
][KC
/2] ^= S
[tk
[i
][KC
/2 - 1]];
254 for(j
= KC
/2 + 1; j
< KC
; j
++)
255 for(i
= 0; i
< 4; i
++) tk
[i
][j
] ^= tk
[i
][j
-1];
257 /* copy values into round key array */
258 for(j
= 0; (j
< KC
) && (t
< (ROUNDS
+1)*BC
); j
++, t
++)
259 for(i
= 0; i
< 4; i
++) W
[t
/ BC
][i
][t
% BC
] = tk
[i
][j
];
265 int rijndaelEncrypt (
269 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
271 /* Encryption of one block, general case.
276 case 128: BC
= 4; break;
277 case 192: BC
= 6; break;
278 case 256: BC
= 8; break;
279 default : return (-2);
282 switch (keyBits
>= blockBits
? keyBits
: blockBits
) {
283 case 128: ROUNDS
= 10; break;
284 case 192: ROUNDS
= 12; break;
285 case 256: ROUNDS
= 14; break;
286 default : return (-3); /* this cannot happen */
289 /* begin with a key addition
291 KeyAddition(a
,rk
[0],BC
);
293 /* ROUNDS-1 ordinary rounds
295 for(r
= 1; r
< ROUNDS
; r
++) {
296 Substitution(a
,S
,BC
);
299 KeyAddition(a
,rk
[r
],BC
);
302 /* Last round is special: there is no MixColumn
304 Substitution(a
,S
,BC
);
306 KeyAddition(a
,rk
[ROUNDS
],BC
);
311 int rijndaelDecrypt (
315 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
320 case 128: BC
= 4; break;
321 case 192: BC
= 6; break;
322 case 256: BC
= 8; break;
323 default : return (-2);
326 switch (keyBits
>= blockBits
? keyBits
: blockBits
) {
327 case 128: ROUNDS
= 10; break;
328 case 192: ROUNDS
= 12; break;
329 case 256: ROUNDS
= 14; break;
330 default : return (-3); /* this cannot happen */
333 /* To decrypt: apply the inverse operations of the encrypt routine,
336 * (KeyAddition is an involution: it 's equal to its inverse)
337 * (the inverse of Substitution with table S is Substitution with the
338 * inverse table of S)
339 * (the inverse of Shiftrow is Shiftrow over a suitable distance)
342 /* First the special round:
343 * without InvMixColumn
344 * with extra KeyAddition
346 KeyAddition(a
,rk
[ROUNDS
],BC
);
347 Substitution(a
,Si
,BC
);
350 /* ROUNDS-1 ordinary rounds
352 for(r
= ROUNDS
-1; r
> 0; r
--) {
353 KeyAddition(a
,rk
[r
],BC
);
355 Substitution(a
,Si
,BC
);
359 /* End with the extra key addition
362 KeyAddition(a
,rk
[0],BC
);
367 #if !GLADMAN_AES_128_ENABLE
370 * All of these 128-bit-key-and-block routines require 32-bit word-aligned
371 * char array pointers.ÊThe key schedule arrays are easy; they come from
372 * keyInstance which has a 4-byte-aligned element preceeding the key schedule.
373 * Others require manual alignment of a local variable by the caller.
376 static inline void KeyAddition128(
377 word8 a
[4][BC_128_OPT
],
378 word8 rk
[4][MAXBC
]) {
380 /* these casts are endian-independent */
381 ((word32
*)a
)[0] ^= *((word32
*)(&rk
[0]));
382 ((word32
*)a
)[1] ^= *((word32
*)(&rk
[1]));
383 ((word32
*)a
)[2] ^= *((word32
*)(&rk
[2]));
384 ((word32
*)a
)[3] ^= *((word32
*)(&rk
[3]));
387 static void Substitution128(
388 word8 a
[4][BC_128_OPT
],
389 const word8 box
[256]) {
390 /* Replace every byte of the input by the byte at that place
391 * in the nonlinear S-box
395 /* still to be optimized - larger S boxes? */
396 for(i
= 0; i
< 4; i
++) {
397 for(j
= 0; j
< BC_128_OPT
; j
++) {
398 a
[i
][j
] = box
[a
[i
][j
]];
403 #if defined(__ppc__) && defined(__GNUC__)
405 static inline void rotateWordLeft(
406 word8
*word
, // known to be word aligned
407 unsigned rotCount
) // in bits
409 word32 lword
= *((word32
*)word
);
410 asm("rlwnm %0,%1,%2,0,31" : "=r"(lword
) : "0"(lword
), "r"(rotCount
));
411 *((word32
*)word
) = lword
;
417 * Insert your machine/compiler dependent code here,
418 * or just use this, which works on any platform and compiler
419 * which supports the __attribute__((aligned(4))) directive.
421 static void rotateWordLeft(
422 word8
*word
, // known to be word aligned
423 unsigned rotCount
) // in bits
425 word8 tmp
[BC_128_OPT
] __attribute__((aligned(4)));
426 unsigned bytes
= rotCount
/ 8;
428 tmp
[0] = word
[bytes
& (BC_128_OPT
-1)];
429 tmp
[1] = word
[(1+bytes
) & (BC_128_OPT
-1)];
430 tmp
[2] = word
[(2+bytes
) & (BC_128_OPT
-1)];
431 tmp
[3] = word
[(3+bytes
) & (BC_128_OPT
-1)];
432 *((word32
*)word
) = *((word32
*)tmp
);
436 static inline void ShiftRow128(
437 word8 a
[4][BC_128_OPT
],
439 /* Row 0 remains unchanged
440 * The other three rows are shifted (actually rotated) a variable amount
444 for(i
= 1; i
< 4; i
++) {
445 rotateWordLeft(a
[i
], shifts128
[i
][d
]);
450 * The following two routines are where most of the time is spent in this
451 * module. Further optimization would have to focus here.
453 static void MixColumn128(word8 a
[4][BC_128_OPT
]) {
454 /* Mix the four bytes of every column in a linear way
456 word8 b
[4][BC_128_OPT
];
459 for(j
= 0; j
< BC_128_OPT
; j
++) {
460 for(i
= 0; i
< 4; i
++) {
461 #if AES_MUL_BY_LOOKUP
462 b
[i
][j
] = mulBy0x02
[a
[i
][j
]]
463 ^ mulBy0x03
[a
[(i
+ 1) % 4][j
]]
467 b
[i
][j
] = mul(2,a
[i
][j
])
468 ^ mul(3,a
[(i
+ 1) % 4][j
])
474 memmove(a
, b
, 4 * BC_128_OPT
);
477 static void InvMixColumn128(word8 a
[4][BC_128_OPT
]) {
478 /* Mix the four bytes of every column in a linear way
479 * This is the opposite operation of Mixcolumn
481 word8 b
[4][BC_128_OPT
];
484 for(j
= 0; j
< BC_128_OPT
; j
++) {
485 for(i
= 0; i
< 4; i
++) {
486 #if AES_MUL_BY_LOOKUP
487 b
[i
][j
] = mulBy0x0e
[a
[i
][j
]]
488 ^ mulBy0x0b
[a
[(i
+ 1) % 4][j
]]
489 ^ mulBy0x0d
[a
[(i
+ 2) % 4][j
]]
490 ^ mulBy0x09
[a
[(i
+ 3) % 4][j
]];
492 b
[i
][j
] = mul(0xe,a
[i
][j
])
493 ^ mul(0xb,a
[(i
+ 1) % 4][j
])
494 ^ mul(0xd,a
[(i
+ 2) % 4][j
])
495 ^ mul(0x9,a
[(i
+ 3) % 4][j
]);
499 memmove(a
, b
, 4 * BC_128_OPT
);
502 int rijndaelKeySched128 (
503 word8 k
[4][KC_128_OPT
],
504 word8 W
[MAXROUNDS
+1][4][MAXBC
]) {
506 /* Calculate the necessary round keys
507 * The number of calculations depends on keyBits and blockBits
509 int i
, j
, t
, rconpointer
= 0;
510 word8 tk
[4][KC_128_OPT
];
511 unsigned numSchedRows
= (ROUNDS_128_OPT
+ 1) * BC_128_OPT
;
513 for(j
= 0; j
< KC_128_OPT
; j
++)
514 for(i
= 0; i
< 4; i
++)
517 /* copy values into round key array */
518 for(j
= 0; (j
< KC_128_OPT
) && (t
< numSchedRows
); j
++, t
++) {
519 for(i
= 0; i
< 4; i
++) {
520 W
[t
/ BC_128_OPT
][i
][t
% BC_128_OPT
] = tk
[i
][j
];
524 while (t
< numSchedRows
) {
525 /* while not enough round key material calculated */
526 /* calculate new values */
527 for(i
= 0; i
< 4; i
++) {
528 tk
[i
][0] ^= S
[tk
[(i
+1)%4
][KC_128_OPT
-1]];
530 tk
[0][0] ^= rcon
[rconpointer
++];
532 for(j
= 1; j
< KC_128_OPT
; j
++) {
533 for(i
= 0; i
< 4; i
++) {
534 tk
[i
][j
] ^= tk
[i
][j
-1];
538 /* copy values into round key array */
539 for(j
= 0; (j
< KC_128_OPT
) && (t
< numSchedRows
); j
++, t
++) {
540 for(i
= 0; i
< 4; i
++) {
541 W
[t
/ BC_128_OPT
][i
][t
% BC_128_OPT
] = tk
[i
][j
];
549 int rijndaelEncrypt128 (
550 word8 a
[4][BC_128_OPT
],
551 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
553 /* Encryption of one block.
557 /* begin with a key addition
559 KeyAddition128(a
,rk
[0]);
561 /* ROUNDS-1 ordinary rounds
563 for(r
= 1; r
< ROUNDS_128_OPT
; r
++) {
564 Substitution128(a
,S
);
567 KeyAddition128(a
,rk
[r
]);
570 /* Last round is special: there is no MixColumn
572 Substitution128(a
,S
);
574 KeyAddition128(a
,rk
[ROUNDS_128_OPT
]);
579 int rijndaelDecrypt128 (
580 word8 a
[4][BC_128_OPT
],
581 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
585 /* To decrypt: apply the inverse operations of the encrypt routine,
588 * (KeyAddition is an involution: it 's equal to its inverse)
589 * (the inverse of Substitution with table S is Substitution with the
590 * inverse table of S)
591 * (the inverse of Shiftrow is Shiftrow over a suitable distance)
594 /* First the special round:
595 * without InvMixColumn
596 * with extra KeyAddition
598 KeyAddition128(a
,rk
[ROUNDS_128_OPT
]);
599 Substitution128(a
,Si
);
602 /* ROUNDS-1 ordinary rounds
604 for(r
= ROUNDS_128_OPT
-1; r
> 0; r
--) {
605 KeyAddition128(a
,rk
[r
]);
607 Substitution128(a
,Si
);
611 /* End with the extra key addition
614 KeyAddition128(a
,rk
[0]);
619 #endif /* !GLADMAN_AES_128_ENABLE */