2 * Copyright (c) 2000-2001 Apple Computer, Inc. All Rights Reserved.
4 * The contents of this file constitute Original Code as defined in and are
5 * subject to the Apple Public Source License Version 1.2 (the 'License').
6 * You may not use this file except in compliance with the License. Please obtain
7 * a copy of the License at http://www.apple.com/publicsource and read it before
10 * This Original Code and all software distributed under the License are
11 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS
12 * OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, INCLUDING WITHOUT
13 * LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
14 * PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. Please see the License for the
15 * specific language governing rights and limitations under the License.
19 /* rijndael-alg-ref.c v2.0 August '99
20 * Reference ANSI C code
21 * authors: Paulo Barreto
24 * PPC and 128-bit block optimization by Doug Mitchell May 2001.
31 #include "rijndael-alg-ref.h"
32 #include <AppleCSP/cspdebugging.h>
34 #define SC ((BC - 4) >> 1)
36 #include "boxes-ref.h"
38 static const word8 shifts
[3][4][2] = {
56 /* 128 bit key/word shift table in bits */
57 static const word8 shifts128
[4][2] = {
64 #if !AES_MUL_BY_LOOKUP
66 * Profiling measurements showed that the mul routine is where a large propertion of
67 * the time is spent. Since the first argument to mul is always one of six
68 * constants (2, 3, 0xe, etc.), we implement six 256x256 byte lookup tables to
69 * do the multiplies. This eliminates the need for the log/antilog tables, so
70 * it's only adding one kilobyte of const data. Throughput improvement for this
71 * mod is a factor of 3.3 for encrypt and 4.1 for decrypt in the 128-bit optimized
72 * case. Improvement for the general case (with a 256 bit key) is 1.46 for encrypt
73 * and 1.88 for decrypt. (Decrypt wins more for this enhancement because the
74 * InvMixColumn does four muls, vs. 2 muls for MixColumn). Measurements taken
75 * on a 500 MHz G4 with 1 MB of L2 cache.
78 * The mod 255 op in mul is really expensive...
80 * We know that b <= (254 * 2), so there are only two cases. Either return b,
83 * On a G4 this single optimization results in a 24% speedup for encrypt and
84 * a 25% speedup for decrypt.
86 static inline word8
mod255(word32 b
)
94 word8
mul(word8 a
, word8 b
) {
95 /* multiply two elements of GF(2^m)
96 * needed for MixColumn and InvMixColumn
98 if (a
&& b
) return Alogtable
[mod255(Logtable
[a
] + Logtable
[b
])];
101 #endif /* !AES_MUL_BY_LOOKUP */
103 void KeyAddition(word8 a
[4][MAXBC
], word8 rk
[4][MAXBC
], word8 BC
) {
104 /* Exor corresponding text input and round key input bytes
108 for(i
= 0; i
< 4; i
++)
109 for(j
= 0; j
< BC
; j
++) a
[i
][j
] ^= rk
[i
][j
];
112 void ShiftRow(word8 a
[4][MAXBC
], word8 d
, word8 BC
) {
113 /* Row 0 remains unchanged
114 * The other three rows are shifted a variable amount
119 for(i
= 1; i
< 4; i
++) {
120 for(j
= 0; j
< BC
; j
++) tmp
[j
] = a
[i
][(j
+ shifts
[SC
][i
][d
]) % BC
];
121 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = tmp
[j
];
125 void Substitution(word8 a
[4][MAXBC
], const word8 box
[256], word8 BC
) {
126 /* Replace every byte of the input by the byte at that place
127 * in the nonlinear S-box
131 for(i
= 0; i
< 4; i
++)
132 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = box
[a
[i
][j
]] ;
135 void MixColumn(word8 a
[4][MAXBC
], word8 BC
) {
136 /* Mix the four bytes of every column in a linear way
141 for(j
= 0; j
< BC
; j
++) {
142 for(i
= 0; i
< 4; i
++) {
143 #if AES_MUL_BY_LOOKUP
144 b
[i
][j
] = mulBy0x02
[a
[i
][j
]]
145 ^ mulBy0x03
[a
[(i
+ 1) % 4][j
]]
149 b
[i
][j
] = mul(2,a
[i
][j
])
150 ^ mul(3,a
[(i
+ 1) % 4][j
])
156 for(i
= 0; i
< 4; i
++) {
157 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = b
[i
][j
];
161 void InvMixColumn(word8 a
[4][MAXBC
], word8 BC
) {
162 /* Mix the four bytes of every column in a linear way
163 * This is the opposite operation of Mixcolumn
168 for(j
= 0; j
< BC
; j
++) {
169 for(i
= 0; i
< 4; i
++) {
170 #if AES_MUL_BY_LOOKUP
171 b
[i
][j
] = mulBy0x0e
[a
[i
][j
]]
172 ^ mulBy0x0b
[a
[(i
+ 1) % 4][j
]]
173 ^ mulBy0x0d
[a
[(i
+ 2) % 4][j
]]
174 ^ mulBy0x09
[a
[(i
+ 3) % 4][j
]];
176 b
[i
][j
] = mul(0xe,a
[i
][j
])
177 ^ mul(0xb,a
[(i
+ 1) % 4][j
])
178 ^ mul(0xd,a
[(i
+ 2) % 4][j
])
179 ^ mul(0x9,a
[(i
+ 3) % 4][j
]);
183 for(i
= 0; i
< 4; i
++) {
184 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = b
[i
][j
];
188 int rijndaelKeySched (
192 word8 W
[MAXROUNDS
+1][4][MAXBC
]) {
194 /* Calculate the necessary round keys
195 * The number of calculations depends on keyBits and blockBits
198 int i
, j
, t
, rconpointer
= 0;
202 case 128: KC
= 4; break;
203 case 192: KC
= 6; break;
204 case 256: KC
= 8; break;
205 default : return (-1);
209 case 128: BC
= 4; break;
210 case 192: BC
= 6; break;
211 case 256: BC
= 8; break;
212 default : return (-2);
215 switch (keyBits
>= blockBits
? keyBits
: blockBits
) {
216 case 128: ROUNDS
= 10; break;
217 case 192: ROUNDS
= 12; break;
218 case 256: ROUNDS
= 14; break;
219 default : return (-3); /* this cannot happen */
223 for(j
= 0; j
< KC
; j
++)
224 for(i
= 0; i
< 4; i
++)
227 /* copy values into round key array */
228 for(j
= 0; (j
< KC
) && (t
< (ROUNDS
+1)*BC
); j
++, t
++)
229 for(i
= 0; i
< 4; i
++) W
[t
/ BC
][i
][t
% BC
] = tk
[i
][j
];
231 while (t
< (ROUNDS
+1)*BC
) { /* while not enough round key material calculated */
232 /* calculate new values */
233 for(i
= 0; i
< 4; i
++)
234 tk
[i
][0] ^= S
[tk
[(i
+1)%4
][KC
-1]];
235 tk
[0][0] ^= rcon
[rconpointer
++];
238 for(j
= 1; j
< KC
; j
++)
239 for(i
= 0; i
< 4; i
++) tk
[i
][j
] ^= tk
[i
][j
-1];
241 for(j
= 1; j
< KC
/2; j
++)
242 for(i
= 0; i
< 4; i
++) tk
[i
][j
] ^= tk
[i
][j
-1];
243 for(i
= 0; i
< 4; i
++) tk
[i
][KC
/2] ^= S
[tk
[i
][KC
/2 - 1]];
244 for(j
= KC
/2 + 1; j
< KC
; j
++)
245 for(i
= 0; i
< 4; i
++) tk
[i
][j
] ^= tk
[i
][j
-1];
247 /* copy values into round key array */
248 for(j
= 0; (j
< KC
) && (t
< (ROUNDS
+1)*BC
); j
++, t
++)
249 for(i
= 0; i
< 4; i
++) W
[t
/ BC
][i
][t
% BC
] = tk
[i
][j
];
255 int rijndaelEncrypt (
259 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
261 /* Encryption of one block, general case.
266 case 128: BC
= 4; break;
267 case 192: BC
= 6; break;
268 case 256: BC
= 8; break;
269 default : return (-2);
272 switch (keyBits
>= blockBits
? keyBits
: blockBits
) {
273 case 128: ROUNDS
= 10; break;
274 case 192: ROUNDS
= 12; break;
275 case 256: ROUNDS
= 14; break;
276 default : return (-3); /* this cannot happen */
279 /* begin with a key addition
281 KeyAddition(a
,rk
[0],BC
);
283 /* ROUNDS-1 ordinary rounds
285 for(r
= 1; r
< ROUNDS
; r
++) {
286 Substitution(a
,S
,BC
);
289 KeyAddition(a
,rk
[r
],BC
);
292 /* Last round is special: there is no MixColumn
294 Substitution(a
,S
,BC
);
296 KeyAddition(a
,rk
[ROUNDS
],BC
);
301 int rijndaelDecrypt (
305 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
310 case 128: BC
= 4; break;
311 case 192: BC
= 6; break;
312 case 256: BC
= 8; break;
313 default : return (-2);
316 switch (keyBits
>= blockBits
? keyBits
: blockBits
) {
317 case 128: ROUNDS
= 10; break;
318 case 192: ROUNDS
= 12; break;
319 case 256: ROUNDS
= 14; break;
320 default : return (-3); /* this cannot happen */
323 /* To decrypt: apply the inverse operations of the encrypt routine,
326 * (KeyAddition is an involution: it 's equal to its inverse)
327 * (the inverse of Substitution with table S is Substitution with the
328 * inverse table of S)
329 * (the inverse of Shiftrow is Shiftrow over a suitable distance)
332 /* First the special round:
333 * without InvMixColumn
334 * with extra KeyAddition
336 KeyAddition(a
,rk
[ROUNDS
],BC
);
337 Substitution(a
,Si
,BC
);
340 /* ROUNDS-1 ordinary rounds
342 for(r
= ROUNDS
-1; r
> 0; r
--) {
343 KeyAddition(a
,rk
[r
],BC
);
345 Substitution(a
,Si
,BC
);
349 /* End with the extra key addition
352 KeyAddition(a
,rk
[0],BC
);
358 * All of these 128-bit-key-and-block routines require 32-bit word-aligned
359 * char array pointers.ÊThe key schedule arrays are easy; they come from
360 * keyInstance which has a 4-byte-aligned element preceeding the key schedule.
361 * Others require manual alignment of a local variable by the caller.
364 static inline void KeyAddition128(
365 word8 a
[4][BC_128_OPT
],
366 word8 rk
[4][MAXBC
]) {
368 /* these casts are endian-independent */
369 ((word32
*)a
)[0] ^= *((word32
*)(&rk
[0]));
370 ((word32
*)a
)[1] ^= *((word32
*)(&rk
[1]));
371 ((word32
*)a
)[2] ^= *((word32
*)(&rk
[2]));
372 ((word32
*)a
)[3] ^= *((word32
*)(&rk
[3]));
375 static void Substitution128(
376 word8 a
[4][BC_128_OPT
],
377 const word8 box
[256]) {
378 /* Replace every byte of the input by the byte at that place
379 * in the nonlinear S-box
383 /* still to be optimized - larger S boxes? */
384 for(i
= 0; i
< 4; i
++) {
385 for(j
= 0; j
< BC_128_OPT
; j
++) {
386 a
[i
][j
] = box
[a
[i
][j
]];
391 #if defined(__ppc__) && defined(__GNUC__)
393 static inline void rotateWordLeft(
394 word8
*word
, // known to be word aligned
395 unsigned rotCount
) // in bits
397 word32 lword
= *((word32
*)word
);
398 asm("rlwnm %0,%1,%2,0,31" : "=r"(lword
) : "0"(lword
), "r"(rotCount
));
399 *((word32
*)word
) = lword
;
405 * Insert your machine/compiler dependent code here,
406 * or just use this, which works on any platform and compiler
407 * which supports the __attribute__((aligned(4))) directive.
409 static void rotateWordLeft(
410 word8
*word
, // known to be word aligned
411 unsigned rotCount
) // in bits
413 word8 tmp
[BC_128_OPT
] __attribute__((aligned(4)));
414 unsigned bytes
= rotCount
/ 8;
416 tmp
[0] = word
[bytes
& (BC_128_OPT
-1)];
417 tmp
[1] = word
[(1+bytes
) & (BC_128_OPT
-1)];
418 tmp
[2] = word
[(2+bytes
) & (BC_128_OPT
-1)];
419 tmp
[3] = word
[(3+bytes
) & (BC_128_OPT
-1)];
420 *((word32
*)word
) = *((word32
*)tmp
);
424 static inline void ShiftRow128(
425 word8 a
[4][BC_128_OPT
],
427 /* Row 0 remains unchanged
428 * The other three rows are shifted (actually rotated) a variable amount
432 for(i
= 1; i
< 4; i
++) {
433 rotateWordLeft(a
[i
], shifts128
[i
][d
]);
438 * The following two routines are where most of the time is spent in this
439 * module. Further optimization would have to focus here.
441 static void MixColumn128(word8 a
[4][BC_128_OPT
]) {
442 /* Mix the four bytes of every column in a linear way
444 word8 b
[4][BC_128_OPT
];
447 for(j
= 0; j
< BC_128_OPT
; j
++) {
448 for(i
= 0; i
< 4; i
++) {
449 #if AES_MUL_BY_LOOKUP
450 b
[i
][j
] = mulBy0x02
[a
[i
][j
]]
451 ^ mulBy0x03
[a
[(i
+ 1) % 4][j
]]
455 b
[i
][j
] = mul(2,a
[i
][j
])
456 ^ mul(3,a
[(i
+ 1) % 4][j
])
462 memmove(a
, b
, 4 * BC_128_OPT
);
465 static void InvMixColumn128(word8 a
[4][BC_128_OPT
]) {
466 /* Mix the four bytes of every column in a linear way
467 * This is the opposite operation of Mixcolumn
469 word8 b
[4][BC_128_OPT
];
472 for(j
= 0; j
< BC_128_OPT
; j
++) {
473 for(i
= 0; i
< 4; i
++) {
474 #if AES_MUL_BY_LOOKUP
475 b
[i
][j
] = mulBy0x0e
[a
[i
][j
]]
476 ^ mulBy0x0b
[a
[(i
+ 1) % 4][j
]]
477 ^ mulBy0x0d
[a
[(i
+ 2) % 4][j
]]
478 ^ mulBy0x09
[a
[(i
+ 3) % 4][j
]];
480 b
[i
][j
] = mul(0xe,a
[i
][j
])
481 ^ mul(0xb,a
[(i
+ 1) % 4][j
])
482 ^ mul(0xd,a
[(i
+ 2) % 4][j
])
483 ^ mul(0x9,a
[(i
+ 3) % 4][j
]);
487 memmove(a
, b
, 4 * BC_128_OPT
);
490 int rijndaelKeySched128 (
491 word8 k
[4][KC_128_OPT
],
492 word8 W
[MAXROUNDS
+1][4][MAXBC
]) {
494 /* Calculate the necessary round keys
495 * The number of calculations depends on keyBits and blockBits
497 int i
, j
, t
, rconpointer
= 0;
498 word8 tk
[4][KC_128_OPT
];
499 unsigned numSchedRows
= (ROUNDS_128_OPT
+ 1) * BC_128_OPT
;
501 for(j
= 0; j
< KC_128_OPT
; j
++)
502 for(i
= 0; i
< 4; i
++)
505 /* copy values into round key array */
506 for(j
= 0; (j
< KC_128_OPT
) && (t
< numSchedRows
); j
++, t
++) {
507 for(i
= 0; i
< 4; i
++) {
508 W
[t
/ BC_128_OPT
][i
][t
% BC_128_OPT
] = tk
[i
][j
];
512 while (t
< numSchedRows
) {
513 /* while not enough round key material calculated */
514 /* calculate new values */
515 for(i
= 0; i
< 4; i
++) {
516 tk
[i
][0] ^= S
[tk
[(i
+1)%4
][KC_128_OPT
-1]];
518 tk
[0][0] ^= rcon
[rconpointer
++];
520 for(j
= 1; j
< KC_128_OPT
; j
++) {
521 for(i
= 0; i
< 4; i
++) {
522 tk
[i
][j
] ^= tk
[i
][j
-1];
526 /* copy values into round key array */
527 for(j
= 0; (j
< KC_128_OPT
) && (t
< numSchedRows
); j
++, t
++) {
528 for(i
= 0; i
< 4; i
++) {
529 W
[t
/ BC_128_OPT
][i
][t
% BC_128_OPT
] = tk
[i
][j
];
537 int rijndaelEncrypt128 (
538 word8 a
[4][BC_128_OPT
],
539 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
541 /* Encryption of one block.
545 /* begin with a key addition
547 KeyAddition128(a
,rk
[0]);
549 /* ROUNDS-1 ordinary rounds
551 for(r
= 1; r
< ROUNDS_128_OPT
; r
++) {
552 Substitution128(a
,S
);
555 KeyAddition128(a
,rk
[r
]);
558 /* Last round is special: there is no MixColumn
560 Substitution128(a
,S
);
562 KeyAddition128(a
,rk
[ROUNDS_128_OPT
]);
567 int rijndaelDecrypt128 (
568 word8 a
[4][BC_128_OPT
],
569 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
573 /* To decrypt: apply the inverse operations of the encrypt routine,
576 * (KeyAddition is an involution: it 's equal to its inverse)
577 * (the inverse of Substitution with table S is Substitution with the
578 * inverse table of S)
579 * (the inverse of Shiftrow is Shiftrow over a suitable distance)
582 /* First the special round:
583 * without InvMixColumn
584 * with extra KeyAddition
586 KeyAddition128(a
,rk
[ROUNDS_128_OPT
]);
587 Substitution128(a
,Si
);
590 /* ROUNDS-1 ordinary rounds
592 for(r
= ROUNDS_128_OPT
-1; r
> 0; r
--) {
593 KeyAddition128(a
,rk
[r
]);
595 Substitution128(a
,Si
);
599 /* End with the extra key addition
602 KeyAddition128(a
,rk
[0]);