]>
git.saurik.com Git - apple/security.git/blob - AppleCSP/AES/rijndael-alg-ref.c
2 * Copyright (c) 2000-2001 Apple Computer, Inc. All Rights Reserved.
4 * The contents of this file constitute Original Code as defined in and are
5 * subject to the Apple Public Source License Version 1.2 (the 'License').
6 * You may not use this file except in compliance with the License. Please obtain
7 * a copy of the License at http://www.apple.com/publicsource and read it before
10 * This Original Code and all software distributed under the License are
11 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS
12 * OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, INCLUDING WITHOUT
13 * LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
14 * PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. Please see the License for the
15 * specific language governing rights and limitations under the License.
19 /* rijndael-alg-ref.c v2.0 August '99
20 * Reference ANSI C code
21 * authors: Paulo Barreto
24 * PPC and 128-bit block optimization by Doug Mitchell May 2001.
31 #include "rijndael-alg-ref.h"
32 #include <AppleCSP/cspdebugging.h>
34 #define SC ((BC - 4) >> 1)
36 #include "boxes-ref.h"
38 static const word8 shifts
[3][4][2] = {
56 /* 128 bit key/word shift table in bits */
57 static const word8 shifts128
[4][2] = {
64 #if !AES_MUL_BY_LOOKUP
66 * Profiling measurements showed that the mul routine is where a large propertion of
67 * the time is spent. Since the first argument to mul is always one of six
68 * constants (2, 3, 0xe, etc.), we implement six 256x256 byte lookup tables to
69 * do the multiplies. This eliminates the need for the log/antilog tables, so
70 * it's only adding one kilobyte of const data. Throughput improvement for this
71 * mod is a factor of 3.3 for encrypt and 4.1 for decrypt in the 128-bit optimized
72 * case. Improvement for the general case (with a 256 bit key) is 1.46 for encrypt
73 * and 1.88 for decrypt. (Decrypt wins more for this enhancement because the
74 * InvMixColumn does four muls, vs. 2 muls for MixColumn). Measurements taken
75 * on a 500 MHz G4 with 1 MB of L2 cache.
78 * The mod 255 op in mul is really expensive...
80 * We know that b <= (254 * 2), so there are only two cases. Either return b,
83 * On a G4 this single optimization results in a 24% speedup for encrypt and
84 * a 25% speedup for decrypt.
86 static inline word8
mod255(word32 b
)
94 word8
mul(word8 a
, word8 b
) {
95 /* multiply two elements of GF(2^m)
96 * needed for MixColumn and InvMixColumn
98 if (a
&& b
) return Alogtable
[mod255(Logtable
[a
] + Logtable
[b
])];
101 #endif /* !AES_MUL_BY_LOOKUP */
103 void KeyAddition(word8 a
[4][MAXBC
], word8 rk
[4][MAXBC
], word8 BC
) {
104 /* Exor corresponding text input and round key input bytes
108 for(i
= 0; i
< 4; i
++)
109 for(j
= 0; j
< BC
; j
++) a
[i
][j
] ^= rk
[i
][j
];
112 void ShiftRow(word8 a
[4][MAXBC
], word8 d
, word8 BC
) {
113 /* Row 0 remains unchanged
114 * The other three rows are shifted a variable amount
119 for(i
= 1; i
< 4; i
++) {
120 for(j
= 0; j
< BC
; j
++) tmp
[j
] = a
[i
][(j
+ shifts
[SC
][i
][d
]) % BC
];
121 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = tmp
[j
];
125 void Substitution(word8 a
[4][MAXBC
], const word8 box
[256], word8 BC
) {
126 /* Replace every byte of the input by the byte at that place
127 * in the nonlinear S-box
131 for(i
= 0; i
< 4; i
++)
132 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = box
[a
[i
][j
]] ;
135 void MixColumn(word8 a
[4][MAXBC
], word8 BC
) {
136 /* Mix the four bytes of every column in a linear way
141 for(j
= 0; j
< BC
; j
++) {
142 for(i
= 0; i
< 4; i
++) {
143 #if AES_MUL_BY_LOOKUP
144 b
[i
][j
] = mulBy0x02
[a
[i
][j
]]
145 ^ mulBy0x03
[a
[(i
+ 1) % 4][j
]]
149 b
[i
][j
] = mul(2,a
[i
][j
])
150 ^ mul(3,a
[(i
+ 1) % 4][j
])
156 for(i
= 0; i
< 4; i
++) {
157 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = b
[i
][j
];
161 void InvMixColumn(word8 a
[4][MAXBC
], word8 BC
) {
162 /* Mix the four bytes of every column in a linear way
163 * This is the opposite operation of Mixcolumn
168 for(j
= 0; j
< BC
; j
++) {
169 for(i
= 0; i
< 4; i
++) {
170 #if AES_MUL_BY_LOOKUP
171 b
[i
][j
] = mulBy0x0e
[a
[i
][j
]]
172 ^ mulBy0x0b
[a
[(i
+ 1) % 4][j
]]
173 ^ mulBy0x0d
[a
[(i
+ 2) % 4][j
]]
174 ^ mulBy0x09
[a
[(i
+ 3) % 4][j
]];
176 b
[i
][j
] = mul(0xe,a
[i
][j
])
177 ^ mul(0xb,a
[(i
+ 1) % 4][j
])
178 ^ mul(0xd,a
[(i
+ 2) % 4][j
])
179 ^ mul(0x9,a
[(i
+ 3) % 4][j
]);
183 for(i
= 0; i
< 4; i
++) {
184 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = b
[i
][j
];
188 int rijndaelKeySched (
192 word8 W
[MAXROUNDS
+1][4][MAXBC
]) {
194 /* Calculate the necessary round keys
195 * The number of calculations depends on keyBits and blockBits
198 int i
, j
, t
, rconpointer
= 0;
202 case 128: KC
= 4; break;
203 case 192: KC
= 6; break;
204 case 256: KC
= 8; break;
205 default : return (-1);
209 case 128: BC
= 4; break;
210 case 192: BC
= 6; break;
211 case 256: BC
= 8; break;
212 default : return (-2);
215 switch (keyBits
>= blockBits
? keyBits
: blockBits
) {
216 case 128: ROUNDS
= 10; break;
217 case 192: ROUNDS
= 12; break;
218 case 256: ROUNDS
= 14; break;
219 default : return (-3); /* this cannot happen */
223 for(j
= 0; j
< KC
; j
++)
224 for(i
= 0; i
< 4; i
++)
227 /* copy values into round key array */
228 for(j
= 0; (j
< KC
) && (t
< (ROUNDS
+1)*BC
); j
++, t
++)
229 for(i
= 0; i
< 4; i
++) W
[t
/ BC
][i
][t
% BC
] = tk
[i
][j
];
231 while (t
< (ROUNDS
+1)*BC
) { /* while not enough round key material calculated */
232 /* calculate new values */
233 for(i
= 0; i
< 4; i
++)
234 tk
[i
][0] ^= S
[tk
[(i
+1)%4
][KC
-1]];
235 tk
[0][0] ^= rcon
[rconpointer
++];
238 for(j
= 1; j
< KC
; j
++)
239 for(i
= 0; i
< 4; i
++) tk
[i
][j
] ^= tk
[i
][j
-1];
241 for(j
= 1; j
< KC
/2; j
++)
242 for(i
= 0; i
< 4; i
++) tk
[i
][j
] ^= tk
[i
][j
-1];
243 for(i
= 0; i
< 4; i
++) tk
[i
][KC
/2] ^= S
[tk
[i
][KC
/2 - 1]];
244 for(j
= KC
/2 + 1; j
< KC
; j
++)
245 for(i
= 0; i
< 4; i
++) tk
[i
][j
] ^= tk
[i
][j
-1];
247 /* copy values into round key array */
248 for(j
= 0; (j
< KC
) && (t
< (ROUNDS
+1)*BC
); j
++, t
++)
249 for(i
= 0; i
< 4; i
++) W
[t
/ BC
][i
][t
% BC
] = tk
[i
][j
];
255 int rijndaelEncrypt (
259 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
261 /* Encryption of one block, general case.
266 case 128: BC
= 4; break;
267 case 192: BC
= 6; break;
268 case 256: BC
= 8; break;
269 default : return (-2);
272 switch (keyBits
>= blockBits
? keyBits
: blockBits
) {
273 case 128: ROUNDS
= 10; break;
274 case 192: ROUNDS
= 12; break;
275 case 256: ROUNDS
= 14; break;
276 default : return (-3); /* this cannot happen */
279 /* begin with a key addition
281 KeyAddition(a
,rk
[0],BC
);
283 /* ROUNDS-1 ordinary rounds
285 for(r
= 1; r
< ROUNDS
; r
++) {
286 Substitution(a
,S
,BC
);
289 KeyAddition(a
,rk
[r
],BC
);
292 /* Last round is special: there is no MixColumn
294 Substitution(a
,S
,BC
);
296 KeyAddition(a
,rk
[ROUNDS
],BC
);
301 int rijndaelDecrypt (
305 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
310 case 128: BC
= 4; break;
311 case 192: BC
= 6; break;
312 case 256: BC
= 8; break;
313 default : return (-2);
316 switch (keyBits
>= blockBits
? keyBits
: blockBits
) {
317 case 128: ROUNDS
= 10; break;
318 case 192: ROUNDS
= 12; break;
319 case 256: ROUNDS
= 14; break;
320 default : return (-3); /* this cannot happen */
323 /* To decrypt: apply the inverse operations of the encrypt routine,
326 * (KeyAddition is an involution: it 's equal to its inverse)
327 * (the inverse of Substitution with table S is Substitution with the
328 * inverse table of S)
329 * (the inverse of Shiftrow is Shiftrow over a suitable distance)
332 /* First the special round:
333 * without InvMixColumn
334 * with extra KeyAddition
336 KeyAddition(a
,rk
[ROUNDS
],BC
);
337 Substitution(a
,Si
,BC
);
340 /* ROUNDS-1 ordinary rounds
342 for(r
= ROUNDS
-1; r
> 0; r
--) {
343 KeyAddition(a
,rk
[r
],BC
);
345 Substitution(a
,Si
,BC
);
349 /* End with the extra key addition
352 KeyAddition(a
,rk
[0],BC
);
357 #if !GLADMAN_AES_128_ENABLE
360 * All of these 128-bit-key-and-block routines require 32-bit word-aligned
361 * char array pointers.ÊThe key schedule arrays are easy; they come from
362 * keyInstance which has a 4-byte-aligned element preceeding the key schedule.
363 * Others require manual alignment of a local variable by the caller.
366 static inline void KeyAddition128(
367 word8 a
[4][BC_128_OPT
],
368 word8 rk
[4][MAXBC
]) {
370 /* these casts are endian-independent */
371 ((word32
*)a
)[0] ^= *((word32
*)(&rk
[0]));
372 ((word32
*)a
)[1] ^= *((word32
*)(&rk
[1]));
373 ((word32
*)a
)[2] ^= *((word32
*)(&rk
[2]));
374 ((word32
*)a
)[3] ^= *((word32
*)(&rk
[3]));
377 static void Substitution128(
378 word8 a
[4][BC_128_OPT
],
379 const word8 box
[256]) {
380 /* Replace every byte of the input by the byte at that place
381 * in the nonlinear S-box
385 /* still to be optimized - larger S boxes? */
386 for(i
= 0; i
< 4; i
++) {
387 for(j
= 0; j
< BC_128_OPT
; j
++) {
388 a
[i
][j
] = box
[a
[i
][j
]];
393 #if defined(__ppc__) && defined(__GNUC__)
395 static inline void rotateWordLeft(
396 word8
*word
, // known to be word aligned
397 unsigned rotCount
) // in bits
399 word32 lword
= *((word32
*)word
);
400 asm("rlwnm %0,%1,%2,0,31" : "=r"(lword
) : "0"(lword
), "r"(rotCount
));
401 *((word32
*)word
) = lword
;
407 * Insert your machine/compiler dependent code here,
408 * or just use this, which works on any platform and compiler
409 * which supports the __attribute__((aligned(4))) directive.
411 static void rotateWordLeft(
412 word8
*word
, // known to be word aligned
413 unsigned rotCount
) // in bits
415 word8 tmp
[BC_128_OPT
] __attribute__((aligned(4)));
416 unsigned bytes
= rotCount
/ 8;
418 tmp
[0] = word
[bytes
& (BC_128_OPT
-1)];
419 tmp
[1] = word
[(1+bytes
) & (BC_128_OPT
-1)];
420 tmp
[2] = word
[(2+bytes
) & (BC_128_OPT
-1)];
421 tmp
[3] = word
[(3+bytes
) & (BC_128_OPT
-1)];
422 *((word32
*)word
) = *((word32
*)tmp
);
426 static inline void ShiftRow128(
427 word8 a
[4][BC_128_OPT
],
429 /* Row 0 remains unchanged
430 * The other three rows are shifted (actually rotated) a variable amount
434 for(i
= 1; i
< 4; i
++) {
435 rotateWordLeft(a
[i
], shifts128
[i
][d
]);
440 * The following two routines are where most of the time is spent in this
441 * module. Further optimization would have to focus here.
443 static void MixColumn128(word8 a
[4][BC_128_OPT
]) {
444 /* Mix the four bytes of every column in a linear way
446 word8 b
[4][BC_128_OPT
];
449 for(j
= 0; j
< BC_128_OPT
; j
++) {
450 for(i
= 0; i
< 4; i
++) {
451 #if AES_MUL_BY_LOOKUP
452 b
[i
][j
] = mulBy0x02
[a
[i
][j
]]
453 ^ mulBy0x03
[a
[(i
+ 1) % 4][j
]]
457 b
[i
][j
] = mul(2,a
[i
][j
])
458 ^ mul(3,a
[(i
+ 1) % 4][j
])
464 memmove(a
, b
, 4 * BC_128_OPT
);
467 static void InvMixColumn128(word8 a
[4][BC_128_OPT
]) {
468 /* Mix the four bytes of every column in a linear way
469 * This is the opposite operation of Mixcolumn
471 word8 b
[4][BC_128_OPT
];
474 for(j
= 0; j
< BC_128_OPT
; j
++) {
475 for(i
= 0; i
< 4; i
++) {
476 #if AES_MUL_BY_LOOKUP
477 b
[i
][j
] = mulBy0x0e
[a
[i
][j
]]
478 ^ mulBy0x0b
[a
[(i
+ 1) % 4][j
]]
479 ^ mulBy0x0d
[a
[(i
+ 2) % 4][j
]]
480 ^ mulBy0x09
[a
[(i
+ 3) % 4][j
]];
482 b
[i
][j
] = mul(0xe,a
[i
][j
])
483 ^ mul(0xb,a
[(i
+ 1) % 4][j
])
484 ^ mul(0xd,a
[(i
+ 2) % 4][j
])
485 ^ mul(0x9,a
[(i
+ 3) % 4][j
]);
489 memmove(a
, b
, 4 * BC_128_OPT
);
492 int rijndaelKeySched128 (
493 word8 k
[4][KC_128_OPT
],
494 word8 W
[MAXROUNDS
+1][4][MAXBC
]) {
496 /* Calculate the necessary round keys
497 * The number of calculations depends on keyBits and blockBits
499 int i
, j
, t
, rconpointer
= 0;
500 word8 tk
[4][KC_128_OPT
];
501 unsigned numSchedRows
= (ROUNDS_128_OPT
+ 1) * BC_128_OPT
;
503 for(j
= 0; j
< KC_128_OPT
; j
++)
504 for(i
= 0; i
< 4; i
++)
507 /* copy values into round key array */
508 for(j
= 0; (j
< KC_128_OPT
) && (t
< numSchedRows
); j
++, t
++) {
509 for(i
= 0; i
< 4; i
++) {
510 W
[t
/ BC_128_OPT
][i
][t
% BC_128_OPT
] = tk
[i
][j
];
514 while (t
< numSchedRows
) {
515 /* while not enough round key material calculated */
516 /* calculate new values */
517 for(i
= 0; i
< 4; i
++) {
518 tk
[i
][0] ^= S
[tk
[(i
+1)%4
][KC_128_OPT
-1]];
520 tk
[0][0] ^= rcon
[rconpointer
++];
522 for(j
= 1; j
< KC_128_OPT
; j
++) {
523 for(i
= 0; i
< 4; i
++) {
524 tk
[i
][j
] ^= tk
[i
][j
-1];
528 /* copy values into round key array */
529 for(j
= 0; (j
< KC_128_OPT
) && (t
< numSchedRows
); j
++, t
++) {
530 for(i
= 0; i
< 4; i
++) {
531 W
[t
/ BC_128_OPT
][i
][t
% BC_128_OPT
] = tk
[i
][j
];
539 int rijndaelEncrypt128 (
540 word8 a
[4][BC_128_OPT
],
541 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
543 /* Encryption of one block.
547 /* begin with a key addition
549 KeyAddition128(a
,rk
[0]);
551 /* ROUNDS-1 ordinary rounds
553 for(r
= 1; r
< ROUNDS_128_OPT
; r
++) {
554 Substitution128(a
,S
);
557 KeyAddition128(a
,rk
[r
]);
560 /* Last round is special: there is no MixColumn
562 Substitution128(a
,S
);
564 KeyAddition128(a
,rk
[ROUNDS_128_OPT
]);
569 int rijndaelDecrypt128 (
570 word8 a
[4][BC_128_OPT
],
571 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
575 /* To decrypt: apply the inverse operations of the encrypt routine,
578 * (KeyAddition is an involution: it 's equal to its inverse)
579 * (the inverse of Substitution with table S is Substitution with the
580 * inverse table of S)
581 * (the inverse of Shiftrow is Shiftrow over a suitable distance)
584 /* First the special round:
585 * without InvMixColumn
586 * with extra KeyAddition
588 KeyAddition128(a
,rk
[ROUNDS_128_OPT
]);
589 Substitution128(a
,Si
);
592 /* ROUNDS-1 ordinary rounds
594 for(r
= ROUNDS_128_OPT
-1; r
> 0; r
--) {
595 KeyAddition128(a
,rk
[r
]);
597 Substitution128(a
,Si
);
601 /* End with the extra key addition
604 KeyAddition128(a
,rk
[0]);
609 #endif /* !GLADMAN_AES_128_ENABLE */