]>
git.saurik.com Git - apple/security.git/blob - libsecurity_apple_csp/lib/rijndael-alg-ref.c
2 * Copyright (c) 2000-2001 Apple Computer, Inc. All Rights Reserved.
4 * The contents of this file constitute Original Code as defined in and are
5 * subject to the Apple Public Source License Version 1.2 (the 'License').
6 * You may not use this file except in compliance with the License. Please obtain
7 * a copy of the License at http://www.apple.com/publicsource and read it before
10 * This Original Code and all software distributed under the License are
11 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS
12 * OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, INCLUDING WITHOUT
13 * LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
14 * PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. Please see the License for the
15 * specific language governing rights and limitations under the License.
19 /* rijndael-alg-ref.c v2.0 August '99
20 * Reference ANSI C code
21 * authors: Paulo Barreto
24 * PPC and 128-bit block optimization by Doug Mitchell May 2001.
31 #include "rijndael-alg-ref.h"
32 #include <cspdebugging.h>
34 #define SC ((BC - 4) >> 1)
36 #include "boxes-ref.h"
38 static const word8 shifts
[3][4][2] = {
56 #if !GLADMAN_AES_128_ENABLE
58 /* 128 bit key/word shift table in bits */
59 static const word8 shifts128
[4][2] = {
66 #endif /* GLADMAN_AES_128_ENABLE */
68 #if !AES_MUL_BY_LOOKUP
70 * Profiling measurements showed that the mul routine is where a large propertion of
71 * the time is spent. Since the first argument to mul is always one of six
72 * constants (2, 3, 0xe, etc.), we implement six 256x256 byte lookup tables to
73 * do the multiplies. This eliminates the need for the log/antilog tables, so
74 * it's only adding one kilobyte of const data. Throughput improvement for this
75 * mod is a factor of 3.3 for encrypt and 4.1 for decrypt in the 128-bit optimized
76 * case. Improvement for the general case (with a 256 bit key) is 1.46 for encrypt
77 * and 1.88 for decrypt. (Decrypt wins more for this enhancement because the
78 * InvMixColumn does four muls, vs. 2 muls for MixColumn). Measurements taken
79 * on a 500 MHz G4 with 1 MB of L2 cache.
83 * The mod 255 op in mul is really expensive...
85 * We know that b <= (254 * 2), so there are only two cases. Either return b,
88 * On a G4 this single optimization results in a 24% speedup for encrypt and
89 * a 25% speedup for decrypt.
91 static inline word8
mod255(word32 b
)
99 word8
mul(word8 a
, word8 b
) {
100 /* multiply two elements of GF(2^m)
101 * needed for MixColumn and InvMixColumn
103 if (a
&& b
) return Alogtable
[mod255(Logtable
[a
] + Logtable
[b
])];
106 #endif /* !AES_MUL_BY_LOOKUP */
108 void KeyAddition(word8 a
[4][MAXBC
], word8 rk
[4][MAXBC
], word8 BC
) {
109 /* Exor corresponding text input and round key input bytes
113 for(i
= 0; i
< 4; i
++)
114 for(j
= 0; j
< BC
; j
++) a
[i
][j
] ^= rk
[i
][j
];
117 void ShiftRow(word8 a
[4][MAXBC
], word8 d
, word8 BC
) {
118 /* Row 0 remains unchanged
119 * The other three rows are shifted a variable amount
124 for(i
= 1; i
< 4; i
++) {
125 for(j
= 0; j
< BC
; j
++) tmp
[j
] = a
[i
][(j
+ shifts
[SC
][i
][d
]) % BC
];
126 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = tmp
[j
];
130 void Substitution(word8 a
[4][MAXBC
], const word8 box
[256], word8 BC
) {
131 /* Replace every byte of the input by the byte at that place
132 * in the nonlinear S-box
136 for(i
= 0; i
< 4; i
++)
137 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = box
[a
[i
][j
]] ;
140 void MixColumn(word8 a
[4][MAXBC
], word8 BC
) {
141 /* Mix the four bytes of every column in a linear way
146 for(j
= 0; j
< BC
; j
++) {
147 for(i
= 0; i
< 4; i
++) {
148 #if AES_MUL_BY_LOOKUP
149 b
[i
][j
] = mulBy0x02
[a
[i
][j
]]
150 ^ mulBy0x03
[a
[(i
+ 1) % 4][j
]]
154 b
[i
][j
] = mul(2,a
[i
][j
])
155 ^ mul(3,a
[(i
+ 1) % 4][j
])
161 for(i
= 0; i
< 4; i
++) {
162 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = b
[i
][j
];
166 void InvMixColumn(word8 a
[4][MAXBC
], word8 BC
) {
167 /* Mix the four bytes of every column in a linear way
168 * This is the opposite operation of Mixcolumn
173 for(j
= 0; j
< BC
; j
++) {
174 for(i
= 0; i
< 4; i
++) {
175 #if AES_MUL_BY_LOOKUP
176 b
[i
][j
] = mulBy0x0e
[a
[i
][j
]]
177 ^ mulBy0x0b
[a
[(i
+ 1) % 4][j
]]
178 ^ mulBy0x0d
[a
[(i
+ 2) % 4][j
]]
179 ^ mulBy0x09
[a
[(i
+ 3) % 4][j
]];
181 b
[i
][j
] = mul(0xe,a
[i
][j
])
182 ^ mul(0xb,a
[(i
+ 1) % 4][j
])
183 ^ mul(0xd,a
[(i
+ 2) % 4][j
])
184 ^ mul(0x9,a
[(i
+ 3) % 4][j
]);
188 for(i
= 0; i
< 4; i
++) {
189 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = b
[i
][j
];
193 int rijndaelKeySched (
197 word8 W
[MAXROUNDS
+1][4][MAXBC
]) {
199 /* Calculate the necessary round keys
200 * The number of calculations depends on keyBits and blockBits
203 int i
, j
, t
, rconpointer
= 0;
207 case 128: KC
= 4; break;
208 case 192: KC
= 6; break;
209 case 256: KC
= 8; break;
210 default : return (-1);
214 case 128: BC
= 4; break;
215 case 192: BC
= 6; break;
216 case 256: BC
= 8; break;
217 default : return (-2);
220 switch (keyBits
>= blockBits
? keyBits
: blockBits
) {
221 case 128: ROUNDS
= 10; break;
222 case 192: ROUNDS
= 12; break;
223 case 256: ROUNDS
= 14; break;
224 default : return (-3); /* this cannot happen */
228 for(j
= 0; j
< KC
; j
++)
229 for(i
= 0; i
< 4; i
++)
232 /* copy values into round key array */
233 for(j
= 0; (j
< KC
) && (t
< (ROUNDS
+1)*BC
); j
++, t
++)
234 for(i
= 0; i
< 4; i
++) W
[t
/ BC
][i
][t
% BC
] = tk
[i
][j
];
236 while (t
< (ROUNDS
+1)*BC
) { /* while not enough round key material calculated */
237 /* calculate new values */
238 for(i
= 0; i
< 4; i
++)
239 tk
[i
][0] ^= S
[tk
[(i
+1)%4
][KC
-1]];
240 tk
[0][0] ^= rcon
[rconpointer
++];
243 for(j
= 1; j
< KC
; j
++)
244 for(i
= 0; i
< 4; i
++) tk
[i
][j
] ^= tk
[i
][j
-1];
246 for(j
= 1; j
< KC
/2; j
++)
247 for(i
= 0; i
< 4; i
++) tk
[i
][j
] ^= tk
[i
][j
-1];
248 for(i
= 0; i
< 4; i
++) tk
[i
][KC
/2] ^= S
[tk
[i
][KC
/2 - 1]];
249 for(j
= KC
/2 + 1; j
< KC
; j
++)
250 for(i
= 0; i
< 4; i
++) tk
[i
][j
] ^= tk
[i
][j
-1];
252 /* copy values into round key array */
253 for(j
= 0; (j
< KC
) && (t
< (ROUNDS
+1)*BC
); j
++, t
++)
254 for(i
= 0; i
< 4; i
++) W
[t
/ BC
][i
][t
% BC
] = tk
[i
][j
];
260 int rijndaelEncrypt (
264 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
266 /* Encryption of one block, general case.
271 case 128: BC
= 4; break;
272 case 192: BC
= 6; break;
273 case 256: BC
= 8; break;
274 default : return (-2);
277 switch (keyBits
>= blockBits
? keyBits
: blockBits
) {
278 case 128: ROUNDS
= 10; break;
279 case 192: ROUNDS
= 12; break;
280 case 256: ROUNDS
= 14; break;
281 default : return (-3); /* this cannot happen */
284 /* begin with a key addition
286 KeyAddition(a
,rk
[0],BC
);
288 /* ROUNDS-1 ordinary rounds
290 for(r
= 1; r
< ROUNDS
; r
++) {
291 Substitution(a
,S
,BC
);
294 KeyAddition(a
,rk
[r
],BC
);
297 /* Last round is special: there is no MixColumn
299 Substitution(a
,S
,BC
);
301 KeyAddition(a
,rk
[ROUNDS
],BC
);
306 int rijndaelDecrypt (
310 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
315 case 128: BC
= 4; break;
316 case 192: BC
= 6; break;
317 case 256: BC
= 8; break;
318 default : return (-2);
321 switch (keyBits
>= blockBits
? keyBits
: blockBits
) {
322 case 128: ROUNDS
= 10; break;
323 case 192: ROUNDS
= 12; break;
324 case 256: ROUNDS
= 14; break;
325 default : return (-3); /* this cannot happen */
328 /* To decrypt: apply the inverse operations of the encrypt routine,
331 * (KeyAddition is an involution: it 's equal to its inverse)
332 * (the inverse of Substitution with table S is Substitution with the
333 * inverse table of S)
334 * (the inverse of Shiftrow is Shiftrow over a suitable distance)
337 /* First the special round:
338 * without InvMixColumn
339 * with extra KeyAddition
341 KeyAddition(a
,rk
[ROUNDS
],BC
);
342 Substitution(a
,Si
,BC
);
345 /* ROUNDS-1 ordinary rounds
347 for(r
= ROUNDS
-1; r
> 0; r
--) {
348 KeyAddition(a
,rk
[r
],BC
);
350 Substitution(a
,Si
,BC
);
354 /* End with the extra key addition
357 KeyAddition(a
,rk
[0],BC
);
362 #if !GLADMAN_AES_128_ENABLE
365 * All of these 128-bit-key-and-block routines require 32-bit word-aligned
366 * char array pointers.ÊThe key schedule arrays are easy; they come from
367 * keyInstance which has a 4-byte-aligned element preceeding the key schedule.
368 * Others require manual alignment of a local variable by the caller.
371 static inline void KeyAddition128(
372 word8 a
[4][BC_128_OPT
],
373 word8 rk
[4][MAXBC
]) {
375 /* these casts are endian-independent */
376 ((word32
*)a
)[0] ^= *((word32
*)(&rk
[0]));
377 ((word32
*)a
)[1] ^= *((word32
*)(&rk
[1]));
378 ((word32
*)a
)[2] ^= *((word32
*)(&rk
[2]));
379 ((word32
*)a
)[3] ^= *((word32
*)(&rk
[3]));
382 static void Substitution128(
383 word8 a
[4][BC_128_OPT
],
384 const word8 box
[256]) {
385 /* Replace every byte of the input by the byte at that place
386 * in the nonlinear S-box
390 /* still to be optimized - larger S boxes? */
391 for(i
= 0; i
< 4; i
++) {
392 for(j
= 0; j
< BC_128_OPT
; j
++) {
393 a
[i
][j
] = box
[a
[i
][j
]];
398 #if defined(__ppc__) && defined(__GNUC__)
400 static inline void rotateWordLeft(
401 word8
*word
, // known to be word aligned
402 unsigned rotCount
) // in bits
404 word32 lword
= *((word32
*)word
);
405 asm("rlwnm %0,%1,%2,0,31" : "=r"(lword
) : "0"(lword
), "r"(rotCount
));
406 *((word32
*)word
) = lword
;
412 * Insert your machine/compiler dependent code here,
413 * or just use this, which works on any platform and compiler
414 * which supports the __attribute__((aligned(4))) directive.
416 static void rotateWordLeft(
417 word8
*word
, // known to be word aligned
418 unsigned rotCount
) // in bits
420 word8 tmp
[BC_128_OPT
] __attribute__((aligned(4)));
421 unsigned bytes
= rotCount
/ 8;
423 tmp
[0] = word
[bytes
& (BC_128_OPT
-1)];
424 tmp
[1] = word
[(1+bytes
) & (BC_128_OPT
-1)];
425 tmp
[2] = word
[(2+bytes
) & (BC_128_OPT
-1)];
426 tmp
[3] = word
[(3+bytes
) & (BC_128_OPT
-1)];
427 *((word32
*)word
) = *((word32
*)tmp
);
431 static inline void ShiftRow128(
432 word8 a
[4][BC_128_OPT
],
434 /* Row 0 remains unchanged
435 * The other three rows are shifted (actually rotated) a variable amount
439 for(i
= 1; i
< 4; i
++) {
440 rotateWordLeft(a
[i
], shifts128
[i
][d
]);
445 * The following two routines are where most of the time is spent in this
446 * module. Further optimization would have to focus here.
448 static void MixColumn128(word8 a
[4][BC_128_OPT
]) {
449 /* Mix the four bytes of every column in a linear way
451 word8 b
[4][BC_128_OPT
];
454 for(j
= 0; j
< BC_128_OPT
; j
++) {
455 for(i
= 0; i
< 4; i
++) {
456 #if AES_MUL_BY_LOOKUP
457 b
[i
][j
] = mulBy0x02
[a
[i
][j
]]
458 ^ mulBy0x03
[a
[(i
+ 1) % 4][j
]]
462 b
[i
][j
] = mul(2,a
[i
][j
])
463 ^ mul(3,a
[(i
+ 1) % 4][j
])
469 memmove(a
, b
, 4 * BC_128_OPT
);
472 static void InvMixColumn128(word8 a
[4][BC_128_OPT
]) {
473 /* Mix the four bytes of every column in a linear way
474 * This is the opposite operation of Mixcolumn
476 word8 b
[4][BC_128_OPT
];
479 for(j
= 0; j
< BC_128_OPT
; j
++) {
480 for(i
= 0; i
< 4; i
++) {
481 #if AES_MUL_BY_LOOKUP
482 b
[i
][j
] = mulBy0x0e
[a
[i
][j
]]
483 ^ mulBy0x0b
[a
[(i
+ 1) % 4][j
]]
484 ^ mulBy0x0d
[a
[(i
+ 2) % 4][j
]]
485 ^ mulBy0x09
[a
[(i
+ 3) % 4][j
]];
487 b
[i
][j
] = mul(0xe,a
[i
][j
])
488 ^ mul(0xb,a
[(i
+ 1) % 4][j
])
489 ^ mul(0xd,a
[(i
+ 2) % 4][j
])
490 ^ mul(0x9,a
[(i
+ 3) % 4][j
]);
494 memmove(a
, b
, 4 * BC_128_OPT
);
497 int rijndaelKeySched128 (
498 word8 k
[4][KC_128_OPT
],
499 word8 W
[MAXROUNDS
+1][4][MAXBC
]) {
501 /* Calculate the necessary round keys
502 * The number of calculations depends on keyBits and blockBits
504 int i
, j
, t
, rconpointer
= 0;
505 word8 tk
[4][KC_128_OPT
];
506 unsigned numSchedRows
= (ROUNDS_128_OPT
+ 1) * BC_128_OPT
;
508 for(j
= 0; j
< KC_128_OPT
; j
++)
509 for(i
= 0; i
< 4; i
++)
512 /* copy values into round key array */
513 for(j
= 0; (j
< KC_128_OPT
) && (t
< numSchedRows
); j
++, t
++) {
514 for(i
= 0; i
< 4; i
++) {
515 W
[t
/ BC_128_OPT
][i
][t
% BC_128_OPT
] = tk
[i
][j
];
519 while (t
< numSchedRows
) {
520 /* while not enough round key material calculated */
521 /* calculate new values */
522 for(i
= 0; i
< 4; i
++) {
523 tk
[i
][0] ^= S
[tk
[(i
+1)%4
][KC_128_OPT
-1]];
525 tk
[0][0] ^= rcon
[rconpointer
++];
527 for(j
= 1; j
< KC_128_OPT
; j
++) {
528 for(i
= 0; i
< 4; i
++) {
529 tk
[i
][j
] ^= tk
[i
][j
-1];
533 /* copy values into round key array */
534 for(j
= 0; (j
< KC_128_OPT
) && (t
< numSchedRows
); j
++, t
++) {
535 for(i
= 0; i
< 4; i
++) {
536 W
[t
/ BC_128_OPT
][i
][t
% BC_128_OPT
] = tk
[i
][j
];
544 int rijndaelEncrypt128 (
545 word8 a
[4][BC_128_OPT
],
546 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
548 /* Encryption of one block.
552 /* begin with a key addition
554 KeyAddition128(a
,rk
[0]);
556 /* ROUNDS-1 ordinary rounds
558 for(r
= 1; r
< ROUNDS_128_OPT
; r
++) {
559 Substitution128(a
,S
);
562 KeyAddition128(a
,rk
[r
]);
565 /* Last round is special: there is no MixColumn
567 Substitution128(a
,S
);
569 KeyAddition128(a
,rk
[ROUNDS_128_OPT
]);
574 int rijndaelDecrypt128 (
575 word8 a
[4][BC_128_OPT
],
576 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
580 /* To decrypt: apply the inverse operations of the encrypt routine,
583 * (KeyAddition is an involution: it 's equal to its inverse)
584 * (the inverse of Substitution with table S is Substitution with the
585 * inverse table of S)
586 * (the inverse of Shiftrow is Shiftrow over a suitable distance)
589 /* First the special round:
590 * without InvMixColumn
591 * with extra KeyAddition
593 KeyAddition128(a
,rk
[ROUNDS_128_OPT
]);
594 Substitution128(a
,Si
);
597 /* ROUNDS-1 ordinary rounds
599 for(r
= ROUNDS_128_OPT
-1; r
> 0; r
--) {
600 KeyAddition128(a
,rk
[r
]);
602 Substitution128(a
,Si
);
606 /* End with the extra key addition
609 KeyAddition128(a
,rk
[0]);
614 #endif /* !GLADMAN_AES_128_ENABLE */