]>
git.saurik.com Git - apple/security.git/blob - OSX/libsecurity_apple_csp/lib/rijndael-alg-ref.c
2 * Copyright (c) 2000-2001,2011-2012,2014 Apple Inc. All Rights Reserved.
4 * The contents of this file constitute Original Code as defined in and are
5 * subject to the Apple Public Source License Version 1.2 (the 'License').
6 * You may not use this file except in compliance with the License. Please obtain
7 * a copy of the License at http://www.apple.com/publicsource and read it before
10 * This Original Code and all software distributed under the License are
11 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS
12 * OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, INCLUDING WITHOUT
13 * LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
14 * PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. Please see the License for the
15 * specific language governing rights and limitations under the License.
19 /* rijndael-alg-ref.c v2.0 August '99
20 * Reference ANSI C code
21 * authors: Paulo Barreto
30 #include "rijndael-alg-ref.h"
31 #include <cspdebugging.h>
33 #define SC ((BC - 4) >> 1)
35 #include "boxes-ref.h"
37 static const word8 shifts
[3][4][2] = {
55 #if !GLADMAN_AES_128_ENABLE
57 /* 128 bit key/word shift table in bits */
58 static const word8 shifts128
[4][2] = {
65 #endif /* GLADMAN_AES_128_ENABLE */
67 #if !AES_MUL_BY_LOOKUP
69 * Profiling measurements showed that the mul routine is where a large propertion of
70 * the time is spent. Since the first argument to mul is always one of six
71 * constants (2, 3, 0xe, etc.), we implement six 256x256 byte lookup tables to
72 * do the multiplies. This eliminates the need for the log/antilog tables, so
73 * it's only adding one kilobyte of const data. Throughput improvement for this
74 * mod is a factor of 3.3 for encrypt and 4.1 for decrypt in the 128-bit optimized
75 * case. Improvement for the general case (with a 256 bit key) is 1.46 for encrypt
76 * and 1.88 for decrypt. (Decrypt wins more for this enhancement because the
77 * InvMixColumn does four muls, vs. 2 muls for MixColumn). Measurements taken
78 * on a 500 MHz G4 with 1 MB of L2 cache.
82 * The mod 255 op in mul is really expensive...
84 * We know that b <= (254 * 2), so there are only two cases. Either return b,
87 * On a G4 this single optimization results in a 24% speedup for encrypt and
88 * a 25% speedup for decrypt.
90 static inline word8
mod255(word32 b
)
98 word8
mul(word8 a
, word8 b
) {
99 /* multiply two elements of GF(2^m)
100 * needed for MixColumn and InvMixColumn
102 if (a
&& b
) return Alogtable
[mod255(Logtable
[a
] + Logtable
[b
])];
105 #endif /* !AES_MUL_BY_LOOKUP */
108 void KeyAddition(word8 a
[4][MAXBC
], word8 rk
[4][MAXBC
], word8 BC
) {
109 /* Exor corresponding text input and round key input bytes
113 for(i
= 0; i
< 4; i
++)
114 for(j
= 0; j
< BC
; j
++) a
[i
][j
] ^= rk
[i
][j
];
118 void ShiftRow(word8 a
[4][MAXBC
], word8 d
, word8 BC
) {
119 /* Row 0 remains unchanged
120 * The other three rows are shifted a variable amount
125 for(i
= 1; i
< 4; i
++) {
126 for(j
= 0; j
< BC
; j
++) tmp
[j
] = a
[i
][(j
+ shifts
[SC
][i
][d
]) % BC
];
127 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = tmp
[j
];
132 void Substitution(word8 a
[4][MAXBC
], const word8 box
[256], word8 BC
) {
133 /* Replace every byte of the input by the byte at that place
134 * in the nonlinear S-box
138 for(i
= 0; i
< 4; i
++)
139 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = box
[a
[i
][j
]] ;
143 void MixColumn(word8 a
[4][MAXBC
], word8 BC
) {
144 /* Mix the four bytes of every column in a linear way
149 for(j
= 0; j
< BC
; j
++) {
150 for(i
= 0; i
< 4; i
++) {
151 #if AES_MUL_BY_LOOKUP
152 b
[i
][j
] = mulBy0x02
[a
[i
][j
]]
153 ^ mulBy0x03
[a
[(i
+ 1) % 4][j
]]
157 b
[i
][j
] = mul(2,a
[i
][j
])
158 ^ mul(3,a
[(i
+ 1) % 4][j
])
164 for(i
= 0; i
< 4; i
++) {
165 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = b
[i
][j
];
170 void InvMixColumn(word8 a
[4][MAXBC
], word8 BC
) {
171 /* Mix the four bytes of every column in a linear way
172 * This is the opposite operation of Mixcolumn
177 for(j
= 0; j
< BC
; j
++) {
178 for(i
= 0; i
< 4; i
++) {
179 #if AES_MUL_BY_LOOKUP
180 b
[i
][j
] = mulBy0x0e
[a
[i
][j
]]
181 ^ mulBy0x0b
[a
[(i
+ 1) % 4][j
]]
182 ^ mulBy0x0d
[a
[(i
+ 2) % 4][j
]]
183 ^ mulBy0x09
[a
[(i
+ 3) % 4][j
]];
185 b
[i
][j
] = mul(0xe,a
[i
][j
])
186 ^ mul(0xb,a
[(i
+ 1) % 4][j
])
187 ^ mul(0xd,a
[(i
+ 2) % 4][j
])
188 ^ mul(0x9,a
[(i
+ 3) % 4][j
]);
192 for(i
= 0; i
< 4; i
++) {
193 for(j
= 0; j
< BC
; j
++) a
[i
][j
] = b
[i
][j
];
197 int rijndaelKeySched (
201 word8 W
[MAXROUNDS
+1][4][MAXBC
]) {
203 /* Calculate the necessary round keys
204 * The number of calculations depends on keyBits and blockBits
207 int i
, j
, t
, rconpointer
= 0;
211 case 128: KC
= 4; break;
212 case 192: KC
= 6; break;
213 case 256: KC
= 8; break;
214 default : return (-1);
218 case 128: BC
= 4; break;
219 case 192: BC
= 6; break;
220 case 256: BC
= 8; break;
221 default : return (-2);
224 switch (keyBits
>= blockBits
? keyBits
: blockBits
) {
225 case 128: ROUNDS
= 10; break;
226 case 192: ROUNDS
= 12; break;
227 case 256: ROUNDS
= 14; break;
228 default : return (-3); /* this cannot happen */
232 for(j
= 0; j
< KC
; j
++)
233 for(i
= 0; i
< 4; i
++)
236 /* copy values into round key array */
237 for(j
= 0; (j
< KC
) && (t
< (ROUNDS
+1)*BC
); j
++, t
++)
238 for(i
= 0; i
< 4; i
++) W
[t
/ BC
][i
][t
% BC
] = tk
[i
][j
];
240 while (t
< (ROUNDS
+1)*BC
) { /* while not enough round key material calculated */
241 /* calculate new values */
242 for(i
= 0; i
< 4; i
++)
243 tk
[i
][0] ^= S
[tk
[(i
+1)%4
][KC
-1]];
244 tk
[0][0] ^= rcon
[rconpointer
++];
247 for(j
= 1; j
< KC
; j
++)
248 for(i
= 0; i
< 4; i
++) tk
[i
][j
] ^= tk
[i
][j
-1];
250 for(j
= 1; j
< KC
/2; j
++)
251 for(i
= 0; i
< 4; i
++) tk
[i
][j
] ^= tk
[i
][j
-1];
252 for(i
= 0; i
< 4; i
++) tk
[i
][KC
/2] ^= S
[tk
[i
][KC
/2 - 1]];
253 for(j
= KC
/2 + 1; j
< KC
; j
++)
254 for(i
= 0; i
< 4; i
++) tk
[i
][j
] ^= tk
[i
][j
-1];
256 /* copy values into round key array */
257 for(j
= 0; (j
< KC
) && (t
< (ROUNDS
+1)*BC
); j
++, t
++)
258 for(i
= 0; i
< 4; i
++) W
[t
/ BC
][i
][t
% BC
] = tk
[i
][j
];
264 int rijndaelEncrypt (
268 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
270 /* Encryption of one block, general case.
275 case 128: BC
= 4; break;
276 case 192: BC
= 6; break;
277 case 256: BC
= 8; break;
278 default : return (-2);
281 switch (keyBits
>= blockBits
? keyBits
: blockBits
) {
282 case 128: ROUNDS
= 10; break;
283 case 192: ROUNDS
= 12; break;
284 case 256: ROUNDS
= 14; break;
285 default : return (-3); /* this cannot happen */
288 /* begin with a key addition
290 KeyAddition(a
,rk
[0],BC
);
292 /* ROUNDS-1 ordinary rounds
294 for(r
= 1; r
< ROUNDS
; r
++) {
295 Substitution(a
,S
,BC
);
298 KeyAddition(a
,rk
[r
],BC
);
301 /* Last round is special: there is no MixColumn
303 Substitution(a
,S
,BC
);
305 KeyAddition(a
,rk
[ROUNDS
],BC
);
310 int rijndaelDecrypt (
314 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
319 case 128: BC
= 4; break;
320 case 192: BC
= 6; break;
321 case 256: BC
= 8; break;
322 default : return (-2);
325 switch (keyBits
>= blockBits
? keyBits
: blockBits
) {
326 case 128: ROUNDS
= 10; break;
327 case 192: ROUNDS
= 12; break;
328 case 256: ROUNDS
= 14; break;
329 default : return (-3); /* this cannot happen */
332 /* To decrypt: apply the inverse operations of the encrypt routine,
335 * (KeyAddition is an involution: it 's equal to its inverse)
336 * (the inverse of Substitution with table S is Substitution with the
337 * inverse table of S)
338 * (the inverse of Shiftrow is Shiftrow over a suitable distance)
341 /* First the special round:
342 * without InvMixColumn
343 * with extra KeyAddition
345 KeyAddition(a
,rk
[ROUNDS
],BC
);
346 Substitution(a
,Si
,BC
);
349 /* ROUNDS-1 ordinary rounds
351 for(r
= ROUNDS
-1; r
> 0; r
--) {
352 KeyAddition(a
,rk
[r
],BC
);
354 Substitution(a
,Si
,BC
);
358 /* End with the extra key addition
361 KeyAddition(a
,rk
[0],BC
);
366 #if !GLADMAN_AES_128_ENABLE
369 * All of these 128-bit-key-and-block routines require 32-bit word-aligned
370 * char array pointers.ÊThe key schedule arrays are easy; they come from
371 * keyInstance which has a 4-byte-aligned element preceeding the key schedule.
372 * Others require manual alignment of a local variable by the caller.
375 static inline void KeyAddition128(
376 word8 a
[4][BC_128_OPT
],
377 word8 rk
[4][MAXBC
]) {
379 /* these casts are endian-independent */
380 ((word32
*)a
)[0] ^= *((word32
*)(&rk
[0]));
381 ((word32
*)a
)[1] ^= *((word32
*)(&rk
[1]));
382 ((word32
*)a
)[2] ^= *((word32
*)(&rk
[2]));
383 ((word32
*)a
)[3] ^= *((word32
*)(&rk
[3]));
386 static void Substitution128(
387 word8 a
[4][BC_128_OPT
],
388 const word8 box
[256]) {
389 /* Replace every byte of the input by the byte at that place
390 * in the nonlinear S-box
394 /* still to be optimized - larger S boxes? */
395 for(i
= 0; i
< 4; i
++) {
396 for(j
= 0; j
< BC_128_OPT
; j
++) {
397 a
[i
][j
] = box
[a
[i
][j
]];
402 #if defined(__ppc__) && defined(__GNUC__)
404 static inline void rotateWordLeft(
405 word8
*word
, // known to be word aligned
406 unsigned rotCount
) // in bits
408 word32 lword
= *((word32
*)word
);
409 asm("rlwnm %0,%1,%2,0,31" : "=r"(lword
) : "0"(lword
), "r"(rotCount
));
410 *((word32
*)word
) = lword
;
416 * Insert your machine/compiler dependent code here,
417 * or just use this, which works on any platform and compiler
418 * which supports the __attribute__((aligned(4))) directive.
420 static void rotateWordLeft(
421 word8
*word
, // known to be word aligned
422 unsigned rotCount
) // in bits
424 word8 tmp
[BC_128_OPT
] __attribute__((aligned(4)));
425 unsigned bytes
= rotCount
/ 8;
427 tmp
[0] = word
[bytes
& (BC_128_OPT
-1)];
428 tmp
[1] = word
[(1+bytes
) & (BC_128_OPT
-1)];
429 tmp
[2] = word
[(2+bytes
) & (BC_128_OPT
-1)];
430 tmp
[3] = word
[(3+bytes
) & (BC_128_OPT
-1)];
431 *((word32
*)word
) = *((word32
*)tmp
);
435 static inline void ShiftRow128(
436 word8 a
[4][BC_128_OPT
],
438 /* Row 0 remains unchanged
439 * The other three rows are shifted (actually rotated) a variable amount
443 for(i
= 1; i
< 4; i
++) {
444 rotateWordLeft(a
[i
], shifts128
[i
][d
]);
449 * The following two routines are where most of the time is spent in this
450 * module. Further optimization would have to focus here.
452 static void MixColumn128(word8 a
[4][BC_128_OPT
]) {
453 /* Mix the four bytes of every column in a linear way
455 word8 b
[4][BC_128_OPT
];
458 for(j
= 0; j
< BC_128_OPT
; j
++) {
459 for(i
= 0; i
< 4; i
++) {
460 #if AES_MUL_BY_LOOKUP
461 b
[i
][j
] = mulBy0x02
[a
[i
][j
]]
462 ^ mulBy0x03
[a
[(i
+ 1) % 4][j
]]
466 b
[i
][j
] = mul(2,a
[i
][j
])
467 ^ mul(3,a
[(i
+ 1) % 4][j
])
473 memmove(a
, b
, 4 * BC_128_OPT
);
476 static void InvMixColumn128(word8 a
[4][BC_128_OPT
]) {
477 /* Mix the four bytes of every column in a linear way
478 * This is the opposite operation of Mixcolumn
480 word8 b
[4][BC_128_OPT
];
483 for(j
= 0; j
< BC_128_OPT
; j
++) {
484 for(i
= 0; i
< 4; i
++) {
485 #if AES_MUL_BY_LOOKUP
486 b
[i
][j
] = mulBy0x0e
[a
[i
][j
]]
487 ^ mulBy0x0b
[a
[(i
+ 1) % 4][j
]]
488 ^ mulBy0x0d
[a
[(i
+ 2) % 4][j
]]
489 ^ mulBy0x09
[a
[(i
+ 3) % 4][j
]];
491 b
[i
][j
] = mul(0xe,a
[i
][j
])
492 ^ mul(0xb,a
[(i
+ 1) % 4][j
])
493 ^ mul(0xd,a
[(i
+ 2) % 4][j
])
494 ^ mul(0x9,a
[(i
+ 3) % 4][j
]);
498 memmove(a
, b
, 4 * BC_128_OPT
);
501 int rijndaelKeySched128 (
502 word8 k
[4][KC_128_OPT
],
503 word8 W
[MAXROUNDS
+1][4][MAXBC
]) {
505 /* Calculate the necessary round keys
506 * The number of calculations depends on keyBits and blockBits
508 int i
, j
, t
, rconpointer
= 0;
509 word8 tk
[4][KC_128_OPT
];
510 unsigned numSchedRows
= (ROUNDS_128_OPT
+ 1) * BC_128_OPT
;
512 for(j
= 0; j
< KC_128_OPT
; j
++)
513 for(i
= 0; i
< 4; i
++)
516 /* copy values into round key array */
517 for(j
= 0; (j
< KC_128_OPT
) && (t
< numSchedRows
); j
++, t
++) {
518 for(i
= 0; i
< 4; i
++) {
519 W
[t
/ BC_128_OPT
][i
][t
% BC_128_OPT
] = tk
[i
][j
];
523 while (t
< numSchedRows
) {
524 /* while not enough round key material calculated */
525 /* calculate new values */
526 for(i
= 0; i
< 4; i
++) {
527 tk
[i
][0] ^= S
[tk
[(i
+1)%4
][KC_128_OPT
-1]];
529 tk
[0][0] ^= rcon
[rconpointer
++];
531 for(j
= 1; j
< KC_128_OPT
; j
++) {
532 for(i
= 0; i
< 4; i
++) {
533 tk
[i
][j
] ^= tk
[i
][j
-1];
537 /* copy values into round key array */
538 for(j
= 0; (j
< KC_128_OPT
) && (t
< numSchedRows
); j
++, t
++) {
539 for(i
= 0; i
< 4; i
++) {
540 W
[t
/ BC_128_OPT
][i
][t
% BC_128_OPT
] = tk
[i
][j
];
548 int rijndaelEncrypt128 (
549 word8 a
[4][BC_128_OPT
],
550 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
552 /* Encryption of one block.
556 /* begin with a key addition
558 KeyAddition128(a
,rk
[0]);
560 /* ROUNDS-1 ordinary rounds
562 for(r
= 1; r
< ROUNDS_128_OPT
; r
++) {
563 Substitution128(a
,S
);
566 KeyAddition128(a
,rk
[r
]);
569 /* Last round is special: there is no MixColumn
571 Substitution128(a
,S
);
573 KeyAddition128(a
,rk
[ROUNDS_128_OPT
]);
578 int rijndaelDecrypt128 (
579 word8 a
[4][BC_128_OPT
],
580 word8 rk
[MAXROUNDS
+1][4][MAXBC
])
584 /* To decrypt: apply the inverse operations of the encrypt routine,
587 * (KeyAddition is an involution: it 's equal to its inverse)
588 * (the inverse of Substitution with table S is Substitution with the
589 * inverse table of S)
590 * (the inverse of Shiftrow is Shiftrow over a suitable distance)
593 /* First the special round:
594 * without InvMixColumn
595 * with extra KeyAddition
597 KeyAddition128(a
,rk
[ROUNDS_128_OPT
]);
598 Substitution128(a
,Si
);
601 /* ROUNDS-1 ordinary rounds
603 for(r
= ROUNDS_128_OPT
-1; r
> 0; r
--) {
604 KeyAddition128(a
,rk
[r
]);
606 Substitution128(a
,Si
);
610 /* End with the extra key addition
613 KeyAddition128(a
,rk
[0]);
618 #endif /* !GLADMAN_AES_128_ENABLE */