]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | --------------------------------------------------------------------------- | |
3 | Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. | |
4 | ||
5 | LICENSE TERMS | |
6 | ||
7 | The free distribution and use of this software in both source and binary | |
8 | form is allowed (with or without changes) provided that: | |
9 | ||
10 | 1. distributions of this source code include the above copyright | |
11 | notice, this list of conditions and the following disclaimer; | |
12 | ||
13 | 2. distributions in binary form include the above copyright | |
14 | notice, this list of conditions and the following disclaimer | |
15 | in the documentation and/or other associated materials; | |
16 | ||
17 | 3. the copyright holder's name is not used to endorse products | |
18 | built using this software without specific written permission. | |
19 | ||
20 | ALTERNATIVELY, provided that this notice is retained in full, this product | |
21 | may be distributed under the terms of the GNU General Public License (GPL), | |
22 | in which case the provisions of the GPL apply INSTEAD OF those given above. | |
23 | ||
24 | DISCLAIMER | |
25 | ||
26 | This software is provided 'as is' with no explicit or implied warranties | |
27 | in respect of its properties, including, but not limited to, correctness | |
28 | and/or fitness for purpose. | |
29 | --------------------------------------------------------------------------- | |
30 | Issue 31/01/2006 | |
31 | ||
32 | These subroutines implement multiple block AES modes for ECB, CBC, CFB, | |
33 | OFB and CTR encryption, The code provides support for the VIA Advanced | |
34 | Cryptography Engine (ACE). | |
35 | ||
36 | NOTE: In the following subroutines, the AES contexts (ctx) must be | |
37 | 16 byte aligned if VIA ACE is being used | |
38 | */ | |
39 | ||
40 | /* ---------------------------------------------------------------------------------------------------------------- | |
41 | ||
42 | aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : | |
43 | ||
44 | For simplicity, I am assuming all variables are in 128-bit data type. | |
45 | ||
46 | aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx) | |
47 | { | |
48 | while(num_blk--) { | |
49 | *iv ^= *ibuf++; | |
50 | aes_encrypt(iv, iv, ctx); | |
51 | *obuf++ = *iv; | |
52 | } | |
53 | return 0; | |
54 | } | |
55 | ||
56 | The following is an implementation of this function using Intel AESNI. | |
57 | This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. | |
58 | Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch | |
59 | to this aesni-based function should it detecs that aesni is available. | |
60 | Blindly call this function SURELY will cause a CRASH on systems with no aesni support. | |
61 | ||
62 | Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks | |
63 | are serially chained. This prevents us from arranging several blocks for encryption in parallel. | |
64 | ||
65 | ----------------------------------------------------------------------------------------------------------------*/ | |
66 | ||
67 | .text | |
68 | .align 4,0x90 | |
69 | .globl _aes_encrypt_cbc_hw | |
70 | _aes_encrypt_cbc_hw: | |
71 | ||
72 | // push/save registers for local use | |
73 | #if defined __i386__ | |
74 | ||
75 | push %ebp | |
76 | movl %esp, %ebp | |
77 | push %ebx | |
78 | push %edi | |
79 | ||
80 | #define sp %esp | |
81 | ||
82 | #else // __x86_64__ | |
83 | ||
84 | push %rbp | |
85 | mov %rsp, %rbp | |
86 | push %rbx | |
87 | push %r13 | |
88 | push %r14 | |
89 | push %r15 | |
90 | ||
91 | #define sp %rsp | |
92 | ||
93 | #endif | |
94 | ||
95 | // if this is kernel code, need to save used xmm registers | |
96 | #ifdef KERNEL | |
97 | ||
98 | #if defined __i386__ | |
99 | sub $(8*16), %esp // for possible xmm0-xmm7 save/restore | |
100 | #else | |
101 | sub $(16*16), %rsp // xmm0-xmm15 save/restore | |
102 | #endif | |
103 | ||
104 | movaps %xmm0, (sp) | |
105 | movaps %xmm1, 16(sp) | |
106 | movaps %xmm2, 32(sp) | |
107 | movaps %xmm3, 48(sp) | |
108 | movaps %xmm4, 64(sp) | |
109 | movaps %xmm5, 80(sp) | |
110 | movaps %xmm6, 96(sp) | |
111 | movaps %xmm7, 112(sp) | |
112 | #if defined __x86_64__ | |
113 | movaps %xmm8, 16*8(sp) | |
114 | movaps %xmm9, 16*9(sp) | |
115 | movaps %xmm10, 16*10(sp) | |
116 | movaps %xmm11, 16*11(sp) | |
117 | movaps %xmm12, 16*12(sp) | |
118 | movaps %xmm13, 16*13(sp) | |
119 | movaps %xmm14, 16*14(sp) | |
120 | movaps %xmm15, 16*15(sp) | |
121 | #endif // __x86_64__ | |
122 | ||
123 | #endif // KERNEL | |
124 | ||
125 | #define iv %xmm0 | |
126 | ||
127 | #ifdef __i386__ | |
128 | ||
129 | mov 12(%ebp), %eax // in_iv | |
130 | mov 24(%ebp), %edx // ctx | |
131 | movups (%eax), iv // iv = in_iv | |
132 | mov 8(%ebp), %ebx // ibuf | |
133 | mov 16(%ebp), %ecx // num_blk | |
134 | mov 20(%ebp), %edi // obuf | |
135 | ||
136 | #define ibuf %ebx | |
137 | #define obuf %edi | |
138 | #define num_blk %ecx | |
139 | #define ctx %edx | |
140 | ||
141 | #else | |
142 | ||
143 | mov %rdi, %rbx // ibuf | |
144 | movups (%rsi), iv // iv = in_iv | |
145 | mov %rdx, %r13 // num_blk | |
146 | mov %rcx, %r14 // obuf | |
147 | mov %r8, %r15 // ctx | |
148 | ||
149 | #define ibuf %rbx | |
150 | #define num_blk %r13d | |
151 | #define obuf %r14 | |
152 | #define ctx %r15 | |
153 | ||
154 | #endif | |
155 | ||
156 | mov 240(ctx), %eax // aes length | |
157 | cmp $160, %eax // aes-128 encrypt ? | |
158 | je L_encrypt_128 | |
159 | cmp $192, %eax // aes-192 encrypt ? | |
160 | je L_encrypt_192 | |
161 | cmp $224, %eax // aes-256 encrypt ? | |
162 | je L_encrypt_256 | |
163 | mov $-1, %eax // return error | |
164 | jmp L_error | |
165 | ||
166 | // | |
167 | // aes-128 encrypt_cbc operation, up to L_HW_cbc_done | |
168 | // | |
169 | ||
170 | L_encrypt_128: | |
171 | ||
172 | cmp $1, num_blk // check number of block | |
173 | jl L_HW_cbc_done // should it be less than 1, nothing to do | |
174 | ||
175 | movups (ctx), %xmm2 // key0 | |
176 | movups 16(ctx), %xmm3 // key1 | |
177 | movups 32(ctx), %xmm4 // key2 | |
178 | movups 48(ctx), %xmm5 // key3 | |
179 | movups 64(ctx), %xmm6 // key4 | |
180 | movups 80(ctx), %xmm7 // key5 | |
181 | #if defined __x86_64__ | |
182 | movups 96(ctx), %xmm8 // key6 | |
183 | movups 112(ctx), %xmm9 // key7 | |
184 | movups 128(ctx), %xmm10 // key8 | |
185 | movups 144(ctx), %xmm11 // key9 | |
186 | movups 160(ctx), %xmm12 // keyA | |
187 | #endif | |
188 | ||
189 | // while (num_blk--) { | |
190 | // *iv ^= *ibuf++; | |
191 | // aes_encrypt(iv, iv, ctx); | |
192 | // *obuf++ = *iv; | |
193 | // } | |
194 | 0: | |
195 | movups (ibuf), %xmm1 // *ibuf | |
196 | pxor %xmm2, iv // 1st instruction inside aes_encrypt | |
197 | pxor %xmm1, iv // *iv ^= *ibuf | |
198 | ||
199 | // finishing up the rest of aes_encrypt | |
200 | aesenc %xmm3, iv | |
201 | aesenc %xmm4, iv | |
202 | aesenc %xmm5, iv | |
203 | aesenc %xmm6, iv | |
204 | aesenc %xmm7, iv | |
205 | #if defined __x86_64__ | |
206 | aesenc %xmm8, iv | |
207 | aesenc %xmm9, iv | |
208 | aesenc %xmm10, iv | |
209 | aesenc %xmm11, iv | |
210 | aesenclast %xmm12, iv | |
211 | #else | |
212 | movups 96(ctx), %xmm1 // key6 | |
213 | aesenc %xmm1, iv | |
214 | movups 112(ctx), %xmm1 // key7 | |
215 | aesenc %xmm1, iv | |
216 | movups 128(ctx), %xmm1 // key8 | |
217 | aesenc %xmm1, iv | |
218 | movups 144(ctx), %xmm1 // key9 | |
219 | aesenc %xmm1, iv | |
220 | movups 160(ctx), %xmm1 // keyA | |
221 | aesenclast %xmm1, iv | |
222 | #endif | |
223 | ||
224 | movups iv, (obuf) // *obuf = *iv; | |
225 | add $16, obuf // obuf++; | |
226 | add $16, ibuf // ibuf++; | |
227 | sub $1, num_blk // num_blk -- | |
228 | jg 0b // if num_blk > 0, repeat the loop | |
229 | ||
230 | // the following will be branched to from all other cases (encrypt/decrypt 128/192/256) | |
231 | ||
232 | L_HW_cbc_done: | |
233 | ||
234 | xor %eax, %eax // to return CRYPT_OK | |
235 | ||
236 | L_error: | |
237 | ||
238 | // if kernel, restore xmm registers | |
239 | #ifdef KERNEL | |
240 | movaps 0(sp), %xmm0 | |
241 | movaps 16(sp), %xmm1 | |
242 | movaps 32(sp), %xmm2 | |
243 | movaps 48(sp), %xmm3 | |
244 | movaps 64(sp), %xmm4 | |
245 | movaps 80(sp), %xmm5 | |
246 | movaps 96(sp), %xmm6 | |
247 | movaps 112(sp), %xmm7 | |
248 | #if defined __x86_64__ | |
249 | movaps 16*8(sp), %xmm8 | |
250 | movaps 16*9(sp), %xmm9 | |
251 | movaps 16*10(sp), %xmm10 | |
252 | movaps 16*11(sp), %xmm11 | |
253 | movaps 16*12(sp), %xmm12 | |
254 | movaps 16*13(sp), %xmm13 | |
255 | movaps 16*14(sp), %xmm14 | |
256 | movaps 16*15(sp), %xmm15 | |
257 | #endif // __x86_64__ | |
258 | #endif // KERNEL | |
259 | ||
260 | // release used stack memory, restore used callee-saved registers, and return | |
261 | #if defined __i386__ | |
262 | #ifdef KERNEL | |
263 | add $(8*16), %esp | |
264 | #endif | |
265 | pop %edi | |
266 | pop %ebx | |
267 | #else | |
268 | #ifdef KERNEL | |
269 | add $(16*16), %rsp | |
270 | #endif | |
271 | pop %r15 | |
272 | pop %r14 | |
273 | pop %r13 | |
274 | pop %rbx | |
275 | #endif | |
276 | leave | |
277 | ret | |
278 | ||
279 | // | |
280 | // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done | |
281 | // | |
282 | ||
283 | L_encrypt_192: | |
284 | ||
285 | cmp $1, num_blk // check number of block | |
286 | jl L_HW_cbc_done // should it be less than 1, nothing to do | |
287 | ||
288 | movups (ctx), %xmm2 // key0 | |
289 | movups 16(ctx), %xmm3 // key1 | |
290 | movups 32(ctx), %xmm4 // key2 | |
291 | movups 48(ctx), %xmm5 // key3 | |
292 | movups 64(ctx), %xmm6 // key4 | |
293 | movups 80(ctx), %xmm7 // key5 | |
294 | #if defined __x86_64__ | |
295 | movups 96(ctx), %xmm8 // key6 | |
296 | movups 112(ctx), %xmm9 // key7 | |
297 | movups 128(ctx), %xmm10 // key8 | |
298 | movups 144(ctx), %xmm11 // key9 | |
299 | movups 160(ctx), %xmm12 // keyA | |
300 | movups 176(ctx), %xmm13 // keyB | |
301 | movups 192(ctx), %xmm14 // keyC | |
302 | #endif | |
303 | ||
304 | // while (num_blk--) { | |
305 | // *iv ^= *ibuf++; | |
306 | // aes_encrypt(iv, iv, ctx); | |
307 | // *obuf++ = *iv; | |
308 | // } | |
309 | 0: | |
310 | movups (ibuf), %xmm1 // *ibuf | |
311 | pxor %xmm1, iv // *iv ^= ibuf | |
312 | ||
313 | // aes_encrypt(iv, iv, ctx); | |
314 | ||
315 | pxor %xmm2, iv | |
316 | aesenc %xmm3, iv | |
317 | aesenc %xmm4, iv | |
318 | aesenc %xmm5, iv | |
319 | aesenc %xmm6, iv | |
320 | aesenc %xmm7, iv | |
321 | #if defined __x86_64__ | |
322 | aesenc %xmm8, iv | |
323 | aesenc %xmm9, iv | |
324 | aesenc %xmm10, iv | |
325 | aesenc %xmm11, iv | |
326 | aesenc %xmm12, iv | |
327 | aesenc %xmm13, iv | |
328 | aesenclast %xmm14, iv | |
329 | #else | |
330 | movups 96(ctx), %xmm1 | |
331 | aesenc %xmm1, iv | |
332 | movups 112(ctx), %xmm1 | |
333 | aesenc %xmm1, iv | |
334 | movups 128(ctx), %xmm1 | |
335 | aesenc %xmm1, iv | |
336 | movups 144(ctx), %xmm1 | |
337 | aesenc %xmm1, iv | |
338 | movups 160(ctx), %xmm1 | |
339 | aesenc %xmm1, iv | |
340 | movups 176(ctx), %xmm1 | |
341 | aesenc %xmm1, iv | |
342 | movups 192(ctx), %xmm1 | |
343 | aesenclast %xmm1, iv | |
344 | #endif | |
345 | ||
346 | movups iv, (obuf) // *obuf = *iv; | |
347 | add $16, ibuf // ibuf++ | |
348 | add $16, obuf // obuf++ | |
349 | ||
350 | sub $1, num_blk // num_blk -- | |
351 | jg 0b // if num_blk > 0, repeat the loop | |
352 | ||
353 | jmp L_HW_cbc_done // share with the common exit code | |
354 | ||
355 | // | |
356 | // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done | |
357 | // | |
358 | ||
359 | L_encrypt_256: | |
360 | ||
361 | cmp $1, num_blk // check number of block | |
362 | jl L_HW_cbc_done // should it be less than 1, nothing to do | |
363 | ||
364 | movups (ctx), %xmm2 // key0 | |
365 | movups 16(ctx), %xmm3 // key1 | |
366 | movups 32(ctx), %xmm4 // key2 | |
367 | movups 48(ctx), %xmm5 // key3 | |
368 | movups 64(ctx), %xmm6 // key4 | |
369 | movups 80(ctx), %xmm7 // key5 | |
370 | #if defined __x86_64__ | |
371 | movups 96(ctx), %xmm8 // key6 | |
372 | movups 112(ctx), %xmm9 // key7 | |
373 | movups 128(ctx), %xmm10 // key8 | |
374 | movups 144(ctx), %xmm11 // key9 | |
375 | movups 160(ctx), %xmm12 // keyA | |
376 | movups 176(ctx), %xmm13 // keyB | |
377 | movups 192(ctx), %xmm14 // keyC | |
378 | movups 208(ctx), %xmm15 // keyD | |
379 | // movups 224(ctx), %xmm1 // keyE | |
380 | #endif | |
381 | ||
382 | // while (num_blk--) { | |
383 | // *iv ^= *ibuf++; | |
384 | // aes_encrypt(iv, iv, ctx); | |
385 | // *obuf++ = *iv; | |
386 | // } | |
387 | 0: | |
388 | movups (ibuf), %xmm1 // *ibuf | |
389 | pxor %xmm1, iv // *iv ^= ibuf | |
390 | ||
391 | // aes_encrypt(iv, iv, ctx); | |
392 | pxor %xmm2, iv | |
393 | aesenc %xmm3, iv | |
394 | aesenc %xmm4, iv | |
395 | aesenc %xmm5, iv | |
396 | aesenc %xmm6, iv | |
397 | aesenc %xmm7, iv | |
398 | #if defined __x86_64__ | |
399 | movups 224(ctx), %xmm1 // keyE | |
400 | aesenc %xmm8, iv | |
401 | aesenc %xmm9, iv | |
402 | aesenc %xmm10, iv | |
403 | aesenc %xmm11, iv | |
404 | aesenc %xmm12, iv | |
405 | aesenc %xmm13, iv | |
406 | aesenc %xmm14, iv | |
407 | aesenc %xmm15, iv | |
408 | aesenclast %xmm1, iv | |
409 | #else | |
410 | movups 96(ctx), %xmm1 // key6 | |
411 | aesenc %xmm1, iv | |
412 | movups 112(ctx), %xmm1 // key7 | |
413 | aesenc %xmm1, iv | |
414 | movups 128(ctx), %xmm1 // key8 | |
415 | aesenc %xmm1, iv | |
416 | movups 144(ctx), %xmm1 // key9 | |
417 | aesenc %xmm1, iv | |
418 | movups 160(ctx), %xmm1 // keyA | |
419 | aesenc %xmm1, iv | |
420 | movups 176(ctx), %xmm1 // keyB | |
421 | aesenc %xmm1, iv | |
422 | movups 192(ctx), %xmm1 // keyC | |
423 | aesenc %xmm1, iv | |
424 | movups 208(ctx), %xmm1 // keyD | |
425 | aesenc %xmm1, iv | |
426 | movups 224(ctx), %xmm1 // keyE | |
427 | aesenclast %xmm1, iv | |
428 | #endif | |
429 | ||
430 | movups iv, (obuf) // *obuf = *iv; | |
431 | add $16, ibuf // ibuf++ | |
432 | add $16, obuf // obuf++ | |
433 | ||
434 | sub $1, num_blk // num_blk -- | |
435 | jg 0b // if num_blk > 0, repeat the loop | |
436 | ||
437 | jmp L_HW_cbc_done // share with the common exit code | |
438 | ||
439 | ||
440 | ||
441 | // | |
442 | // --------- END of aes_encrypt_cbc_hw ------------------- | |
443 | // | |
444 | ||
445 | ||
446 | /* ---------------------------------------------------------------------------------------------------------------- | |
447 | ||
448 | aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : | |
449 | ||
450 | For simplicity, I am assuming all variables are in 128-bit data type. | |
451 | ||
452 | aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx) | |
453 | { | |
454 | while(num_blk--) { | |
455 | aes_decrypt(ibuf, obuf, ctx); | |
456 | *obuf++ ^= *iv; | |
457 | *iv = *ibuf++; | |
458 | } | |
459 | return 0; | |
460 | } | |
461 | ||
462 | The following is an implementation of this function using Intel AESNI. | |
463 | This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. | |
464 | Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch | |
465 | to this aesni-based function should it detecs that aesni is available. | |
466 | Blindly call this function SURELY will cause a CRASH on systems with no aesni support. | |
467 | ||
468 | Note that the decryption operation is not related over blocks. | |
469 | This gives opportunity of arranging aes_decrypt operations in parallel to speed up code. | |
470 | This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55) | |
471 | The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc. | |
472 | ||
473 | Example C code for packing 4 blocks in an iteration is shown as follows: | |
474 | ||
475 | while ((num_blk-=4)>=0) { | |
476 | ||
477 | // the following 4 functions can be interleaved to exploit parallelism | |
478 | aes_decrypt(ibuf, obuf, ctx); | |
479 | aes_decrypt(ibuf+1, obuf+1, ctx); | |
480 | aes_decrypt(ibuf+2, obuf+2, ctx); | |
481 | aes_decrypt(ibuf+3, obuf+3, ctx); | |
482 | ||
483 | obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; | |
484 | *iv = ibuf[3]; ibuf += 4; obuf += 4; | |
485 | } | |
486 | num_blk+=4; | |
487 | ||
488 | ----------------------------------------------------------------------------------------------------------------*/ | |
489 | ||
490 | .text | |
491 | .align 4,0x90 | |
492 | .globl _aes_decrypt_cbc_hw | |
493 | _aes_decrypt_cbc_hw: | |
494 | ||
495 | // push/save registers for local use | |
496 | #if defined __i386__ | |
497 | ||
498 | push %ebp | |
499 | movl %esp, %ebp | |
500 | push %ebx // ibuf | |
501 | push %edi // obuf | |
502 | ||
503 | #define sp %esp | |
504 | ||
505 | #else // __x86_64__ | |
506 | ||
507 | push %rbp | |
508 | mov %rsp, %rbp | |
509 | push %rbx | |
510 | push %r13 | |
511 | push %r14 | |
512 | push %r15 | |
513 | ||
514 | #define sp %rsp | |
515 | ||
516 | #endif | |
517 | ||
518 | ||
519 | // if kernel, allocate stack space to save xmm registers | |
520 | #ifdef KERNEL | |
521 | #if defined __i386__ | |
522 | sub $(8*16), %esp | |
523 | #else | |
524 | sub $(16*16), %rsp | |
525 | #endif | |
526 | movaps %xmm0, (sp) | |
527 | movaps %xmm1, 16(sp) | |
528 | movaps %xmm2, 32(sp) | |
529 | movaps %xmm3, 48(sp) | |
530 | movaps %xmm4, 64(sp) | |
531 | movaps %xmm5, 80(sp) | |
532 | movaps %xmm6, 96(sp) | |
533 | movaps %xmm7, 112(sp) | |
534 | #if defined __x86_64__ | |
535 | movaps %xmm8, 16*8(sp) | |
536 | movaps %xmm9, 16*9(sp) | |
537 | movaps %xmm10, 16*10(sp) | |
538 | movaps %xmm11, 16*11(sp) | |
539 | movaps %xmm12, 16*12(sp) | |
540 | movaps %xmm13, 16*13(sp) | |
541 | movaps %xmm14, 16*14(sp) | |
542 | movaps %xmm15, 16*15(sp) | |
543 | #endif // __x86_64__ | |
544 | #endif | |
545 | ||
546 | #undef iv | |
547 | #define iv %xmm0 | |
548 | ||
549 | #if defined __i386__ | |
550 | mov 12(%ebp), %eax // in_iv | |
551 | mov 24(%ebp), %edx // ctx | |
552 | movups (%eax), iv // iv = in_iv | |
553 | mov 8(%ebp), %ebx // ibuf | |
554 | mov 16(%ebp), %ecx // num_blk | |
555 | mov 20(%ebp), %edi // obuf | |
556 | ||
557 | #define ibuf %ebx | |
558 | #define obuf %edi | |
559 | #define num_blk %ecx | |
560 | #define ctx %edx | |
561 | ||
562 | #else // __x86_64__, rdi/rsi/rdx/rcx/r8 | |
563 | ||
564 | mov %rdi, %rbx // ibuf | |
565 | movups (%rsi), iv // iv = in_iv | |
566 | mov %rdx, %r13 // num_blk | |
567 | mov %rcx, %r14 // obuf | |
568 | mov %r8, %r15 // ctx | |
569 | ||
570 | #define ibuf %rbx | |
571 | #define num_blk %r13d | |
572 | #define obuf %r14 | |
573 | #define ctx %r15 | |
574 | ||
575 | #endif | |
576 | ||
577 | mov 240(ctx), %eax // aes length | |
578 | cmp $160, %eax // aes-128 decrypt | |
579 | je L_decrypt_128 | |
580 | cmp $192, %eax // aes-192 decrypt | |
581 | je L_decrypt_192 | |
582 | cmp $224, %eax // aes-256 decrypt | |
583 | je L_decrypt_256 | |
584 | ||
585 | mov $-1, %eax // wrong aes length, to return -1 | |
586 | jmp L_error // early exit due to wrong aes length | |
587 | ||
588 | ||
589 | // | |
590 | // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done | |
591 | // | |
592 | ||
593 | L_decrypt_128: | |
594 | ||
595 | cmp $1, num_blk | |
596 | jl L_HW_cbc_done // if num_blk < 1, early return | |
597 | ||
598 | // aes-128 decrypt expanded keys | |
599 | movups 160(ctx), %xmm3 | |
600 | movups 144(ctx), %xmm4 | |
601 | movups 128(ctx), %xmm5 | |
602 | movups 112(ctx), %xmm6 | |
603 | movups 96(ctx), %xmm7 | |
604 | #if defined __x86_64__ | |
605 | movups 80(ctx), %xmm8 | |
606 | movups 64(ctx), %xmm9 | |
607 | movups 48(ctx), %xmm10 | |
608 | movups 32(ctx), %xmm11 | |
609 | movups 16(ctx), %xmm12 | |
610 | movups 0(ctx), %xmm13 | |
611 | #endif | |
612 | ||
613 | // performs 4 block decryption in an iteration to exploit decrypt in parallel | |
614 | ||
615 | // while ((num_blk-=4)>=0) { | |
616 | // aes_decrypt(ibuf, obuf, ctx); | |
617 | // aes_decrypt(ibuf+1, obuf+1, ctx); | |
618 | // aes_decrypt(ibuf+2, obuf+2, ctx); | |
619 | // aes_decrypt(ibuf+3, obuf+3, ctx); | |
620 | // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; | |
621 | // *iv = ibuf[3]; ibuf += 4; obuf += 4; | |
622 | // } | |
623 | ||
624 | sub $4, num_blk // pre decrement num_blk by 4 | |
625 | jl 9f // if num_blk < 4, skip the per-4-blocks processing code | |
626 | ||
627 | 0: | |
628 | ||
629 | ||
630 | #if defined __x86_64__ | |
631 | ||
632 | movups (ibuf), %xmm1 // tmp = 1st ibuf | |
633 | movups 16(ibuf), %xmm2 // tmp = 2nd ibuf | |
634 | movups 32(ibuf), %xmm14 // tmp = 3rd ibuf | |
635 | movups 48(ibuf), %xmm15 // tmp = 4th ibuf | |
636 | ||
637 | // for x86_64, the expanded keys are already stored in xmm3-xmm13 | |
638 | ||
639 | // aes-128 decrypt round 0 per 4 blocks | |
640 | pxor %xmm3, %xmm1 | |
641 | pxor %xmm3, %xmm2 | |
642 | pxor %xmm3, %xmm14 | |
643 | pxor %xmm3, %xmm15 | |
644 | ||
645 | // aes-128 decrypt round 1 per 4 blocks | |
646 | aesdec %xmm4, %xmm1 | |
647 | aesdec %xmm4, %xmm2 | |
648 | aesdec %xmm4, %xmm14 | |
649 | aesdec %xmm4, %xmm15 | |
650 | ||
651 | // aes-128 decrypt round 2 per 4 blocks | |
652 | aesdec %xmm5, %xmm1 | |
653 | aesdec %xmm5, %xmm2 | |
654 | aesdec %xmm5, %xmm14 | |
655 | aesdec %xmm5, %xmm15 | |
656 | ||
657 | // aes-128 decrypt round 3 per 4 blocks | |
658 | aesdec %xmm6, %xmm1 | |
659 | aesdec %xmm6, %xmm2 | |
660 | aesdec %xmm6, %xmm14 | |
661 | aesdec %xmm6, %xmm15 | |
662 | ||
663 | // aes-128 decrypt round 4 per 4 blocks | |
664 | aesdec %xmm7, %xmm1 | |
665 | aesdec %xmm7, %xmm2 | |
666 | aesdec %xmm7, %xmm14 | |
667 | aesdec %xmm7, %xmm15 | |
668 | ||
669 | // aes-128 decrypt round 5 per 4 blocks | |
670 | aesdec %xmm8, %xmm1 | |
671 | aesdec %xmm8, %xmm2 | |
672 | aesdec %xmm8, %xmm14 | |
673 | aesdec %xmm8, %xmm15 | |
674 | ||
675 | // aes-128 decrypt round 6 per 4 blocks | |
676 | aesdec %xmm9, %xmm1 | |
677 | aesdec %xmm9, %xmm2 | |
678 | aesdec %xmm9, %xmm14 | |
679 | aesdec %xmm9, %xmm15 | |
680 | ||
681 | // aes-128 decrypt round 7 per 4 blocks | |
682 | aesdec %xmm10, %xmm1 | |
683 | aesdec %xmm10, %xmm2 | |
684 | aesdec %xmm10, %xmm14 | |
685 | aesdec %xmm10, %xmm15 | |
686 | ||
687 | // aes-128 decrypt round 8 per 4 blocks | |
688 | aesdec %xmm11, %xmm1 | |
689 | aesdec %xmm11, %xmm2 | |
690 | aesdec %xmm11, %xmm14 | |
691 | aesdec %xmm11, %xmm15 | |
692 | ||
693 | // aes-128 decrypt round 9 per 4 blocks | |
694 | aesdec %xmm12, %xmm1 | |
695 | aesdec %xmm12, %xmm2 | |
696 | aesdec %xmm12, %xmm14 | |
697 | aesdec %xmm12, %xmm15 | |
698 | ||
699 | // aes-128 decrypt round 10 (last) per 4 blocks | |
700 | aesdeclast %xmm13, %xmm1 | |
701 | aesdeclast %xmm13, %xmm2 | |
702 | aesdeclast %xmm13, %xmm14 | |
703 | aesdeclast %xmm13, %xmm15 | |
704 | ||
705 | pxor iv, %xmm1 // obuf[0] ^= *iv; | |
706 | movups (ibuf), iv // ibuf[0] | |
707 | pxor iv, %xmm2 // obuf[1] ^= ibuf[0]; | |
708 | movups 16(ibuf), iv // ibuf[1] | |
709 | pxor iv, %xmm14 // obuf[2] ^= ibuf[1]; | |
710 | movups 32(ibuf), iv // ibuf[2] | |
711 | pxor iv, %xmm15 // obuf[3] ^= obuf[2]; | |
712 | movups 48(ibuf), iv // *iv = ibuf[3] | |
713 | ||
714 | movups %xmm1, (obuf) // write 1st obuf | |
715 | movups %xmm2, 16(obuf) // write 2nd obuf | |
716 | movups %xmm14, 32(obuf) // write 3rd obuf | |
717 | movups %xmm15, 48(obuf) // write 4th obuf | |
718 | ||
719 | ||
720 | #else | |
721 | ||
722 | // aes_decrypt_cbc per 4 blocks using aes-128 for i386 | |
723 | // xmm1/xmm2/xmm4/xmm5 used for obuf per block | |
724 | // xmm3 = key0 | |
725 | // xmm0 = iv | |
726 | // xmm6/xmm7 dynamically load with other expanded keys | |
727 | ||
728 | movups (ibuf), %xmm1 // tmp = 1st ibuf | |
729 | movups 16(ibuf), %xmm2 // tmp = 2nd ibuf | |
730 | movups 32(ibuf), %xmm4 // tmp = 3rd ibuf | |
731 | movups 48(ibuf), %xmm5 // tmp = 4th ibuf | |
732 | ||
733 | // aes_decrypt | |
734 | // for i386, sequentially load expanded keys into xmm6/xmm7 | |
735 | ||
736 | movups 144(ctx), %xmm6 // key1 | |
737 | ||
738 | // aes-128 decrypt round 0 per 4 blocks | |
739 | pxor %xmm3, %xmm1 | |
740 | pxor %xmm3, %xmm2 | |
741 | pxor %xmm3, %xmm4 | |
742 | pxor %xmm3, %xmm5 | |
743 | ||
744 | movups 128(ctx), %xmm7 // key2 | |
745 | ||
746 | // aes-128 decrypt round 1 per 4 blocks | |
747 | aesdec %xmm6, %xmm1 | |
748 | aesdec %xmm6, %xmm2 | |
749 | aesdec %xmm6, %xmm4 | |
750 | aesdec %xmm6, %xmm5 | |
751 | ||
752 | movups 112(ctx), %xmm6 // key3 | |
753 | ||
754 | // aes-128 decrypt round 2 per 4 blocks | |
755 | aesdec %xmm7, %xmm1 | |
756 | aesdec %xmm7, %xmm2 | |
757 | aesdec %xmm7, %xmm4 | |
758 | aesdec %xmm7, %xmm5 | |
759 | ||
760 | movups 96(ctx), %xmm7 // key4 | |
761 | ||
762 | // aes-128 decrypt round 3 per 4 blocks | |
763 | aesdec %xmm6, %xmm1 | |
764 | aesdec %xmm6, %xmm2 | |
765 | aesdec %xmm6, %xmm4 | |
766 | aesdec %xmm6, %xmm5 | |
767 | ||
768 | movups 80(ctx), %xmm6 // key5 | |
769 | ||
770 | // aes-128 decrypt round 4 per 4 blocks | |
771 | aesdec %xmm7, %xmm1 | |
772 | aesdec %xmm7, %xmm2 | |
773 | aesdec %xmm7, %xmm4 | |
774 | aesdec %xmm7, %xmm5 | |
775 | ||
776 | movups 64(ctx), %xmm7 // key6 | |
777 | ||
778 | // aes-128 decrypt round 5 per 4 blocks | |
779 | aesdec %xmm6, %xmm1 | |
780 | aesdec %xmm6, %xmm2 | |
781 | aesdec %xmm6, %xmm4 | |
782 | aesdec %xmm6, %xmm5 | |
783 | ||
784 | movups 48(ctx), %xmm6 // key7 | |
785 | ||
786 | // aes-128 decrypt round 6 per 4 blocks | |
787 | aesdec %xmm7, %xmm1 | |
788 | aesdec %xmm7, %xmm2 | |
789 | aesdec %xmm7, %xmm4 | |
790 | aesdec %xmm7, %xmm5 | |
791 | ||
792 | movups 32(ctx), %xmm7 // key8 | |
793 | ||
794 | // aes-128 decrypt round 7 per 4 blocks | |
795 | aesdec %xmm6, %xmm1 | |
796 | aesdec %xmm6, %xmm2 | |
797 | aesdec %xmm6, %xmm4 | |
798 | aesdec %xmm6, %xmm5 | |
799 | ||
800 | movups 16(ctx), %xmm6 // key9 | |
801 | ||
802 | // aes-128 decrypt round 8 per 4 blocks | |
803 | aesdec %xmm7, %xmm1 | |
804 | aesdec %xmm7, %xmm2 | |
805 | aesdec %xmm7, %xmm4 | |
806 | aesdec %xmm7, %xmm5 | |
807 | ||
808 | movups 0(ctx), %xmm7 // keyA | |
809 | ||
810 | // aes-128 decrypt round 9 per 4 blocks | |
811 | aesdec %xmm6, %xmm1 | |
812 | aesdec %xmm6, %xmm2 | |
813 | aesdec %xmm6, %xmm4 | |
814 | aesdec %xmm6, %xmm5 | |
815 | ||
816 | // aes-128 decrypt round 10 (last) per 4 blocks | |
817 | aesdeclast %xmm7, %xmm1 | |
818 | aesdeclast %xmm7, %xmm2 | |
819 | aesdeclast %xmm7, %xmm4 | |
820 | aesdeclast %xmm7, %xmm5 | |
821 | ||
822 | pxor iv, %xmm1 // 1st obuf ^= iv; | |
823 | movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); | |
824 | pxor iv, %xmm2 // 2nd obuf ^= iv; | |
825 | movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); | |
826 | pxor iv, %xmm4 // 3rd obuf ^= iv; | |
827 | movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); | |
828 | pxor iv, %xmm5 // 4th obuf ^= iv; | |
829 | movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); | |
830 | ||
831 | movups %xmm1, (obuf) // write 1st obuf | |
832 | movups %xmm2, 16(obuf) // write 2nd obuf | |
833 | movups %xmm4, 32(obuf) // write 3rd obuf | |
834 | movups %xmm5, 48(obuf) // write 4th obuf | |
835 | #endif | |
836 | ||
837 | add $64, ibuf // ibuf += 4; | |
838 | add $64, obuf // obuf += 4; | |
839 | ||
840 | sub $4, num_blk // num_blk -= 4 | |
841 | jge 0b // if num_blk > 0, repeat the loop | |
842 | ||
843 | 9: add $4, num_blk // post incremtn num_blk by 4 | |
844 | je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code | |
845 | ||
846 | #if defined __i386__ | |
847 | // updated as they might be needed as expanded keys in the remaining | |
848 | movups 144(ctx), %xmm4 | |
849 | movups 128(ctx), %xmm5 | |
850 | movups 112(ctx), %xmm6 | |
851 | movups 96(ctx), %xmm7 | |
852 | #endif | |
853 | ||
854 | test $2, num_blk // check whether num_blk has 2 blocks | |
855 | je 9f // if num_blk & 2 == 0, skip the per-pair processing code | |
856 | ||
857 | // do the remaining 2 blocks together | |
858 | ||
859 | movups (ibuf), %xmm1 // tmp = 1st ibuf | |
860 | movups 16(ibuf), %xmm2 // tmp = 2nd ibuf | |
861 | ||
862 | // aes_decrypt | |
863 | pxor %xmm3, %xmm1 | |
864 | pxor %xmm3, %xmm2 | |
865 | aesdec %xmm4, %xmm1 | |
866 | aesdec %xmm4, %xmm2 | |
867 | aesdec %xmm5, %xmm1 | |
868 | aesdec %xmm5, %xmm2 | |
869 | aesdec %xmm6, %xmm1 | |
870 | aesdec %xmm6, %xmm2 | |
871 | #if defined __x86_64__ | |
872 | aesdec %xmm7, %xmm1 | |
873 | aesdec %xmm7, %xmm2 | |
874 | aesdec %xmm8, %xmm1 | |
875 | aesdec %xmm8, %xmm2 | |
876 | aesdec %xmm9, %xmm1 | |
877 | aesdec %xmm9, %xmm2 | |
878 | aesdec %xmm10, %xmm1 | |
879 | aesdec %xmm10, %xmm2 | |
880 | aesdec %xmm11, %xmm1 | |
881 | aesdec %xmm11, %xmm2 | |
882 | aesdec %xmm12, %xmm1 | |
883 | aesdec %xmm12, %xmm2 | |
884 | aesdeclast %xmm13, %xmm1 | |
885 | aesdeclast %xmm13, %xmm2 | |
886 | #else | |
887 | movups 80(ctx), %xmm6 | |
888 | aesdec %xmm7, %xmm1 | |
889 | aesdec %xmm7, %xmm2 | |
890 | movups 64(ctx), %xmm7 | |
891 | aesdec %xmm6, %xmm1 | |
892 | aesdec %xmm6, %xmm2 | |
893 | movups 48(ctx), %xmm6 | |
894 | aesdec %xmm7, %xmm1 | |
895 | aesdec %xmm7, %xmm2 | |
896 | movups 32(ctx), %xmm7 | |
897 | aesdec %xmm6, %xmm1 | |
898 | aesdec %xmm6, %xmm2 | |
899 | movups 16(ctx), %xmm6 | |
900 | aesdec %xmm7, %xmm1 | |
901 | aesdec %xmm7, %xmm2 | |
902 | movups 0(ctx), %xmm7 | |
903 | aesdec %xmm6, %xmm1 | |
904 | aesdec %xmm6, %xmm2 | |
905 | aesdeclast %xmm7, %xmm1 | |
906 | aesdeclast %xmm7, %xmm2 | |
907 | movups 112(ctx), %xmm6 | |
908 | movups 96(ctx), %xmm7 | |
909 | #endif | |
910 | ||
911 | pxor iv, %xmm1 // obuf[0] ^= *iv; | |
912 | movups (ibuf), iv // ibuf[0] | |
913 | pxor iv, %xmm2 // obuf[1] ^= ibuf[0] | |
914 | movups 16(ibuf), iv // *iv = ibuf[1] | |
915 | ||
916 | movups %xmm1, (obuf) // write obuf[0] | |
917 | movups %xmm2, 16(obuf) // write obuf[1] | |
918 | ||
919 | add $32, ibuf // ibuf += 2 | |
920 | add $32, obuf // obuf += 2 | |
921 | ||
922 | 9: | |
923 | test $1, num_blk // check whether num_blk has residual 1 block | |
924 | je L_HW_cbc_done // if num_blk == 0, no need for residual processing code | |
925 | ||
926 | movups (ibuf), %xmm2 // tmp = ibuf | |
927 | // aes_decrypt | |
928 | pxor %xmm3, %xmm2 | |
929 | aesdec %xmm4, %xmm2 | |
930 | aesdec %xmm5, %xmm2 | |
931 | aesdec %xmm6, %xmm2 | |
932 | aesdec %xmm7, %xmm2 | |
933 | #if defined __x86_64__ | |
934 | aesdec %xmm8, %xmm2 | |
935 | aesdec %xmm9, %xmm2 | |
936 | aesdec %xmm10, %xmm2 | |
937 | aesdec %xmm11, %xmm2 | |
938 | aesdec %xmm12, %xmm2 | |
939 | aesdeclast %xmm13, %xmm2 | |
940 | #else | |
941 | movups 80(ctx), %xmm1 | |
942 | aesdec %xmm1, %xmm2 | |
943 | movups 64(ctx), %xmm1 | |
944 | aesdec %xmm1, %xmm2 | |
945 | movups 48(ctx), %xmm1 | |
946 | aesdec %xmm1, %xmm2 | |
947 | movups 32(ctx), %xmm1 | |
948 | aesdec %xmm1, %xmm2 | |
949 | movups 16(ctx), %xmm1 | |
950 | aesdec %xmm1, %xmm2 | |
951 | movups (ctx), %xmm1 | |
952 | aesdeclast %xmm1, %xmm2 | |
953 | #endif | |
954 | ||
955 | pxor iv, %xmm2 // *obuf ^= *iv; | |
956 | movups (ibuf), iv // *iv = *ibuf; | |
957 | movups %xmm2, (obuf) // write *obuf | |
958 | ||
959 | jmp L_HW_cbc_done | |
960 | ||
961 | // | |
962 | // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done | |
963 | // | |
964 | ||
965 | L_decrypt_192: | |
966 | ||
967 | cmp $1, num_blk | |
968 | jl L_HW_cbc_done // if num_blk < 1, early return | |
969 | ||
970 | // aes-192 decryp expanded keys | |
971 | movups 192(ctx), %xmm3 | |
972 | movups 176(ctx), %xmm4 | |
973 | movups 160(ctx), %xmm5 | |
974 | movups 144(ctx), %xmm6 | |
975 | movups 128(ctx), %xmm7 | |
976 | #if defined __x86_64__ | |
977 | movups 112(ctx), %xmm8 | |
978 | movups 96(ctx), %xmm9 | |
979 | movups 80(ctx), %xmm10 | |
980 | movups 64(ctx), %xmm11 | |
981 | movups 48(ctx), %xmm12 | |
982 | movups 32(ctx), %xmm13 | |
983 | movups 16(ctx), %xmm14 | |
984 | movups (ctx), %xmm15 | |
985 | #endif | |
986 | ||
987 | // performs 4 block decryption in an iteration to exploit decrypt in parallel | |
988 | ||
989 | // while ((num_blk-=4)>=0) { | |
990 | // aes_decrypt(ibuf, obuf, ctx); | |
991 | // aes_decrypt(ibuf+1, obuf+1, ctx); | |
992 | // aes_decrypt(ibuf+2, obuf+2, ctx); | |
993 | // aes_decrypt(ibuf+3, obuf+3, ctx); | |
994 | // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; | |
995 | // *iv = ibuf[3]; ibuf += 4; obuf += 4; | |
996 | // } | |
997 | ||
998 | sub $4, num_blk // pre decrement num_blk by 4 | |
999 | jl 9f // if num_blk < 4, skip the per-4-blocks processing code | |
1000 | 0: | |
1001 | ||
1002 | #if defined __x86_64__ | |
1003 | ||
1004 | movups (ibuf), %xmm1 // tmp = 1st ibuf | |
1005 | movups 16(ibuf), %xmm2 // tmp = 2nd ibuf | |
1006 | movups 32(ibuf), %xmm14 // tmp = 3rd ibuf | |
1007 | movups 48(ibuf), %xmm15 // tmp = 4th ibuf | |
1008 | ||
1009 | // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 | |
1010 | // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards | |
1011 | ||
1012 | // round 0 for 4 blocks | |
1013 | pxor %xmm3, %xmm1 | |
1014 | pxor %xmm3, %xmm2 | |
1015 | pxor %xmm3, %xmm14 | |
1016 | pxor %xmm3, %xmm15 | |
1017 | ||
1018 | // round 1 for 4 blocks | |
1019 | aesdec %xmm4, %xmm1 | |
1020 | aesdec %xmm4, %xmm2 | |
1021 | aesdec %xmm4, %xmm14 | |
1022 | aesdec %xmm4, %xmm15 | |
1023 | ||
1024 | // round 2 for 4 blocks | |
1025 | aesdec %xmm5, %xmm1 | |
1026 | aesdec %xmm5, %xmm2 | |
1027 | aesdec %xmm5, %xmm14 | |
1028 | aesdec %xmm5, %xmm15 | |
1029 | ||
1030 | // round 3 for 4 blocks | |
1031 | aesdec %xmm6, %xmm1 | |
1032 | aesdec %xmm6, %xmm2 | |
1033 | aesdec %xmm6, %xmm14 | |
1034 | aesdec %xmm6, %xmm15 | |
1035 | ||
1036 | // round 4 for 4 blocks | |
1037 | aesdec %xmm7, %xmm1 | |
1038 | aesdec %xmm7, %xmm2 | |
1039 | aesdec %xmm7, %xmm14 | |
1040 | aesdec %xmm7, %xmm15 | |
1041 | ||
1042 | // round 5 for 4 blocks | |
1043 | aesdec %xmm8, %xmm1 | |
1044 | aesdec %xmm8, %xmm2 | |
1045 | aesdec %xmm8, %xmm14 | |
1046 | aesdec %xmm8, %xmm15 | |
1047 | ||
1048 | // round 6 for 4 blocks | |
1049 | aesdec %xmm9, %xmm1 | |
1050 | aesdec %xmm9, %xmm2 | |
1051 | aesdec %xmm9, %xmm14 | |
1052 | aesdec %xmm9, %xmm15 | |
1053 | ||
1054 | // round 7 for 4 blocks | |
1055 | aesdec %xmm10, %xmm1 | |
1056 | aesdec %xmm10, %xmm2 | |
1057 | aesdec %xmm10, %xmm14 | |
1058 | aesdec %xmm10, %xmm15 | |
1059 | ||
1060 | // round 8 for 4 blocks | |
1061 | aesdec %xmm11, %xmm1 | |
1062 | aesdec %xmm11, %xmm2 | |
1063 | aesdec %xmm11, %xmm14 | |
1064 | aesdec %xmm11, %xmm15 | |
1065 | ||
1066 | // round 9 for 4 blocks | |
1067 | aesdec %xmm12, %xmm1 | |
1068 | aesdec %xmm12, %xmm2 | |
1069 | aesdec %xmm12, %xmm14 | |
1070 | aesdec %xmm12, %xmm15 | |
1071 | ||
1072 | movups 16(ctx), %xmm12 | |
1073 | ||
1074 | // round A for 4 blocks | |
1075 | aesdec %xmm13, %xmm1 | |
1076 | aesdec %xmm13, %xmm2 | |
1077 | aesdec %xmm13, %xmm14 | |
1078 | aesdec %xmm13, %xmm15 | |
1079 | ||
1080 | movups (ctx), %xmm13 | |
1081 | ||
1082 | // round B for 4 blocks | |
1083 | aesdec %xmm12, %xmm1 | |
1084 | aesdec %xmm12, %xmm2 | |
1085 | aesdec %xmm12, %xmm14 | |
1086 | aesdec %xmm12, %xmm15 | |
1087 | ||
1088 | movups 48(ctx), %xmm12 // restore %xmm12 to its original key | |
1089 | ||
1090 | // round C (last) for 4 blocks | |
1091 | aesdeclast %xmm13, %xmm1 | |
1092 | aesdeclast %xmm13, %xmm2 | |
1093 | aesdeclast %xmm13, %xmm14 | |
1094 | aesdeclast %xmm13, %xmm15 | |
1095 | ||
1096 | movups 32(ctx), %xmm13 // restore %xmm13 to its original key | |
1097 | ||
1098 | pxor iv, %xmm1 // obuf[0] ^= *iv; | |
1099 | movups (ibuf), iv // ibuf[0] | |
1100 | pxor iv, %xmm2 // obuf[1] ^= ibuf[0] | |
1101 | movups 16(ibuf), iv // ibuf[1] | |
1102 | pxor iv, %xmm14 // obuf[2] ^= ibuf[1] | |
1103 | movups 32(ibuf), iv // ibuf[2] | |
1104 | pxor iv, %xmm15 // obuf[3] ^= ibuf[2] | |
1105 | movups 48(ibuf), iv // *iv = ibuf[3] | |
1106 | ||
1107 | movups %xmm1, (obuf) // write 1st obuf | |
1108 | movups %xmm2, 16(obuf) // write 2nd obuf | |
1109 | movups %xmm14, 32(obuf) // write 3rd obuf | |
1110 | movups %xmm15, 48(obuf) // write 4th obuf | |
1111 | ||
1112 | add $64, ibuf // ibuf += 4; | |
1113 | add $64, obuf // obuf += 4; | |
1114 | ||
1115 | sub $4, num_blk // num_blk -= 4 | |
1116 | jge 0b // if num_blk > 0, repeat the loop | |
1117 | ||
1118 | 9: add $4, num_blk // post incremtn num_blk by 4 | |
1119 | je L_HW_cbc_done // if num_blk == 0, prepare to return | |
1120 | ||
1121 | movups 16(ctx), %xmm14 // restore %xmm14 to its key | |
1122 | movups (ctx), %xmm15 // restore %xmm15 to its key | |
1123 | ||
1124 | #else | |
1125 | ||
1126 | movups (ibuf), %xmm1 // tmp = 1st ibuf | |
1127 | movups 16(ibuf), %xmm2 // tmp = 2nd ibuf | |
1128 | movups 32(ibuf), %xmm4 // tmp = 3rd ibuf | |
1129 | movups 48(ibuf), %xmm5 // tmp = 4th ibuf | |
1130 | ||
1131 | // aes_decrypt | |
1132 | // for i386, sequentially load expanded keys into xmm6/xmm7 | |
1133 | movups 176(ctx), %xmm6 | |
1134 | pxor %xmm3, %xmm1 | |
1135 | pxor %xmm3, %xmm2 | |
1136 | pxor %xmm3, %xmm4 | |
1137 | pxor %xmm3, %xmm5 | |
1138 | ||
1139 | movups 160(ctx), %xmm7 | |
1140 | aesdec %xmm6, %xmm1 | |
1141 | aesdec %xmm6, %xmm2 | |
1142 | aesdec %xmm6, %xmm4 | |
1143 | aesdec %xmm6, %xmm5 | |
1144 | ||
1145 | movups 144(ctx), %xmm6 | |
1146 | aesdec %xmm7, %xmm1 | |
1147 | aesdec %xmm7, %xmm2 | |
1148 | aesdec %xmm7, %xmm4 | |
1149 | aesdec %xmm7, %xmm5 | |
1150 | ||
1151 | movups 128(ctx), %xmm7 | |
1152 | aesdec %xmm6, %xmm1 | |
1153 | aesdec %xmm6, %xmm2 | |
1154 | aesdec %xmm6, %xmm4 | |
1155 | aesdec %xmm6, %xmm5 | |
1156 | ||
1157 | movups 112(ctx), %xmm6 | |
1158 | aesdec %xmm7, %xmm1 | |
1159 | aesdec %xmm7, %xmm2 | |
1160 | aesdec %xmm7, %xmm4 | |
1161 | aesdec %xmm7, %xmm5 | |
1162 | ||
1163 | movups 96(ctx), %xmm7 | |
1164 | aesdec %xmm6, %xmm1 | |
1165 | aesdec %xmm6, %xmm2 | |
1166 | aesdec %xmm6, %xmm4 | |
1167 | aesdec %xmm6, %xmm5 | |
1168 | ||
1169 | movups 80(ctx), %xmm6 | |
1170 | aesdec %xmm7, %xmm1 | |
1171 | aesdec %xmm7, %xmm2 | |
1172 | aesdec %xmm7, %xmm4 | |
1173 | aesdec %xmm7, %xmm5 | |
1174 | ||
1175 | movups 64(ctx), %xmm7 | |
1176 | aesdec %xmm6, %xmm1 | |
1177 | aesdec %xmm6, %xmm2 | |
1178 | aesdec %xmm6, %xmm4 | |
1179 | aesdec %xmm6, %xmm5 | |
1180 | ||
1181 | movups 48(ctx), %xmm6 | |
1182 | aesdec %xmm7, %xmm1 | |
1183 | aesdec %xmm7, %xmm2 | |
1184 | aesdec %xmm7, %xmm4 | |
1185 | aesdec %xmm7, %xmm5 | |
1186 | ||
1187 | movups 32(ctx), %xmm7 | |
1188 | aesdec %xmm6, %xmm1 | |
1189 | aesdec %xmm6, %xmm2 | |
1190 | aesdec %xmm6, %xmm4 | |
1191 | aesdec %xmm6, %xmm5 | |
1192 | ||
1193 | movups 16(ctx), %xmm6 | |
1194 | aesdec %xmm7, %xmm1 | |
1195 | aesdec %xmm7, %xmm2 | |
1196 | aesdec %xmm7, %xmm4 | |
1197 | aesdec %xmm7, %xmm5 | |
1198 | ||
1199 | movups 0(ctx), %xmm7 | |
1200 | aesdec %xmm6, %xmm1 | |
1201 | aesdec %xmm6, %xmm2 | |
1202 | aesdec %xmm6, %xmm4 | |
1203 | aesdec %xmm6, %xmm5 | |
1204 | ||
1205 | aesdeclast %xmm7, %xmm1 | |
1206 | aesdeclast %xmm7, %xmm2 | |
1207 | aesdeclast %xmm7, %xmm4 | |
1208 | aesdeclast %xmm7, %xmm5 | |
1209 | ||
1210 | pxor iv, %xmm1 // 1st obuf ^= iv; | |
1211 | movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); | |
1212 | pxor iv, %xmm2 // 2nd obuf ^= iv; | |
1213 | movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); | |
1214 | pxor iv, %xmm4 // 3rd obuf ^= iv; | |
1215 | movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); | |
1216 | pxor iv, %xmm5 // 4th obuf ^= iv; | |
1217 | movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); | |
1218 | movups %xmm1, (obuf) // write 1st obuf | |
1219 | movups %xmm2, 16(obuf) // write 2nd obuf | |
1220 | movups %xmm4, 32(obuf) // write 3rd obuf | |
1221 | movups %xmm5, 48(obuf) // write 4th obuf | |
1222 | ||
1223 | add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; | |
1224 | add $64, obuf // obuf += AES_BLOCK_SIZE * 4; | |
1225 | ||
1226 | sub $4, num_blk // num_blk -= 4 | |
1227 | jge 0b // if num_blk > 0, repeat the loop | |
1228 | ||
1229 | ||
1230 | 9: add $4, num_blk // post incremtn num_blk by 4 | |
1231 | je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code | |
1232 | ||
1233 | movups 176(ctx), %xmm4 | |
1234 | movups 160(ctx), %xmm5 | |
1235 | movups 144(ctx), %xmm6 | |
1236 | movups 128(ctx), %xmm7 | |
1237 | ||
1238 | #endif | |
1239 | ||
1240 | // per-block aes_decrypt_cbc loop | |
1241 | ||
1242 | 0: | |
1243 | movups (ibuf), %xmm2 // tmp = ibuf | |
1244 | ||
1245 | // aes_decrypt | |
1246 | pxor %xmm3, %xmm2 | |
1247 | aesdec %xmm4, %xmm2 | |
1248 | aesdec %xmm5, %xmm2 | |
1249 | aesdec %xmm6, %xmm2 | |
1250 | aesdec %xmm7, %xmm2 | |
1251 | #if defined __x86_64__ | |
1252 | aesdec %xmm8, %xmm2 | |
1253 | aesdec %xmm9, %xmm2 | |
1254 | aesdec %xmm10, %xmm2 | |
1255 | aesdec %xmm11, %xmm2 | |
1256 | aesdec %xmm12, %xmm2 | |
1257 | aesdec %xmm13, %xmm2 | |
1258 | aesdec %xmm14, %xmm2 | |
1259 | aesdeclast %xmm15, %xmm2 | |
1260 | #else | |
1261 | movups 112(ctx), %xmm1 | |
1262 | aesdec %xmm1, %xmm2 | |
1263 | movups 96(ctx), %xmm1 | |
1264 | aesdec %xmm1, %xmm2 | |
1265 | movups 80(ctx), %xmm1 | |
1266 | aesdec %xmm1, %xmm2 | |
1267 | movups 64(ctx), %xmm1 | |
1268 | aesdec %xmm1, %xmm2 | |
1269 | movups 48(ctx), %xmm1 | |
1270 | aesdec %xmm1, %xmm2 | |
1271 | movups 32(ctx), %xmm1 | |
1272 | aesdec %xmm1, %xmm2 | |
1273 | movups 16(ctx), %xmm1 | |
1274 | aesdec %xmm1, %xmm2 | |
1275 | movups (ctx), %xmm1 | |
1276 | aesdeclast %xmm1, %xmm2 | |
1277 | #endif | |
1278 | ||
1279 | pxor iv, %xmm2 // obuf ^= iv; | |
1280 | movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); | |
1281 | ||
1282 | movups %xmm2, (obuf) // write obuf | |
1283 | ||
1284 | add $16, ibuf // ibuf += AES_BLOCK_SIZE; | |
1285 | add $16, obuf // obuf += AES_BLOCK_SIZE; | |
1286 | sub $1, num_blk // num_blk -- | |
1287 | jg 0b // if num_blk > 0, repeat the loop | |
1288 | ||
1289 | jmp L_HW_cbc_done | |
1290 | ||
1291 | // | |
1292 | // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done | |
1293 | // | |
1294 | ||
1295 | L_decrypt_256: | |
1296 | ||
1297 | cmp $1, num_blk | |
1298 | jl L_HW_cbc_done | |
1299 | ||
1300 | movups 224(ctx), %xmm3 | |
1301 | movups 208(ctx), %xmm4 | |
1302 | movups 192(ctx), %xmm5 | |
1303 | movups 176(ctx), %xmm6 | |
1304 | movups 160(ctx), %xmm7 | |
1305 | #if defined __x86_64__ | |
1306 | movups 144(ctx), %xmm8 | |
1307 | movups 128(ctx), %xmm9 | |
1308 | movups 112(ctx), %xmm10 | |
1309 | movups 96(ctx), %xmm11 | |
1310 | movups 80(ctx), %xmm12 | |
1311 | movups 64(ctx), %xmm13 | |
1312 | movups 48(ctx), %xmm14 | |
1313 | movups 32(ctx), %xmm15 | |
1314 | // movups 16(ctx), %xmm14 | |
1315 | // movups (ctx), %xmm15 | |
1316 | #endif | |
1317 | ||
1318 | #if defined __x86_64__ | |
1319 | ||
1320 | sub $4, num_blk // pre decrement num_blk by 4 | |
1321 | jl 9f // if num_blk < 4, skip the per-4-blocks processing code | |
1322 | 0: | |
1323 | movups (ibuf), %xmm1 // tmp = 1st ibuf | |
1324 | movups 16(ibuf), %xmm2 // tmp = 2nd ibuf | |
1325 | movups 32(ibuf), %xmm14 // tmp = 3rd ibuf | |
1326 | movups 48(ibuf), %xmm15 // tmp = 4th ibuf | |
1327 | ||
1328 | // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 | |
1329 | pxor %xmm3, %xmm1 | |
1330 | pxor %xmm3, %xmm2 | |
1331 | pxor %xmm3, %xmm14 | |
1332 | pxor %xmm3, %xmm15 | |
1333 | ||
1334 | aesdec %xmm4, %xmm1 | |
1335 | aesdec %xmm4, %xmm2 | |
1336 | aesdec %xmm4, %xmm14 | |
1337 | aesdec %xmm4, %xmm15 | |
1338 | ||
1339 | aesdec %xmm5, %xmm1 | |
1340 | aesdec %xmm5, %xmm2 | |
1341 | aesdec %xmm5, %xmm14 | |
1342 | aesdec %xmm5, %xmm15 | |
1343 | ||
1344 | aesdec %xmm6, %xmm1 | |
1345 | aesdec %xmm6, %xmm2 | |
1346 | aesdec %xmm6, %xmm14 | |
1347 | aesdec %xmm6, %xmm15 | |
1348 | ||
1349 | aesdec %xmm7, %xmm1 | |
1350 | aesdec %xmm7, %xmm2 | |
1351 | aesdec %xmm7, %xmm14 | |
1352 | aesdec %xmm7, %xmm15 | |
1353 | ||
1354 | aesdec %xmm8, %xmm1 | |
1355 | aesdec %xmm8, %xmm2 | |
1356 | aesdec %xmm8, %xmm14 | |
1357 | aesdec %xmm8, %xmm15 | |
1358 | ||
1359 | aesdec %xmm9, %xmm1 | |
1360 | aesdec %xmm9, %xmm2 | |
1361 | aesdec %xmm9, %xmm14 | |
1362 | aesdec %xmm9, %xmm15 | |
1363 | ||
1364 | aesdec %xmm10, %xmm1 | |
1365 | aesdec %xmm10, %xmm2 | |
1366 | aesdec %xmm10, %xmm14 | |
1367 | aesdec %xmm10, %xmm15 | |
1368 | ||
1369 | aesdec %xmm11, %xmm1 | |
1370 | aesdec %xmm11, %xmm2 | |
1371 | aesdec %xmm11, %xmm14 | |
1372 | aesdec %xmm11, %xmm15 | |
1373 | ||
1374 | aesdec %xmm12, %xmm1 | |
1375 | aesdec %xmm12, %xmm2 | |
1376 | aesdec %xmm12, %xmm14 | |
1377 | aesdec %xmm12, %xmm15 | |
1378 | movups 48(ctx), %xmm12 | |
1379 | ||
1380 | aesdec %xmm13, %xmm1 | |
1381 | aesdec %xmm13, %xmm2 | |
1382 | aesdec %xmm13, %xmm14 | |
1383 | aesdec %xmm13, %xmm15 | |
1384 | movups 32(ctx), %xmm13 | |
1385 | ||
1386 | aesdec %xmm12, %xmm1 | |
1387 | aesdec %xmm12, %xmm2 | |
1388 | aesdec %xmm12, %xmm14 | |
1389 | aesdec %xmm12, %xmm15 | |
1390 | movups 16(ctx), %xmm12 | |
1391 | ||
1392 | aesdec %xmm13, %xmm1 | |
1393 | aesdec %xmm13, %xmm2 | |
1394 | aesdec %xmm13, %xmm14 | |
1395 | aesdec %xmm13, %xmm15 | |
1396 | movups (ctx), %xmm13 | |
1397 | ||
1398 | aesdec %xmm12, %xmm1 | |
1399 | aesdec %xmm12, %xmm2 | |
1400 | aesdec %xmm12, %xmm14 | |
1401 | aesdec %xmm12, %xmm15 | |
1402 | movups 80(ctx), %xmm12 | |
1403 | ||
1404 | aesdeclast %xmm13, %xmm1 | |
1405 | aesdeclast %xmm13, %xmm2 | |
1406 | aesdeclast %xmm13, %xmm14 | |
1407 | aesdeclast %xmm13, %xmm15 | |
1408 | movups 64(ctx), %xmm13 | |
1409 | ||
1410 | pxor iv, %xmm1 // obuf ^= iv; | |
1411 | movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); | |
1412 | pxor iv, %xmm2 // obuf ^= iv; | |
1413 | movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); | |
1414 | pxor iv, %xmm14 // obuf ^= iv; | |
1415 | movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); | |
1416 | pxor iv, %xmm15 // obuf ^= iv; | |
1417 | movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); | |
1418 | ||
1419 | movups %xmm1, (obuf) // write 1st obuf | |
1420 | movups %xmm2, 16(obuf) // write 2nd obuf | |
1421 | movups %xmm14, 32(obuf) // write 3rd obuf | |
1422 | movups %xmm15, 48(obuf) // write 4th obuf | |
1423 | ||
1424 | add $64, ibuf // ibuf += AES_BLOCK_SIZE*4; | |
1425 | add $64, obuf // obuf += AES_BLOCK_SIZE*4; | |
1426 | ||
1427 | sub $4, num_blk // num_blk -= 4 | |
1428 | jge 0b // if num_blk > 0, repeat the loop | |
1429 | ||
1430 | 9: add $4, num_blk // post incremtn num_blk by 4 | |
1431 | je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code | |
1432 | ||
1433 | movups 48(ctx), %xmm14 | |
1434 | movups 32(ctx), %xmm15 | |
1435 | ||
1436 | #else | |
1437 | ||
1438 | sub $4, num_blk // pre decrement num_blk by 4 | |
1439 | jl 9f // if num_blk < 4, skip the per-pair processing code | |
1440 | 0: | |
1441 | movups (ibuf), %xmm1 // tmp = 1st ibuf | |
1442 | movups 16(ibuf), %xmm2 // tmp = 2nd ibuf | |
1443 | movups 32(ibuf), %xmm4 // tmp = 3rd ibuf | |
1444 | movups 48(ibuf), %xmm5 // tmp = 4th ibuf | |
1445 | ||
1446 | // aes_decrypt | |
1447 | // for i386, sequentially load expanded keys into xmm6/xmm7 | |
1448 | movups 208(ctx), %xmm6 | |
1449 | pxor %xmm3, %xmm1 | |
1450 | pxor %xmm3, %xmm2 | |
1451 | pxor %xmm3, %xmm4 | |
1452 | pxor %xmm3, %xmm5 | |
1453 | ||
1454 | movups 192(ctx), %xmm7 | |
1455 | aesdec %xmm6, %xmm1 | |
1456 | aesdec %xmm6, %xmm2 | |
1457 | aesdec %xmm6, %xmm4 | |
1458 | aesdec %xmm6, %xmm5 | |
1459 | ||
1460 | movups 176(ctx), %xmm6 | |
1461 | aesdec %xmm7, %xmm1 | |
1462 | aesdec %xmm7, %xmm2 | |
1463 | aesdec %xmm7, %xmm4 | |
1464 | aesdec %xmm7, %xmm5 | |
1465 | ||
1466 | movups 160(ctx), %xmm7 | |
1467 | aesdec %xmm6, %xmm1 | |
1468 | aesdec %xmm6, %xmm2 | |
1469 | aesdec %xmm6, %xmm4 | |
1470 | aesdec %xmm6, %xmm5 | |
1471 | ||
1472 | movups 144(ctx), %xmm6 | |
1473 | aesdec %xmm7, %xmm1 | |
1474 | aesdec %xmm7, %xmm2 | |
1475 | aesdec %xmm7, %xmm4 | |
1476 | aesdec %xmm7, %xmm5 | |
1477 | ||
1478 | movups 128(ctx), %xmm7 | |
1479 | aesdec %xmm6, %xmm1 | |
1480 | aesdec %xmm6, %xmm2 | |
1481 | aesdec %xmm6, %xmm4 | |
1482 | aesdec %xmm6, %xmm5 | |
1483 | ||
1484 | movups 112(ctx), %xmm6 | |
1485 | aesdec %xmm7, %xmm1 | |
1486 | aesdec %xmm7, %xmm2 | |
1487 | aesdec %xmm7, %xmm4 | |
1488 | aesdec %xmm7, %xmm5 | |
1489 | ||
1490 | movups 96(ctx), %xmm7 | |
1491 | aesdec %xmm6, %xmm1 | |
1492 | aesdec %xmm6, %xmm2 | |
1493 | aesdec %xmm6, %xmm4 | |
1494 | aesdec %xmm6, %xmm5 | |
1495 | ||
1496 | movups 80(ctx), %xmm6 | |
1497 | aesdec %xmm7, %xmm1 | |
1498 | aesdec %xmm7, %xmm2 | |
1499 | aesdec %xmm7, %xmm4 | |
1500 | aesdec %xmm7, %xmm5 | |
1501 | ||
1502 | movups 64(ctx), %xmm7 | |
1503 | aesdec %xmm6, %xmm1 | |
1504 | aesdec %xmm6, %xmm2 | |
1505 | aesdec %xmm6, %xmm4 | |
1506 | aesdec %xmm6, %xmm5 | |
1507 | ||
1508 | movups 48(ctx), %xmm6 | |
1509 | aesdec %xmm7, %xmm1 | |
1510 | aesdec %xmm7, %xmm2 | |
1511 | aesdec %xmm7, %xmm4 | |
1512 | aesdec %xmm7, %xmm5 | |
1513 | ||
1514 | movups 32(ctx), %xmm7 | |
1515 | aesdec %xmm6, %xmm1 | |
1516 | aesdec %xmm6, %xmm2 | |
1517 | aesdec %xmm6, %xmm4 | |
1518 | aesdec %xmm6, %xmm5 | |
1519 | ||
1520 | movups 16(ctx), %xmm6 | |
1521 | aesdec %xmm7, %xmm1 | |
1522 | aesdec %xmm7, %xmm2 | |
1523 | aesdec %xmm7, %xmm4 | |
1524 | aesdec %xmm7, %xmm5 | |
1525 | ||
1526 | movups 0(ctx), %xmm7 | |
1527 | aesdec %xmm6, %xmm1 | |
1528 | aesdec %xmm6, %xmm2 | |
1529 | aesdec %xmm6, %xmm4 | |
1530 | aesdec %xmm6, %xmm5 | |
1531 | ||
1532 | aesdeclast %xmm7, %xmm1 | |
1533 | aesdeclast %xmm7, %xmm2 | |
1534 | aesdeclast %xmm7, %xmm4 | |
1535 | aesdeclast %xmm7, %xmm5 | |
1536 | ||
1537 | pxor iv, %xmm1 // 1st obuf ^= iv; | |
1538 | movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); | |
1539 | pxor iv, %xmm2 // 2nd obuf ^= iv; | |
1540 | movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); | |
1541 | pxor iv, %xmm4 // 3rd obuf ^= iv; | |
1542 | movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); | |
1543 | pxor iv, %xmm5 // 4th obuf ^= iv; | |
1544 | movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); | |
1545 | movups %xmm1, (obuf) // write 1st obuf | |
1546 | movups %xmm2, 16(obuf) // write 2nd obuf | |
1547 | movups %xmm4, 32(obuf) // write 3rd obuf | |
1548 | movups %xmm5, 48(obuf) // write 4th obuf | |
1549 | ||
1550 | add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; | |
1551 | add $64, obuf // obuf += AES_BLOCK_SIZE * 4; | |
1552 | ||
1553 | sub $4, num_blk // num_blk -= 4 | |
1554 | jge 0b // if num_blk > 0, repeat the loop | |
1555 | ||
1556 | ||
1557 | 9: add $4, num_blk // post incremtn num_blk by 4 | |
1558 | je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code | |
1559 | ||
1560 | movups 208(ctx), %xmm4 | |
1561 | movups 192(ctx), %xmm5 | |
1562 | movups 176(ctx), %xmm6 | |
1563 | movups 160(ctx), %xmm7 | |
1564 | ||
1565 | #endif | |
1566 | ||
1567 | 0: | |
1568 | movups (ibuf), %xmm2 // tmp = ibuf | |
1569 | ||
1570 | // aes_decrypt | |
1571 | pxor %xmm3, %xmm2 | |
1572 | aesdec %xmm4, %xmm2 | |
1573 | aesdec %xmm5, %xmm2 | |
1574 | aesdec %xmm6, %xmm2 | |
1575 | aesdec %xmm7, %xmm2 | |
1576 | #if defined __x86_64__ | |
1577 | aesdec %xmm8, %xmm2 | |
1578 | aesdec %xmm9, %xmm2 | |
1579 | aesdec %xmm10, %xmm2 | |
1580 | aesdec %xmm11, %xmm2 | |
1581 | aesdec %xmm12, %xmm2 | |
1582 | aesdec %xmm13, %xmm2 | |
1583 | aesdec %xmm14, %xmm2 | |
1584 | aesdec %xmm15, %xmm2 | |
1585 | #else | |
1586 | movups 144(ctx), %xmm1 | |
1587 | aesdec %xmm1, %xmm2 | |
1588 | movups 128(ctx), %xmm1 | |
1589 | aesdec %xmm1, %xmm2 | |
1590 | movups 112(ctx), %xmm1 | |
1591 | aesdec %xmm1, %xmm2 | |
1592 | movups 96(ctx), %xmm1 | |
1593 | aesdec %xmm1, %xmm2 | |
1594 | movups 80(ctx), %xmm1 | |
1595 | aesdec %xmm1, %xmm2 | |
1596 | movups 64(ctx), %xmm1 | |
1597 | aesdec %xmm1, %xmm2 | |
1598 | movups 48(ctx), %xmm1 | |
1599 | aesdec %xmm1, %xmm2 | |
1600 | movups 32(ctx), %xmm1 | |
1601 | aesdec %xmm1, %xmm2 | |
1602 | #endif | |
1603 | movups 16(ctx), %xmm1 | |
1604 | aesdec %xmm1, %xmm2 | |
1605 | movups (ctx), %xmm1 | |
1606 | aesdeclast %xmm1, %xmm2 | |
1607 | ||
1608 | pxor iv, %xmm2 // obuf ^= iv; | |
1609 | movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); | |
1610 | ||
1611 | movups %xmm2, (obuf) // write obuf | |
1612 | ||
1613 | add $16, ibuf // ibuf += AES_BLOCK_SIZE; | |
1614 | add $16, obuf // obuf += AES_BLOCK_SIZE; | |
1615 | sub $1, num_blk // num_blk -- | |
1616 | jg 0b // if num_blk > 0, repeat the loop | |
1617 | ||
1618 | jmp L_HW_cbc_done | |
1619 | ||
1620 | // | |
1621 | // --------- END of aes_decrypt_cbc_hw ------------------- | |
1622 | // |