]>
Commit | Line | Data |
---|---|---|
6d2010ae A |
1 | /*\r |
2 | ---------------------------------------------------------------------------\r | |
3 | Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.\r | |
4 | \r | |
5 | LICENSE TERMS\r | |
6 | \r | |
7 | The free distribution and use of this software in both source and binary\r | |
8 | form is allowed (with or without changes) provided that:\r | |
9 | \r | |
10 | 1. distributions of this source code include the above copyright\r | |
11 | notice, this list of conditions and the following disclaimer;\r | |
12 | \r | |
13 | 2. distributions in binary form include the above copyright\r | |
14 | notice, this list of conditions and the following disclaimer\r | |
15 | in the documentation and/or other associated materials;\r | |
16 | \r | |
17 | 3. the copyright holder's name is not used to endorse products\r | |
18 | built using this software without specific written permission.\r | |
19 | \r | |
20 | ALTERNATIVELY, provided that this notice is retained in full, this product\r | |
21 | may be distributed under the terms of the GNU General Public License (GPL),\r | |
22 | in which case the provisions of the GPL apply INSTEAD OF those given above.\r | |
23 | \r | |
24 | DISCLAIMER\r | |
25 | \r | |
26 | This software is provided 'as is' with no explicit or implied warranties\r | |
27 | in respect of its properties, including, but not limited to, correctness\r | |
28 | and/or fitness for purpose.\r | |
29 | ---------------------------------------------------------------------------\r | |
30 | Issue 31/01/2006\r | |
31 | \r | |
32 | These subroutines implement multiple block AES modes for ECB, CBC, CFB,\r | |
33 | OFB and CTR encryption, The code provides support for the VIA Advanced \r | |
34 | Cryptography Engine (ACE).\r | |
35 | \r | |
36 | NOTE: In the following subroutines, the AES contexts (ctx) must be\r | |
37 | 16 byte aligned if VIA ACE is being used\r | |
38 | */\r | |
39 | \r | |
40 | /* modified 3/5/10 cclee */\r | |
41 | /* Clean up those related to VIA ACE and hand optimize aes_cbc_encrypt and aes_cbc_decrypt */\r | |
42 | /* move the xmm registers save/restore originally inside the callee functions into these 2 caller functions */\r | |
43 | \r | |
44 | /* HW-AES specific implementation cclee 3-12-10 */\r | |
45 | /* In aes_encrypt_cbc and aes_decrypt_cbc, __cpu_capabilities is polled, \r | |
46 | and if kHasAES is detected, branch to the hw-specific functions here */\r | |
47 | \r | |
48 | \r | |
49 | /* \r | |
50 | This files defines _aes_encrypt_cbc_hw and _aes_decrypt_cbc_hw --- Intel Westmere HW AES-based implementation\r | |
51 | of _aes_encrypt_cbc and _aes_decrypt_cbc. \r | |
52 | \r | |
53 | These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available. \r | |
54 | They SHOULD NOT be called without AES HW detection. It might cause xnu to crash.\r | |
55 | \r | |
56 | The AES HW is detected 1st thing in \r | |
57 | _aes_encrypt_cbc (aes_modes_asm.s) \r | |
58 | _aes_decrypt_cbc (aes_modes_asm.s)\r | |
59 | and, if AES HW is detected, branch without link (ie, jump) to the functions here.\r | |
60 | \r | |
61 | The implementation here follows the examples in an Intel White Paper\r | |
62 | "Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01\r | |
63 | \r | |
64 | Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01\r | |
65 | \r | |
66 | cclee 3-13-10\r | |
67 | */\r | |
68 | \r | |
69 | /* \r | |
70 | The function _aes_decrypt_cbc_hw previously simply serially decrypts block by block\r | |
71 | in our group meeting, Eric/Ali suggested that I perhaps should take a look of combining multiple blocks\r | |
72 | in a loop and interleaving multiple aesdec instructions to absorb/hide stalls to improve the decrypt thoughput.\r | |
73 | \r | |
74 | The idea was actually described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55) \r | |
75 | \r | |
76 | This modification interleaves the aesdec/aesdeclast instructions for 4 blocks in cbc mode.\r | |
77 | On a K18 (2.4GHz core-i5/2.66GHz core-i7), the x86_64 decrypt throughput (in xnu-iokit) has been improved\r | |
78 | from 1180/1332 to 1667/1858 MBytes/sec. This is approximately 1.40 times speedup in the decryption.\r | |
79 | The encrypt throughput is not changed. \r | |
80 | \r | |
81 | I also enhanced the assembly code comments.\r | |
82 | \r | |
83 | cclee-4-30-10 (Do you know 4-30 is National Honesty Day in the US? No need to know. I've been honest all the time.)\r | |
84 | \r | |
85 | */\r | |
86 | \r | |
87 | /* ---------------------------------------------------------------------------------------------------------------- \r | |
88 | \r | |
89 | aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :\r | |
90 | \r | |
91 | For simplicity, I am assuming all variables are in 128-bit data type.\r | |
92 | \r | |
93 | aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx)\r | |
94 | {\r | |
95 | while(num_blk--) {\r | |
96 | *iv ^= *ibuf++;\r | |
97 | aes_encrypt(iv, iv, ctx);\r | |
98 | *obuf++ = *iv;\r | |
99 | }\r | |
100 | return 0;\r | |
101 | }\r | |
102 | \r | |
103 | The following is an implementation of this function using Intel AESNI.\r | |
104 | This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. \r | |
105 | Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch\r | |
106 | to this aesni-based function should it detecs that aesni is available.\r | |
107 | Blindly call this function SURELY will cause a CRASH on systems with no aesni support. \r | |
108 | \r | |
109 | Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks\r | |
110 | are serially chained. This prevents us from arranging several blocks for encryption in parallel.\r | |
111 | \r | |
112 | ----------------------------------------------------------------------------------------------------------------*/\r | |
113 | \r | |
114 | .text\r | |
115 | .align 4,0x90\r | |
116 | .globl _aes_encrypt_cbc_hw\r | |
117 | _aes_encrypt_cbc_hw:\r | |
118 | \r | |
119 | // push/save registers for local use\r | |
120 | #if defined __i386__\r | |
121 | \r | |
122 | push %ebp\r | |
123 | movl %esp, %ebp\r | |
124 | push %ebx\r | |
125 | push %edi\r | |
126 | \r | |
127 | #define sp %esp\r | |
128 | \r | |
129 | #else // __x86_64__\r | |
130 | \r | |
131 | push %rbp\r | |
132 | mov %rsp, %rbp\r | |
133 | push %rbx\r | |
134 | push %r13\r | |
135 | push %r14\r | |
136 | push %r15\r | |
137 | \r | |
138 | #define sp %rsp\r | |
139 | \r | |
140 | #endif\r | |
141 | \r | |
142 | // if this is kernel code, need to save used xmm registers\r | |
143 | #ifdef KERNEL\r | |
144 | \r | |
145 | #if defined __i386__\r | |
146 | sub $(8*16), %esp // for possible xmm0-xmm7 save/restore\r | |
147 | #else\r | |
148 | sub $(16*16), %rsp // xmm0-xmm15 save/restore \r | |
149 | #endif\r | |
150 | \r | |
151 | movaps %xmm0, (sp)\r | |
152 | movaps %xmm1, 16(sp)\r | |
153 | movaps %xmm2, 32(sp)\r | |
154 | movaps %xmm3, 48(sp)\r | |
155 | movaps %xmm4, 64(sp)\r | |
156 | movaps %xmm5, 80(sp)\r | |
157 | movaps %xmm6, 96(sp)\r | |
158 | movaps %xmm7, 112(sp)\r | |
159 | #if defined __x86_64__\r | |
160 | movaps %xmm8, 16*8(sp)\r | |
161 | movaps %xmm9, 16*9(sp)\r | |
162 | movaps %xmm10, 16*10(sp)\r | |
163 | movaps %xmm11, 16*11(sp)\r | |
164 | movaps %xmm12, 16*12(sp)\r | |
165 | movaps %xmm13, 16*13(sp)\r | |
166 | movaps %xmm14, 16*14(sp)\r | |
167 | movaps %xmm15, 16*15(sp)\r | |
168 | #endif // __x86_64__\r | |
169 | \r | |
170 | #endif // KERNEL\r | |
171 | \r | |
172 | #define iv %xmm0\r | |
173 | \r | |
174 | #ifdef __i386__\r | |
175 | \r | |
176 | mov 12(%ebp), %eax // in_iv\r | |
177 | mov 24(%ebp), %edx // ctx\r | |
178 | movups (%eax), iv // iv = in_iv \r | |
179 | mov 8(%ebp), %ebx // ibuf\r | |
180 | mov 16(%ebp), %ecx // num_blk\r | |
181 | mov 20(%ebp), %edi // obuf\r | |
182 | \r | |
183 | #define ibuf %ebx\r | |
184 | #define obuf %edi\r | |
185 | #define num_blk %ecx \r | |
186 | #define ctx %edx\r | |
187 | \r | |
188 | #else\r | |
189 | \r | |
190 | mov %rdi, %rbx // ibuf\r | |
191 | movups (%rsi), iv // iv = in_iv\r | |
192 | mov %rdx, %r13 // num_blk\r | |
193 | mov %rcx, %r14 // obuf\r | |
194 | mov %r8, %r15 // ctx \r | |
195 | \r | |
196 | #define ibuf %rbx\r | |
197 | #define num_blk %r13d\r | |
198 | #define obuf %r14 \r | |
199 | #define ctx %r15\r | |
200 | \r | |
201 | #endif\r | |
202 | \r | |
203 | mov 240(ctx), %eax // aes length\r | |
204 | cmp $160, %eax // aes-128 encrypt ?\r | |
205 | je L_encrypt_128\r | |
206 | cmp $192, %eax // aes-192 encrypt ?\r | |
207 | je L_encrypt_192\r | |
208 | cmp $224, %eax // aes-256 encrypt ?\r | |
209 | je L_encrypt_256\r | |
210 | mov $-1, %eax // return error\r | |
211 | jmp L_error \r | |
212 | \r | |
213 | //\r | |
214 | // aes-128 encrypt_cbc operation, up to L_HW_cbc_done\r | |
215 | //\r | |
216 | \r | |
217 | L_encrypt_128:\r | |
218 | \r | |
219 | cmp $1, num_blk // check number of block\r | |
220 | jl L_HW_cbc_done // should it be less than 1, nothing to do\r | |
221 | \r | |
222 | movups (ctx), %xmm2 // key0\r | |
223 | movups 16(ctx), %xmm3 // key1\r | |
224 | movups 32(ctx), %xmm4 // key2\r | |
225 | movups 48(ctx), %xmm5 // key3\r | |
226 | movups 64(ctx), %xmm6 // key4\r | |
227 | movups 80(ctx), %xmm7 // key5\r | |
228 | #if defined __x86_64__\r | |
229 | movups 96(ctx), %xmm8 // key6\r | |
230 | movups 112(ctx), %xmm9 // key7\r | |
231 | movups 128(ctx), %xmm10 // key8\r | |
232 | movups 144(ctx), %xmm11 // key9\r | |
233 | movups 160(ctx), %xmm12 // keyA\r | |
234 | #endif\r | |
235 | \r | |
236 | // while (num_blk--) {\r | |
237 | // *iv ^= *ibuf++;\r | |
238 | // aes_encrypt(iv, iv, ctx);\r | |
239 | // *obuf++ = *iv;\r | |
240 | // }\r | |
241 | 0:\r | |
242 | movups (ibuf), %xmm1 // *ibuf\r | |
243 | pxor %xmm2, iv // 1st instruction inside aes_encrypt\r | |
244 | pxor %xmm1, iv // *iv ^= *ibuf\r | |
245 | \r | |
246 | // finishing up the rest of aes_encrypt\r | |
247 | aesenc %xmm3, iv\r | |
248 | aesenc %xmm4, iv\r | |
249 | aesenc %xmm5, iv\r | |
250 | aesenc %xmm6, iv\r | |
251 | aesenc %xmm7, iv\r | |
252 | #if defined __x86_64__\r | |
253 | aesenc %xmm8, iv\r | |
254 | aesenc %xmm9, iv\r | |
255 | aesenc %xmm10, iv\r | |
256 | aesenc %xmm11, iv\r | |
257 | aesenclast %xmm12, iv\r | |
258 | #else\r | |
259 | movups 96(ctx), %xmm1 // key6\r | |
260 | aesenc %xmm1, iv\r | |
261 | movups 112(ctx), %xmm1 // key7\r | |
262 | aesenc %xmm1, iv\r | |
263 | movups 128(ctx), %xmm1 // key8\r | |
264 | aesenc %xmm1, iv\r | |
265 | movups 144(ctx), %xmm1 // key9\r | |
266 | aesenc %xmm1, iv\r | |
267 | movups 160(ctx), %xmm1 // keyA\r | |
268 | aesenclast %xmm1, iv\r | |
269 | #endif\r | |
270 | \r | |
271 | movups iv, (obuf) // *obuf = *iv;\r | |
272 | add $16, obuf // obuf++;\r | |
273 | add $16, ibuf // ibuf++;\r | |
274 | sub $1, num_blk // num_blk --\r | |
275 | jg 0b // if num_blk > 0, repeat the loop\r | |
276 | \r | |
277 | // the following will be branched to from all other cases (encrypt/decrypt 128/192/256)\r | |
278 | \r | |
279 | L_HW_cbc_done:\r | |
280 | \r | |
281 | xor %eax, %eax // to return CRYPT_OK\r | |
282 | \r | |
283 | L_error:\r | |
284 | \r | |
285 | // if kernel, restore xmm registers\r | |
286 | #ifdef KERNEL \r | |
287 | movaps 0(sp), %xmm0\r | |
288 | movaps 16(sp), %xmm1\r | |
289 | movaps 32(sp), %xmm2\r | |
290 | movaps 48(sp), %xmm3\r | |
291 | movaps 64(sp), %xmm4\r | |
292 | movaps 80(sp), %xmm5\r | |
293 | movaps 96(sp), %xmm6\r | |
294 | movaps 112(sp), %xmm7\r | |
295 | #if defined __x86_64__\r | |
296 | movaps 16*8(sp), %xmm8\r | |
297 | movaps 16*9(sp), %xmm9\r | |
298 | movaps 16*10(sp), %xmm10\r | |
299 | movaps 16*11(sp), %xmm11\r | |
300 | movaps 16*12(sp), %xmm12\r | |
301 | movaps 16*13(sp), %xmm13\r | |
302 | movaps 16*14(sp), %xmm14\r | |
303 | movaps 16*15(sp), %xmm15\r | |
304 | #endif // __x86_64__\r | |
305 | #endif // KERNEL\r | |
306 | \r | |
307 | // release used stack memory, restore used callee-saved registers, and return \r | |
308 | #if defined __i386__\r | |
309 | #ifdef KERNEL\r | |
310 | add $(8*16), %esp\r | |
311 | #endif\r | |
312 | pop %edi\r | |
313 | pop %ebx\r | |
314 | #else\r | |
315 | #ifdef KERNEL\r | |
316 | add $(16*16), %rsp \r | |
317 | #endif\r | |
318 | pop %r15\r | |
319 | pop %r14\r | |
320 | pop %r13\r | |
321 | pop %rbx\r | |
322 | #endif\r | |
323 | leave\r | |
324 | ret\r | |
325 | \r | |
326 | //\r | |
327 | // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done\r | |
328 | //\r | |
329 | \r | |
330 | L_encrypt_192:\r | |
331 | \r | |
332 | cmp $1, num_blk // check number of block\r | |
333 | jl L_HW_cbc_done // should it be less than 1, nothing to do\r | |
334 | \r | |
335 | movups (ctx), %xmm2 // key0\r | |
336 | movups 16(ctx), %xmm3 // key1\r | |
337 | movups 32(ctx), %xmm4 // key2\r | |
338 | movups 48(ctx), %xmm5 // key3\r | |
339 | movups 64(ctx), %xmm6 // key4\r | |
340 | movups 80(ctx), %xmm7 // key5\r | |
341 | #if defined __x86_64__\r | |
342 | movups 96(ctx), %xmm8 // key6\r | |
343 | movups 112(ctx), %xmm9 // key7\r | |
344 | movups 128(ctx), %xmm10 // key8\r | |
345 | movups 144(ctx), %xmm11 // key9\r | |
346 | movups 160(ctx), %xmm12 // keyA\r | |
347 | movups 176(ctx), %xmm13 // keyB\r | |
348 | movups 192(ctx), %xmm14 // keyC\r | |
349 | #endif\r | |
350 | \r | |
351 | // while (num_blk--) {\r | |
352 | // *iv ^= *ibuf++;\r | |
353 | // aes_encrypt(iv, iv, ctx);\r | |
354 | // *obuf++ = *iv;\r | |
355 | // }\r | |
356 | 0:\r | |
357 | movups (ibuf), %xmm1 // *ibuf\r | |
358 | pxor %xmm1, iv // *iv ^= ibuf\r | |
359 | \r | |
360 | // aes_encrypt(iv, iv, ctx);\r | |
361 | \r | |
362 | pxor %xmm2, iv\r | |
363 | aesenc %xmm3, iv\r | |
364 | aesenc %xmm4, iv\r | |
365 | aesenc %xmm5, iv\r | |
366 | aesenc %xmm6, iv\r | |
367 | aesenc %xmm7, iv\r | |
368 | #if defined __x86_64__\r | |
369 | aesenc %xmm8, iv\r | |
370 | aesenc %xmm9, iv\r | |
371 | aesenc %xmm10, iv\r | |
372 | aesenc %xmm11, iv\r | |
373 | aesenc %xmm12, iv\r | |
374 | aesenc %xmm13, iv\r | |
375 | aesenclast %xmm14, iv\r | |
376 | #else\r | |
377 | movups 96(ctx), %xmm1\r | |
378 | aesenc %xmm1, iv\r | |
379 | movups 112(ctx), %xmm1\r | |
380 | aesenc %xmm1, iv\r | |
381 | movups 128(ctx), %xmm1\r | |
382 | aesenc %xmm1, iv\r | |
383 | movups 144(ctx), %xmm1\r | |
384 | aesenc %xmm1, iv\r | |
385 | movups 160(ctx), %xmm1\r | |
386 | aesenc %xmm1, iv\r | |
387 | movups 176(ctx), %xmm1\r | |
388 | aesenc %xmm1, iv\r | |
389 | movups 192(ctx), %xmm1\r | |
390 | aesenclast %xmm1, iv\r | |
391 | #endif\r | |
392 | \r | |
393 | movups iv, (obuf) // *obuf = *iv;\r | |
394 | add $16, ibuf // ibuf++\r | |
395 | add $16, obuf // obuf++\r | |
396 | \r | |
397 | sub $1, num_blk // num_blk --\r | |
398 | jg 0b // if num_blk > 0, repeat the loop\r | |
399 | \r | |
400 | jmp L_HW_cbc_done // share with the common exit code\r | |
401 | \r | |
402 | //\r | |
403 | // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done\r | |
404 | //\r | |
405 | \r | |
406 | L_encrypt_256:\r | |
407 | \r | |
408 | cmp $1, num_blk // check number of block\r | |
409 | jl L_HW_cbc_done // should it be less than 1, nothing to do\r | |
410 | \r | |
411 | movups (ctx), %xmm2 // key0\r | |
412 | movups 16(ctx), %xmm3 // key1\r | |
413 | movups 32(ctx), %xmm4 // key2\r | |
414 | movups 48(ctx), %xmm5 // key3\r | |
415 | movups 64(ctx), %xmm6 // key4\r | |
416 | movups 80(ctx), %xmm7 // key5\r | |
417 | #if defined __x86_64__\r | |
418 | movups 96(ctx), %xmm8 // key6\r | |
419 | movups 112(ctx), %xmm9 // key7\r | |
420 | movups 128(ctx), %xmm10 // key8\r | |
421 | movups 144(ctx), %xmm11 // key9\r | |
422 | movups 160(ctx), %xmm12 // keyA\r | |
423 | movups 176(ctx), %xmm13 // keyB\r | |
424 | movups 192(ctx), %xmm14 // keyC\r | |
425 | movups 208(ctx), %xmm15 // keyD\r | |
426 | // movups 224(ctx), %xmm1 // keyE\r | |
427 | #endif\r | |
428 | \r | |
429 | // while (num_blk--) {\r | |
430 | // *iv ^= *ibuf++;\r | |
431 | // aes_encrypt(iv, iv, ctx);\r | |
432 | // *obuf++ = *iv;\r | |
433 | // }\r | |
434 | 0:\r | |
435 | movups (ibuf), %xmm1 // *ibuf\r | |
436 | pxor %xmm1, iv // *iv ^= ibuf\r | |
437 | \r | |
438 | // aes_encrypt(iv, iv, ctx);\r | |
439 | pxor %xmm2, iv\r | |
440 | aesenc %xmm3, iv\r | |
441 | aesenc %xmm4, iv\r | |
442 | aesenc %xmm5, iv\r | |
443 | aesenc %xmm6, iv\r | |
444 | aesenc %xmm7, iv\r | |
445 | #if defined __x86_64__\r | |
446 | movups 224(ctx), %xmm1 // keyE\r | |
447 | aesenc %xmm8, iv\r | |
448 | aesenc %xmm9, iv\r | |
449 | aesenc %xmm10, iv\r | |
450 | aesenc %xmm11, iv\r | |
451 | aesenc %xmm12, iv\r | |
452 | aesenc %xmm13, iv\r | |
453 | aesenc %xmm14, iv\r | |
454 | aesenc %xmm15, iv\r | |
455 | aesenclast %xmm1, iv\r | |
456 | #else\r | |
457 | movups 96(ctx), %xmm1 // key6\r | |
458 | aesenc %xmm1, iv\r | |
459 | movups 112(ctx), %xmm1 // key7\r | |
460 | aesenc %xmm1, iv\r | |
461 | movups 128(ctx), %xmm1 // key8\r | |
462 | aesenc %xmm1, iv\r | |
463 | movups 144(ctx), %xmm1 // key9\r | |
464 | aesenc %xmm1, iv\r | |
465 | movups 160(ctx), %xmm1 // keyA\r | |
466 | aesenc %xmm1, iv\r | |
467 | movups 176(ctx), %xmm1 // keyB\r | |
468 | aesenc %xmm1, iv\r | |
469 | movups 192(ctx), %xmm1 // keyC\r | |
470 | aesenc %xmm1, iv\r | |
471 | movups 208(ctx), %xmm1 // keyD\r | |
472 | aesenc %xmm1, iv\r | |
473 | movups 224(ctx), %xmm1 // keyE\r | |
474 | aesenclast %xmm1, iv\r | |
475 | #endif\r | |
476 | \r | |
477 | movups iv, (obuf) // *obuf = *iv;\r | |
478 | add $16, ibuf // ibuf++\r | |
479 | add $16, obuf // obuf++\r | |
480 | \r | |
481 | sub $1, num_blk // num_blk --\r | |
482 | jg 0b // if num_blk > 0, repeat the loop\r | |
483 | \r | |
484 | jmp L_HW_cbc_done // share with the common exit code\r | |
485 | \r | |
486 | \r | |
487 | \r | |
488 | //\r | |
489 | // --------- END of aes_encrypt_cbc_hw -------------------\r | |
490 | //\r | |
491 | \r | |
492 | \r | |
493 | /* ---------------------------------------------------------------------------------------------------------------- \r | |
494 | \r | |
495 | aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) :\r | |
496 | \r | |
497 | For simplicity, I am assuming all variables are in 128-bit data type.\r | |
498 | \r | |
499 | aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx)\r | |
500 | {\r | |
501 | while(num_blk--) {\r | |
502 | aes_decrypt(ibuf, obuf, ctx);\r | |
503 | *obuf++ ^= *iv;\r | |
504 | *iv = *ibuf++;\r | |
505 | }\r | |
506 | return 0;\r | |
507 | }\r | |
508 | \r | |
509 | The following is an implementation of this function using Intel AESNI.\r | |
510 | This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. \r | |
511 | Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch\r | |
512 | to this aesni-based function should it detecs that aesni is available.\r | |
513 | Blindly call this function SURELY will cause a CRASH on systems with no aesni support. \r | |
514 | \r | |
515 | Note that the decryption operation is not related over blocks.\r | |
516 | This gives opportunity of arranging aes_decrypt operations in parallel to speed up code.\r | |
517 | This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55)\r | |
518 | The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc.\r | |
519 | \r | |
520 | Example C code for packing 4 blocks in an iteration is shown as follows:\r | |
521 | \r | |
522 | while ((num_blk-=4)>=0) {\r | |
523 | \r | |
524 | // the following 4 functions can be interleaved to exploit parallelism\r | |
525 | aes_decrypt(ibuf, obuf, ctx);\r | |
526 | aes_decrypt(ibuf+1, obuf+1, ctx);\r | |
527 | aes_decrypt(ibuf+2, obuf+2, ctx);\r | |
528 | aes_decrypt(ibuf+3, obuf+3, ctx);\r | |
529 | \r | |
530 | obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r | |
531 | *iv = ibuf[3]; ibuf += 4; obuf += 4;\r | |
532 | }\r | |
533 | num_blk+=4;\r | |
534 | \r | |
535 | ----------------------------------------------------------------------------------------------------------------*/\r | |
536 | \r | |
537 | .text\r | |
538 | .align 4,0x90\r | |
539 | .globl _aes_decrypt_cbc_hw\r | |
540 | _aes_decrypt_cbc_hw:\r | |
541 | \r | |
542 | // push/save registers for local use\r | |
543 | #if defined __i386__\r | |
544 | \r | |
545 | push %ebp\r | |
546 | movl %esp, %ebp\r | |
547 | push %ebx // ibuf\r | |
548 | push %edi // obuf\r | |
549 | \r | |
550 | #define sp %esp\r | |
551 | \r | |
552 | #else // __x86_64__\r | |
553 | \r | |
554 | push %rbp\r | |
555 | mov %rsp, %rbp\r | |
556 | push %rbx\r | |
557 | push %r13\r | |
558 | push %r14\r | |
559 | push %r15\r | |
560 | \r | |
561 | #define sp %rsp\r | |
562 | \r | |
563 | #endif\r | |
564 | \r | |
565 | \r | |
566 | // if kernel, allocate stack space to save xmm registers\r | |
567 | #ifdef KERNEL\r | |
568 | #if defined __i386__\r | |
569 | sub $(8*16), %esp\r | |
570 | #else\r | |
571 | sub $(16*16), %rsp\r | |
572 | #endif\r | |
573 | movaps %xmm0, (sp)\r | |
574 | movaps %xmm1, 16(sp)\r | |
575 | movaps %xmm2, 32(sp)\r | |
576 | movaps %xmm3, 48(sp)\r | |
577 | movaps %xmm4, 64(sp)\r | |
578 | movaps %xmm5, 80(sp)\r | |
579 | movaps %xmm6, 96(sp)\r | |
580 | movaps %xmm7, 112(sp)\r | |
581 | #if defined __x86_64__\r | |
582 | movaps %xmm8, 16*8(sp)\r | |
583 | movaps %xmm9, 16*9(sp)\r | |
584 | movaps %xmm10, 16*10(sp)\r | |
585 | movaps %xmm11, 16*11(sp)\r | |
586 | movaps %xmm12, 16*12(sp)\r | |
587 | movaps %xmm13, 16*13(sp)\r | |
588 | movaps %xmm14, 16*14(sp)\r | |
589 | movaps %xmm15, 16*15(sp)\r | |
590 | #endif // __x86_64__\r | |
591 | #endif\r | |
592 | \r | |
593 | #undef iv\r | |
594 | #define iv %xmm0\r | |
595 | \r | |
596 | #if defined __i386__\r | |
597 | mov 12(%ebp), %eax // in_iv\r | |
598 | mov 24(%ebp), %edx // ctx\r | |
599 | movups (%eax), iv // iv = in_iv \r | |
600 | mov 8(%ebp), %ebx // ibuf\r | |
601 | mov 16(%ebp), %ecx // num_blk\r | |
602 | mov 20(%ebp), %edi // obuf\r | |
603 | \r | |
604 | #define ibuf %ebx\r | |
605 | #define obuf %edi\r | |
606 | #define num_blk %ecx \r | |
607 | #define ctx %edx\r | |
608 | \r | |
609 | #else // __x86_64__, rdi/rsi/rdx/rcx/r8\r | |
610 | \r | |
611 | mov %rdi, %rbx // ibuf\r | |
612 | movups (%rsi), iv // iv = in_iv\r | |
613 | mov %rdx, %r13 // num_blk\r | |
614 | mov %rcx, %r14 // obuf\r | |
615 | mov %r8, %r15 // ctx \r | |
616 | \r | |
617 | #define ibuf %rbx\r | |
618 | #define num_blk %r13d\r | |
619 | #define obuf %r14 \r | |
620 | #define ctx %r15\r | |
621 | \r | |
622 | #endif\r | |
623 | \r | |
624 | mov 240(ctx), %eax // aes length\r | |
625 | cmp $160, %eax // aes-128 decrypt\r | |
626 | je L_decrypt_128\r | |
627 | cmp $192, %eax // aes-192 decrypt\r | |
628 | je L_decrypt_192\r | |
629 | cmp $224, %eax // aes-256 decrypt\r | |
630 | je L_decrypt_256\r | |
631 | \r | |
632 | mov $-1, %eax // wrong aes length, to return -1\r | |
633 | jmp L_error // early exit due to wrong aes length\r | |
634 | \r | |
635 | \r | |
636 | //\r | |
637 | // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r | |
638 | //\r | |
639 | \r | |
640 | L_decrypt_128:\r | |
641 | \r | |
642 | cmp $1, num_blk\r | |
643 | jl L_HW_cbc_done // if num_blk < 1, early return\r | |
644 | \r | |
645 | // aes-128 decrypt expanded keys\r | |
646 | movups 160(ctx), %xmm3\r | |
647 | movups 144(ctx), %xmm4\r | |
648 | movups 128(ctx), %xmm5\r | |
649 | movups 112(ctx), %xmm6\r | |
650 | movups 96(ctx), %xmm7\r | |
651 | #if defined __x86_64__\r | |
652 | movups 80(ctx), %xmm8\r | |
653 | movups 64(ctx), %xmm9\r | |
654 | movups 48(ctx), %xmm10\r | |
655 | movups 32(ctx), %xmm11\r | |
656 | movups 16(ctx), %xmm12\r | |
657 | movups 0(ctx), %xmm13\r | |
658 | #endif\r | |
659 | \r | |
660 | // performs 4 block decryption in an iteration to exploit decrypt in parallel\r | |
661 | \r | |
662 | // while ((num_blk-=4)>=0) {\r | |
663 | // aes_decrypt(ibuf, obuf, ctx);\r | |
664 | // aes_decrypt(ibuf+1, obuf+1, ctx);\r | |
665 | // aes_decrypt(ibuf+2, obuf+2, ctx);\r | |
666 | // aes_decrypt(ibuf+3, obuf+3, ctx);\r | |
667 | // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r | |
668 | // *iv = ibuf[3]; ibuf += 4; obuf += 4;\r | |
669 | // }\r | |
670 | \r | |
671 | sub $4, num_blk // pre decrement num_blk by 4\r | |
672 | jl 9f // if num_blk < 4, skip the per-4-blocks processing code\r | |
673 | \r | |
674 | 0:\r | |
675 | \r | |
676 | \r | |
677 | #if defined __x86_64__\r | |
678 | \r | |
679 | movups (ibuf), %xmm1 // tmp = 1st ibuf\r | |
680 | movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r | |
681 | movups 32(ibuf), %xmm14 // tmp = 3rd ibuf\r | |
682 | movups 48(ibuf), %xmm15 // tmp = 4th ibuf\r | |
683 | \r | |
684 | // for x86_64, the expanded keys are already stored in xmm3-xmm13\r | |
685 | \r | |
686 | // aes-128 decrypt round 0 per 4 blocks\r | |
687 | pxor %xmm3, %xmm1\r | |
688 | pxor %xmm3, %xmm2\r | |
689 | pxor %xmm3, %xmm14\r | |
690 | pxor %xmm3, %xmm15\r | |
691 | \r | |
692 | // aes-128 decrypt round 1 per 4 blocks\r | |
693 | aesdec %xmm4, %xmm1\r | |
694 | aesdec %xmm4, %xmm2\r | |
695 | aesdec %xmm4, %xmm14\r | |
696 | aesdec %xmm4, %xmm15\r | |
697 | \r | |
698 | // aes-128 decrypt round 2 per 4 blocks\r | |
699 | aesdec %xmm5, %xmm1\r | |
700 | aesdec %xmm5, %xmm2\r | |
701 | aesdec %xmm5, %xmm14\r | |
702 | aesdec %xmm5, %xmm15\r | |
703 | \r | |
704 | // aes-128 decrypt round 3 per 4 blocks\r | |
705 | aesdec %xmm6, %xmm1\r | |
706 | aesdec %xmm6, %xmm2\r | |
707 | aesdec %xmm6, %xmm14\r | |
708 | aesdec %xmm6, %xmm15\r | |
709 | \r | |
710 | // aes-128 decrypt round 4 per 4 blocks\r | |
711 | aesdec %xmm7, %xmm1\r | |
712 | aesdec %xmm7, %xmm2\r | |
713 | aesdec %xmm7, %xmm14\r | |
714 | aesdec %xmm7, %xmm15\r | |
715 | \r | |
716 | // aes-128 decrypt round 5 per 4 blocks\r | |
717 | aesdec %xmm8, %xmm1\r | |
718 | aesdec %xmm8, %xmm2\r | |
719 | aesdec %xmm8, %xmm14\r | |
720 | aesdec %xmm8, %xmm15\r | |
721 | \r | |
722 | // aes-128 decrypt round 6 per 4 blocks\r | |
723 | aesdec %xmm9, %xmm1\r | |
724 | aesdec %xmm9, %xmm2\r | |
725 | aesdec %xmm9, %xmm14\r | |
726 | aesdec %xmm9, %xmm15\r | |
727 | \r | |
728 | // aes-128 decrypt round 7 per 4 blocks\r | |
729 | aesdec %xmm10, %xmm1\r | |
730 | aesdec %xmm10, %xmm2\r | |
731 | aesdec %xmm10, %xmm14\r | |
732 | aesdec %xmm10, %xmm15\r | |
733 | \r | |
734 | // aes-128 decrypt round 8 per 4 blocks\r | |
735 | aesdec %xmm11, %xmm1\r | |
736 | aesdec %xmm11, %xmm2\r | |
737 | aesdec %xmm11, %xmm14\r | |
738 | aesdec %xmm11, %xmm15\r | |
739 | \r | |
740 | // aes-128 decrypt round 9 per 4 blocks\r | |
741 | aesdec %xmm12, %xmm1\r | |
742 | aesdec %xmm12, %xmm2\r | |
743 | aesdec %xmm12, %xmm14\r | |
744 | aesdec %xmm12, %xmm15\r | |
745 | \r | |
746 | // aes-128 decrypt round 10 (last) per 4 blocks\r | |
747 | aesdeclast %xmm13, %xmm1\r | |
748 | aesdeclast %xmm13, %xmm2\r | |
749 | aesdeclast %xmm13, %xmm14\r | |
750 | aesdeclast %xmm13, %xmm15\r | |
751 | \r | |
752 | pxor iv, %xmm1 // obuf[0] ^= *iv; \r | |
753 | movups (ibuf), iv // ibuf[0]\r | |
754 | pxor iv, %xmm2 // obuf[1] ^= ibuf[0]; \r | |
755 | movups 16(ibuf), iv // ibuf[1]\r | |
756 | pxor iv, %xmm14 // obuf[2] ^= ibuf[1]; \r | |
757 | movups 32(ibuf), iv // ibuf[2] \r | |
758 | pxor iv, %xmm15 // obuf[3] ^= obuf[2]; \r | |
759 | movups 48(ibuf), iv // *iv = ibuf[3]\r | |
760 | \r | |
761 | movups %xmm1, (obuf) // write 1st obuf\r | |
762 | movups %xmm2, 16(obuf) // write 2nd obuf\r | |
763 | movups %xmm14, 32(obuf) // write 3rd obuf\r | |
764 | movups %xmm15, 48(obuf) // write 4th obuf\r | |
765 | \r | |
766 | \r | |
767 | #else\r | |
768 | \r | |
769 | // aes_decrypt_cbc per 4 blocks using aes-128 for i386\r | |
770 | // xmm1/xmm2/xmm4/xmm5 used for obuf per block\r | |
771 | // xmm3 = key0\r | |
772 | // xmm0 = iv\r | |
773 | // xmm6/xmm7 dynamically load with other expanded keys\r | |
774 | \r | |
775 | movups (ibuf), %xmm1 // tmp = 1st ibuf\r | |
776 | movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r | |
777 | movups 32(ibuf), %xmm4 // tmp = 3rd ibuf\r | |
778 | movups 48(ibuf), %xmm5 // tmp = 4th ibuf\r | |
779 | \r | |
780 | // aes_decrypt\r | |
781 | // for i386, sequentially load expanded keys into xmm6/xmm7\r | |
782 | \r | |
783 | movups 144(ctx), %xmm6 // key1\r | |
784 | \r | |
785 | // aes-128 decrypt round 0 per 4 blocks\r | |
786 | pxor %xmm3, %xmm1\r | |
787 | pxor %xmm3, %xmm2\r | |
788 | pxor %xmm3, %xmm4\r | |
789 | pxor %xmm3, %xmm5\r | |
790 | \r | |
791 | movups 128(ctx), %xmm7 // key2\r | |
792 | \r | |
793 | // aes-128 decrypt round 1 per 4 blocks\r | |
794 | aesdec %xmm6, %xmm1\r | |
795 | aesdec %xmm6, %xmm2\r | |
796 | aesdec %xmm6, %xmm4\r | |
797 | aesdec %xmm6, %xmm5\r | |
798 | \r | |
799 | movups 112(ctx), %xmm6 // key3\r | |
800 | \r | |
801 | // aes-128 decrypt round 2 per 4 blocks\r | |
802 | aesdec %xmm7, %xmm1\r | |
803 | aesdec %xmm7, %xmm2\r | |
804 | aesdec %xmm7, %xmm4\r | |
805 | aesdec %xmm7, %xmm5\r | |
806 | \r | |
807 | movups 96(ctx), %xmm7 // key4\r | |
808 | \r | |
809 | // aes-128 decrypt round 3 per 4 blocks\r | |
810 | aesdec %xmm6, %xmm1\r | |
811 | aesdec %xmm6, %xmm2\r | |
812 | aesdec %xmm6, %xmm4\r | |
813 | aesdec %xmm6, %xmm5\r | |
814 | \r | |
815 | movups 80(ctx), %xmm6 // key5\r | |
816 | \r | |
817 | // aes-128 decrypt round 4 per 4 blocks\r | |
818 | aesdec %xmm7, %xmm1\r | |
819 | aesdec %xmm7, %xmm2\r | |
820 | aesdec %xmm7, %xmm4\r | |
821 | aesdec %xmm7, %xmm5\r | |
822 | \r | |
823 | movups 64(ctx), %xmm7 // key6\r | |
824 | \r | |
825 | // aes-128 decrypt round 5 per 4 blocks\r | |
826 | aesdec %xmm6, %xmm1\r | |
827 | aesdec %xmm6, %xmm2\r | |
828 | aesdec %xmm6, %xmm4\r | |
829 | aesdec %xmm6, %xmm5\r | |
830 | \r | |
831 | movups 48(ctx), %xmm6 // key7\r | |
832 | \r | |
833 | // aes-128 decrypt round 6 per 4 blocks\r | |
834 | aesdec %xmm7, %xmm1\r | |
835 | aesdec %xmm7, %xmm2\r | |
836 | aesdec %xmm7, %xmm4\r | |
837 | aesdec %xmm7, %xmm5\r | |
838 | \r | |
839 | movups 32(ctx), %xmm7 // key8\r | |
840 | \r | |
841 | // aes-128 decrypt round 7 per 4 blocks\r | |
842 | aesdec %xmm6, %xmm1\r | |
843 | aesdec %xmm6, %xmm2\r | |
844 | aesdec %xmm6, %xmm4\r | |
845 | aesdec %xmm6, %xmm5\r | |
846 | \r | |
847 | movups 16(ctx), %xmm6 // key9\r | |
848 | \r | |
849 | // aes-128 decrypt round 8 per 4 blocks\r | |
850 | aesdec %xmm7, %xmm1\r | |
851 | aesdec %xmm7, %xmm2\r | |
852 | aesdec %xmm7, %xmm4\r | |
853 | aesdec %xmm7, %xmm5\r | |
854 | \r | |
855 | movups 0(ctx), %xmm7 // keyA\r | |
856 | \r | |
857 | // aes-128 decrypt round 9 per 4 blocks\r | |
858 | aesdec %xmm6, %xmm1\r | |
859 | aesdec %xmm6, %xmm2\r | |
860 | aesdec %xmm6, %xmm4\r | |
861 | aesdec %xmm6, %xmm5\r | |
862 | \r | |
863 | // aes-128 decrypt round 10 (last) per 4 blocks\r | |
864 | aesdeclast %xmm7, %xmm1\r | |
865 | aesdeclast %xmm7, %xmm2\r | |
866 | aesdeclast %xmm7, %xmm4\r | |
867 | aesdeclast %xmm7, %xmm5\r | |
868 | \r | |
869 | pxor iv, %xmm1 // 1st obuf ^= iv; \r | |
870 | movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
871 | pxor iv, %xmm2 // 2nd obuf ^= iv; \r | |
872 | movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
873 | pxor iv, %xmm4 // 3rd obuf ^= iv; \r | |
874 | movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
875 | pxor iv, %xmm5 // 4th obuf ^= iv; \r | |
876 | movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
877 | \r | |
878 | movups %xmm1, (obuf) // write 1st obuf\r | |
879 | movups %xmm2, 16(obuf) // write 2nd obuf\r | |
880 | movups %xmm4, 32(obuf) // write 3rd obuf\r | |
881 | movups %xmm5, 48(obuf) // write 4th obuf\r | |
882 | #endif\r | |
883 | \r | |
884 | add $64, ibuf // ibuf += 4; \r | |
885 | add $64, obuf // obuf += 4; \r | |
886 | \r | |
887 | sub $4, num_blk // num_blk -= 4\r | |
888 | jge 0b // if num_blk > 0, repeat the loop\r | |
889 | \r | |
890 | 9: add $4, num_blk // post incremtn num_blk by 4\r | |
891 | je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r | |
892 | \r | |
893 | #if defined __i386__\r | |
894 | // updated as they might be needed as expanded keys in the remaining\r | |
895 | movups 144(ctx), %xmm4\r | |
896 | movups 128(ctx), %xmm5\r | |
897 | movups 112(ctx), %xmm6\r | |
898 | movups 96(ctx), %xmm7\r | |
899 | #endif\r | |
900 | \r | |
901 | test $2, num_blk // check whether num_blk has 2 blocks\r | |
902 | je 9f // if num_blk & 2 == 0, skip the per-pair processing code\r | |
903 | \r | |
904 | // do the remaining 2 blocks together\r | |
905 | \r | |
906 | movups (ibuf), %xmm1 // tmp = 1st ibuf\r | |
907 | movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r | |
908 | \r | |
909 | // aes_decrypt\r | |
910 | pxor %xmm3, %xmm1\r | |
911 | pxor %xmm3, %xmm2\r | |
912 | aesdec %xmm4, %xmm1\r | |
913 | aesdec %xmm4, %xmm2\r | |
914 | aesdec %xmm5, %xmm1\r | |
915 | aesdec %xmm5, %xmm2\r | |
916 | aesdec %xmm6, %xmm1\r | |
917 | aesdec %xmm6, %xmm2\r | |
918 | #if defined __x86_64__\r | |
919 | aesdec %xmm7, %xmm1\r | |
920 | aesdec %xmm7, %xmm2\r | |
921 | aesdec %xmm8, %xmm1\r | |
922 | aesdec %xmm8, %xmm2\r | |
923 | aesdec %xmm9, %xmm1\r | |
924 | aesdec %xmm9, %xmm2\r | |
925 | aesdec %xmm10, %xmm1\r | |
926 | aesdec %xmm10, %xmm2\r | |
927 | aesdec %xmm11, %xmm1\r | |
928 | aesdec %xmm11, %xmm2\r | |
929 | aesdec %xmm12, %xmm1\r | |
930 | aesdec %xmm12, %xmm2\r | |
931 | aesdeclast %xmm13, %xmm1\r | |
932 | aesdeclast %xmm13, %xmm2\r | |
933 | #else\r | |
934 | movups 80(ctx), %xmm6\r | |
935 | aesdec %xmm7, %xmm1\r | |
936 | aesdec %xmm7, %xmm2\r | |
937 | movups 64(ctx), %xmm7\r | |
938 | aesdec %xmm6, %xmm1\r | |
939 | aesdec %xmm6, %xmm2\r | |
940 | movups 48(ctx), %xmm6\r | |
941 | aesdec %xmm7, %xmm1\r | |
942 | aesdec %xmm7, %xmm2\r | |
943 | movups 32(ctx), %xmm7\r | |
944 | aesdec %xmm6, %xmm1\r | |
945 | aesdec %xmm6, %xmm2\r | |
946 | movups 16(ctx), %xmm6\r | |
947 | aesdec %xmm7, %xmm1\r | |
948 | aesdec %xmm7, %xmm2\r | |
949 | movups 0(ctx), %xmm7\r | |
950 | aesdec %xmm6, %xmm1\r | |
951 | aesdec %xmm6, %xmm2\r | |
952 | aesdeclast %xmm7, %xmm1\r | |
953 | aesdeclast %xmm7, %xmm2\r | |
954 | movups 112(ctx), %xmm6\r | |
955 | movups 96(ctx), %xmm7\r | |
956 | #endif\r | |
957 | \r | |
958 | pxor iv, %xmm1 // obuf[0] ^= *iv; \r | |
959 | movups (ibuf), iv // ibuf[0]\r | |
960 | pxor iv, %xmm2 // obuf[1] ^= ibuf[0]\r | |
961 | movups 16(ibuf), iv // *iv = ibuf[1]\r | |
962 | \r | |
963 | movups %xmm1, (obuf) // write obuf[0]\r | |
964 | movups %xmm2, 16(obuf) // write obuf[1]\r | |
965 | \r | |
966 | add $32, ibuf // ibuf += 2\r | |
967 | add $32, obuf // obuf += 2\r | |
968 | \r | |
969 | 9:\r | |
970 | test $1, num_blk // check whether num_blk has residual 1 block\r | |
971 | je L_HW_cbc_done // if num_blk == 0, no need for residual processing code\r | |
972 | \r | |
973 | movups (ibuf), %xmm2 // tmp = ibuf\r | |
974 | // aes_decrypt\r | |
975 | pxor %xmm3, %xmm2\r | |
976 | aesdec %xmm4, %xmm2\r | |
977 | aesdec %xmm5, %xmm2\r | |
978 | aesdec %xmm6, %xmm2\r | |
979 | aesdec %xmm7, %xmm2\r | |
980 | #if defined __x86_64__\r | |
981 | aesdec %xmm8, %xmm2\r | |
982 | aesdec %xmm9, %xmm2\r | |
983 | aesdec %xmm10, %xmm2\r | |
984 | aesdec %xmm11, %xmm2\r | |
985 | aesdec %xmm12, %xmm2\r | |
986 | aesdeclast %xmm13, %xmm2\r | |
987 | #else\r | |
988 | movups 80(ctx), %xmm1\r | |
989 | aesdec %xmm1, %xmm2\r | |
990 | movups 64(ctx), %xmm1\r | |
991 | aesdec %xmm1, %xmm2\r | |
992 | movups 48(ctx), %xmm1\r | |
993 | aesdec %xmm1, %xmm2\r | |
994 | movups 32(ctx), %xmm1\r | |
995 | aesdec %xmm1, %xmm2\r | |
996 | movups 16(ctx), %xmm1\r | |
997 | aesdec %xmm1, %xmm2\r | |
998 | movups (ctx), %xmm1\r | |
999 | aesdeclast %xmm1, %xmm2\r | |
1000 | #endif\r | |
1001 | \r | |
1002 | pxor iv, %xmm2 // *obuf ^= *iv; \r | |
1003 | movups (ibuf), iv // *iv = *ibuf;\r | |
1004 | movups %xmm2, (obuf) // write *obuf\r | |
1005 | \r | |
1006 | jmp L_HW_cbc_done\r | |
1007 | \r | |
1008 | //\r | |
1009 | // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r | |
1010 | //\r | |
1011 | \r | |
1012 | L_decrypt_192:\r | |
1013 | \r | |
1014 | cmp $1, num_blk\r | |
1015 | jl L_HW_cbc_done // if num_blk < 1, early return\r | |
1016 | \r | |
1017 | // aes-192 decryp expanded keys\r | |
1018 | movups 192(ctx), %xmm3\r | |
1019 | movups 176(ctx), %xmm4\r | |
1020 | movups 160(ctx), %xmm5\r | |
1021 | movups 144(ctx), %xmm6\r | |
1022 | movups 128(ctx), %xmm7\r | |
1023 | #if defined __x86_64__\r | |
1024 | movups 112(ctx), %xmm8\r | |
1025 | movups 96(ctx), %xmm9\r | |
1026 | movups 80(ctx), %xmm10\r | |
1027 | movups 64(ctx), %xmm11\r | |
1028 | movups 48(ctx), %xmm12\r | |
1029 | movups 32(ctx), %xmm13\r | |
1030 | movups 16(ctx), %xmm14\r | |
1031 | movups (ctx), %xmm15\r | |
1032 | #endif\r | |
1033 | \r | |
1034 | // performs 4 block decryption in an iteration to exploit decrypt in parallel\r | |
1035 | \r | |
1036 | // while ((num_blk-=4)>=0) {\r | |
1037 | // aes_decrypt(ibuf, obuf, ctx);\r | |
1038 | // aes_decrypt(ibuf+1, obuf+1, ctx);\r | |
1039 | // aes_decrypt(ibuf+2, obuf+2, ctx);\r | |
1040 | // aes_decrypt(ibuf+3, obuf+3, ctx);\r | |
1041 | // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2];\r | |
1042 | // *iv = ibuf[3]; ibuf += 4; obuf += 4;\r | |
1043 | // }\r | |
1044 | \r | |
1045 | sub $4, num_blk // pre decrement num_blk by 4\r | |
1046 | jl 9f // if num_blk < 4, skip the per-4-blocks processing code\r | |
1047 | 0:\r | |
1048 | \r | |
1049 | #if defined __x86_64__\r | |
1050 | \r | |
1051 | movups (ibuf), %xmm1 // tmp = 1st ibuf\r | |
1052 | movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r | |
1053 | movups 32(ibuf), %xmm14 // tmp = 3rd ibuf\r | |
1054 | movups 48(ibuf), %xmm15 // tmp = 4th ibuf\r | |
1055 | \r | |
1056 | // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13\r | |
1057 | // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards\r | |
1058 | \r | |
1059 | // round 0 for 4 blocks\r | |
1060 | pxor %xmm3, %xmm1\r | |
1061 | pxor %xmm3, %xmm2\r | |
1062 | pxor %xmm3, %xmm14\r | |
1063 | pxor %xmm3, %xmm15\r | |
1064 | \r | |
1065 | // round 1 for 4 blocks\r | |
1066 | aesdec %xmm4, %xmm1\r | |
1067 | aesdec %xmm4, %xmm2\r | |
1068 | aesdec %xmm4, %xmm14\r | |
1069 | aesdec %xmm4, %xmm15\r | |
1070 | \r | |
1071 | // round 2 for 4 blocks\r | |
1072 | aesdec %xmm5, %xmm1\r | |
1073 | aesdec %xmm5, %xmm2\r | |
1074 | aesdec %xmm5, %xmm14\r | |
1075 | aesdec %xmm5, %xmm15\r | |
1076 | \r | |
1077 | // round 3 for 4 blocks\r | |
1078 | aesdec %xmm6, %xmm1\r | |
1079 | aesdec %xmm6, %xmm2\r | |
1080 | aesdec %xmm6, %xmm14\r | |
1081 | aesdec %xmm6, %xmm15\r | |
1082 | \r | |
1083 | // round 4 for 4 blocks\r | |
1084 | aesdec %xmm7, %xmm1\r | |
1085 | aesdec %xmm7, %xmm2\r | |
1086 | aesdec %xmm7, %xmm14\r | |
1087 | aesdec %xmm7, %xmm15\r | |
1088 | \r | |
1089 | // round 5 for 4 blocks\r | |
1090 | aesdec %xmm8, %xmm1\r | |
1091 | aesdec %xmm8, %xmm2\r | |
1092 | aesdec %xmm8, %xmm14\r | |
1093 | aesdec %xmm8, %xmm15\r | |
1094 | \r | |
1095 | // round 6 for 4 blocks\r | |
1096 | aesdec %xmm9, %xmm1\r | |
1097 | aesdec %xmm9, %xmm2\r | |
1098 | aesdec %xmm9, %xmm14\r | |
1099 | aesdec %xmm9, %xmm15\r | |
1100 | \r | |
1101 | // round 7 for 4 blocks\r | |
1102 | aesdec %xmm10, %xmm1\r | |
1103 | aesdec %xmm10, %xmm2\r | |
1104 | aesdec %xmm10, %xmm14\r | |
1105 | aesdec %xmm10, %xmm15\r | |
1106 | \r | |
1107 | // round 8 for 4 blocks\r | |
1108 | aesdec %xmm11, %xmm1\r | |
1109 | aesdec %xmm11, %xmm2\r | |
1110 | aesdec %xmm11, %xmm14\r | |
1111 | aesdec %xmm11, %xmm15\r | |
1112 | \r | |
1113 | // round 9 for 4 blocks\r | |
1114 | aesdec %xmm12, %xmm1\r | |
1115 | aesdec %xmm12, %xmm2\r | |
1116 | aesdec %xmm12, %xmm14\r | |
1117 | aesdec %xmm12, %xmm15\r | |
1118 | \r | |
1119 | movups 16(ctx), %xmm12\r | |
1120 | \r | |
1121 | // round A for 4 blocks\r | |
1122 | aesdec %xmm13, %xmm1\r | |
1123 | aesdec %xmm13, %xmm2\r | |
1124 | aesdec %xmm13, %xmm14\r | |
1125 | aesdec %xmm13, %xmm15\r | |
1126 | \r | |
1127 | movups (ctx), %xmm13\r | |
1128 | \r | |
1129 | // round B for 4 blocks\r | |
1130 | aesdec %xmm12, %xmm1\r | |
1131 | aesdec %xmm12, %xmm2\r | |
1132 | aesdec %xmm12, %xmm14\r | |
1133 | aesdec %xmm12, %xmm15\r | |
1134 | \r | |
1135 | movups 48(ctx), %xmm12 // restore %xmm12 to its original key\r | |
1136 | \r | |
1137 | // round C (last) for 4 blocks\r | |
1138 | aesdeclast %xmm13, %xmm1\r | |
1139 | aesdeclast %xmm13, %xmm2\r | |
1140 | aesdeclast %xmm13, %xmm14\r | |
1141 | aesdeclast %xmm13, %xmm15\r | |
1142 | \r | |
1143 | movups 32(ctx), %xmm13 // restore %xmm13 to its original key\r | |
1144 | \r | |
1145 | pxor iv, %xmm1 // obuf[0] ^= *iv; \r | |
1146 | movups (ibuf), iv // ibuf[0]\r | |
1147 | pxor iv, %xmm2 // obuf[1] ^= ibuf[0] \r | |
1148 | movups 16(ibuf), iv // ibuf[1]\r | |
1149 | pxor iv, %xmm14 // obuf[2] ^= ibuf[1] \r | |
1150 | movups 32(ibuf), iv // ibuf[2] \r | |
1151 | pxor iv, %xmm15 // obuf[3] ^= ibuf[2] \r | |
1152 | movups 48(ibuf), iv // *iv = ibuf[3] \r | |
1153 | \r | |
1154 | movups %xmm1, (obuf) // write 1st obuf\r | |
1155 | movups %xmm2, 16(obuf) // write 2nd obuf\r | |
1156 | movups %xmm14, 32(obuf) // write 3rd obuf\r | |
1157 | movups %xmm15, 48(obuf) // write 4th obuf\r | |
1158 | \r | |
1159 | add $64, ibuf // ibuf += 4; \r | |
1160 | add $64, obuf // obuf += 4; \r | |
1161 | \r | |
1162 | sub $4, num_blk // num_blk -= 4\r | |
1163 | jge 0b // if num_blk > 0, repeat the loop\r | |
1164 | \r | |
1165 | 9: add $4, num_blk // post incremtn num_blk by 4\r | |
1166 | je L_HW_cbc_done // if num_blk == 0, prepare to return \r | |
1167 | \r | |
1168 | movups 16(ctx), %xmm14 // restore %xmm14 to its key\r | |
1169 | movups (ctx), %xmm15 // restore %xmm15 to its key\r | |
1170 | \r | |
1171 | #else\r | |
1172 | \r | |
1173 | movups (ibuf), %xmm1 // tmp = 1st ibuf\r | |
1174 | movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r | |
1175 | movups 32(ibuf), %xmm4 // tmp = 3rd ibuf\r | |
1176 | movups 48(ibuf), %xmm5 // tmp = 4th ibuf\r | |
1177 | \r | |
1178 | // aes_decrypt\r | |
1179 | // for i386, sequentially load expanded keys into xmm6/xmm7\r | |
1180 | movups 176(ctx), %xmm6\r | |
1181 | pxor %xmm3, %xmm1\r | |
1182 | pxor %xmm3, %xmm2\r | |
1183 | pxor %xmm3, %xmm4\r | |
1184 | pxor %xmm3, %xmm5\r | |
1185 | \r | |
1186 | movups 160(ctx), %xmm7\r | |
1187 | aesdec %xmm6, %xmm1\r | |
1188 | aesdec %xmm6, %xmm2\r | |
1189 | aesdec %xmm6, %xmm4\r | |
1190 | aesdec %xmm6, %xmm5\r | |
1191 | \r | |
1192 | movups 144(ctx), %xmm6\r | |
1193 | aesdec %xmm7, %xmm1\r | |
1194 | aesdec %xmm7, %xmm2\r | |
1195 | aesdec %xmm7, %xmm4\r | |
1196 | aesdec %xmm7, %xmm5\r | |
1197 | \r | |
1198 | movups 128(ctx), %xmm7\r | |
1199 | aesdec %xmm6, %xmm1\r | |
1200 | aesdec %xmm6, %xmm2\r | |
1201 | aesdec %xmm6, %xmm4\r | |
1202 | aesdec %xmm6, %xmm5\r | |
1203 | \r | |
1204 | movups 112(ctx), %xmm6\r | |
1205 | aesdec %xmm7, %xmm1\r | |
1206 | aesdec %xmm7, %xmm2\r | |
1207 | aesdec %xmm7, %xmm4\r | |
1208 | aesdec %xmm7, %xmm5\r | |
1209 | \r | |
1210 | movups 96(ctx), %xmm7\r | |
1211 | aesdec %xmm6, %xmm1\r | |
1212 | aesdec %xmm6, %xmm2\r | |
1213 | aesdec %xmm6, %xmm4\r | |
1214 | aesdec %xmm6, %xmm5\r | |
1215 | \r | |
1216 | movups 80(ctx), %xmm6\r | |
1217 | aesdec %xmm7, %xmm1\r | |
1218 | aesdec %xmm7, %xmm2\r | |
1219 | aesdec %xmm7, %xmm4\r | |
1220 | aesdec %xmm7, %xmm5\r | |
1221 | \r | |
1222 | movups 64(ctx), %xmm7\r | |
1223 | aesdec %xmm6, %xmm1\r | |
1224 | aesdec %xmm6, %xmm2\r | |
1225 | aesdec %xmm6, %xmm4\r | |
1226 | aesdec %xmm6, %xmm5\r | |
1227 | \r | |
1228 | movups 48(ctx), %xmm6\r | |
1229 | aesdec %xmm7, %xmm1\r | |
1230 | aesdec %xmm7, %xmm2\r | |
1231 | aesdec %xmm7, %xmm4\r | |
1232 | aesdec %xmm7, %xmm5\r | |
1233 | \r | |
1234 | movups 32(ctx), %xmm7\r | |
1235 | aesdec %xmm6, %xmm1\r | |
1236 | aesdec %xmm6, %xmm2\r | |
1237 | aesdec %xmm6, %xmm4\r | |
1238 | aesdec %xmm6, %xmm5\r | |
1239 | \r | |
1240 | movups 16(ctx), %xmm6\r | |
1241 | aesdec %xmm7, %xmm1\r | |
1242 | aesdec %xmm7, %xmm2\r | |
1243 | aesdec %xmm7, %xmm4\r | |
1244 | aesdec %xmm7, %xmm5\r | |
1245 | \r | |
1246 | movups 0(ctx), %xmm7\r | |
1247 | aesdec %xmm6, %xmm1\r | |
1248 | aesdec %xmm6, %xmm2\r | |
1249 | aesdec %xmm6, %xmm4\r | |
1250 | aesdec %xmm6, %xmm5\r | |
1251 | \r | |
1252 | aesdeclast %xmm7, %xmm1\r | |
1253 | aesdeclast %xmm7, %xmm2\r | |
1254 | aesdeclast %xmm7, %xmm4\r | |
1255 | aesdeclast %xmm7, %xmm5\r | |
1256 | \r | |
1257 | pxor iv, %xmm1 // 1st obuf ^= iv; \r | |
1258 | movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
1259 | pxor iv, %xmm2 // 2nd obuf ^= iv; \r | |
1260 | movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
1261 | pxor iv, %xmm4 // 3rd obuf ^= iv; \r | |
1262 | movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
1263 | pxor iv, %xmm5 // 4th obuf ^= iv; \r | |
1264 | movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
1265 | movups %xmm1, (obuf) // write 1st obuf\r | |
1266 | movups %xmm2, 16(obuf) // write 2nd obuf\r | |
1267 | movups %xmm4, 32(obuf) // write 3rd obuf\r | |
1268 | movups %xmm5, 48(obuf) // write 4th obuf\r | |
1269 | \r | |
1270 | add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; \r | |
1271 | add $64, obuf // obuf += AES_BLOCK_SIZE * 4; \r | |
1272 | \r | |
1273 | sub $4, num_blk // num_blk -= 4\r | |
1274 | jge 0b // if num_blk > 0, repeat the loop\r | |
1275 | \r | |
1276 | \r | |
1277 | 9: add $4, num_blk // post incremtn num_blk by 4\r | |
1278 | je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r | |
1279 | \r | |
1280 | movups 176(ctx), %xmm4\r | |
1281 | movups 160(ctx), %xmm5\r | |
1282 | movups 144(ctx), %xmm6\r | |
1283 | movups 128(ctx), %xmm7\r | |
1284 | \r | |
1285 | #endif\r | |
1286 | \r | |
1287 | // per-block aes_decrypt_cbc loop\r | |
1288 | \r | |
1289 | 0:\r | |
1290 | movups (ibuf), %xmm2 // tmp = ibuf\r | |
1291 | \r | |
1292 | // aes_decrypt\r | |
1293 | pxor %xmm3, %xmm2\r | |
1294 | aesdec %xmm4, %xmm2\r | |
1295 | aesdec %xmm5, %xmm2\r | |
1296 | aesdec %xmm6, %xmm2\r | |
1297 | aesdec %xmm7, %xmm2\r | |
1298 | #if defined __x86_64__\r | |
1299 | aesdec %xmm8, %xmm2\r | |
1300 | aesdec %xmm9, %xmm2\r | |
1301 | aesdec %xmm10, %xmm2\r | |
1302 | aesdec %xmm11, %xmm2\r | |
1303 | aesdec %xmm12, %xmm2\r | |
1304 | aesdec %xmm13, %xmm2\r | |
1305 | aesdec %xmm14, %xmm2\r | |
1306 | aesdeclast %xmm15, %xmm2\r | |
1307 | #else\r | |
1308 | movups 112(ctx), %xmm1\r | |
1309 | aesdec %xmm1, %xmm2\r | |
1310 | movups 96(ctx), %xmm1\r | |
1311 | aesdec %xmm1, %xmm2\r | |
1312 | movups 80(ctx), %xmm1\r | |
1313 | aesdec %xmm1, %xmm2\r | |
1314 | movups 64(ctx), %xmm1\r | |
1315 | aesdec %xmm1, %xmm2\r | |
1316 | movups 48(ctx), %xmm1\r | |
1317 | aesdec %xmm1, %xmm2\r | |
1318 | movups 32(ctx), %xmm1\r | |
1319 | aesdec %xmm1, %xmm2\r | |
1320 | movups 16(ctx), %xmm1\r | |
1321 | aesdec %xmm1, %xmm2\r | |
1322 | movups (ctx), %xmm1\r | |
1323 | aesdeclast %xmm1, %xmm2\r | |
1324 | #endif\r | |
1325 | \r | |
1326 | pxor iv, %xmm2 // obuf ^= iv; \r | |
1327 | movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
1328 | \r | |
1329 | movups %xmm2, (obuf) // write obuf\r | |
1330 | \r | |
1331 | add $16, ibuf // ibuf += AES_BLOCK_SIZE; \r | |
1332 | add $16, obuf // obuf += AES_BLOCK_SIZE; \r | |
1333 | sub $1, num_blk // num_blk --\r | |
1334 | jg 0b // if num_blk > 0, repeat the loop\r | |
1335 | \r | |
1336 | jmp L_HW_cbc_done\r | |
1337 | \r | |
1338 | //\r | |
1339 | // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done\r | |
1340 | //\r | |
1341 | \r | |
1342 | L_decrypt_256:\r | |
1343 | \r | |
1344 | cmp $1, num_blk\r | |
1345 | jl L_HW_cbc_done \r | |
1346 | \r | |
1347 | movups 224(ctx), %xmm3\r | |
1348 | movups 208(ctx), %xmm4\r | |
1349 | movups 192(ctx), %xmm5\r | |
1350 | movups 176(ctx), %xmm6\r | |
1351 | movups 160(ctx), %xmm7\r | |
1352 | #if defined __x86_64__\r | |
1353 | movups 144(ctx), %xmm8\r | |
1354 | movups 128(ctx), %xmm9\r | |
1355 | movups 112(ctx), %xmm10\r | |
1356 | movups 96(ctx), %xmm11\r | |
1357 | movups 80(ctx), %xmm12\r | |
1358 | movups 64(ctx), %xmm13\r | |
1359 | movups 48(ctx), %xmm14\r | |
1360 | movups 32(ctx), %xmm15\r | |
1361 | // movups 16(ctx), %xmm14\r | |
1362 | // movups (ctx), %xmm15\r | |
1363 | #endif\r | |
1364 | \r | |
1365 | #if defined __x86_64__\r | |
1366 | \r | |
1367 | sub $4, num_blk // pre decrement num_blk by 4\r | |
1368 | jl 9f // if num_blk < 4, skip the per-4-blocks processing code\r | |
1369 | 0:\r | |
1370 | movups (ibuf), %xmm1 // tmp = 1st ibuf\r | |
1371 | movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r | |
1372 | movups 32(ibuf), %xmm14 // tmp = 3rd ibuf\r | |
1373 | movups 48(ibuf), %xmm15 // tmp = 4th ibuf\r | |
1374 | \r | |
1375 | // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13\r | |
1376 | pxor %xmm3, %xmm1\r | |
1377 | pxor %xmm3, %xmm2\r | |
1378 | pxor %xmm3, %xmm14\r | |
1379 | pxor %xmm3, %xmm15\r | |
1380 | \r | |
1381 | aesdec %xmm4, %xmm1\r | |
1382 | aesdec %xmm4, %xmm2\r | |
1383 | aesdec %xmm4, %xmm14\r | |
1384 | aesdec %xmm4, %xmm15\r | |
1385 | \r | |
1386 | aesdec %xmm5, %xmm1\r | |
1387 | aesdec %xmm5, %xmm2\r | |
1388 | aesdec %xmm5, %xmm14\r | |
1389 | aesdec %xmm5, %xmm15\r | |
1390 | \r | |
1391 | aesdec %xmm6, %xmm1\r | |
1392 | aesdec %xmm6, %xmm2\r | |
1393 | aesdec %xmm6, %xmm14\r | |
1394 | aesdec %xmm6, %xmm15\r | |
1395 | \r | |
1396 | aesdec %xmm7, %xmm1\r | |
1397 | aesdec %xmm7, %xmm2\r | |
1398 | aesdec %xmm7, %xmm14\r | |
1399 | aesdec %xmm7, %xmm15\r | |
1400 | \r | |
1401 | aesdec %xmm8, %xmm1\r | |
1402 | aesdec %xmm8, %xmm2\r | |
1403 | aesdec %xmm8, %xmm14\r | |
1404 | aesdec %xmm8, %xmm15\r | |
1405 | \r | |
1406 | aesdec %xmm9, %xmm1\r | |
1407 | aesdec %xmm9, %xmm2\r | |
1408 | aesdec %xmm9, %xmm14\r | |
1409 | aesdec %xmm9, %xmm15\r | |
1410 | \r | |
1411 | aesdec %xmm10, %xmm1\r | |
1412 | aesdec %xmm10, %xmm2\r | |
1413 | aesdec %xmm10, %xmm14\r | |
1414 | aesdec %xmm10, %xmm15\r | |
1415 | \r | |
1416 | aesdec %xmm11, %xmm1\r | |
1417 | aesdec %xmm11, %xmm2\r | |
1418 | aesdec %xmm11, %xmm14\r | |
1419 | aesdec %xmm11, %xmm15\r | |
1420 | \r | |
1421 | aesdec %xmm12, %xmm1\r | |
1422 | aesdec %xmm12, %xmm2\r | |
1423 | aesdec %xmm12, %xmm14\r | |
1424 | aesdec %xmm12, %xmm15\r | |
1425 | movups 48(ctx), %xmm12\r | |
1426 | \r | |
1427 | aesdec %xmm13, %xmm1\r | |
1428 | aesdec %xmm13, %xmm2\r | |
1429 | aesdec %xmm13, %xmm14\r | |
1430 | aesdec %xmm13, %xmm15\r | |
1431 | movups 32(ctx), %xmm13\r | |
1432 | \r | |
1433 | aesdec %xmm12, %xmm1\r | |
1434 | aesdec %xmm12, %xmm2\r | |
1435 | aesdec %xmm12, %xmm14\r | |
1436 | aesdec %xmm12, %xmm15\r | |
1437 | movups 16(ctx), %xmm12\r | |
1438 | \r | |
1439 | aesdec %xmm13, %xmm1\r | |
1440 | aesdec %xmm13, %xmm2\r | |
1441 | aesdec %xmm13, %xmm14\r | |
1442 | aesdec %xmm13, %xmm15\r | |
1443 | movups (ctx), %xmm13\r | |
1444 | \r | |
1445 | aesdec %xmm12, %xmm1\r | |
1446 | aesdec %xmm12, %xmm2\r | |
1447 | aesdec %xmm12, %xmm14\r | |
1448 | aesdec %xmm12, %xmm15\r | |
1449 | movups 80(ctx), %xmm12\r | |
1450 | \r | |
1451 | aesdeclast %xmm13, %xmm1\r | |
1452 | aesdeclast %xmm13, %xmm2\r | |
1453 | aesdeclast %xmm13, %xmm14\r | |
1454 | aesdeclast %xmm13, %xmm15\r | |
1455 | movups 64(ctx), %xmm13\r | |
1456 | \r | |
1457 | pxor iv, %xmm1 // obuf ^= iv; \r | |
1458 | movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
1459 | pxor iv, %xmm2 // obuf ^= iv; \r | |
1460 | movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
1461 | pxor iv, %xmm14 // obuf ^= iv; \r | |
1462 | movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
1463 | pxor iv, %xmm15 // obuf ^= iv; \r | |
1464 | movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
1465 | \r | |
1466 | movups %xmm1, (obuf) // write 1st obuf\r | |
1467 | movups %xmm2, 16(obuf) // write 2nd obuf\r | |
1468 | movups %xmm14, 32(obuf) // write 3rd obuf\r | |
1469 | movups %xmm15, 48(obuf) // write 4th obuf\r | |
1470 | \r | |
1471 | add $64, ibuf // ibuf += AES_BLOCK_SIZE*4; \r | |
1472 | add $64, obuf // obuf += AES_BLOCK_SIZE*4; \r | |
1473 | \r | |
1474 | sub $4, num_blk // num_blk -= 4\r | |
1475 | jge 0b // if num_blk > 0, repeat the loop\r | |
1476 | \r | |
1477 | 9: add $4, num_blk // post incremtn num_blk by 4\r | |
1478 | je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r | |
1479 | \r | |
1480 | movups 48(ctx), %xmm14\r | |
1481 | movups 32(ctx), %xmm15\r | |
1482 | \r | |
1483 | #else\r | |
1484 | \r | |
1485 | sub $4, num_blk // pre decrement num_blk by 4\r | |
1486 | jl 9f // if num_blk < 4, skip the per-pair processing code\r | |
1487 | 0:\r | |
1488 | movups (ibuf), %xmm1 // tmp = 1st ibuf\r | |
1489 | movups 16(ibuf), %xmm2 // tmp = 2nd ibuf\r | |
1490 | movups 32(ibuf), %xmm4 // tmp = 3rd ibuf\r | |
1491 | movups 48(ibuf), %xmm5 // tmp = 4th ibuf\r | |
1492 | \r | |
1493 | // aes_decrypt\r | |
1494 | // for i386, sequentially load expanded keys into xmm6/xmm7\r | |
1495 | movups 208(ctx), %xmm6\r | |
1496 | pxor %xmm3, %xmm1\r | |
1497 | pxor %xmm3, %xmm2\r | |
1498 | pxor %xmm3, %xmm4\r | |
1499 | pxor %xmm3, %xmm5\r | |
1500 | \r | |
1501 | movups 192(ctx), %xmm7\r | |
1502 | aesdec %xmm6, %xmm1\r | |
1503 | aesdec %xmm6, %xmm2\r | |
1504 | aesdec %xmm6, %xmm4\r | |
1505 | aesdec %xmm6, %xmm5\r | |
1506 | \r | |
1507 | movups 176(ctx), %xmm6\r | |
1508 | aesdec %xmm7, %xmm1\r | |
1509 | aesdec %xmm7, %xmm2\r | |
1510 | aesdec %xmm7, %xmm4\r | |
1511 | aesdec %xmm7, %xmm5\r | |
1512 | \r | |
1513 | movups 160(ctx), %xmm7\r | |
1514 | aesdec %xmm6, %xmm1\r | |
1515 | aesdec %xmm6, %xmm2\r | |
1516 | aesdec %xmm6, %xmm4\r | |
1517 | aesdec %xmm6, %xmm5\r | |
1518 | \r | |
1519 | movups 144(ctx), %xmm6\r | |
1520 | aesdec %xmm7, %xmm1\r | |
1521 | aesdec %xmm7, %xmm2\r | |
1522 | aesdec %xmm7, %xmm4\r | |
1523 | aesdec %xmm7, %xmm5\r | |
1524 | \r | |
1525 | movups 128(ctx), %xmm7\r | |
1526 | aesdec %xmm6, %xmm1\r | |
1527 | aesdec %xmm6, %xmm2\r | |
1528 | aesdec %xmm6, %xmm4\r | |
1529 | aesdec %xmm6, %xmm5\r | |
1530 | \r | |
1531 | movups 112(ctx), %xmm6\r | |
1532 | aesdec %xmm7, %xmm1\r | |
1533 | aesdec %xmm7, %xmm2\r | |
1534 | aesdec %xmm7, %xmm4\r | |
1535 | aesdec %xmm7, %xmm5\r | |
1536 | \r | |
1537 | movups 96(ctx), %xmm7\r | |
1538 | aesdec %xmm6, %xmm1\r | |
1539 | aesdec %xmm6, %xmm2\r | |
1540 | aesdec %xmm6, %xmm4\r | |
1541 | aesdec %xmm6, %xmm5\r | |
1542 | \r | |
1543 | movups 80(ctx), %xmm6\r | |
1544 | aesdec %xmm7, %xmm1\r | |
1545 | aesdec %xmm7, %xmm2\r | |
1546 | aesdec %xmm7, %xmm4\r | |
1547 | aesdec %xmm7, %xmm5\r | |
1548 | \r | |
1549 | movups 64(ctx), %xmm7\r | |
1550 | aesdec %xmm6, %xmm1\r | |
1551 | aesdec %xmm6, %xmm2\r | |
1552 | aesdec %xmm6, %xmm4\r | |
1553 | aesdec %xmm6, %xmm5\r | |
1554 | \r | |
1555 | movups 48(ctx), %xmm6\r | |
1556 | aesdec %xmm7, %xmm1\r | |
1557 | aesdec %xmm7, %xmm2\r | |
1558 | aesdec %xmm7, %xmm4\r | |
1559 | aesdec %xmm7, %xmm5\r | |
1560 | \r | |
1561 | movups 32(ctx), %xmm7\r | |
1562 | aesdec %xmm6, %xmm1\r | |
1563 | aesdec %xmm6, %xmm2\r | |
1564 | aesdec %xmm6, %xmm4\r | |
1565 | aesdec %xmm6, %xmm5\r | |
1566 | \r | |
1567 | movups 16(ctx), %xmm6\r | |
1568 | aesdec %xmm7, %xmm1\r | |
1569 | aesdec %xmm7, %xmm2\r | |
1570 | aesdec %xmm7, %xmm4\r | |
1571 | aesdec %xmm7, %xmm5\r | |
1572 | \r | |
1573 | movups 0(ctx), %xmm7\r | |
1574 | aesdec %xmm6, %xmm1\r | |
1575 | aesdec %xmm6, %xmm2\r | |
1576 | aesdec %xmm6, %xmm4\r | |
1577 | aesdec %xmm6, %xmm5\r | |
1578 | \r | |
1579 | aesdeclast %xmm7, %xmm1\r | |
1580 | aesdeclast %xmm7, %xmm2\r | |
1581 | aesdeclast %xmm7, %xmm4\r | |
1582 | aesdeclast %xmm7, %xmm5\r | |
1583 | \r | |
1584 | pxor iv, %xmm1 // 1st obuf ^= iv; \r | |
1585 | movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
1586 | pxor iv, %xmm2 // 2nd obuf ^= iv; \r | |
1587 | movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
1588 | pxor iv, %xmm4 // 3rd obuf ^= iv; \r | |
1589 | movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
1590 | pxor iv, %xmm5 // 4th obuf ^= iv; \r | |
1591 | movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
1592 | movups %xmm1, (obuf) // write 1st obuf\r | |
1593 | movups %xmm2, 16(obuf) // write 2nd obuf\r | |
1594 | movups %xmm4, 32(obuf) // write 3rd obuf\r | |
1595 | movups %xmm5, 48(obuf) // write 4th obuf\r | |
1596 | \r | |
1597 | add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; \r | |
1598 | add $64, obuf // obuf += AES_BLOCK_SIZE * 4; \r | |
1599 | \r | |
1600 | sub $4, num_blk // num_blk -= 4\r | |
1601 | jge 0b // if num_blk > 0, repeat the loop\r | |
1602 | \r | |
1603 | \r | |
1604 | 9: add $4, num_blk // post incremtn num_blk by 4\r | |
1605 | je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code\r | |
1606 | \r | |
1607 | movups 208(ctx), %xmm4\r | |
1608 | movups 192(ctx), %xmm5\r | |
1609 | movups 176(ctx), %xmm6\r | |
1610 | movups 160(ctx), %xmm7\r | |
1611 | \r | |
1612 | #endif\r | |
1613 | \r | |
1614 | 0:\r | |
1615 | movups (ibuf), %xmm2 // tmp = ibuf\r | |
1616 | \r | |
1617 | // aes_decrypt\r | |
1618 | pxor %xmm3, %xmm2\r | |
1619 | aesdec %xmm4, %xmm2\r | |
1620 | aesdec %xmm5, %xmm2\r | |
1621 | aesdec %xmm6, %xmm2\r | |
1622 | aesdec %xmm7, %xmm2\r | |
1623 | #if defined __x86_64__\r | |
1624 | aesdec %xmm8, %xmm2\r | |
1625 | aesdec %xmm9, %xmm2\r | |
1626 | aesdec %xmm10, %xmm2\r | |
1627 | aesdec %xmm11, %xmm2\r | |
1628 | aesdec %xmm12, %xmm2\r | |
1629 | aesdec %xmm13, %xmm2\r | |
1630 | aesdec %xmm14, %xmm2\r | |
1631 | aesdec %xmm15, %xmm2\r | |
1632 | #else\r | |
1633 | movups 144(ctx), %xmm1\r | |
1634 | aesdec %xmm1, %xmm2\r | |
1635 | movups 128(ctx), %xmm1\r | |
1636 | aesdec %xmm1, %xmm2\r | |
1637 | movups 112(ctx), %xmm1\r | |
1638 | aesdec %xmm1, %xmm2\r | |
1639 | movups 96(ctx), %xmm1\r | |
1640 | aesdec %xmm1, %xmm2\r | |
1641 | movups 80(ctx), %xmm1\r | |
1642 | aesdec %xmm1, %xmm2\r | |
1643 | movups 64(ctx), %xmm1\r | |
1644 | aesdec %xmm1, %xmm2\r | |
1645 | movups 48(ctx), %xmm1\r | |
1646 | aesdec %xmm1, %xmm2\r | |
1647 | movups 32(ctx), %xmm1\r | |
1648 | aesdec %xmm1, %xmm2\r | |
1649 | #endif\r | |
1650 | movups 16(ctx), %xmm1\r | |
1651 | aesdec %xmm1, %xmm2\r | |
1652 | movups (ctx), %xmm1\r | |
1653 | aesdeclast %xmm1, %xmm2\r | |
1654 | \r | |
1655 | pxor iv, %xmm2 // obuf ^= iv; \r | |
1656 | movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE);\r | |
1657 | \r | |
1658 | movups %xmm2, (obuf) // write obuf\r | |
1659 | \r | |
1660 | add $16, ibuf // ibuf += AES_BLOCK_SIZE; \r | |
1661 | add $16, obuf // obuf += AES_BLOCK_SIZE; \r | |
1662 | sub $1, num_blk // num_blk --\r | |
1663 | jg 0b // if num_blk > 0, repeat the loop\r | |
1664 | \r | |
1665 | jmp L_HW_cbc_done\r | |
1666 | \r | |
1667 | //\r | |
1668 | // --------- END of aes_decrypt_cbc_hw -------------------\r | |
1669 | //\r |