]> git.saurik.com Git - apple/xnu.git/blob - osfmk/arm64/corecrypto/sha256_compress_arm64.s
xnu-7195.60.75.tar.gz
[apple/xnu.git] / osfmk / arm64 / corecrypto / sha256_compress_arm64.s
1 /*
2 * Copyright (c) 2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 This file provides armv7+neon hand implementation of the following function
30
31 void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
32
33 which is a C function in sha2.c (from xnu).
34
35 sha256 algorithm per block description:
36
37 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
38 2. load 8 digests a-h from ctx->state
39 3. for r = 0:15
40 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
41 d += T1;
42 h = T1 + Sigma0(a) + Maj(a,b,c)
43 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
44 4. for r = 16:63
45 W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
46 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
47 d += T1;
48 h = T1 + Sigma0(a) + Maj(a,b,c)
49 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
50
51 In the assembly implementation:
52 - a circular window of message schedule W(r:r+15) is updated and stored in q0-q3
53 - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
54 - the 8 digests (a-h) will be stored in GPR or memory
55
56 the implementation per block looks like
57
58 ----------------------------------------------------------------------------
59
60 load W(0:15) (big-endian per 4 bytes) into q0:q3
61 pre_calculate and store W+K(0:15) in stack
62
63 load digests a-h from ctx->state;
64
65 for (r=0;r<48;r+=4) {
66 digests a-h update and permute round r:r+3
67 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
68 }
69
70 for (r=48;r<64;r+=4) {
71 digests a-h update and permute round r:r+3
72 }
73
74 ctx->states += digests a-h;
75
76 ----------------------------------------------------------------------------
77
78 our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
79 into the last 16 rounds of its previous block:
80
81 ----------------------------------------------------------------------------
82
83 load W(0:15) (big-endian per 4 bytes) into q0:q3
84 pre_calculate and store W+K(0:15) in stack
85
86 L_loop:
87
88 load digests a-h from ctx->state;
89
90 for (r=0;r<48;r+=4) {
91 digests a-h update and permute round r:r+3
92 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
93 }
94
95 num_block--;
96 if (num_block==0) jmp L_last_block;
97
98 for (r=48;r<64;r+=4) {
99 digests a-h update and permute round r:r+3
100 load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3
101 pre_calculate and store W+K([r:r+3]%16) in stack
102 }
103
104 ctx->states += digests a-h;
105
106 jmp L_loop;
107
108 L_last_block:
109
110 for (r=48;r<64;r+=4) {
111 digests a-h update and permute round r:r+3
112 }
113
114 ctx->states += digests a-h;
115
116 ------------------------------------------------------------------------
117
118 Apple CoreOS vector & numerics
119 */
120
121 #if defined(__arm64__)
122
123 #include "arm64_isa_compatibility.h"
124
125 .subsections_via_symbols
126 .text
127
128 .p2align 4
129
130 K256:
131 .long 0x428a2f98
132 .long 0x71374491
133 .long 0xb5c0fbcf
134 .long 0xe9b5dba5
135 .long 0x3956c25b
136 .long 0x59f111f1
137 .long 0x923f82a4
138 .long 0xab1c5ed5
139 .long 0xd807aa98
140 .long 0x12835b01
141 .long 0x243185be
142 .long 0x550c7dc3
143 .long 0x72be5d74
144 .long 0x80deb1fe
145 .long 0x9bdc06a7
146 .long 0xc19bf174
147 .long 0xe49b69c1
148 .long 0xefbe4786
149 .long 0x0fc19dc6
150 .long 0x240ca1cc
151 .long 0x2de92c6f
152 .long 0x4a7484aa
153 .long 0x5cb0a9dc
154 .long 0x76f988da
155 .long 0x983e5152
156 .long 0xa831c66d
157 .long 0xb00327c8
158 .long 0xbf597fc7
159 .long 0xc6e00bf3
160 .long 0xd5a79147
161 .long 0x06ca6351
162 .long 0x14292967
163 .long 0x27b70a85
164 .long 0x2e1b2138
165 .long 0x4d2c6dfc
166 .long 0x53380d13
167 .long 0x650a7354
168 .long 0x766a0abb
169 .long 0x81c2c92e
170 .long 0x92722c85
171 .long 0xa2bfe8a1
172 .long 0xa81a664b
173 .long 0xc24b8b70
174 .long 0xc76c51a3
175 .long 0xd192e819
176 .long 0xd6990624
177 .long 0xf40e3585
178 .long 0x106aa070
179 .long 0x19a4c116
180 .long 0x1e376c08
181 .long 0x2748774c
182 .long 0x34b0bcb5
183 .long 0x391c0cb3
184 .long 0x4ed8aa4a
185 .long 0x5b9cca4f
186 .long 0x682e6ff3
187 .long 0x748f82ee
188 .long 0x78a5636f
189 .long 0x84c87814
190 .long 0x8cc70208
191 .long 0x90befffa
192 .long 0xa4506ceb
193 .long 0xbef9a3f7
194 .long 0xc67178f2
195
196
197 .p2align 4
198
199 .globl _AccelerateCrypto_SHA256_compress
200 _AccelerateCrypto_SHA256_compress:
201
202
203 #define hashes x0
204 #define numblocks x1
205 #define data x2
206 #define ktable x3
207
208 #ifdef __ILP32__
209 uxtw numblocks, numblocks // in arm64_32 size_t is 32-bit, so we need to extend it
210 #endif
211
212
213 adrp ktable, K256@page
214 cbnz numblocks, 1f // if number of blocks is nonzero, go on for sha256 transform operation
215 ret lr // otherwise, return
216 1:
217 add ktable, ktable, K256@pageoff
218
219 #if BUILDKERNEL
220 // save q0-q7, q16-q24 8+8+1=19
221 sub x4, sp, #17*16
222 sub sp, sp, #17*16
223 st1.4s {v0, v1, v2, v3}, [x4], #64
224 st1.4s {v4, v5, v6, v7}, [x4], #64
225 st1.4s {v16, v17, v18, v19}, [x4], #64
226 st1.4s {v20, v21, v22, v23}, [x4], #64
227 st1.4s {v24}, [x4], #16
228 #endif
229
230 ld1.4s {v0,v1,v2,v3}, [data], #64 // w0,w1,w2,w3 need to bswap into big-endian
231
232 rev32.16b v0, v0 // byte swap of 1st 4 ints
233 ldr q21, [ktable, #16*0]
234 rev32.16b v1, v1 // byte swap of 2nd 4 ints
235 ldr q16, [hashes, #0]
236 rev32.16b v2, v2 // byte swap of 3rd 4 ints
237 ldr q17, [hashes, #16]
238 rev32.16b v3, v3 // byte swap of 4th 4 ints
239 ldr q22, [ktable, #16*1]
240
241 mov.16b v18, v16
242 ldr q23, [ktable, #16*2]
243 add.4s v4, v0, v21 // 1st 4 input + K256
244 ldr q24, [ktable, #16*3]
245 add.4s v5, v1, v22 // 2nd 4 input + K256
246 mov.16b v19, v17
247 add.4s v6, v2, v23 // 3rd 4 input + K256
248 add.4s v7, v3, v24 // 4th 4 input + K256
249 add ktable, ktable, #16*4
250
251
252 .macro sha256_round
253 mov.16b v20, v18
254 SHA256SU0 $0, $1
255 SHA256H 18, 19, $4
256 SHA256SU1 $0, $2, $3
257 SHA256H2 19, 20, $4
258 add.4s $6, $5, $7
259 .endm
260
261 // 4 vector hashes update and load next vector rounds
262 .macro sha256_hash_load_round
263 mov.16b v20, v18
264 SHA256H 18, 19, $0
265 rev32.16b $1, $1
266 SHA256H2 19, 20, $0
267 add.4s $2, $1, $3
268 .endm
269
270 .macro sha256_hash_round
271 mov.16b v20, v18
272 SHA256H 18, 19, $0
273 SHA256H2 19, 20, $0
274 .endm
275
276 // 12 vector hash and sequence update rounds
277 mov w4, #3
278 L_i_loop:
279 mov.16b v20, v18
280 ldr q21, [ktable, #0] // k0
281 SHA256SU0 0, 1
282 ldr q22, [ktable, #16] // k1
283 SHA256H 18, 19, 4
284 ldr q23, [ktable, #32] // k2
285 SHA256SU1 0, 2, 3
286 ldr q24, [ktable, #48] // k3
287 SHA256H2 19, 20, 4
288 add ktable, ktable, #64
289 add.4s v4, v0, v21
290
291 sha256_round 1, 2, 3, 0, 5, v1, v5, v22
292 sha256_round 2, 3, 0, 1, 6, v2, v6, v23
293 subs w4, w4, #1
294 sha256_round 3, 0, 1, 2, 7, v3, v7, v24
295 b.gt L_i_loop
296
297 subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
298 b.le L_wrapup
299
300 sub ktable, ktable, #256
301
302 L_loop:
303
304 ldr q0, [data, #0]
305 mov.16b v20, v18
306 ldr q21, [ktable,#0]
307 SHA256H 18, 19, 4
308 ldr q1, [data, #16]
309 rev32.16b v0, v0
310 ldr q2, [data, #32]
311 SHA256H2 19, 20, 4
312 ldr q3, [data, #48]
313 add.4s v4, v0, v21
314
315 ldr q22, [ktable,#16]
316 mov.16b v20, v18
317 add data, data, #64
318 SHA256H 18, 19, 5
319 ldr q23, [ktable,#32]
320 rev32.16b v1, v1
321 ldr q24, [ktable,#48]
322 SHA256H2 19, 20, 5
323 add.4s v5, v1, v22
324
325 sha256_hash_load_round 6, v2, v6, v23
326 sha256_hash_load_round 7, v3, v7, v24
327
328 add.4s v18, v16, v18
329 add.4s v19, v17, v19
330 mov.16b v16, v18
331 mov.16b v17, v19
332
333 // 12 vector hash and sequence update rounds
334 mov.16b v20, v18
335 ldr q21, [ktable, #16*4] // k0
336 SHA256SU0 0, 1
337 ldr q22, [ktable, #16*5] // k1
338 SHA256H 18, 19, 4
339 ldr q23, [ktable, #16*6] // k2
340 SHA256SU1 0, 2, 3
341 ldr q24, [ktable, #16*7] // k3
342 SHA256H2 19, 20, 4
343 add.4s v4, v0, v21
344
345 sha256_round 1, 2, 3, 0, 5, v1, v5, v22
346 sha256_round 2, 3, 0, 1, 6, v2, v6, v23
347 sha256_round 3, 0, 1, 2, 7, v3, v7, v24
348 mov.16b v20, v18
349 ldr q21, [ktable, #16*8] // k0
350 SHA256SU0 0, 1
351 ldr q22, [ktable, #16*9] // k1
352 SHA256H 18, 19, 4
353 ldr q23, [ktable, #16*10] // k2
354 SHA256SU1 0, 2, 3
355 ldr q24, [ktable, #16*11] // k3
356 SHA256H2 19, 20, 4
357 add.4s v4, v0, v21
358
359 sha256_round 1, 2, 3, 0, 5, v1, v5, v22
360 sha256_round 2, 3, 0, 1, 6, v2, v6, v23
361 sha256_round 3, 0, 1, 2, 7, v3, v7, v24
362
363 mov.16b v20, v18
364 ldr q21, [ktable, #16*12] // k0
365 SHA256SU0 0, 1
366 ldr q22, [ktable, #16*13] // k1
367 SHA256H 18, 19, 4
368 ldr q23, [ktable, #16*14] // k2
369 SHA256SU1 0, 2, 3
370 ldr q24, [ktable, #16*15] // k3
371 SHA256H2 19, 20, 4
372 add.4s v4, v0, v21
373
374 sha256_round 1, 2, 3, 0, 5, v1, v5, v22
375 sha256_round 2, 3, 0, 1, 6, v2, v6, v23
376 sha256_round 3, 0, 1, 2, 7, v3, v7, v24
377
378 subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
379 b.gt L_loop
380
381 L_wrapup:
382
383 sha256_hash_round 4
384 sha256_hash_round 5
385 sha256_hash_round 6
386 sha256_hash_round 7
387
388 add.4s v16, v16, v18
389 add.4s v17, v17, v19
390 st1.4s {v16,v17}, [hashes] // hashes q16 : d,c,b,a q17 : h,g,f,e
391
392 #if BUILDKERNEL
393 // restore q9-q13, q0-q7, q16-q31
394 ld1.4s {v0, v1, v2, v3}, [sp], #64
395 ld1.4s {v4, v5, v6, v7}, [sp], #64
396 ld1.4s {v16, v17, v18, v19}, [sp], #64
397 ld1.4s {v20, v21, v22, v23}, [sp], #64
398 ld1.4s {v24}, [sp], #16
399 #endif
400
401 ret lr
402
403
404 #endif // arm64
405