2 * Copyright (c) 2019 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 This file provides armv7+neon hand implementation of the following function
31 void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
33 which is a C function in sha2.c (from xnu).
35 sha256 algorithm per block description:
37 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
38 2. load 8 digests a-h from ctx->state
40 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
42 h = T1 + Sigma0(a) + Maj(a,b,c)
43 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
45 W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
46 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
48 h = T1 + Sigma0(a) + Maj(a,b,c)
49 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
51 In the assembly implementation:
52 - a circular window of message schedule W(r:r+15) is updated and stored in q0-q3
53 - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
54 - the 8 digests (a-h) will be stored in GPR or memory
56 the implementation per block looks like
58 ----------------------------------------------------------------------------
60 load W(0:15) (big-endian per 4 bytes) into q0:q3
61 pre_calculate and store W+K(0:15) in stack
63 load digests a-h from ctx->state;
66 digests a-h update and permute round r:r+3
67 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
70 for (r=48;r<64;r+=4) {
71 digests a-h update and permute round r:r+3
74 ctx->states += digests a-h;
76 ----------------------------------------------------------------------------
78 our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
79 into the last 16 rounds of its previous block:
81 ----------------------------------------------------------------------------
83 load W(0:15) (big-endian per 4 bytes) into q0:q3
84 pre_calculate and store W+K(0:15) in stack
88 load digests a-h from ctx->state;
91 digests a-h update and permute round r:r+3
92 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
96 if (num_block==0) jmp L_last_block;
98 for (r=48;r<64;r+=4) {
99 digests a-h update and permute round r:r+3
100 load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3
101 pre_calculate and store W+K([r:r+3]%16) in stack
104 ctx->states += digests a-h;
110 for (r=48;r<64;r+=4) {
111 digests a-h update and permute round r:r+3
114 ctx->states += digests a-h;
116 ------------------------------------------------------------------------
118 Apple CoreOS vector & numerics
121 #if defined(__arm64__)
123 #include "arm64_isa_compatibility.h"
125 .subsections_via_symbols
199 .globl _AccelerateCrypto_SHA256_compress
200 _AccelerateCrypto_SHA256_compress:
209 uxtw numblocks, numblocks // in arm64_32 size_t is 32-bit, so we need to extend it
213 adrp ktable, K256@page
214 cbnz numblocks, 1f // if number of blocks is nonzero, go on for sha256 transform operation
215 ret lr // otherwise, return
217 add ktable, ktable, K256@pageoff
220 // save q0-q7, q16-q24 8+8+1=19
223 st1.4s {v0, v1, v2, v3}, [x4], #64
224 st1.4s {v4, v5, v6, v7}, [x4], #64
225 st1.4s {v16, v17, v18, v19}, [x4], #64
226 st1.4s {v20, v21, v22, v23}, [x4], #64
227 st1.4s {v24}, [x4], #16
230 ld1.4s {v0,v1,v2,v3}, [data], #64 // w0,w1,w2,w3 need to bswap into big-endian
232 rev32.16b v0, v0 // byte swap of 1st 4 ints
233 ldr q21, [ktable, #16*0]
234 rev32.16b v1, v1 // byte swap of 2nd 4 ints
235 ldr q16, [hashes, #0]
236 rev32.16b v2, v2 // byte swap of 3rd 4 ints
237 ldr q17, [hashes, #16]
238 rev32.16b v3, v3 // byte swap of 4th 4 ints
239 ldr q22, [ktable, #16*1]
242 ldr q23, [ktable, #16*2]
243 add.4s v4, v0, v21 // 1st 4 input + K256
244 ldr q24, [ktable, #16*3]
245 add.4s v5, v1, v22 // 2nd 4 input + K256
247 add.4s v6, v2, v23 // 3rd 4 input + K256
248 add.4s v7, v3, v24 // 4th 4 input + K256
249 add ktable, ktable, #16*4
261 // 4 vector hashes update and load next vector rounds
262 .macro sha256_hash_load_round
270 .macro sha256_hash_round
276 // 12 vector hash and sequence update rounds
280 ldr q21, [ktable, #0] // k0
282 ldr q22, [ktable, #16] // k1
284 ldr q23, [ktable, #32] // k2
286 ldr q24, [ktable, #48] // k3
288 add ktable, ktable, #64
291 sha256_round 1, 2, 3, 0, 5, v1, v5, v22
292 sha256_round 2, 3, 0, 1, 6, v2, v6, v23
294 sha256_round 3, 0, 1, 2, 7, v3, v7, v24
297 subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
300 sub ktable, ktable, #256
315 ldr q22, [ktable,#16]
319 ldr q23, [ktable,#32]
321 ldr q24, [ktable,#48]
325 sha256_hash_load_round 6, v2, v6, v23
326 sha256_hash_load_round 7, v3, v7, v24
333 // 12 vector hash and sequence update rounds
335 ldr q21, [ktable, #16*4] // k0
337 ldr q22, [ktable, #16*5] // k1
339 ldr q23, [ktable, #16*6] // k2
341 ldr q24, [ktable, #16*7] // k3
345 sha256_round 1, 2, 3, 0, 5, v1, v5, v22
346 sha256_round 2, 3, 0, 1, 6, v2, v6, v23
347 sha256_round 3, 0, 1, 2, 7, v3, v7, v24
349 ldr q21, [ktable, #16*8] // k0
351 ldr q22, [ktable, #16*9] // k1
353 ldr q23, [ktable, #16*10] // k2
355 ldr q24, [ktable, #16*11] // k3
359 sha256_round 1, 2, 3, 0, 5, v1, v5, v22
360 sha256_round 2, 3, 0, 1, 6, v2, v6, v23
361 sha256_round 3, 0, 1, 2, 7, v3, v7, v24
364 ldr q21, [ktable, #16*12] // k0
366 ldr q22, [ktable, #16*13] // k1
368 ldr q23, [ktable, #16*14] // k2
370 ldr q24, [ktable, #16*15] // k3
374 sha256_round 1, 2, 3, 0, 5, v1, v5, v22
375 sha256_round 2, 3, 0, 1, 6, v2, v6, v23
376 sha256_round 3, 0, 1, 2, 7, v3, v7, v24
378 subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
390 st1.4s {v16,v17}, [hashes] // hashes q16 : d,c,b,a q17 : h,g,f,e
393 // restore q9-q13, q0-q7, q16-q31
394 ld1.4s {v0, v1, v2, v3}, [sp], #64
395 ld1.4s {v4, v5, v6, v7}, [sp], #64
396 ld1.4s {v16, v17, v18, v19}, [sp], #64
397 ld1.4s {v20, v21, v22, v23}, [sp], #64
398 ld1.4s {v24}, [sp], #16