osfmk/arm64/corecrypto/sha256_compress_arm64.s

   1 /*
   2  * Copyright (c) 2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29         This file provides armv7+neon hand implementation of the following function
  30
  31         void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
  32
  33         which is a C function in sha2.c (from xnu).
  34
  35         sha256 algorithm per block description:
  36
  37                 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
  38                 2. load 8 digests a-h from ctx->state
  39                 3. for r = 0:15
  40                                 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
  41                                 d += T1;
  42                                 h = T1 + Sigma0(a) + Maj(a,b,c)
  43                                 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
  44                 4. for r = 16:63
  45                                 W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
  46                                 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
  47                                 d += T1;
  48                                 h = T1 + Sigma0(a) + Maj(a,b,c)
  49                                 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
  50
  51         In the assembly implementation:
  52                 - a circular window of message schedule W(r:r+15) is updated and stored in q0-q3
  53                 - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
  54                 - the 8 digests (a-h) will be stored in GPR or memory
  55
  56         the implementation per block looks like
  57
  58         ----------------------------------------------------------------------------
  59
  60         load W(0:15) (big-endian per 4 bytes) into q0:q3
  61         pre_calculate and store W+K(0:15) in stack
  62
  63         load digests a-h from ctx->state;
  64
  65         for (r=0;r<48;r+=4) {
  66                 digests a-h update and permute round r:r+3
  67                 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
  68         }
  69
  70         for (r=48;r<64;r+=4) {
  71                 digests a-h update and permute round r:r+3
  72         }
  73
  74         ctx->states += digests a-h;
  75
  76         ----------------------------------------------------------------------------
  77
  78         our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
  79         into the last 16 rounds of its previous block:
  80
  81         ----------------------------------------------------------------------------
  82
  83         load W(0:15) (big-endian per 4 bytes) into q0:q3
  84         pre_calculate and store W+K(0:15) in stack
  85
  86 L_loop:
  87
  88         load digests a-h from ctx->state;
  89
  90         for (r=0;r<48;r+=4) {
  91                 digests a-h update and permute round r:r+3
  92                 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
  93         }
  94
  95         num_block--;
  96         if (num_block==0)       jmp L_last_block;
  97
  98         for (r=48;r<64;r+=4) {
  99                 digests a-h update and permute round r:r+3
 100                 load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3
 101                 pre_calculate and store W+K([r:r+3]%16) in stack
 102         }
 103
 104         ctx->states += digests a-h;
 105
 106         jmp     L_loop;
 107
 108 L_last_block:
 109
 110         for (r=48;r<64;r+=4) {
 111                 digests a-h update and permute round r:r+3
 112         }
 113
 114         ctx->states += digests a-h;
 115
 116         ------------------------------------------------------------------------
 117
 118         Apple CoreOS vector & numerics
 119 */
 120
 121 #if defined(__arm64__)
 122
 123 #include "arm64_isa_compatibility.h"
 124
 125 .subsections_via_symbols
 126     .text
 127
 128     .p2align  4
 129
 130 K256:
 131     .long   0x428a2f98
 132     .long   0x71374491
 133     .long   0xb5c0fbcf
 134     .long   0xe9b5dba5
 135     .long   0x3956c25b
 136     .long   0x59f111f1
 137     .long   0x923f82a4
 138     .long   0xab1c5ed5
 139     .long   0xd807aa98
 140     .long   0x12835b01
 141     .long   0x243185be
 142     .long   0x550c7dc3
 143     .long   0x72be5d74
 144     .long   0x80deb1fe
 145     .long   0x9bdc06a7
 146     .long   0xc19bf174
 147     .long   0xe49b69c1
 148     .long   0xefbe4786
 149     .long   0x0fc19dc6
 150     .long   0x240ca1cc
 151     .long   0x2de92c6f
 152     .long   0x4a7484aa
 153     .long   0x5cb0a9dc
 154     .long   0x76f988da
 155     .long   0x983e5152
 156     .long   0xa831c66d
 157     .long   0xb00327c8
 158     .long   0xbf597fc7
 159     .long   0xc6e00bf3
 160     .long   0xd5a79147
 161     .long   0x06ca6351
 162     .long   0x14292967
 163     .long   0x27b70a85
 164     .long   0x2e1b2138
 165     .long   0x4d2c6dfc
 166     .long   0x53380d13
 167     .long   0x650a7354
 168     .long   0x766a0abb
 169     .long   0x81c2c92e
 170     .long   0x92722c85
 171     .long   0xa2bfe8a1
 172     .long   0xa81a664b
 173     .long   0xc24b8b70
 174     .long   0xc76c51a3
 175     .long   0xd192e819
 176     .long   0xd6990624
 177     .long   0xf40e3585
 178     .long   0x106aa070
 179     .long   0x19a4c116
 180     .long   0x1e376c08
 181     .long   0x2748774c
 182     .long   0x34b0bcb5
 183     .long   0x391c0cb3
 184     .long   0x4ed8aa4a
 185     .long   0x5b9cca4f
 186     .long   0x682e6ff3
 187     .long   0x748f82ee
 188     .long   0x78a5636f
 189     .long   0x84c87814
 190     .long   0x8cc70208
 191     .long   0x90befffa
 192     .long   0xa4506ceb
 193     .long   0xbef9a3f7
 194     .long   0xc67178f2
 195
 196
 197     .p2align  4
 198
 199         .globl _AccelerateCrypto_SHA256_compress
 200 _AccelerateCrypto_SHA256_compress:
 201
 202
 203         #define hashes          x0
 204         #define numblocks       x1
 205         #define data            x2
 206         #define ktable          x3
 207
 208 #ifdef __ILP32__
 209     uxtw    numblocks, numblocks        // in arm64_32 size_t is 32-bit, so we need to extend it
 210 #endif
 211
 212
 213         adrp    ktable, K256@page
 214         cbnz    numblocks, 1f                                           // if number of blocks is nonzero, go on for sha256 transform operation
 215         ret             lr                                                      // otherwise, return
 216 1:
 217         add             ktable, ktable, K256@pageoff
 218
 219 #if BUILDKERNEL
 220         // save q0-q7, q16-q24 8+8+1=19
 221         sub             x4, sp, #17*16
 222         sub             sp, sp, #17*16
 223         st1.4s  {v0, v1, v2, v3}, [x4], #64
 224         st1.4s  {v4, v5, v6, v7}, [x4], #64
 225         st1.4s  {v16, v17, v18, v19}, [x4], #64
 226         st1.4s  {v20, v21, v22, v23}, [x4], #64
 227         st1.4s  {v24}, [x4], #16
 228 #endif
 229
 230         ld1.4s  {v0,v1,v2,v3}, [data], #64                      // w0,w1,w2,w3 need to bswap into big-endian
 231
 232     rev32.16b   v0, v0                                  // byte swap of 1st 4 ints
 233     ldr         q21, [ktable, #16*0]
 234     rev32.16b   v1, v1                                  // byte swap of 2nd 4 ints
 235     ldr         q16, [hashes, #0]
 236     rev32.16b   v2, v2                                  // byte swap of 3rd 4 ints
 237     ldr         q17, [hashes, #16]
 238     rev32.16b   v3, v3                                  // byte swap of 4th 4 ints
 239     ldr         q22, [ktable, #16*1]
 240
 241         mov.16b         v18, v16
 242     ldr         q23, [ktable, #16*2]
 243     add.4s              v4, v0, v21                             // 1st 4 input + K256
 244     ldr         q24, [ktable, #16*3]
 245     add.4s              v5, v1, v22                             // 2nd 4 input + K256
 246         mov.16b         v19, v17
 247     add.4s              v6, v2, v23                             // 3rd 4 input + K256
 248     add.4s              v7, v3, v24                             // 4th 4 input + K256
 249     add         ktable, ktable, #16*4
 250
 251
 252         .macro  sha256_round
 253         mov.16b         v20, v18
 254         SHA256SU0       $0, $1
 255         SHA256H         18, 19, $4
 256         SHA256SU1       $0, $2, $3
 257         SHA256H2        19, 20, $4
 258         add.4s          $6, $5, $7
 259         .endm
 260
 261         // 4 vector hashes update and load next vector rounds
 262         .macro  sha256_hash_load_round
 263         mov.16b         v20, v18
 264         SHA256H         18, 19, $0
 265     rev32.16b   $1, $1
 266         SHA256H2        19, 20, $0
 267     add.4s              $2, $1, $3
 268         .endm
 269
 270         .macro  sha256_hash_round
 271         mov.16b         v20, v18
 272         SHA256H         18, 19, $0
 273         SHA256H2        19, 20, $0
 274         .endm
 275
 276         // 12 vector hash and sequence update rounds
 277     mov         w4, #3
 278 L_i_loop:
 279     mov.16b             v20, v18
 280         ldr         q21, [ktable, #0]           // k0
 281         SHA256SU0       0, 1
 282         ldr         q22, [ktable, #16]          // k1
 283         SHA256H         18, 19, 4
 284         ldr         q23, [ktable, #32]          // k2
 285         SHA256SU1       0, 2, 3
 286         ldr         q24, [ktable, #48]          // k3
 287         SHA256H2        19, 20, 4
 288     add         ktable, ktable, #64
 289         add.4s          v4, v0, v21
 290
 291         sha256_round    1, 2, 3, 0, 5, v1, v5, v22
 292         sha256_round    2, 3, 0, 1, 6, v2, v6, v23
 293     subs            w4, w4, #1
 294         sha256_round    3, 0, 1, 2, 7, v3, v7, v24
 295     b.gt            L_i_loop
 296
 297         subs            numblocks, numblocks, #1        // pre-decrement num_blocks by 1
 298         b.le            L_wrapup
 299
 300         sub                     ktable, ktable, #256
 301
 302 L_loop:
 303
 304     ldr         q0, [data, #0]
 305         mov.16b         v20, v18
 306     ldr         q21, [ktable,#0]
 307         SHA256H         18, 19, 4
 308         ldr             q1, [data, #16]
 309     rev32.16b   v0, v0
 310         ldr             q2, [data, #32]
 311         SHA256H2        19, 20, 4
 312         ldr             q3, [data, #48]
 313     add.4s              v4, v0, v21
 314
 315     ldr         q22, [ktable,#16]
 316         mov.16b         v20, v18
 317     add         data, data, #64
 318         SHA256H         18, 19, 5
 319     ldr         q23, [ktable,#32]
 320     rev32.16b   v1, v1
 321     ldr         q24, [ktable,#48]
 322         SHA256H2        19, 20, 5
 323     add.4s              v5, v1, v22
 324
 325         sha256_hash_load_round  6, v2, v6, v23
 326         sha256_hash_load_round  7, v3, v7, v24
 327
 328         add.4s          v18, v16, v18
 329         add.4s          v19, v17, v19
 330         mov.16b         v16, v18
 331         mov.16b         v17, v19
 332
 333         // 12 vector hash and sequence update rounds
 334     mov.16b             v20, v18
 335         ldr         q21, [ktable, #16*4]                // k0
 336         SHA256SU0       0, 1
 337         ldr         q22, [ktable, #16*5]                // k1
 338         SHA256H         18, 19, 4
 339         ldr         q23, [ktable, #16*6]                // k2
 340         SHA256SU1       0, 2, 3
 341         ldr         q24, [ktable, #16*7]                // k3
 342         SHA256H2        19, 20, 4
 343         add.4s          v4, v0, v21
 344
 345         sha256_round    1, 2, 3, 0, 5, v1, v5, v22
 346         sha256_round    2, 3, 0, 1, 6, v2, v6, v23
 347         sha256_round    3, 0, 1, 2, 7, v3, v7, v24
 348     mov.16b             v20, v18
 349         ldr         q21, [ktable, #16*8]                // k0
 350         SHA256SU0       0, 1
 351         ldr         q22, [ktable, #16*9]                // k1
 352         SHA256H         18, 19, 4
 353         ldr         q23, [ktable, #16*10]               // k2
 354         SHA256SU1       0, 2, 3
 355         ldr         q24, [ktable, #16*11]               // k3
 356         SHA256H2        19, 20, 4
 357         add.4s          v4, v0, v21
 358
 359         sha256_round    1, 2, 3, 0, 5, v1, v5, v22
 360         sha256_round    2, 3, 0, 1, 6, v2, v6, v23
 361         sha256_round    3, 0, 1, 2, 7, v3, v7, v24
 362
 363     mov.16b             v20, v18
 364         ldr         q21, [ktable, #16*12]               // k0
 365         SHA256SU0       0, 1
 366         ldr         q22, [ktable, #16*13]               // k1
 367         SHA256H         18, 19, 4
 368         ldr         q23, [ktable, #16*14]               // k2
 369         SHA256SU1       0, 2, 3
 370         ldr         q24, [ktable, #16*15]               // k3
 371         SHA256H2        19, 20, 4
 372         add.4s          v4, v0, v21
 373
 374         sha256_round    1, 2, 3, 0, 5, v1, v5, v22
 375         sha256_round    2, 3, 0, 1, 6, v2, v6, v23
 376         sha256_round    3, 0, 1, 2, 7, v3, v7, v24
 377
 378         subs            numblocks, numblocks, #1        // pre-decrement num_blocks by 1
 379         b.gt            L_loop
 380
 381 L_wrapup:
 382
 383         sha256_hash_round       4
 384         sha256_hash_round       5
 385         sha256_hash_round       6
 386         sha256_hash_round       7
 387
 388         add.4s          v16, v16, v18
 389         add.4s          v17, v17, v19
 390         st1.4s          {v16,v17}, [hashes]                                     // hashes q16 : d,c,b,a   q17 : h,g,f,e
 391
 392 #if BUILDKERNEL
 393         // restore q9-q13, q0-q7, q16-q31
 394         ld1.4s  {v0, v1, v2, v3}, [sp], #64
 395         ld1.4s  {v4, v5, v6, v7}, [sp], #64
 396         ld1.4s  {v16, v17, v18, v19}, [sp], #64
 397         ld1.4s  {v20, v21, v22, v23}, [sp], #64
 398         ld1.4s  {v24}, [sp], #16
 399 #endif
 400
 401         ret             lr
 402
 403
 404 #endif          // arm64
 405