bsd/crypto/sha2/intel/sha256nossse3.s

   1 /*
   2         This file provides x86_64/i386 hand implementation of the following function
   3
   4         void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
   5
   6         which is a C function in sha2.c (from xnu).
   7
   8         The code SHA256_Transform_nossse3 is a clone of SHA256_Transform
   9         with all ssse3 instructions replaced with sse3 or below instructions.
  10
  11         For performance reason, this function should not be called directly. This file should be working
  12         together with the one that implements SHA256_Transform. There, cpu_capabilities is probed to detect
  13         ssse3. If ssse3 is not supported, the execution will be branched to this no-ssse3-specific function.
  14
  15         sha256 algorithm per block description:
  16
  17                 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
  18                 2. load 8 digests a-h from ctx->state
  19                 3. for r = 0:15
  20                                 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
  21                                 d += T1;
  22                                 h = T1 + Sigma0(a) + Maj(a,b,c)
  23                                 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
  24                 4. for r = 16:63
  25                                 W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
  26                                 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
  27                                 d += T1;
  28                                 h = T1 + Sigma0(a) + Maj(a,b,c)
  29                                 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
  30
  31         In the assembly implementation:
  32                 - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
  33                 - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
  34                 - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)
  35
  36         the implementation per block looks like
  37
  38         ----------------------------------------------------------------------------
  39
  40         load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
  41         pre_calculate and store W+K(0:15) in stack
  42
  43         load digests a-h from ctx->state;
  44
  45         for (r=0;r<48;r+=4) {
  46                 digests a-h update and permute round r:r+3
  47                 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
  48         }
  49
  50         for (r=48;r<64;r+=4) {
  51                 digests a-h update and permute round r:r+3
  52         }
  53
  54         ctx->states += digests a-h;
  55
  56         ----------------------------------------------------------------------------
  57
  58         our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
  59         into the last 16 rounds of its previous block:
  60
  61         ----------------------------------------------------------------------------
  62
  63         load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
  64         pre_calculate and store W+K(0:15) in stack
  65
  66 L_loop:
  67
  68         load digests a-h from ctx->state;
  69
  70         for (r=0;r<48;r+=4) {
  71                 digests a-h update and permute round r:r+3
  72                 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
  73         }
  74
  75         num_block--;
  76         if (num_block==0)       jmp L_last_block;
  77
  78         for (r=48;r<64;r+=4) {
  79                 digests a-h update and permute round r:r+3
  80                 load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3
  81                 pre_calculate and store W+K([r:r+3]%16) in stack
  82         }
  83
  84         ctx->states += digests a-h;
  85
  86         jmp     L_loop;
  87
  88 L_last_block:
  89
  90         for (r=48;r<64;r+=4) {
  91                 digests a-h update and permute round r:r+3
  92         }
  93
  94         ctx->states += digests a-h;
  95
  96         ------------------------------------------------------------------------
  97
  98         Apple CoreOS vector & numerics
  99         cclee 8-3-10
 100 */
 101
 102 #if defined     KERNEL
 103 #include <i386/cpu_capabilities.h>
 104 #else
 105 #include <System/i386/cpu_capabilities.h>
 106 #endif
 107
 108         // associate variables with registers or memory
 109
 110 #if defined     (__x86_64__)
 111         #define sp                      %rsp
 112         #define ctx                     %rdi
 113         #define data            %rsi
 114         #define num_blocks      %rdx
 115
 116         #define a                       %r8d
 117         #define b                       %r9d
 118         #define c                       %r10d
 119         #define d                       %r11d
 120         #define e                       %r12d
 121         #define f                       %r13d
 122         #define g                       %r14d
 123         #define h                       %r15d
 124
 125         #define K                       %rbx
 126         #define stack_size      (8+16*8+16+64)  // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15)
 127
 128         #define xmm_save        80(sp)                  // starting address for xmm save/restore
 129 #else
 130         #define sp      %esp
 131         #define stack_size      (12+16*8+16+16+64)      // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
 132         #define ctx_addr        20+stack_size(sp)       // ret_addr + 4 registers = 20, 1st caller argument
 133         #define data_addr       24+stack_size(sp)       // 2nd caller argument
 134         #define num_blocks      28+stack_size(sp)       // 3rd caller argument
 135
 136         #define a       %ebx
 137         #define b       %edx
 138         #define c       64(sp)
 139         #define d       %ebp
 140         #define e       %esi
 141         #define f       68(sp)
 142         #define g       %edi
 143         #define h       72(sp)
 144
 145         #define K       76(sp)                                  // pointer to K256[] table
 146         #define xmm_save        96(sp)                  // starting address for xmm save/restore
 147 #endif
 148
 149         // 2 local variables
 150         #define t       %eax
 151         #define s       %ecx
 152
 153         // a window (16 words) of message scheule
 154         #define W0      %xmm0
 155         #define W1      %xmm1
 156         #define W2      %xmm2
 157         #define W3      %xmm3
 158
 159         // circular buffer for WK[(r:r+15)%16]
 160         #define WK(x)   (x&15)*4(sp)
 161
 162 // #define Ch(x,y,z)   (((x) & (y)) ^ ((~(x)) & (z)))
 163
 164         .macro Ch
 165         mov             $0, t           // x
 166         mov             $0, s           // x
 167         not             t                       // ~x
 168         and             $1, s           // x & y
 169         and             $2, t           // ~x & z
 170         xor             s, t            // t = ((x) & (y)) ^ ((~(x)) & (z));
 171         .endm
 172
 173 // #define Maj(x,y,z)  (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
 174
 175         .macro  Maj
 176         mov             $0, t           // x
 177         mov             $1, s           // y
 178         and             s, t            // x&y
 179         and             $2, s           // y&z
 180         xor             s, t            // (x&y) ^ (y&z)
 181         mov             $2, s           // z
 182         and             $0, s           // (x&z)
 183         xor             s, t            // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
 184         .endm
 185
 186 /* Shift-right (used in SHA-256, SHA-384, and SHA-512): */
 187 // #define R(b,x)      ((x) >> (b))
 188 /* 32-bit Rotate-right (used in SHA-256): */
 189 // #define S32(b,x)    (((x) >> (b)) | ((x) << (32 - (b))))
 190
 191 // #define sigma0_256(x)   (S32(7,  (x)) ^ S32(18, (x)) ^ R(3 ,   (x)))
 192
 193         // performs sigma0_256 on 4 words on an xmm registers
 194         // use xmm6/xmm7 as intermediate registers
 195         .macro  sigma0
 196         movdqa  $0, %xmm6
 197         movdqa  $0, %xmm7
 198         psrld   $$3, $0                 // SHR3(x)
 199         psrld   $$7, %xmm6              // part of ROTR7
 200         pslld   $$14, %xmm7             // part of ROTR18
 201         pxor    %xmm6, $0
 202         pxor    %xmm7, $0
 203         psrld   $$11, %xmm6             // part of ROTR18
 204         pslld   $$11, %xmm7             // part of ROTR7
 205         pxor    %xmm6, $0
 206         pxor    %xmm7, $0
 207         .endm
 208
 209 // #define sigma1_256(x)   (S32(17, (x)) ^ S32(19, (x)) ^ R(10,   (x)))
 210
 211         // performs sigma1_256 on 4 words on an xmm registers
 212         // use xmm6/xmm7 as intermediate registers
 213         .macro  sigma1
 214         movdqa  $0, %xmm6
 215         movdqa  $0, %xmm7
 216         psrld   $$10, $0                // SHR10(x)
 217         psrld   $$17, %xmm6             // part of ROTR17
 218         pxor    %xmm6, $0
 219         pslld   $$13, %xmm7             // part of ROTR19
 220         pxor    %xmm7, $0
 221         psrld   $$2, %xmm6              // part of ROTR19
 222         pxor    %xmm6, $0
 223         pslld   $$2, %xmm7              // part of ROTR17
 224         pxor    %xmm7, $0
 225         .endm
 226
 227 // #define Sigma0_256(x)   (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
 228
 229         .macro  Sigma0
 230         mov             $0, t                   // x
 231         mov             $0, s                   // x
 232         ror             $$2, t                  // S32(2,  (x))
 233         ror             $$13, s                 // S32(13,  (x))
 234         xor             s, t                    // S32(2,  (x)) ^ S32(13, (x))
 235         ror             $$9, s                  // S32(22,  (x))
 236         xor             s, t                    // t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
 237         .endm
 238
 239 // #define Sigma1_256(x)   (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
 240
 241         .macro  Sigma1
 242         mov             $0, s                   // x
 243         ror             $$6, s                  // S32(6,  (x))
 244         mov             s, t                    // S32(6,  (x))
 245         ror             $$5, s                  // S32(11, (x))
 246         xor             s, t                    // S32(6,  (x)) ^ S32(11, (x))
 247         ror             $$14, s                 // S32(25, (x))
 248         xor             s, t                    // t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
 249         .endm
 250
 251         // per round digests update
 252         .macro  round
 253         Sigma1  $4                              // t = T1
 254         add             t, $7                   // use h to store h+Sigma1(e)
 255         Ch              $4, $5, $6              // t = Ch (e, f, g);
 256         add             $7, t                   // t = h+Sigma1(e)+Ch(e,f,g);
 257         add             WK($8), t               // h = T1
 258         add             t, $3                   // d += T1;
 259         mov             t, $7                   // h = T1
 260         Sigma0  $0                              // t = Sigma0(a);
 261         add             t, $7                   // h = T1 + Sigma0(a);
 262         Maj             $0, $1, $2              // t = Maj(a,b,c)
 263         add             t, $7                   // h = T1 + Sigma0(a) + Maj(a,b,c);
 264         .endm
 265
 266         // per 4 rounds digests update and permutation
 267         // permutation is absorbed by rotating the roles of digests a-h
 268         .macro  rounds
 269         round   $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
 270         round   $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
 271         round   $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
 272         round   $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
 273         .endm
 274
 275         // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future
 276         .macro  message_schedule
 277
 278         // 4 32-bit K256 words in xmm5
 279 #if defined     (__x86_64__)
 280         movdqu  (K), %xmm5
 281 #else
 282         mov             K, t
 283         movdqu  (t), %xmm5
 284 #endif
 285         add             $$16, K                         // K points to next K256 word for next iteration
 286         movdqa  $1, %xmm4                       // W7:W4
 287 #if 0
 288         palignr $$4, $0, %xmm4          // W4:W1
 289 #else   // no-ssse3 implementation of palignr
 290         movdqa  $0, %xmm7
 291     pslldq  $$12, %xmm4
 292     psrldq  $$4, %xmm7
 293     por     %xmm7, %xmm4
 294 #endif
 295         sigma0  %xmm4                           // sigma0(W4:W1)
 296         movdqa  $3, %xmm6                       // W15:W12
 297         paddd   %xmm4, $0                       // $0 = W3:W0 + sigma0(W4:W1)
 298 #if 0
 299         palignr $$4, $2, %xmm6          // W12:W9
 300 #else   // no-ssse3 implementation of palignr
 301         movdqa  $2, %xmm7
 302     pslldq  $$12, %xmm6
 303     psrldq  $$4, %xmm7
 304     por     %xmm7, %xmm6
 305 #endif
 306         paddd   %xmm6, $0                       // $0 = W12:W9 + sigma0(W4:W1) + W3:W0
 307         movdqa  $3, %xmm4                       // W15:W12
 308         psrldq  $$8, %xmm4                      // 0,0,W15,W14
 309         sigma1  %xmm4                           // sigma1(0,0,W15,W14)
 310         paddd   %xmm4, $0                       // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
 311         movdqa  $0, %xmm4                       // W19-sigma1(W17), W18-sigma1(W16), W17, W16
 312         pslldq  $$8, %xmm4                      // W17, W16, 0, 0
 313         sigma1  %xmm4                           // sigma1(W17,W16,0,0)
 314         paddd   %xmm4, $0                       // W19:W16
 315         paddd   $0, %xmm5                       // WK
 316         movdqa  %xmm5, WK($4)
 317         .endm
 318
 319         // this macro is used in the last 16 rounds of a current block
 320         // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
 321         // and save into stack to prepare for next block
 322
 323         .macro  update_W_WK
 324 #if defined (__x86_64__)
 325 #if 0
 326         movdqu  $0*16(data), $1         // read 4 4-byte words
 327         pshufb  L_aligned_bswap, $1     // big-endian of each 4-byte word, W[r:r+3]
 328 #else   // no-ssse3 implementation
 329         mov     0+$0*16(data), s
 330     bswap   s
 331     mov     s, 0+WK($0*4)
 332     mov     4+$0*16(data), s
 333     bswap   s
 334     mov     s, 4+WK($0*4)
 335     mov     8+$0*16(data), s
 336     bswap   s
 337     mov     s, 8+WK($0*4)
 338     mov     12+$0*16(data), s
 339     bswap   s
 340     mov     s, 12+WK($0*4)
 341     movdqa  WK($0*4), $1
 342 #endif
 343         movdqu  $0*16(K), %xmm4         // K[r:r+3]
 344 #else
 345         mov             data_addr, t
 346 #if 0
 347         movdqu  $0*16(t), $1            // read 4 4-byte words
 348         pshufb  L_aligned_bswap, $1     // big-endian of each 4-byte word, W[r:r+3]
 349 #else   // no-ssse3 implementation
 350         mov     0+$0*16(t), s
 351     bswap   s
 352     mov     s, 0+WK($0*4)
 353     mov     4+$0*16(t), s
 354     bswap   s
 355     mov     s, 4+WK($0*4)
 356     mov     8+$0*16(t), s
 357     bswap   s
 358     mov     s, 8+WK($0*4)
 359     mov     12+$0*16(t), s
 360     bswap   s
 361     mov     s, 12+WK($0*4)
 362     movdqa  WK($0*4), $1
 363 #endif
 364         mov             K, t
 365         movdqu  $0*16(t), %xmm4         // K[r:r+3]
 366 #endif
 367         paddd   $1, %xmm4                       // WK[r:r+3]
 368         movdqa  %xmm4, WK($0*4)         // save WK[r:r+3] into stack circular buffer
 369         .endm
 370
 371         .text
 372
 373 #if defined (__x86_64__) || defined (__i386__)
 374
 375         .globl  _SHA256_Transform_nossse3
 376
 377 _SHA256_Transform_nossse3:
 378
 379         // push callee-saved registers
 380 #if defined     (__x86_64__)
 381         push    %rbp
 382         push    %rbx
 383         push    %r12
 384         push    %r13
 385         push    %r14
 386         push    %r15
 387 #else
 388     push    %ebp
 389         push    %ebx
 390     push    %esi
 391     push    %edi
 392 #endif
 393
 394         // allocate stack space
 395         sub             $stack_size, sp
 396
 397         // if kernel code, save used xmm registers
 398 #if     KERNEL
 399         movdqa  %xmm0, 0*16+xmm_save
 400         movdqa  %xmm1, 1*16+xmm_save
 401         movdqa  %xmm2, 2*16+xmm_save
 402         movdqa  %xmm3, 3*16+xmm_save
 403         movdqa  %xmm4, 4*16+xmm_save
 404         movdqa  %xmm5, 5*16+xmm_save
 405         movdqa  %xmm6, 6*16+xmm_save
 406         movdqa  %xmm7, 7*16+xmm_save
 407 #endif
 408
 409         // set up pointer to table K256[]
 410 #if defined (__x86_64__)
 411         lea             _K256(%rip), K
 412 #else
 413         lea             _K256, t
 414         mov             t, K
 415 #endif
 416
 417         // load W[0:15] into xmm0-xmm3
 418     .macro  mybswap
 419     movl    0+$0*16($1), a
 420     movl    4+$0*16($1), b
 421     movl    8+$0*16($1), e
 422     movl    12+$0*16($1), d
 423     bswap   a
 424     bswap   b
 425     bswap   e
 426     bswap   d
 427     movl    a, $0*16(sp)
 428     movl    b, 4+$0*16(sp)
 429     movl    e, 8+$0*16(sp)
 430     movl    d, 12+$0*16(sp)
 431     .endm
 432
 433 #if defined (__x86_64__)
 434     mybswap 0, data
 435     mybswap 1, data
 436     mybswap 2, data
 437     mybswap 3, data
 438     add     $64, data
 439 #else
 440     mov     data_addr, t
 441     mybswap 0, t
 442     mybswap 1, t
 443     mybswap 2, t
 444     mybswap 3, t
 445     add     $64, data_addr
 446 #endif
 447     movdqa  0*16(sp), W0
 448     movdqa  1*16(sp), W1
 449     movdqa  2*16(sp), W2
 450     movdqa  3*16(sp), W3
 451
 452         // compute WK[0:15] and save in stack
 453 #if defined (__x86_64__)
 454         movdqu  0*16(K), %xmm4
 455         movdqu  1*16(K), %xmm5
 456         movdqu  2*16(K), %xmm6
 457         movdqu  3*16(K), %xmm7
 458 #else
 459         mov             K, t
 460         movdqu  0*16(t), %xmm4
 461         movdqu  1*16(t), %xmm5
 462         movdqu  2*16(t), %xmm6
 463         movdqu  3*16(t), %xmm7
 464 #endif
 465         add             $64, K
 466         paddd   %xmm0, %xmm4
 467         paddd   %xmm1, %xmm5
 468         paddd   %xmm2, %xmm6
 469         paddd   %xmm3, %xmm7
 470         movdqa  %xmm4, WK(0)
 471         movdqa  %xmm5, WK(4)
 472         movdqa  %xmm6, WK(8)
 473         movdqa  %xmm7, WK(12)
 474
 475 L_loop:
 476
 477         // digests a-h = ctx->states;
 478 #if defined (__x86_64__)
 479         mov             0*4(ctx), a
 480         mov             1*4(ctx), b
 481         mov             2*4(ctx), c
 482         mov             3*4(ctx), d
 483         mov             4*4(ctx), e
 484         mov             5*4(ctx), f
 485         mov             6*4(ctx), g
 486         mov             7*4(ctx), h
 487 #else
 488         mov             ctx_addr, t
 489         mov     0*4(t), a
 490         mov     1*4(t), b
 491         mov     2*4(t), s
 492         mov             s, c
 493         mov     3*4(t), d
 494         mov     4*4(t), e
 495         mov     5*4(t), s
 496         mov             s, f
 497         mov     6*4(t), g
 498         mov     7*4(t), s
 499         mov             s, h
 500 #endif
 501
 502         // rounds 0:47 interleaved with W/WK update for rounds 16:63
 503         rounds  a, b, c, d, e, f, g, h, 0
 504         message_schedule W0,W1,W2,W3,16
 505         rounds  e, f, g, h, a, b, c, d, 4
 506         message_schedule W1,W2,W3,W0,20
 507         rounds  a, b, c, d, e, f, g, h, 8
 508         message_schedule W2,W3,W0,W1,24
 509         rounds  e, f, g, h, a, b, c, d, 12
 510         message_schedule W3,W0,W1,W2,28
 511         rounds  a, b, c, d, e, f, g, h, 16
 512         message_schedule W0,W1,W2,W3,32
 513         rounds  e, f, g, h, a, b, c, d, 20
 514         message_schedule W1,W2,W3,W0,36
 515         rounds  a, b, c, d, e, f, g, h, 24
 516         message_schedule W2,W3,W0,W1,40
 517         rounds  e, f, g, h, a, b, c, d, 28
 518         message_schedule W3,W0,W1,W2,44
 519         rounds  a, b, c, d, e, f, g, h, 32
 520         message_schedule W0,W1,W2,W3,48
 521         rounds  e, f, g, h, a, b, c, d, 36
 522         message_schedule W1,W2,W3,W0,52
 523         rounds  a, b, c, d, e, f, g, h, 40
 524         message_schedule W2,W3,W0,W1,56
 525         rounds  e, f, g, h, a, b, c, d, 44
 526         message_schedule W3,W0,W1,W2,60
 527
 528         // revert K to the beginning of K256[]
 529 #if defined __x86_64__
 530         sub             $256, K
 531 #else
 532         subl    $256, K
 533 #endif
 534
 535         sub             $1, num_blocks                          // num_blocks--
 536         je              L_final_block                           // if final block, wrap up final rounds
 537
 538         // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
 539         rounds  a, b, c, d, e, f, g, h, 48
 540         update_W_WK     0, W0
 541         rounds  e, f, g, h, a, b, c, d, 52
 542         update_W_WK     1, W1
 543         rounds  a, b, c, d, e, f, g, h, 56
 544         update_W_WK     2, W2
 545         rounds  e, f, g, h, a, b, c, d, 60
 546         update_W_WK     3, W3
 547
 548         add             $64, K
 549 #if defined (__x86_64__)
 550         add             $64, data
 551 #else
 552         add             $64, data_addr
 553 #endif
 554
 555         // ctx->states += digests a-h
 556 #if     defined (__x86_64__)
 557         add             a, 0*4(ctx)
 558         add             b, 1*4(ctx)
 559         add             c, 2*4(ctx)
 560         add             d, 3*4(ctx)
 561         add             e, 4*4(ctx)
 562         add             f, 5*4(ctx)
 563         add             g, 6*4(ctx)
 564         add             h, 7*4(ctx)
 565 #else
 566         mov             ctx_addr, t
 567         add             a, 0*4(t)
 568         add             b, 1*4(t)
 569         mov             c, s
 570         add             s, 2*4(t)
 571         add             d, 3*4(t)
 572         add             e, 4*4(t)
 573         mov             f, s
 574         add             s, 5*4(t)
 575         add             g, 6*4(t)
 576         mov             h, s
 577         add             s, 7*4(t)
 578 #endif
 579
 580         jmp             L_loop                          // branch for next block
 581
 582         // wrap up digest update round 48:63 for final block
 583 L_final_block:
 584         rounds  a, b, c, d, e, f, g, h, 48
 585         rounds  e, f, g, h, a, b, c, d, 52
 586         rounds  a, b, c, d, e, f, g, h, 56
 587         rounds  e, f, g, h, a, b, c, d, 60
 588
 589         // ctx->states += digests a-h
 590 #if     defined (__x86_64__)
 591         add             a, 0*4(ctx)
 592         add             b, 1*4(ctx)
 593         add             c, 2*4(ctx)
 594         add             d, 3*4(ctx)
 595         add             e, 4*4(ctx)
 596         add             f, 5*4(ctx)
 597         add             g, 6*4(ctx)
 598         add             h, 7*4(ctx)
 599 #else
 600         mov             ctx_addr, t
 601         add             a, 0*4(t)
 602         add             b, 1*4(t)
 603         mov             c, s
 604         add             s, 2*4(t)
 605         add             d, 3*4(t)
 606         add             e, 4*4(t)
 607         mov             f, s
 608         add             s, 5*4(t)
 609         add             g, 6*4(t)
 610         mov             h, s
 611         add             s, 7*4(t)
 612 #endif
 613
 614         // if kernel, restore xmm0-xmm7
 615 #if     KERNEL
 616         movdqa  0*16+xmm_save, %xmm0
 617         movdqa  1*16+xmm_save, %xmm1
 618         movdqa  2*16+xmm_save, %xmm2
 619         movdqa  3*16+xmm_save, %xmm3
 620         movdqa  4*16+xmm_save, %xmm4
 621         movdqa  5*16+xmm_save, %xmm5
 622         movdqa  6*16+xmm_save, %xmm6
 623         movdqa  7*16+xmm_save, %xmm7
 624 #endif
 625
 626         // free allocated stack memory
 627         add             $stack_size, sp
 628
 629         // restore callee-saved registers
 630 #if defined (__x86_64__)
 631         pop             %r15
 632         pop             %r14
 633         pop             %r13
 634         pop             %r12
 635         pop             %rbx
 636         pop             %rbp
 637 #else
 638     pop         %edi
 639     pop         %esi
 640         pop             %ebx
 641     pop         %ebp
 642 #endif
 643
 644         // return
 645         ret
 646
 647
 648 #endif          // x86_64/i386
 649