bsd/crypto/sha2/intel/sha256.s

   1 /*
   2         This file provides x86_64/i386 hand implementation of the following function
   3
   4         void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
   5
   6         which is a C function in sha2.c (from xnu).
   7
   8         The code 1st probes cpu_capabilities to detect whether ssse3 is supported. If not, it branches to
   9         SHA256_Transform_nossse3 (in a separate source file sha256nossse3.s) that was cloned from this file
  10         with all ssse3 instructions replaced with sse3 or below instructions.
  11
  12         sha256 algorithm per block description:
  13
  14                 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
  15                 2. load 8 digests a-h from ctx->state
  16                 3. for r = 0:15
  17                                 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
  18                                 d += T1;
  19                                 h = T1 + Sigma0(a) + Maj(a,b,c)
  20                                 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
  21                 4. for r = 16:63
  22                                 W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
  23                                 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
  24                                 d += T1;
  25                                 h = T1 + Sigma0(a) + Maj(a,b,c)
  26                                 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
  27
  28         In the assembly implementation:
  29                 - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
  30                 - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
  31                 - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)
  32
  33         the implementation per block looks like
  34
  35         ----------------------------------------------------------------------------
  36
  37         load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
  38         pre_calculate and store W+K(0:15) in stack
  39
  40         load digests a-h from ctx->state;
  41
  42         for (r=0;r<48;r+=4) {
  43                 digests a-h update and permute round r:r+3
  44                 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
  45         }
  46
  47         for (r=48;r<64;r+=4) {
  48                 digests a-h update and permute round r:r+3
  49         }
  50
  51         ctx->states += digests a-h;
  52
  53         ----------------------------------------------------------------------------
  54
  55         our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
  56         into the last 16 rounds of its previous block:
  57
  58         ----------------------------------------------------------------------------
  59
  60         load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
  61         pre_calculate and store W+K(0:15) in stack
  62
  63 L_loop:
  64
  65         load digests a-h from ctx->state;
  66
  67         for (r=0;r<48;r+=4) {
  68                 digests a-h update and permute round r:r+3
  69                 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
  70         }
  71
  72         num_block--;
  73         if (num_block==0)       jmp L_last_block;
  74
  75         for (r=48;r<64;r+=4) {
  76                 digests a-h update and permute round r:r+3
  77                 load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3
  78                 pre_calculate and store W+K([r:r+3]%16) in stack
  79         }
  80
  81         ctx->states += digests a-h;
  82
  83         jmp     L_loop;
  84
  85 L_last_block:
  86
  87         for (r=48;r<64;r+=4) {
  88                 digests a-h update and permute round r:r+3
  89         }
  90
  91         ctx->states += digests a-h;
  92
  93         ------------------------------------------------------------------------
  94
  95         Apple CoreOS vector & numerics
  96         cclee 8-3-10
  97 */
  98
  99 #if defined     KERNEL
 100 #include <i386/cpu_capabilities.h>
 101 #else
 102 #include <System/i386/cpu_capabilities.h>
 103 #endif
 104
 105         // associate variables with registers or memory
 106
 107 #if defined     (__x86_64__)
 108         #define sp                      %rsp
 109         #define ctx                     %rdi
 110         #define data            %rsi
 111         #define num_blocks      %rdx
 112
 113         #define a                       %r8d
 114         #define b                       %r9d
 115         #define c                       %r10d
 116         #define d                       %r11d
 117         #define e                       %r12d
 118         #define f                       %r13d
 119         #define g                       %r14d
 120         #define h                       %r15d
 121
 122         #define K                       %rbx
 123         #define stack_size      (8+16*8+16+64)  // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15)
 124
 125         #define L_aligned_bswap 64(sp)          // bswap : big-endian loading of 4-byte words
 126         #define xmm_save        80(sp)                  // starting address for xmm save/restore
 127 #else
 128         #define sp      %esp
 129         #define stack_size      (12+16*8+16+16+64)      // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
 130         #define ctx_addr        20+stack_size(sp)       // ret_addr + 4 registers = 20, 1st caller argument
 131         #define data_addr       24+stack_size(sp)       // 2nd caller argument
 132         #define num_blocks      28+stack_size(sp)       // 3rd caller argument
 133
 134         #define a       %ebx
 135         #define b       %edx
 136         #define c       64(sp)
 137         #define d       %ebp
 138         #define e       %esi
 139         #define f       68(sp)
 140         #define g       %edi
 141         #define h       72(sp)
 142
 143         #define K       76(sp)                                  // pointer to K256[] table
 144         #define L_aligned_bswap 80(sp)          // bswap : big-endian loading of 4-byte words
 145         #define xmm_save        96(sp)                  // starting address for xmm save/restore
 146 #endif
 147
 148         // 2 local variables
 149         #define t       %eax
 150         #define s       %ecx
 151
 152         // a window (16 words) of message scheule
 153         #define W0      %xmm0
 154         #define W1      %xmm1
 155         #define W2      %xmm2
 156         #define W3      %xmm3
 157
 158         // circular buffer for WK[(r:r+15)%16]
 159         #define WK(x)   (x&15)*4(sp)
 160
 161 // #define Ch(x,y,z)   (((x) & (y)) ^ ((~(x)) & (z)))
 162
 163         .macro Ch
 164         mov             $0, t           // x
 165         mov             $0, s           // x
 166         not             t                       // ~x
 167         and             $1, s           // x & y
 168         and             $2, t           // ~x & z
 169         xor             s, t            // t = ((x) & (y)) ^ ((~(x)) & (z));
 170         .endm
 171
 172 // #define Maj(x,y,z)  (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
 173
 174         .macro  Maj
 175         mov             $0, t           // x
 176         mov             $1, s           // y
 177         and             s, t            // x&y
 178         and             $2, s           // y&z
 179         xor             s, t            // (x&y) ^ (y&z)
 180         mov             $2, s           // z
 181         and             $0, s           // (x&z)
 182         xor             s, t            // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
 183         .endm
 184
 185 /* Shift-right (used in SHA-256, SHA-384, and SHA-512): */
 186 // #define R(b,x)      ((x) >> (b))
 187 /* 32-bit Rotate-right (used in SHA-256): */
 188 // #define S32(b,x)    (((x) >> (b)) | ((x) << (32 - (b))))
 189
 190 // #define sigma0_256(x)   (S32(7,  (x)) ^ S32(18, (x)) ^ R(3 ,   (x)))
 191
 192         // performs sigma0_256 on 4 words on an xmm registers
 193         // use xmm6/xmm7 as intermediate registers
 194         .macro  sigma0
 195         movdqa  $0, %xmm6
 196         movdqa  $0, %xmm7
 197         psrld   $$3, $0                 // SHR3(x)
 198         psrld   $$7, %xmm6              // part of ROTR7
 199         pslld   $$14, %xmm7             // part of ROTR18
 200         pxor    %xmm6, $0
 201         pxor    %xmm7, $0
 202         psrld   $$11, %xmm6             // part of ROTR18
 203         pslld   $$11, %xmm7             // part of ROTR7
 204         pxor    %xmm6, $0
 205         pxor    %xmm7, $0
 206         .endm
 207
 208 // #define sigma1_256(x)   (S32(17, (x)) ^ S32(19, (x)) ^ R(10,   (x)))
 209
 210         // performs sigma1_256 on 4 words on an xmm registers
 211         // use xmm6/xmm7 as intermediate registers
 212         .macro  sigma1
 213         movdqa  $0, %xmm6
 214         movdqa  $0, %xmm7
 215         psrld   $$10, $0                // SHR10(x)
 216         psrld   $$17, %xmm6             // part of ROTR17
 217         pxor    %xmm6, $0
 218         pslld   $$13, %xmm7             // part of ROTR19
 219         pxor    %xmm7, $0
 220         psrld   $$2, %xmm6              // part of ROTR19
 221         pxor    %xmm6, $0
 222         pslld   $$2, %xmm7              // part of ROTR17
 223         pxor    %xmm7, $0
 224         .endm
 225
 226 // #define Sigma0_256(x)   (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
 227
 228         .macro  Sigma0
 229         mov             $0, t                   // x
 230         mov             $0, s                   // x
 231         ror             $$2, t                  // S32(2,  (x))
 232         ror             $$13, s                 // S32(13,  (x))
 233         xor             s, t                    // S32(2,  (x)) ^ S32(13, (x))
 234         ror             $$9, s                  // S32(22,  (x))
 235         xor             s, t                    // t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
 236         .endm
 237
 238 // #define Sigma1_256(x)   (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
 239
 240         .macro  Sigma1
 241         mov             $0, s                   // x
 242         ror             $$6, s                  // S32(6,  (x))
 243         mov             s, t                    // S32(6,  (x))
 244         ror             $$5, s                  // S32(11, (x))
 245         xor             s, t                    // S32(6,  (x)) ^ S32(11, (x))
 246         ror             $$14, s                 // S32(25, (x))
 247         xor             s, t                    // t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
 248         .endm
 249
 250         // per round digests update
 251         .macro  round
 252         Sigma1  $4                              // t = T1
 253         add             t, $7                   // use h to store h+Sigma1(e)
 254         Ch              $4, $5, $6              // t = Ch (e, f, g);
 255         add             $7, t                   // t = h+Sigma1(e)+Ch(e,f,g);
 256         add             WK($8), t               // h = T1
 257         add             t, $3                   // d += T1;
 258         mov             t, $7                   // h = T1
 259         Sigma0  $0                              // t = Sigma0(a);
 260         add             t, $7                   // h = T1 + Sigma0(a);
 261         Maj             $0, $1, $2              // t = Maj(a,b,c)
 262         add             t, $7                   // h = T1 + Sigma0(a) + Maj(a,b,c);
 263         .endm
 264
 265         // per 4 rounds digests update and permutation
 266         // permutation is absorbed by rotating the roles of digests a-h
 267         .macro  rounds
 268         round   $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
 269         round   $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
 270         round   $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
 271         round   $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
 272         .endm
 273
 274         // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future
 275         .macro  message_schedule
 276
 277         // 4 32-bit K256 words in xmm5
 278 #if defined     (__x86_64__)
 279         movdqu  (K), %xmm5
 280 #else
 281         mov             K, t
 282         movdqu  (t), %xmm5
 283 #endif
 284         add             $$16, K                         // K points to next K256 word for next iteration
 285         movdqa  $1, %xmm4                       // W7:W4
 286         palignr $$4, $0, %xmm4          // W4:W1
 287         sigma0  %xmm4                           // sigma0(W4:W1)
 288         movdqa  $3, %xmm6                       // W15:W12
 289         paddd   %xmm4, $0                       // $0 = W3:W0 + sigma0(W4:W1)
 290         palignr $$4, $2, %xmm6          // W12:W9
 291         paddd   %xmm6, $0                       // $0 = W12:W9 + sigma0(W4:W1) + W3:W0
 292         movdqa  $3, %xmm4                       // W15:W12
 293         psrldq  $$8, %xmm4                      // 0,0,W15,W14
 294         sigma1  %xmm4                           // sigma1(0,0,W15,W14)
 295         paddd   %xmm4, $0                       // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
 296         movdqa  $0, %xmm4                       // W19-sigma1(W17), W18-sigma1(W16), W17, W16
 297         pslldq  $$8, %xmm4                      // W17, W16, 0, 0
 298         sigma1  %xmm4                           // sigma1(W17,W16,0,0)
 299         paddd   %xmm4, $0                       // W19:W16
 300         paddd   $0, %xmm5                       // WK
 301         movdqa  %xmm5, WK($4)
 302         .endm
 303
 304         // this macro is used in the last 16 rounds of a current block
 305         // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
 306         // and save into stack to prepare for next block
 307
 308         .macro  update_W_WK
 309 #if defined (__x86_64__)
 310         movdqu  $0*16(data), $1         // read 4 4-byte words
 311         pshufb  L_aligned_bswap, $1     // big-endian of each 4-byte word, W[r:r+3]
 312         movdqu  $0*16(K), %xmm4         // K[r:r+3]
 313 #else
 314         mov             data_addr, t
 315         movdqu  $0*16(t), $1            // read 4 4-byte words
 316         pshufb  L_aligned_bswap, $1     // big-endian of each 4-byte word, W[r:r+3]
 317         mov             K, t
 318         movdqu  $0*16(t), %xmm4         // K[r:r+3]
 319 #endif
 320         paddd   $1, %xmm4                       // WK[r:r+3]
 321         movdqa  %xmm4, WK($0*4)         // save WK[r:r+3] into stack circular buffer
 322         .endm
 323
 324         .text
 325
 326 #if defined (__x86_64__) || defined (__i386__)
 327
 328         .globl  _SHA256_Transform
 329
 330 _SHA256_Transform:
 331
 332
 333         // detect SSSE3 and dispatch appropriate code branch
 334         #if defined __x86_64__
 335         movq    __cpu_capabilities@GOTPCREL(%rip), %rax         // %rax -> __cpu_capabilities
 336         mov     (%rax), %eax                                    // %eax = __cpu_capabilities
 337     #else       // i386
 338         #if defined KERNEL
 339             leal    __cpu_capabilities, %eax                    // %eax -> __cpu_capabilities
 340             mov     (%eax), %eax                                // %eax = __cpu_capabilities
 341         #else
 342             mov    _COMM_PAGE_CPU_CAPABILITIES, %eax
 343         #endif
 344     #endif
 345     test    $(kHasSupplementalSSE3), %eax
 346     je      _SHA256_Transform_nossse3                              // branch to no-ssse3 code
 347
 348         // push callee-saved registers
 349 #if defined     (__x86_64__)
 350         push    %rbp
 351         push    %rbx
 352         push    %r12
 353         push    %r13
 354         push    %r14
 355         push    %r15
 356 #else
 357     push    %ebp
 358         push    %ebx
 359     push    %esi
 360     push    %edi
 361 #endif
 362
 363         // allocate stack space
 364         sub             $stack_size, sp
 365
 366         // if kernel code, save used xmm registers
 367 #if     KERNEL
 368         movdqa  %xmm0, 0*16+xmm_save
 369         movdqa  %xmm1, 1*16+xmm_save
 370         movdqa  %xmm2, 2*16+xmm_save
 371         movdqa  %xmm3, 3*16+xmm_save
 372         movdqa  %xmm4, 4*16+xmm_save
 373         movdqa  %xmm5, 5*16+xmm_save
 374         movdqa  %xmm6, 6*16+xmm_save
 375         movdqa  %xmm7, 7*16+xmm_save
 376 #endif
 377
 378         // set up bswap parameters in the aligned stack space and pointer to table K256[]
 379 #if defined (__x86_64__)
 380         lea             _K256(%rip), K
 381         lea             L_bswap(%rip), %rax
 382         movdqa  (%rax), %xmm0
 383 #else
 384         lea             _K256, t
 385         mov             t, K
 386         lea             L_bswap, %eax
 387         movdqa  (%eax), %xmm0
 388 #endif
 389         movdqa  %xmm0, L_aligned_bswap
 390
 391         // load W[0:15] into xmm0-xmm3
 392 #if defined (__x86_64__)
 393         movdqu  0*16(data), W0
 394         movdqu  1*16(data), W1
 395         movdqu  2*16(data), W2
 396         movdqu  3*16(data), W3
 397         add             $64, data
 398 #else
 399         mov             data_addr, t
 400         movdqu  0*16(t), W0
 401         movdqu  1*16(t), W1
 402         movdqu  2*16(t), W2
 403         movdqu  3*16(t), W3
 404         add             $64, data_addr
 405 #endif
 406         pshufb  L_aligned_bswap, W0
 407         pshufb  L_aligned_bswap, W1
 408         pshufb  L_aligned_bswap, W2
 409         pshufb  L_aligned_bswap, W3
 410
 411         // compute WK[0:15] and save in stack
 412 #if defined (__x86_64__)
 413         movdqu  0*16(K), %xmm4
 414         movdqu  1*16(K), %xmm5
 415         movdqu  2*16(K), %xmm6
 416         movdqu  3*16(K), %xmm7
 417 #else
 418         mov             K, t
 419         movdqu  0*16(t), %xmm4
 420         movdqu  1*16(t), %xmm5
 421         movdqu  2*16(t), %xmm6
 422         movdqu  3*16(t), %xmm7
 423 #endif
 424         add             $64, K
 425         paddd   %xmm0, %xmm4
 426         paddd   %xmm1, %xmm5
 427         paddd   %xmm2, %xmm6
 428         paddd   %xmm3, %xmm7
 429         movdqa  %xmm4, WK(0)
 430         movdqa  %xmm5, WK(4)
 431         movdqa  %xmm6, WK(8)
 432         movdqa  %xmm7, WK(12)
 433
 434 L_loop:
 435
 436         // digests a-h = ctx->states;
 437 #if defined (__x86_64__)
 438         mov             0*4(ctx), a
 439         mov             1*4(ctx), b
 440         mov             2*4(ctx), c
 441         mov             3*4(ctx), d
 442         mov             4*4(ctx), e
 443         mov             5*4(ctx), f
 444         mov             6*4(ctx), g
 445         mov             7*4(ctx), h
 446 #else
 447         mov             ctx_addr, t
 448         mov     0*4(t), a
 449         mov     1*4(t), b
 450         mov     2*4(t), s
 451         mov             s, c
 452         mov     3*4(t), d
 453         mov     4*4(t), e
 454         mov     5*4(t), s
 455         mov             s, f
 456         mov     6*4(t), g
 457         mov     7*4(t), s
 458         mov             s, h
 459 #endif
 460
 461         // rounds 0:47 interleaved with W/WK update for rounds 16:63
 462         rounds  a, b, c, d, e, f, g, h, 0
 463         message_schedule W0,W1,W2,W3,16
 464         rounds  e, f, g, h, a, b, c, d, 4
 465         message_schedule W1,W2,W3,W0,20
 466         rounds  a, b, c, d, e, f, g, h, 8
 467         message_schedule W2,W3,W0,W1,24
 468         rounds  e, f, g, h, a, b, c, d, 12
 469         message_schedule W3,W0,W1,W2,28
 470         rounds  a, b, c, d, e, f, g, h, 16
 471         message_schedule W0,W1,W2,W3,32
 472         rounds  e, f, g, h, a, b, c, d, 20
 473         message_schedule W1,W2,W3,W0,36
 474         rounds  a, b, c, d, e, f, g, h, 24
 475         message_schedule W2,W3,W0,W1,40
 476         rounds  e, f, g, h, a, b, c, d, 28
 477         message_schedule W3,W0,W1,W2,44
 478         rounds  a, b, c, d, e, f, g, h, 32
 479         message_schedule W0,W1,W2,W3,48
 480         rounds  e, f, g, h, a, b, c, d, 36
 481         message_schedule W1,W2,W3,W0,52
 482         rounds  a, b, c, d, e, f, g, h, 40
 483         message_schedule W2,W3,W0,W1,56
 484         rounds  e, f, g, h, a, b, c, d, 44
 485         message_schedule W3,W0,W1,W2,60
 486
 487         // revert K to the beginning of K256[]
 488 #if defined __x86_64__
 489         sub             $256, K
 490 #else
 491         subl    $256, K
 492 #endif
 493
 494         sub             $1, num_blocks                          // num_blocks--
 495         je              L_final_block                           // if final block, wrap up final rounds
 496
 497         // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
 498         rounds  a, b, c, d, e, f, g, h, 48
 499         update_W_WK     0, W0
 500         rounds  e, f, g, h, a, b, c, d, 52
 501         update_W_WK     1, W1
 502         rounds  a, b, c, d, e, f, g, h, 56
 503         update_W_WK     2, W2
 504         rounds  e, f, g, h, a, b, c, d, 60
 505         update_W_WK     3, W3
 506
 507         add             $64, K
 508 #if defined (__x86_64__)
 509         add             $64, data
 510 #else
 511         add             $64, data_addr
 512 #endif
 513
 514         // ctx->states += digests a-h
 515 #if     defined (__x86_64__)
 516         add             a, 0*4(ctx)
 517         add             b, 1*4(ctx)
 518         add             c, 2*4(ctx)
 519         add             d, 3*4(ctx)
 520         add             e, 4*4(ctx)
 521         add             f, 5*4(ctx)
 522         add             g, 6*4(ctx)
 523         add             h, 7*4(ctx)
 524 #else
 525         mov             ctx_addr, t
 526         add             a, 0*4(t)
 527         add             b, 1*4(t)
 528         mov             c, s
 529         add             s, 2*4(t)
 530         add             d, 3*4(t)
 531         add             e, 4*4(t)
 532         mov             f, s
 533         add             s, 5*4(t)
 534         add             g, 6*4(t)
 535         mov             h, s
 536         add             s, 7*4(t)
 537 #endif
 538
 539         jmp             L_loop                          // branch for next block
 540
 541         // wrap up digest update round 48:63 for final block
 542 L_final_block:
 543         rounds  a, b, c, d, e, f, g, h, 48
 544         rounds  e, f, g, h, a, b, c, d, 52
 545         rounds  a, b, c, d, e, f, g, h, 56
 546         rounds  e, f, g, h, a, b, c, d, 60
 547
 548         // ctx->states += digests a-h
 549 #if     defined (__x86_64__)
 550         add             a, 0*4(ctx)
 551         add             b, 1*4(ctx)
 552         add             c, 2*4(ctx)
 553         add             d, 3*4(ctx)
 554         add             e, 4*4(ctx)
 555         add             f, 5*4(ctx)
 556         add             g, 6*4(ctx)
 557         add             h, 7*4(ctx)
 558 #else
 559         mov             ctx_addr, t
 560         add             a, 0*4(t)
 561         add             b, 1*4(t)
 562         mov             c, s
 563         add             s, 2*4(t)
 564         add             d, 3*4(t)
 565         add             e, 4*4(t)
 566         mov             f, s
 567         add             s, 5*4(t)
 568         add             g, 6*4(t)
 569         mov             h, s
 570         add             s, 7*4(t)
 571 #endif
 572
 573         // if kernel, restore xmm0-xmm7
 574 #if     KERNEL
 575         movdqa  0*16+xmm_save, %xmm0
 576         movdqa  1*16+xmm_save, %xmm1
 577         movdqa  2*16+xmm_save, %xmm2
 578         movdqa  3*16+xmm_save, %xmm3
 579         movdqa  4*16+xmm_save, %xmm4
 580         movdqa  5*16+xmm_save, %xmm5
 581         movdqa  6*16+xmm_save, %xmm6
 582         movdqa  7*16+xmm_save, %xmm7
 583 #endif
 584
 585         // free allocated stack memory
 586         add             $stack_size, sp
 587
 588         // restore callee-saved registers
 589 #if defined (__x86_64__)
 590         pop             %r15
 591         pop             %r14
 592         pop             %r13
 593         pop             %r12
 594         pop             %rbx
 595         pop             %rbp
 596 #else
 597     pop         %edi
 598     pop         %esi
 599         pop             %ebx
 600     pop         %ebp
 601 #endif
 602
 603         // return
 604         ret
 605
 606
 607         .const
 608         .align  4, 0x90
 609
 610 L_bswap:
 611     .long   0x00010203
 612     .long   0x04050607
 613     .long   0x08090a0b
 614     .long   0x0c0d0e0f
 615
 616 #endif          // x86_64/i386
 617