[apple/xnu.git] / bsd / crypto / sha2 / intel / sha256nossse3.s

/*
	This file provides x86_64/i386 hand implementation of the following function

	void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);

	which is a C function in sha2.c (from xnu).

	The code SHA256_Transform_nossse3 is a clone of SHA256_Transform
	with all ssse3 instructions replaced with sse3 or below instructions.

	For performance reason, this function should not be called directly. This file should be working
	together with the one that implements SHA256_Transform. There, cpu_capabilities is probed to detect
	ssse3. If ssse3 is not supported, the execution will be branched to this no-ssse3-specific function.

	sha256 algorithm per block description:

		1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) 
		2. load 8 digests a-h from ctx->state
		3. for r = 0:15
				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
				d += T1;
				h = T1 + Sigma0(a) + Maj(a,b,c)
				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
		4. for r = 16:63
				W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
				d += T1;
				h = T1 + Sigma0(a) + Maj(a,b,c)
				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
				
	In the assembly implementation:	
		- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
		- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
		- the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)

	the implementation per block looks like

	----------------------------------------------------------------------------

	load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 
	pre_calculate and store W+K(0:15) in stack

	load digests a-h from ctx->state;

	for (r=0;r<48;r+=4) {
		digests a-h update and permute round r:r+3
		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
	}

	for (r=48;r<64;r+=4) {
		digests a-h update and permute round r:r+3
	}

	ctx->states += digests a-h;

	----------------------------------------------------------------------------

	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block 
	into the last 16 rounds of its previous block:

	----------------------------------------------------------------------------

	load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 
	pre_calculate and store W+K(0:15) in stack

L_loop:

	load digests a-h from ctx->state;

	for (r=0;r<48;r+=4) {
		digests a-h update and permute round r:r+3
		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
	}

	num_block--;
	if (num_block==0)	jmp L_last_block;

	for (r=48;r<64;r+=4) {
		digests a-h update and permute round r:r+3
		load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3 
		pre_calculate and store W+K([r:r+3]%16) in stack
	}

	ctx->states += digests a-h;

	jmp	L_loop;

L_last_block:

	for (r=48;r<64;r+=4) {
		digests a-h update and permute round r:r+3
	}

	ctx->states += digests a-h;

	------------------------------------------------------------------------

	Apple CoreOS vector & numerics
	cclee 8-3-10
*/

#if defined	KERNEL
#include <i386/cpu_capabilities.h>
#else
#include <System/i386/cpu_capabilities.h>
#endif

	// associate variables with registers or memory

#if defined	(__x86_64__)
	#define	sp			%rsp
	#define	ctx			%rdi
	#define data		%rsi
	#define	num_blocks	%rdx

	#define	a			%r8d
	#define	b			%r9d
	#define	c			%r10d
	#define	d			%r11d
	#define	e			%r12d
	#define	f			%r13d
	#define	g			%r14d
	#define	h			%r15d

	#define	K			%rbx
	#define stack_size	(8+16*8+16+64)	// 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15)

	#define	xmm_save	80(sp)			// starting address for xmm save/restore
#else
	#define	sp 	%esp
	#define stack_size	(12+16*8+16+16+64)	// 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
	#define	ctx_addr	20+stack_size(sp)	// ret_addr + 4 registers = 20, 1st caller argument
	#define	data_addr	24+stack_size(sp)	// 2nd caller argument
	#define	num_blocks	28+stack_size(sp)	// 3rd caller argument

	#define	a	%ebx
	#define	b	%edx
	#define	c	64(sp)
	#define	d	%ebp
	#define	e	%esi
	#define	f	68(sp)
	#define	g	%edi
	#define	h	72(sp)

	#define	K	76(sp)					// pointer to K256[] table
	#define	xmm_save	96(sp)			// starting address for xmm save/restore
#endif

	// 2 local variables
	#define	t	%eax
	#define	s	%ecx

	// a window (16 words) of message scheule
	#define	W0	%xmm0
	#define	W1	%xmm1
	#define	W2	%xmm2
	#define	W3	%xmm3

	// circular buffer for WK[(r:r+15)%16]
	#define WK(x)   (x&15)*4(sp)

// #define Ch(x,y,z)   (((x) & (y)) ^ ((~(x)) & (z)))

	.macro Ch
	mov		$0, t		// x
	mov		$0, s		// x
	not		t			// ~x
	and		$1, s		// x & y
	and		$2, t		// ~x & z
	xor		s, t		// t = ((x) & (y)) ^ ((~(x)) & (z));
	.endm

// #define Maj(x,y,z)  (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))

	.macro	Maj
	mov		$0, t		// x
	mov		$1, s		// y
	and		s, t		// x&y
	and		$2, s		// y&z
	xor		s, t		// (x&y) ^ (y&z)
	mov		$2, s		// z
	and		$0, s		// (x&z)
	xor		s, t		// t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) 
	.endm

/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */
// #define R(b,x)      ((x) >> (b))
/* 32-bit Rotate-right (used in SHA-256): */
// #define S32(b,x)    (((x) >> (b)) | ((x) << (32 - (b))))

// #define sigma0_256(x)   (S32(7,  (x)) ^ S32(18, (x)) ^ R(3 ,   (x)))

	// performs sigma0_256 on 4 words on an xmm registers
	// use xmm6/xmm7 as intermediate registers
	.macro	sigma0
	movdqa	$0, %xmm6
	movdqa	$0, %xmm7
	psrld	$$3, $0			// SHR3(x)
	psrld	$$7, %xmm6		// part of ROTR7
	pslld	$$14, %xmm7		// part of ROTR18
	pxor	%xmm6, $0
	pxor	%xmm7, $0
	psrld	$$11, %xmm6		// part of ROTR18
	pslld	$$11, %xmm7		// part of ROTR7
	pxor	%xmm6, $0
	pxor	%xmm7, $0
	.endm

// #define sigma1_256(x)   (S32(17, (x)) ^ S32(19, (x)) ^ R(10,   (x)))

	// performs sigma1_256 on 4 words on an xmm registers
	// use xmm6/xmm7 as intermediate registers
	.macro	sigma1
	movdqa	$0, %xmm6
	movdqa	$0, %xmm7
	psrld	$$10, $0		// SHR10(x)
	psrld	$$17, %xmm6		// part of ROTR17
	pxor	%xmm6, $0
	pslld	$$13, %xmm7		// part of ROTR19
	pxor	%xmm7, $0
	psrld	$$2, %xmm6		// part of ROTR19
	pxor	%xmm6, $0
	pslld	$$2, %xmm7		// part of ROTR17
	pxor	%xmm7, $0
	.endm

// #define Sigma0_256(x)   (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))

	.macro	Sigma0
	mov		$0, t			// x
	mov		$0, s			// x
	ror		$$2, t			// S32(2,  (x))
	ror		$$13, s			// S32(13,  (x))
	xor		s, t			// S32(2,  (x)) ^ S32(13, (x))
	ror		$$9, s			// S32(22,  (x))
	xor		s, t			// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
	.endm

// #define Sigma1_256(x)   (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))

	.macro	Sigma1
	mov		$0, s			// x
	ror		$$6, s			// S32(6,  (x))
	mov		s, t			// S32(6,  (x))
	ror		$$5, s			// S32(11, (x))
	xor		s, t			// S32(6,  (x)) ^ S32(11, (x))
	ror		$$14, s			// S32(25, (x))
	xor		s, t			// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
	.endm

	// per round digests update
	.macro	round
	Sigma1	$4				// t = T1
	add		t, $7			// use h to store h+Sigma1(e)
	Ch		$4, $5, $6		// t = Ch (e, f, g);
	add		$7, t			// t = h+Sigma1(e)+Ch(e,f,g);
	add		WK($8), t		// h = T1
	add		t, $3			// d += T1;
	mov		t, $7			// h = T1
	Sigma0	$0				// t = Sigma0(a);
	add		t, $7			// h = T1 + Sigma0(a);
	Maj		$0, $1, $2		// t = Maj(a,b,c)
	add		t, $7			// h = T1 + Sigma0(a) + Maj(a,b,c);			
	.endm

	// per 4 rounds digests update and permutation
	// permutation is absorbed by rotating the roles of digests a-h
	.macro	rounds
	round	$0, $1, $2, $3, $4, $5, $6, $7, 0+$8
	round	$7, $0, $1, $2, $3, $4, $5, $6, 1+$8
	round	$6, $7, $0, $1, $2, $3, $4, $5, 2+$8
	round	$5, $6, $7, $0, $1, $2, $3, $4, 3+$8
	.endm

	// update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future 
	.macro	message_schedule

	// 4 32-bit K256 words in xmm5
#if defined	(__x86_64__)
	movdqu	(K), %xmm5
#else
	mov		K, t
	movdqu	(t), %xmm5 
#endif	
	add		$$16, K				// K points to next K256 word for next iteration
	movdqa	$1, %xmm4 			// W7:W4
#if 0
	palignr	$$4, $0, %xmm4		// W4:W1
#else	// no-ssse3 implementation of palignr
	movdqa  $0, %xmm7
    pslldq  $$12, %xmm4
    psrldq  $$4, %xmm7
    por     %xmm7, %xmm4
#endif
	sigma0	%xmm4				// sigma0(W4:W1)
	movdqa	$3, %xmm6 			// W15:W12
	paddd	%xmm4, $0			// $0 = W3:W0 + sigma0(W4:W1) 
#if 0
	palignr	$$4, $2, %xmm6		// W12:W9
#else	// no-ssse3 implementation of palignr
	movdqa  $2, %xmm7
    pslldq  $$12, %xmm6
    psrldq  $$4, %xmm7
    por     %xmm7, %xmm6
#endif
	paddd	%xmm6, $0			// $0 = W12:W9 + sigma0(W4:W1) + W3:W0	
	movdqa	$3, %xmm4			// W15:W12
	psrldq	$$8, %xmm4			// 0,0,W15,W14	
	sigma1	%xmm4				// sigma1(0,0,W15,W14)
	paddd	%xmm4, $0			// sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
	movdqa	$0, %xmm4			// W19-sigma1(W17), W18-sigma1(W16), W17, W16
	pslldq	$$8, %xmm4			// W17, W16, 0, 0
	sigma1	%xmm4				// sigma1(W17,W16,0,0)
	paddd	%xmm4, $0			// W19:W16
	paddd	$0, %xmm5			// WK
	movdqa	%xmm5, WK($4)
	.endm

	// this macro is used in the last 16 rounds of a current block
	// it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
	// and save into stack to prepare for next block

	.macro	update_W_WK
#if defined (__x86_64__)
#if 0
	movdqu	$0*16(data), $1		// read 4 4-byte words
	pshufb	L_aligned_bswap, $1	// big-endian of each 4-byte word, W[r:r+3]
#else	// no-ssse3 implementation
	mov     0+$0*16(data), s
    bswap   s
    mov     s, 0+WK($0*4)
    mov     4+$0*16(data), s
    bswap   s
    mov     s, 4+WK($0*4)
    mov     8+$0*16(data), s
    bswap   s
    mov     s, 8+WK($0*4)
    mov     12+$0*16(data), s
    bswap   s
    mov     s, 12+WK($0*4)
    movdqa  WK($0*4), $1
#endif
	movdqu	$0*16(K), %xmm4		// K[r:r+3]
#else
	mov		data_addr, t
#if 0
	movdqu	$0*16(t), $1		// read 4 4-byte words
	pshufb	L_aligned_bswap, $1	// big-endian of each 4-byte word, W[r:r+3]
#else	// no-ssse3 implementation
	mov     0+$0*16(t), s
    bswap   s
    mov     s, 0+WK($0*4)
    mov     4+$0*16(t), s
    bswap   s
    mov     s, 4+WK($0*4)
    mov     8+$0*16(t), s
    bswap   s
    mov     s, 8+WK($0*4)
    mov     12+$0*16(t), s
    bswap   s
    mov     s, 12+WK($0*4)
    movdqa  WK($0*4), $1
#endif
	mov		K, t
	movdqu	$0*16(t), %xmm4		// K[r:r+3]
#endif
	paddd	$1, %xmm4			// WK[r:r+3]
	movdqa	%xmm4, WK($0*4)		// save WK[r:r+3] into stack circular buffer
	.endm

	.text

#if defined (__x86_64__) || defined (__i386__)

	.globl	_SHA256_Transform_nossse3

_SHA256_Transform_nossse3:

	// push callee-saved registers
#if defined	(__x86_64__)
	push	%rbp
	push	%rbx
	push	%r12
	push	%r13
	push	%r14
	push	%r15
#else
    push    %ebp
	push    %ebx
    push    %esi
    push    %edi
#endif

	// allocate stack space
	sub		$stack_size, sp

	// if kernel code, save used xmm registers
#if	KERNEL
	movdqa	%xmm0, 0*16+xmm_save
	movdqa	%xmm1, 1*16+xmm_save
	movdqa	%xmm2, 2*16+xmm_save
	movdqa	%xmm3, 3*16+xmm_save
	movdqa	%xmm4, 4*16+xmm_save
	movdqa	%xmm5, 5*16+xmm_save
	movdqa	%xmm6, 6*16+xmm_save
	movdqa	%xmm7, 7*16+xmm_save
#endif

	// set up pointer to table K256[]
#if defined (__x86_64__)
	lea		_K256(%rip), K
#else
	lea		_K256, t
	mov		t, K
#endif

	// load W[0:15] into xmm0-xmm3
    .macro  mybswap
    movl    0+$0*16($1), a
    movl    4+$0*16($1), b
    movl    8+$0*16($1), e
    movl    12+$0*16($1), d
    bswap   a
    bswap   b
    bswap   e
    bswap   d
    movl    a, $0*16(sp)
    movl    b, 4+$0*16(sp)
    movl    e, 8+$0*16(sp)
    movl    d, 12+$0*16(sp)
    .endm

#if defined (__x86_64__)
    mybswap 0, data
    mybswap 1, data
    mybswap 2, data
    mybswap 3, data
    add     $64, data
#else
    mov     data_addr, t
    mybswap 0, t
    mybswap 1, t
    mybswap 2, t
    mybswap 3, t
    add     $64, data_addr
#endif
    movdqa  0*16(sp), W0
    movdqa  1*16(sp), W1
    movdqa  2*16(sp), W2
    movdqa  3*16(sp), W3

	// compute WK[0:15] and save in stack
#if defined (__x86_64__)
	movdqu	0*16(K), %xmm4	
	movdqu	1*16(K), %xmm5
	movdqu	2*16(K), %xmm6	
	movdqu	3*16(K), %xmm7
#else
	mov		K, t
	movdqu	0*16(t), %xmm4	
	movdqu	1*16(t), %xmm5
	movdqu	2*16(t), %xmm6	
	movdqu	3*16(t), %xmm7
#endif
	add		$64, K
	paddd	%xmm0, %xmm4
	paddd	%xmm1, %xmm5
	paddd	%xmm2, %xmm6
	paddd	%xmm3, %xmm7
	movdqa	%xmm4, WK(0)
	movdqa	%xmm5, WK(4)
	movdqa	%xmm6, WK(8)
	movdqa	%xmm7, WK(12)

L_loop:

	// digests a-h = ctx->states;
#if defined (__x86_64__)
	mov		0*4(ctx), a
	mov		1*4(ctx), b
	mov		2*4(ctx), c
	mov		3*4(ctx), d
	mov		4*4(ctx), e
	mov		5*4(ctx), f
	mov		6*4(ctx), g
	mov		7*4(ctx), h
#else
	mov		ctx_addr, t
	mov 	0*4(t), a
	mov 	1*4(t), b
	mov 	2*4(t), s
	mov		s, c
	mov 	3*4(t), d
	mov 	4*4(t), e
	mov 	5*4(t), s
	mov		s, f
	mov 	6*4(t), g
	mov 	7*4(t), s
	mov		s, h
#endif

	// rounds 0:47 interleaved with W/WK update for rounds 16:63
	rounds	a, b, c, d, e, f, g, h, 0
	message_schedule W0,W1,W2,W3,16
	rounds	e, f, g, h, a, b, c, d, 4 
	message_schedule W1,W2,W3,W0,20
	rounds	a, b, c, d, e, f, g, h, 8
	message_schedule W2,W3,W0,W1,24
	rounds	e, f, g, h, a, b, c, d, 12 
	message_schedule W3,W0,W1,W2,28
	rounds	a, b, c, d, e, f, g, h, 16
	message_schedule W0,W1,W2,W3,32
	rounds	e, f, g, h, a, b, c, d, 20 
	message_schedule W1,W2,W3,W0,36
	rounds	a, b, c, d, e, f, g, h, 24
	message_schedule W2,W3,W0,W1,40
	rounds	e, f, g, h, a, b, c, d, 28 
	message_schedule W3,W0,W1,W2,44
	rounds	a, b, c, d, e, f, g, h, 32
	message_schedule W0,W1,W2,W3,48
	rounds	e, f, g, h, a, b, c, d, 36 
	message_schedule W1,W2,W3,W0,52
	rounds	a, b, c, d, e, f, g, h, 40
	message_schedule W2,W3,W0,W1,56
	rounds	e, f, g, h, a, b, c, d, 44 
	message_schedule W3,W0,W1,W2,60

	// revert K to the beginning of K256[]
#if defined __x86_64__
	sub		$256, K
#else
	subl	$256, K
#endif

	sub		$1, num_blocks				// num_blocks--
	je		L_final_block				// if final block, wrap up final rounds

	// rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15 
	rounds	a, b, c, d, e, f, g, h, 48
	update_W_WK	0, W0
	rounds	e, f, g, h, a, b, c, d, 52 
	update_W_WK	1, W1
	rounds	a, b, c, d, e, f, g, h, 56
	update_W_WK	2, W2
	rounds	e, f, g, h, a, b, c, d, 60 
	update_W_WK	3, W3

	add		$64, K
#if defined (__x86_64__)
	add		$64, data
#else
	add		$64, data_addr
#endif

	// ctx->states += digests a-h
#if	defined (__x86_64__)
	add		a, 0*4(ctx)
	add		b, 1*4(ctx)
	add		c, 2*4(ctx)
	add		d, 3*4(ctx)
	add		e, 4*4(ctx)
	add		f, 5*4(ctx)
	add		g, 6*4(ctx)
	add		h, 7*4(ctx)
#else
	mov		ctx_addr, t
	add		a, 0*4(t)
	add		b, 1*4(t)
	mov		c, s
	add		s, 2*4(t)
	add		d, 3*4(t)
	add		e, 4*4(t)
	mov		f, s
	add		s, 5*4(t)
	add		g, 6*4(t)
	mov		h, s
	add		s, 7*4(t)
#endif

	jmp		L_loop				// branch for next block

	// wrap up digest update round 48:63 for final block
L_final_block:
	rounds	a, b, c, d, e, f, g, h, 48
	rounds	e, f, g, h, a, b, c, d, 52 
	rounds	a, b, c, d, e, f, g, h, 56
	rounds	e, f, g, h, a, b, c, d, 60 

	// ctx->states += digests a-h
#if	defined (__x86_64__)
	add		a, 0*4(ctx)
	add		b, 1*4(ctx)
	add		c, 2*4(ctx)
	add		d, 3*4(ctx)
	add		e, 4*4(ctx)
	add		f, 5*4(ctx)
	add		g, 6*4(ctx)
	add		h, 7*4(ctx)
#else
	mov		ctx_addr, t
	add		a, 0*4(t)
	add		b, 1*4(t)
	mov		c, s
	add		s, 2*4(t)
	add		d, 3*4(t)
	add		e, 4*4(t)
	mov		f, s
	add		s, 5*4(t)
	add		g, 6*4(t)
	mov		h, s
	add		s, 7*4(t)
#endif

	// if kernel, restore xmm0-xmm7
#if	KERNEL
	movdqa	0*16+xmm_save, %xmm0
	movdqa	1*16+xmm_save, %xmm1
	movdqa	2*16+xmm_save, %xmm2
	movdqa	3*16+xmm_save, %xmm3
	movdqa	4*16+xmm_save, %xmm4
	movdqa	5*16+xmm_save, %xmm5
	movdqa	6*16+xmm_save, %xmm6
	movdqa	7*16+xmm_save, %xmm7
#endif

	// free allocated stack memory
	add		$stack_size, sp

	// restore callee-saved registers
#if defined (__x86_64__)
	pop		%r15
	pop		%r14
	pop		%r13
	pop		%r12
	pop		%rbx
	pop		%rbp
#else
    pop		%edi
    pop		%esi
	pop		%ebx
    pop		%ebp
#endif

	// return
	ret


#endif		// x86_64/i386
Commit	Line	Data
6d2010ae A	1	/*
	2	This file provides x86_64/i386 hand implementation of the following function
	3
	4	void SHA256_Transform(SHA256_ctx ctx, char data, unsigned int num_blocks);
	5
	6	which is a C function in sha2.c (from xnu).
	7
	8	The code SHA256_Transform_nossse3 is a clone of SHA256_Transform
	9	with all ssse3 instructions replaced with sse3 or below instructions.
	10
	11	For performance reason, this function should not be called directly. This file should be working
	12	together with the one that implements SHA256_Transform. There, cpu_capabilities is probed to detect
	13	ssse3. If ssse3 is not supported, the execution will be branched to this no-ssse3-specific function.
	14
	15	sha256 algorithm per block description:
	16
	17	1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
	18	2. load 8 digests a-h from ctx->state
	19	3. for r = 0:15
	20	T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
	21	d += T1;
	22	h = T1 + Sigma0(a) + Maj(a,b,c)
	23	permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
	24	4. for r = 16:63
	25	W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
	26	T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
	27	d += T1;
	28	h = T1 + Sigma0(a) + Maj(a,b,c)
	29	permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
	30
	31	In the assembly implementation:
	32	- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
	33	- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
	34	- the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)
	35
	36	the implementation per block looks like
	37
	38	----------------------------------------------------------------------------
	39
	40	load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
	41	pre_calculate and store W+K(0:15) in stack
	42
	43	load digests a-h from ctx->state;
	44
	45	for (r=0;r<48;r+=4) {
	46	digests a-h update and permute round r:r+3
	47	update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
	48	}
	49
	50	for (r=48;r<64;r+=4) {
	51	digests a-h update and permute round r:r+3
	52	}
	53
	54	ctx->states += digests a-h;
	55
	56	----------------------------------------------------------------------------
	57
	58	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
	59	into the last 16 rounds of its previous block:
	60
	61	----------------------------------------------------------------------------
	62
	63	load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
	64	pre_calculate and store W+K(0:15) in stack
65
66	L_loop:
67
68	load digests a-h from ctx->state;
69
70	for (r=0;r<48;r+=4) {
71	digests a-h update and permute round r:r+3
72	update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
73	}
74
75	num_block--;
76	if (num_block==0) jmp L_last_block;
77
78	for (r=48;r<64;r+=4) {
79	digests a-h update and permute round r:r+3
80	load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3
81	pre_calculate and store W+K([r:r+3]%16) in stack
82	}
83
84	ctx->states += digests a-h;
85
86	jmp L_loop;
87
88	L_last_block:
89
90	for (r=48;r<64;r+=4) {
91	digests a-h update and permute round r:r+3
92	}
93
94	ctx->states += digests a-h;
95
96	------------------------------------------------------------------------
97
98	Apple CoreOS vector & numerics
99	cclee 8-3-10
100	*/
101
102	#if defined KERNEL
103	#include <i386/cpu_capabilities.h>
104	#else
105	#include <System/i386/cpu_capabilities.h>
106	#endif
107
108	// associate variables with registers or memory
109
110	#if defined (__x86_64__)
111	#define sp %rsp
112	#define ctx %rdi
113	#define data %rsi
114	#define num_blocks %rdx
115
116	#define a %r8d
117	#define b %r9d
118	#define c %r10d
119	#define d %r11d
120	#define e %r12d
121	#define f %r13d
122	#define g %r14d
123	#define h %r15d
124
125	#define K %rbx
126	#define stack_size (8+16*8+16+64) // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15)
127
128	#define xmm_save 80(sp) // starting address for xmm save/restore
129	#else
130	#define sp %esp
131	#define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
132	#define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument
133	#define data_addr 24+stack_size(sp) // 2nd caller argument
134	#define num_blocks 28+stack_size(sp) // 3rd caller argument
135
136	#define a %ebx
137	#define b %edx
138	#define c 64(sp)
139	#define d %ebp
140	#define e %esi
141	#define f 68(sp)
142	#define g %edi
143	#define h 72(sp)
144
145	#define K 76(sp) // pointer to K256[] table
146	#define xmm_save 96(sp) // starting address for xmm save/restore
147	#endif
148
149	// 2 local variables
150	#define t %eax
151	#define s %ecx
152
153	// a window (16 words) of message scheule
154	#define W0 %xmm0
155	#define W1 %xmm1
156	#define W2 %xmm2
157	#define W3 %xmm3
158
159	// circular buffer for WK[(r:r+15)%16]
160	#define WK(x) (x&15)*4(sp)
161
162	// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
163
164	.macro Ch
165	mov $0, t // x
166	mov $0, s // x
167	not t // ~x
168	and $1, s // x & y
169	and $2, t // ~x & z
170	xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z));
171	.endm
172
173	// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
174
175	.macro Maj
176	mov $0, t // x
177	mov $1, s // y
178	and s, t // x&y
179	and $2, s // y&z
180	xor s, t // (x&y) ^ (y&z)
181	mov $2, s // z
182	and $0, s // (x&z)
183	xor s, t // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
184	.endm
185
186	/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */
187	// #define R(b,x) ((x) >> (b))
188	/* 32-bit Rotate-right (used in SHA-256): */
189	// #define S32(b,x) (((x) >> (b)) \| ((x) << (32 - (b))))
190
191	// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x)))
192
193	// performs sigma0_256 on 4 words on an xmm registers
194	// use xmm6/xmm7 as intermediate registers
195	.macro sigma0
196	movdqa $0, %xmm6
197	movdqa $0, %xmm7
198	psrld $$3, $0 // SHR3(x)
199	psrld $$7, %xmm6 // part of ROTR7
200	pslld $$14, %xmm7 // part of ROTR18
201	pxor %xmm6, $0
202	pxor %xmm7, $0
203	psrld $$11, %xmm6 // part of ROTR18
204	pslld $$11, %xmm7 // part of ROTR7
205	pxor %xmm6, $0
206	pxor %xmm7, $0
207	.endm
208
209	// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x)))
210
211	// performs sigma1_256 on 4 words on an xmm registers
212	// use xmm6/xmm7 as intermediate registers
213	.macro sigma1
214	movdqa $0, %xmm6
215	movdqa $0, %xmm7
216	psrld $$10, $0 // SHR10(x)
217	psrld $$17, %xmm6 // part of ROTR17
218	pxor %xmm6, $0
219	pslld $$13, %xmm7 // part of ROTR19
220	pxor %xmm7, $0
221	psrld $$2, %xmm6 // part of ROTR19
222	pxor %xmm6, $0
223	pslld $$2, %xmm7 // part of ROTR17
224	pxor %xmm7, $0
225	.endm
226
227	// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
228
229	.macro Sigma0
230	mov $0, t // x
231	mov $0, s // x
232	ror $$2, t // S32(2, (x))
233	ror $$13, s // S32(13, (x))
234	xor s, t // S32(2, (x)) ^ S32(13, (x))
235	ror $$9, s // S32(22, (x))
236	xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
237	.endm
238
239	// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
240
241	.macro Sigma1
242	mov $0, s // x
243	ror $$6, s // S32(6, (x))
244	mov s, t // S32(6, (x))
245	ror $$5, s // S32(11, (x))
246	xor s, t // S32(6, (x)) ^ S32(11, (x))
247	ror $$14, s // S32(25, (x))
248	xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
249	.endm
250
251	// per round digests update
252	.macro round
253	Sigma1 $4 // t = T1
254	add t, $7 // use h to store h+Sigma1(e)
255	Ch $4, $5, $6 // t = Ch (e, f, g);
256	add $7, t // t = h+Sigma1(e)+Ch(e,f,g);
257	add WK($8), t // h = T1
258	add t, $3 // d += T1;
259	mov t, $7 // h = T1
260	Sigma0 $0 // t = Sigma0(a);
261	add t, $7 // h = T1 + Sigma0(a);
262	Maj $0, $1, $2 // t = Maj(a,b,c)
263	add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c);
264	.endm
265
266	// per 4 rounds digests update and permutation
267	// permutation is absorbed by rotating the roles of digests a-h
268	.macro rounds
269	round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
270	round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
271	round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
272	round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
273	.endm
274
275	// update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future
276	.macro message_schedule
277
278	// 4 32-bit K256 words in xmm5
279	#if defined (__x86_64__)
280	movdqu (K), %xmm5
281	#else
282	mov K, t
283	movdqu (t), %xmm5
284	#endif
285	add $$16, K // K points to next K256 word for next iteration
286	movdqa $1, %xmm4 // W7:W4
287	#if 0
288	palignr $$4, $0, %xmm4 // W4:W1
289	#else // no-ssse3 implementation of palignr
290	movdqa $0, %xmm7
291	pslldq $$12, %xmm4
292	psrldq $$4, %xmm7
293	por %xmm7, %xmm4
294	#endif
295	sigma0 %xmm4 // sigma0(W4:W1)
296	movdqa $3, %xmm6 // W15:W12
297	paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1)
298	#if 0
299	palignr $$4, $2, %xmm6 // W12:W9
300	#else // no-ssse3 implementation of palignr
301	movdqa $2, %xmm7
302	pslldq $$12, %xmm6
303	psrldq $$4, %xmm7
304	por %xmm7, %xmm6
305	#endif
306	paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0
307	movdqa $3, %xmm4 // W15:W12
308	psrldq $$8, %xmm4 // 0,0,W15,W14
309	sigma1 %xmm4 // sigma1(0,0,W15,W14)
310	paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
311	movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16
312	pslldq $$8, %xmm4 // W17, W16, 0, 0
313	sigma1 %xmm4 // sigma1(W17,W16,0,0)
314	paddd %xmm4, $0 // W19:W16
315	paddd $0, %xmm5 // WK
316	movdqa %xmm5, WK($4)
317	.endm
318
319	// this macro is used in the last 16 rounds of a current block
320	// it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
321	// and save into stack to prepare for next block
322
323	.macro update_W_WK
324	#if defined (__x86_64__)
325	#if 0
326	movdqu $0*16(data), $1 // read 4 4-byte words
327	pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
328	#else // no-ssse3 implementation
329	mov 0+$0*16(data), s
330	bswap s
331	mov s, 0+WK($0*4)
332	mov 4+$0*16(data), s
333	bswap s
334	mov s, 4+WK($0*4)
335	mov 8+$0*16(data), s
336	bswap s
337	mov s, 8+WK($0*4)
338	mov 12+$0*16(data), s
339	bswap s
340	mov s, 12+WK($0*4)
341	movdqa WK($0*4), $1
342	#endif
343	movdqu $0*16(K), %xmm4 // K[r:r+3]
344	#else
345	mov data_addr, t
346	#if 0
347	movdqu $0*16(t), $1 // read 4 4-byte words
348	pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
349	#else // no-ssse3 implementation
350	mov 0+$0*16(t), s
351	bswap s
352	mov s, 0+WK($0*4)
353	mov 4+$0*16(t), s
354	bswap s
355	mov s, 4+WK($0*4)
356	mov 8+$0*16(t), s
357	bswap s
358	mov s, 8+WK($0*4)
359	mov 12+$0*16(t), s
360	bswap s
361	mov s, 12+WK($0*4)
362	movdqa WK($0*4), $1
363	#endif
364	mov K, t
365	movdqu $0*16(t), %xmm4 // K[r:r+3]
366	#endif
367	paddd $1, %xmm4 // WK[r:r+3]
368	movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer
369	.endm
370
371	.text
372
373	#if defined (__x86_64__) \|\| defined (__i386__)
374
375	.globl _SHA256_Transform_nossse3
376
377	_SHA256_Transform_nossse3:
378
379	// push callee-saved registers
380	#if defined (__x86_64__)
381	push %rbp
382	push %rbx
383	push %r12
384	push %r13
385	push %r14
386	push %r15
387	#else
388	push %ebp
389	push %ebx
390	push %esi
391	push %edi
392	#endif
393
394	// allocate stack space
395	sub $stack_size, sp
396
397	// if kernel code, save used xmm registers
398	#if KERNEL
399	movdqa %xmm0, 0*16+xmm_save
400	movdqa %xmm1, 1*16+xmm_save
401	movdqa %xmm2, 2*16+xmm_save
402	movdqa %xmm3, 3*16+xmm_save
403	movdqa %xmm4, 4*16+xmm_save
404	movdqa %xmm5, 5*16+xmm_save
405	movdqa %xmm6, 6*16+xmm_save
406	movdqa %xmm7, 7*16+xmm_save
407	#endif
408
409	// set up pointer to table K256[]
410	#if defined (__x86_64__)
411	lea _K256(%rip), K
412	#else
413	lea _K256, t
414	mov t, K
415	#endif
416
417	// load W[0:15] into xmm0-xmm3
418	.macro mybswap
419	movl 0+$0*16($1), a
420	movl 4+$0*16($1), b
421	movl 8+$0*16($1), e
422	movl 12+$0*16($1), d
423	bswap a
424	bswap b
425	bswap e
426	bswap d
427	movl a, $0*16(sp)
428	movl b, 4+$0*16(sp)
429	movl e, 8+$0*16(sp)
430	movl d, 12+$0*16(sp)
431	.endm
432
433	#if defined (__x86_64__)
434	mybswap 0, data
435	mybswap 1, data
436	mybswap 2, data
437	mybswap 3, data
438	add $64, data
439	#else
440	mov data_addr, t
441	mybswap 0, t
442	mybswap 1, t
443	mybswap 2, t
444	mybswap 3, t
445	add $64, data_addr
446	#endif
447	movdqa 0*16(sp), W0
448	movdqa 1*16(sp), W1
449	movdqa 2*16(sp), W2
450	movdqa 3*16(sp), W3
451
452	// compute WK[0:15] and save in stack
453	#if defined (__x86_64__)
454	movdqu 0*16(K), %xmm4
455	movdqu 1*16(K), %xmm5
456	movdqu 2*16(K), %xmm6
457	movdqu 3*16(K), %xmm7
458	#else
459	mov K, t
460	movdqu 0*16(t), %xmm4
461	movdqu 1*16(t), %xmm5
462	movdqu 2*16(t), %xmm6
463	movdqu 3*16(t), %xmm7
464	#endif
465	add $64, K
466	paddd %xmm0, %xmm4
467	paddd %xmm1, %xmm5
468	paddd %xmm2, %xmm6
469	paddd %xmm3, %xmm7
470	movdqa %xmm4, WK(0)
471	movdqa %xmm5, WK(4)
472	movdqa %xmm6, WK(8)
473	movdqa %xmm7, WK(12)
474
475	L_loop:
476
477	// digests a-h = ctx->states;
478	#if defined (__x86_64__)
479	mov 0*4(ctx), a
480	mov 1*4(ctx), b
481	mov 2*4(ctx), c
482	mov 3*4(ctx), d
483	mov 4*4(ctx), e
484	mov 5*4(ctx), f
485	mov 6*4(ctx), g
486	mov 7*4(ctx), h
487	#else
488	mov ctx_addr, t
489	mov 0*4(t), a
490	mov 1*4(t), b
491	mov 2*4(t), s
492	mov s, c
493	mov 3*4(t), d
494	mov 4*4(t), e
495	mov 5*4(t), s
496	mov s, f
497	mov 6*4(t), g
498	mov 7*4(t), s
499	mov s, h
500	#endif
501
502	// rounds 0:47 interleaved with W/WK update for rounds 16:63
503	rounds a, b, c, d, e, f, g, h, 0
504	message_schedule W0,W1,W2,W3,16
505	rounds e, f, g, h, a, b, c, d, 4
506	message_schedule W1,W2,W3,W0,20
507	rounds a, b, c, d, e, f, g, h, 8
508	message_schedule W2,W3,W0,W1,24
509	rounds e, f, g, h, a, b, c, d, 12
510	message_schedule W3,W0,W1,W2,28
511	rounds a, b, c, d, e, f, g, h, 16
512	message_schedule W0,W1,W2,W3,32
513	rounds e, f, g, h, a, b, c, d, 20
514	message_schedule W1,W2,W3,W0,36
515	rounds a, b, c, d, e, f, g, h, 24
516	message_schedule W2,W3,W0,W1,40
517	rounds e, f, g, h, a, b, c, d, 28
518	message_schedule W3,W0,W1,W2,44
519	rounds a, b, c, d, e, f, g, h, 32
520	message_schedule W0,W1,W2,W3,48
521	rounds e, f, g, h, a, b, c, d, 36
522	message_schedule W1,W2,W3,W0,52
523	rounds a, b, c, d, e, f, g, h, 40
524	message_schedule W2,W3,W0,W1,56
525	rounds e, f, g, h, a, b, c, d, 44
526	message_schedule W3,W0,W1,W2,60
527
528	// revert K to the beginning of K256[]
529	#if defined __x86_64__
530	sub $256, K
531	#else
532	subl $256, K
533	#endif
534
535	sub $1, num_blocks // num_blocks--
536	je L_final_block // if final block, wrap up final rounds
537
538	// rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
539	rounds a, b, c, d, e, f, g, h, 48
540	update_W_WK 0, W0
541	rounds e, f, g, h, a, b, c, d, 52
542	update_W_WK 1, W1
543	rounds a, b, c, d, e, f, g, h, 56
544	update_W_WK 2, W2
545	rounds e, f, g, h, a, b, c, d, 60
546	update_W_WK 3, W3
547
548	add $64, K
549	#if defined (__x86_64__)
550	add $64, data
551	#else
552	add $64, data_addr
553	#endif
554
555	// ctx->states += digests a-h
556	#if defined (__x86_64__)
557	add a, 0*4(ctx)
558	add b, 1*4(ctx)
559	add c, 2*4(ctx)
560	add d, 3*4(ctx)
561	add e, 4*4(ctx)
562	add f, 5*4(ctx)
563	add g, 6*4(ctx)
564	add h, 7*4(ctx)
565	#else
566	mov ctx_addr, t
567	add a, 0*4(t)
568	add b, 1*4(t)
569	mov c, s
570	add s, 2*4(t)
571	add d, 3*4(t)
572	add e, 4*4(t)
573	mov f, s
574	add s, 5*4(t)
575	add g, 6*4(t)
576	mov h, s
577	add s, 7*4(t)
578	#endif
579
580	jmp L_loop // branch for next block
581
582	// wrap up digest update round 48:63 for final block
583	L_final_block:
584	rounds a, b, c, d, e, f, g, h, 48
585	rounds e, f, g, h, a, b, c, d, 52
586	rounds a, b, c, d, e, f, g, h, 56
587	rounds e, f, g, h, a, b, c, d, 60
588
589	// ctx->states += digests a-h
590	#if defined (__x86_64__)
591	add a, 0*4(ctx)
592	add b, 1*4(ctx)
593	add c, 2*4(ctx)
594	add d, 3*4(ctx)
595	add e, 4*4(ctx)
596	add f, 5*4(ctx)
597	add g, 6*4(ctx)
598	add h, 7*4(ctx)
599	#else
600	mov ctx_addr, t
601	add a, 0*4(t)
602	add b, 1*4(t)
603	mov c, s
604	add s, 2*4(t)
605	add d, 3*4(t)
606	add e, 4*4(t)
607	mov f, s
608	add s, 5*4(t)
609	add g, 6*4(t)
610	mov h, s
611	add s, 7*4(t)
612	#endif
613
614	// if kernel, restore xmm0-xmm7
615	#if KERNEL
616	movdqa 0*16+xmm_save, %xmm0
617	movdqa 1*16+xmm_save, %xmm1
618	movdqa 2*16+xmm_save, %xmm2
619	movdqa 3*16+xmm_save, %xmm3
620	movdqa 4*16+xmm_save, %xmm4
621	movdqa 5*16+xmm_save, %xmm5
622	movdqa 6*16+xmm_save, %xmm6
623	movdqa 7*16+xmm_save, %xmm7
624	#endif
625
626	// free allocated stack memory
627	add $stack_size, sp
628
629	// restore callee-saved registers
630	#if defined (__x86_64__)
631	pop %r15
632	pop %r14
633	pop %r13
634	pop %r12
635	pop %rbx
636	pop %rbp
637	#else
638	pop %edi
639	pop %esi
640	pop %ebx
641	pop %ebp
642	#endif
643
644	// return
645	ret
646
647
648	#endif // x86_64/i386
649