/*
	This file provides x86_64/i386 hand implementation of the following function

	void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);

	which is a C function in sha2.c (from xnu).

	The code 1st probes cpu_capabilities to detect whether ssse3 is supported. If not, it branches to
	SHA256_Transform_nossse3 (in a separate source file sha256nossse3.s) that was cloned from this file
	with all ssse3 instructions replaced with sse3 or below instructions.

	sha256 algorithm per block description:

		1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) 
		2. load 8 digests a-h from ctx->state
		3. for r = 0:15
				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
				d += T1;
				h = T1 + Sigma0(a) + Maj(a,b,c)
				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
		4. for r = 16:63
				W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
				d += T1;
				h = T1 + Sigma0(a) + Maj(a,b,c)
				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
				
	In the assembly implementation:	
		- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
		- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
		- the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)

	the implementation per block looks like

	----------------------------------------------------------------------------

	load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 
	pre_calculate and store W+K(0:15) in stack

	load digests a-h from ctx->state;

	for (r=0;r<48;r+=4) {
		digests a-h update and permute round r:r+3
		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
	}

	for (r=48;r<64;r+=4) {
		digests a-h update and permute round r:r+3
	}

	ctx->states += digests a-h;

	----------------------------------------------------------------------------

	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block 
	into the last 16 rounds of its previous block:

	----------------------------------------------------------------------------

	load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 
	pre_calculate and store W+K(0:15) in stack

L_loop:

	load digests a-h from ctx->state;

	for (r=0;r<48;r+=4) {
		digests a-h update and permute round r:r+3
		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
	}

	num_block--;
	if (num_block==0)	jmp L_last_block;

	for (r=48;r<64;r+=4) {
		digests a-h update and permute round r:r+3
		load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3 
		pre_calculate and store W+K([r:r+3]%16) in stack
	}

	ctx->states += digests a-h;

	jmp	L_loop;

L_last_block:

	for (r=48;r<64;r+=4) {
		digests a-h update and permute round r:r+3
	}

	ctx->states += digests a-h;

	------------------------------------------------------------------------

	Apple CoreOS vector & numerics
	cclee 8-3-10
*/

#if defined	KERNEL
#include <i386/cpu_capabilities.h>
#else
#include <System/i386/cpu_capabilities.h>
#endif

	// associate variables with registers or memory

#if defined	(__x86_64__)
	#define	sp			%rsp
	#define	ctx			%rdi
	#define data		%rsi
	#define	num_blocks	%rdx

	#define	a			%r8d
	#define	b			%r9d
	#define	c			%r10d
	#define	d			%r11d
	#define	e			%r12d
	#define	f			%r13d
	#define	g			%r14d
	#define	h			%r15d

	#define	K			%rbx
	#define stack_size	(8+16*8+16+64)	// 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15)

	#define	L_aligned_bswap	64(sp)		// bswap : big-endian loading of 4-byte words
	#define	xmm_save	80(sp)			// starting address for xmm save/restore
#else
	#define	sp 	%esp
	#define stack_size	(12+16*8+16+16+64)	// 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
	#define	ctx_addr	20+stack_size(sp)	// ret_addr + 4 registers = 20, 1st caller argument
	#define	data_addr	24+stack_size(sp)	// 2nd caller argument
	#define	num_blocks	28+stack_size(sp)	// 3rd caller argument

	#define	a	%ebx
	#define	b	%edx
	#define	c	64(sp)
	#define	d	%ebp
	#define	e	%esi
	#define	f	68(sp)
	#define	g	%edi
	#define	h	72(sp)

	#define	K	76(sp)					// pointer to K256[] table
	#define	L_aligned_bswap	80(sp)		// bswap : big-endian loading of 4-byte words
	#define	xmm_save	96(sp)			// starting address for xmm save/restore
#endif

	// 2 local variables
	#define	t	%eax
	#define	s	%ecx

	// a window (16 words) of message scheule
	#define	W0	%xmm0
	#define	W1	%xmm1
	#define	W2	%xmm2
	#define	W3	%xmm3

	// circular buffer for WK[(r:r+15)%16]
	#define WK(x)   (x&15)*4(sp)

// #define Ch(x,y,z)   (((x) & (y)) ^ ((~(x)) & (z)))

	.macro Ch
	mov		$0, t		// x
	mov		$0, s		// x
	not		t			// ~x
	and		$1, s		// x & y
	and		$2, t		// ~x & z
	xor		s, t		// t = ((x) & (y)) ^ ((~(x)) & (z));
	.endm

// #define Maj(x,y,z)  (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))

	.macro	Maj
	mov		$0, t		// x
	mov		$1, s		// y
	and		s, t		// x&y
	and		$2, s		// y&z
	xor		s, t		// (x&y) ^ (y&z)
	mov		$2, s		// z
	and		$0, s		// (x&z)
	xor		s, t		// t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) 
	.endm

/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */
// #define R(b,x)      ((x) >> (b))
/* 32-bit Rotate-right (used in SHA-256): */
// #define S32(b,x)    (((x) >> (b)) | ((x) << (32 - (b))))

// #define sigma0_256(x)   (S32(7,  (x)) ^ S32(18, (x)) ^ R(3 ,   (x)))

	// performs sigma0_256 on 4 words on an xmm registers
	// use xmm6/xmm7 as intermediate registers
	.macro	sigma0
	movdqa	$0, %xmm6
	movdqa	$0, %xmm7
	psrld	$$3, $0			// SHR3(x)
	psrld	$$7, %xmm6		// part of ROTR7
	pslld	$$14, %xmm7		// part of ROTR18
	pxor	%xmm6, $0
	pxor	%xmm7, $0
	psrld	$$11, %xmm6		// part of ROTR18
	pslld	$$11, %xmm7		// part of ROTR7
	pxor	%xmm6, $0
	pxor	%xmm7, $0
	.endm

// #define sigma1_256(x)   (S32(17, (x)) ^ S32(19, (x)) ^ R(10,   (x)))

	// performs sigma1_256 on 4 words on an xmm registers
	// use xmm6/xmm7 as intermediate registers
	.macro	sigma1
	movdqa	$0, %xmm6
	movdqa	$0, %xmm7
	psrld	$$10, $0		// SHR10(x)
	psrld	$$17, %xmm6		// part of ROTR17
	pxor	%xmm6, $0
	pslld	$$13, %xmm7		// part of ROTR19
	pxor	%xmm7, $0
	psrld	$$2, %xmm6		// part of ROTR19
	pxor	%xmm6, $0
	pslld	$$2, %xmm7		// part of ROTR17
	pxor	%xmm7, $0
	.endm

// #define Sigma0_256(x)   (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))

	.macro	Sigma0
	mov		$0, t			// x
	mov		$0, s			// x
	ror		$$2, t			// S32(2,  (x))
	ror		$$13, s			// S32(13,  (x))
	xor		s, t			// S32(2,  (x)) ^ S32(13, (x))
	ror		$$9, s			// S32(22,  (x))
	xor		s, t			// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
	.endm

// #define Sigma1_256(x)   (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))

	.macro	Sigma1
	mov		$0, s			// x
	ror		$$6, s			// S32(6,  (x))
	mov		s, t			// S32(6,  (x))
	ror		$$5, s			// S32(11, (x))
	xor		s, t			// S32(6,  (x)) ^ S32(11, (x))
	ror		$$14, s			// S32(25, (x))
	xor		s, t			// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
	.endm

	// per round digests update
	.macro	round
	Sigma1	$4				// t = T1
	add		t, $7			// use h to store h+Sigma1(e)
	Ch		$4, $5, $6		// t = Ch (e, f, g);
	add		$7, t			// t = h+Sigma1(e)+Ch(e,f,g);
	add		WK($8), t		// h = T1
	add		t, $3			// d += T1;
	mov		t, $7			// h = T1
	Sigma0	$0				// t = Sigma0(a);
	add		t, $7			// h = T1 + Sigma0(a);
	Maj		$0, $1, $2		// t = Maj(a,b,c)
	add		t, $7			// h = T1 + Sigma0(a) + Maj(a,b,c);			
	.endm

	// per 4 rounds digests update and permutation
	// permutation is absorbed by rotating the roles of digests a-h
	.macro	rounds
	round	$0, $1, $2, $3, $4, $5, $6, $7, 0+$8
	round	$7, $0, $1, $2, $3, $4, $5, $6, 1+$8
	round	$6, $7, $0, $1, $2, $3, $4, $5, 2+$8
	round	$5, $6, $7, $0, $1, $2, $3, $4, 3+$8
	.endm

	// update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future 
	.macro	message_schedule

	// 4 32-bit K256 words in xmm5
#if defined	(__x86_64__)
	movdqu	(K), %xmm5
#else
	mov		K, t
	movdqu	(t), %xmm5 
#endif	
	add		$$16, K				// K points to next K256 word for next iteration
	movdqa	$1, %xmm4 			// W7:W4
	palignr	$$4, $0, %xmm4		// W4:W1
	sigma0	%xmm4				// sigma0(W4:W1)
	movdqa	$3, %xmm6 			// W15:W12
	paddd	%xmm4, $0			// $0 = W3:W0 + sigma0(W4:W1) 
	palignr	$$4, $2, %xmm6		// W12:W9
	paddd	%xmm6, $0			// $0 = W12:W9 + sigma0(W4:W1) + W3:W0	
	movdqa	$3, %xmm4			// W15:W12
	psrldq	$$8, %xmm4			// 0,0,W15,W14	
	sigma1	%xmm4				// sigma1(0,0,W15,W14)
	paddd	%xmm4, $0			// sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
	movdqa	$0, %xmm4			// W19-sigma1(W17), W18-sigma1(W16), W17, W16
	pslldq	$$8, %xmm4			// W17, W16, 0, 0
	sigma1	%xmm4				// sigma1(W17,W16,0,0)
	paddd	%xmm4, $0			// W19:W16
	paddd	$0, %xmm5			// WK
	movdqa	%xmm5, WK($4)
	.endm

	// this macro is used in the last 16 rounds of a current block
	// it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
	// and save into stack to prepare for next block

	.macro	update_W_WK
#if defined (__x86_64__)
	movdqu	$0*16(data), $1		// read 4 4-byte words
	pshufb	L_aligned_bswap, $1	// big-endian of each 4-byte word, W[r:r+3]
	movdqu	$0*16(K), %xmm4		// K[r:r+3]
#else
	mov		data_addr, t
	movdqu	$0*16(t), $1		// read 4 4-byte words
	pshufb	L_aligned_bswap, $1	// big-endian of each 4-byte word, W[r:r+3]
	mov		K, t
	movdqu	$0*16(t), %xmm4		// K[r:r+3]
#endif
	paddd	$1, %xmm4			// WK[r:r+3]
	movdqa	%xmm4, WK($0*4)		// save WK[r:r+3] into stack circular buffer
	.endm

	.text

#if defined (__x86_64__) || defined (__i386__)

	.globl	_SHA256_Transform

_SHA256_Transform:


	// detect SSSE3 and dispatch appropriate code branch
	#if defined __x86_64__
        movq    __cpu_capabilities@GOTPCREL(%rip), %rax         // %rax -> __cpu_capabilities
        mov     (%rax), %eax                                    // %eax = __cpu_capabilities
    #else       // i386
        #if defined KERNEL
            leal    __cpu_capabilities, %eax                    // %eax -> __cpu_capabilities
            mov     (%eax), %eax                                // %eax = __cpu_capabilities
        #else
            mov    _COMM_PAGE_CPU_CAPABILITIES, %eax
        #endif
    #endif
    test    $(kHasSupplementalSSE3), %eax
    je      _SHA256_Transform_nossse3                              // branch to no-ssse3 code

	// push callee-saved registers
#if defined	(__x86_64__)
	push	%rbp
	push	%rbx
	push	%r12
	push	%r13
	push	%r14
	push	%r15
#else
    push    %ebp
	push    %ebx
    push    %esi
    push    %edi
#endif

	// allocate stack space
	sub		$stack_size, sp

	// if kernel code, save used xmm registers
#if	KERNEL
	movdqa	%xmm0, 0*16+xmm_save
	movdqa	%xmm1, 1*16+xmm_save
	movdqa	%xmm2, 2*16+xmm_save
	movdqa	%xmm3, 3*16+xmm_save
	movdqa	%xmm4, 4*16+xmm_save
	movdqa	%xmm5, 5*16+xmm_save
	movdqa	%xmm6, 6*16+xmm_save
	movdqa	%xmm7, 7*16+xmm_save
#endif

	// set up bswap parameters in the aligned stack space and pointer to table K256[]
#if defined (__x86_64__)
	lea		_K256(%rip), K
	lea		L_bswap(%rip), %rax
	movdqa	(%rax), %xmm0
#else
	lea		_K256, t
	mov		t, K
	lea		L_bswap, %eax
	movdqa	(%eax), %xmm0
#endif
	movdqa	%xmm0, L_aligned_bswap	

	// load W[0:15] into xmm0-xmm3
#if defined (__x86_64__)
	movdqu	0*16(data), W0
	movdqu	1*16(data), W1
	movdqu	2*16(data), W2
	movdqu	3*16(data), W3
	add		$64, data
#else
	mov		data_addr, t
	movdqu	0*16(t), W0
	movdqu	1*16(t), W1
	movdqu	2*16(t), W2
	movdqu	3*16(t), W3
	add		$64, data_addr
#endif
	pshufb	L_aligned_bswap, W0
	pshufb	L_aligned_bswap, W1
	pshufb	L_aligned_bswap, W2
	pshufb	L_aligned_bswap, W3

	// compute WK[0:15] and save in stack
#if defined (__x86_64__)
	movdqu	0*16(K), %xmm4	
	movdqu	1*16(K), %xmm5
	movdqu	2*16(K), %xmm6	
	movdqu	3*16(K), %xmm7
#else
	mov		K, t
	movdqu	0*16(t), %xmm4	
	movdqu	1*16(t), %xmm5
	movdqu	2*16(t), %xmm6	
	movdqu	3*16(t), %xmm7
#endif
	add		$64, K
	paddd	%xmm0, %xmm4
	paddd	%xmm1, %xmm5
	paddd	%xmm2, %xmm6
	paddd	%xmm3, %xmm7
	movdqa	%xmm4, WK(0)
	movdqa	%xmm5, WK(4)
	movdqa	%xmm6, WK(8)
	movdqa	%xmm7, WK(12)

L_loop:

	// digests a-h = ctx->states;
#if defined (__x86_64__)
	mov		0*4(ctx), a
	mov		1*4(ctx), b
	mov		2*4(ctx), c
	mov		3*4(ctx), d
	mov		4*4(ctx), e
	mov		5*4(ctx), f
	mov		6*4(ctx), g
	mov		7*4(ctx), h
#else
	mov		ctx_addr, t
	mov 	0*4(t), a
	mov 	1*4(t), b
	mov 	2*4(t), s
	mov		s, c
	mov 	3*4(t), d
	mov 	4*4(t), e
	mov 	5*4(t), s
	mov		s, f
	mov 	6*4(t), g
	mov 	7*4(t), s
	mov		s, h
#endif

	// rounds 0:47 interleaved with W/WK update for rounds 16:63
	rounds	a, b, c, d, e, f, g, h, 0
	message_schedule W0,W1,W2,W3,16
	rounds	e, f, g, h, a, b, c, d, 4 
	message_schedule W1,W2,W3,W0,20
	rounds	a, b, c, d, e, f, g, h, 8
	message_schedule W2,W3,W0,W1,24
	rounds	e, f, g, h, a, b, c, d, 12 
	message_schedule W3,W0,W1,W2,28
	rounds	a, b, c, d, e, f, g, h, 16
	message_schedule W0,W1,W2,W3,32
	rounds	e, f, g, h, a, b, c, d, 20 
	message_schedule W1,W2,W3,W0,36
	rounds	a, b, c, d, e, f, g, h, 24
	message_schedule W2,W3,W0,W1,40
	rounds	e, f, g, h, a, b, c, d, 28 
	message_schedule W3,W0,W1,W2,44
	rounds	a, b, c, d, e, f, g, h, 32
	message_schedule W0,W1,W2,W3,48
	rounds	e, f, g, h, a, b, c, d, 36 
	message_schedule W1,W2,W3,W0,52
	rounds	a, b, c, d, e, f, g, h, 40
	message_schedule W2,W3,W0,W1,56
	rounds	e, f, g, h, a, b, c, d, 44 
	message_schedule W3,W0,W1,W2,60

	// revert K to the beginning of K256[]
#if defined __x86_64__
	sub		$256, K
#else
	subl	$256, K
#endif

	sub		$1, num_blocks				// num_blocks--
	je		L_final_block				// if final block, wrap up final rounds

	// rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15 
	rounds	a, b, c, d, e, f, g, h, 48
	update_W_WK	0, W0
	rounds	e, f, g, h, a, b, c, d, 52 
	update_W_WK	1, W1
	rounds	a, b, c, d, e, f, g, h, 56
	update_W_WK	2, W2
	rounds	e, f, g, h, a, b, c, d, 60 
	update_W_WK	3, W3

	add		$64, K
#if defined (__x86_64__)
	add		$64, data
#else
	add		$64, data_addr
#endif

	// ctx->states += digests a-h
#if	defined (__x86_64__)
	add		a, 0*4(ctx)
	add		b, 1*4(ctx)
	add		c, 2*4(ctx)
	add		d, 3*4(ctx)
	add		e, 4*4(ctx)
	add		f, 5*4(ctx)
	add		g, 6*4(ctx)
	add		h, 7*4(ctx)
#else
	mov		ctx_addr, t
	add		a, 0*4(t)
	add		b, 1*4(t)
	mov		c, s
	add		s, 2*4(t)
	add		d, 3*4(t)
	add		e, 4*4(t)
	mov		f, s
	add		s, 5*4(t)
	add		g, 6*4(t)
	mov		h, s
	add		s, 7*4(t)
#endif

	jmp		L_loop				// branch for next block

	// wrap up digest update round 48:63 for final block
L_final_block:
	rounds	a, b, c, d, e, f, g, h, 48
	rounds	e, f, g, h, a, b, c, d, 52 
	rounds	a, b, c, d, e, f, g, h, 56
	rounds	e, f, g, h, a, b, c, d, 60 

	// ctx->states += digests a-h
#if	defined (__x86_64__)
	add		a, 0*4(ctx)
	add		b, 1*4(ctx)
	add		c, 2*4(ctx)
	add		d, 3*4(ctx)
	add		e, 4*4(ctx)
	add		f, 5*4(ctx)
	add		g, 6*4(ctx)
	add		h, 7*4(ctx)
#else
	mov		ctx_addr, t
	add		a, 0*4(t)
	add		b, 1*4(t)
	mov		c, s
	add		s, 2*4(t)
	add		d, 3*4(t)
	add		e, 4*4(t)
	mov		f, s
	add		s, 5*4(t)
	add		g, 6*4(t)
	mov		h, s
	add		s, 7*4(t)
#endif

	// if kernel, restore xmm0-xmm7
#if	KERNEL
	movdqa	0*16+xmm_save, %xmm0
	movdqa	1*16+xmm_save, %xmm1
	movdqa	2*16+xmm_save, %xmm2
	movdqa	3*16+xmm_save, %xmm3
	movdqa	4*16+xmm_save, %xmm4
	movdqa	5*16+xmm_save, %xmm5
	movdqa	6*16+xmm_save, %xmm6
	movdqa	7*16+xmm_save, %xmm7
#endif

	// free allocated stack memory
	add		$stack_size, sp

	// restore callee-saved registers
#if defined (__x86_64__)
	pop		%r15
	pop		%r14
	pop		%r13
	pop		%r12
	pop		%rbx
	pop		%rbp
#else
    pop		%edi
    pop		%esi
	pop		%ebx
    pop		%ebp
#endif

	// return
	ret


	.const
	.align	4, 0x90

L_bswap:
    .long   0x00010203
    .long   0x04050607
    .long   0x08090a0b
    .long   0x0c0d0e0f

#endif		// x86_64/i386