/*
	This file provides x86_64/i386 hand implementation of the following function

	void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);

	which is a C function in sha2.c (from xnu).

	The code SHA256_Transform_nossse3 is a clone of SHA256_Transform
	with all ssse3 instructions replaced with sse3 or below instructions.

	For performance reason, this function should not be called directly. This file should be working
	together with the one that implements SHA256_Transform. There, cpu_capabilities is probed to detect
	ssse3. If ssse3 is not supported, the execution will be branched to this no-ssse3-specific function.

	sha256 algorithm per block description:

		1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) 
		2. load 8 digests a-h from ctx->state
		3. for r = 0:15
				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
				d += T1;
				h = T1 + Sigma0(a) + Maj(a,b,c)
				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
		4. for r = 16:63
				W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
				T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
				d += T1;
				h = T1 + Sigma0(a) + Maj(a,b,c)
				permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
				
	In the assembly implementation:	
		- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
		- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
		- the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)

	the implementation per block looks like

	----------------------------------------------------------------------------

	load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 
	pre_calculate and store W+K(0:15) in stack

	load digests a-h from ctx->state;

	for (r=0;r<48;r+=4) {
		digests a-h update and permute round r:r+3
		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
	}

	for (r=48;r<64;r+=4) {
		digests a-h update and permute round r:r+3
	}

	ctx->states += digests a-h;

	----------------------------------------------------------------------------

	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block 
	into the last 16 rounds of its previous block:

	----------------------------------------------------------------------------

	load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 
	pre_calculate and store W+K(0:15) in stack

L_loop:

	load digests a-h from ctx->state;

	for (r=0;r<48;r+=4) {
		digests a-h update and permute round r:r+3
		update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration 
	}

	num_block--;
	if (num_block==0)	jmp L_last_block;

	for (r=48;r<64;r+=4) {
		digests a-h update and permute round r:r+3
		load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3 
		pre_calculate and store W+K([r:r+3]%16) in stack
	}

	ctx->states += digests a-h;

	jmp	L_loop;

L_last_block:

	for (r=48;r<64;r+=4) {
		digests a-h update and permute round r:r+3
	}

	ctx->states += digests a-h;

	------------------------------------------------------------------------

	Apple CoreOS vector & numerics
	cclee 8-3-10
*/

#if defined	KERNEL
#include <i386/cpu_capabilities.h>
#else
#include <System/i386/cpu_capabilities.h>
#endif

	// associate variables with registers or memory

#if defined	(__x86_64__)
	#define	sp			%rsp
	#define	ctx			%rdi
	#define data		%rsi
	#define	num_blocks	%rdx

	#define	a			%r8d
	#define	b			%r9d
	#define	c			%r10d
	#define	d			%r11d
	#define	e			%r12d
	#define	f			%r13d
	#define	g			%r14d
	#define	h			%r15d

	#define	K			%rbx
	#define stack_size	(8+16*8+16+64)	// 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15)

	#define	xmm_save	80(sp)			// starting address for xmm save/restore
#else
	#define	sp 	%esp
	#define stack_size	(12+16*8+16+16+64)	// 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
	#define	ctx_addr	20+stack_size(sp)	// ret_addr + 4 registers = 20, 1st caller argument
	#define	data_addr	24+stack_size(sp)	// 2nd caller argument
	#define	num_blocks	28+stack_size(sp)	// 3rd caller argument

	#define	a	%ebx
	#define	b	%edx
	#define	c	64(sp)
	#define	d	%ebp
	#define	e	%esi
	#define	f	68(sp)
	#define	g	%edi
	#define	h	72(sp)

	#define	K	76(sp)					// pointer to K256[] table
	#define	xmm_save	96(sp)			// starting address for xmm save/restore
#endif

	// 2 local variables
	#define	t	%eax
	#define	s	%ecx

	// a window (16 words) of message scheule
	#define	W0	%xmm0
	#define	W1	%xmm1
	#define	W2	%xmm2
	#define	W3	%xmm3

	// circular buffer for WK[(r:r+15)%16]
	#define WK(x)   (x&15)*4(sp)

// #define Ch(x,y,z)   (((x) & (y)) ^ ((~(x)) & (z)))

	.macro Ch
	mov		$0, t		// x
	mov		$0, s		// x
	not		t			// ~x
	and		$1, s		// x & y
	and		$2, t		// ~x & z
	xor		s, t		// t = ((x) & (y)) ^ ((~(x)) & (z));
	.endm

// #define Maj(x,y,z)  (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))

	.macro	Maj
	mov		$0, t		// x
	mov		$1, s		// y
	and		s, t		// x&y
	and		$2, s		// y&z
	xor		s, t		// (x&y) ^ (y&z)
	mov		$2, s		// z
	and		$0, s		// (x&z)
	xor		s, t		// t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) 
	.endm

/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */
// #define R(b,x)      ((x) >> (b))
/* 32-bit Rotate-right (used in SHA-256): */
// #define S32(b,x)    (((x) >> (b)) | ((x) << (32 - (b))))

// #define sigma0_256(x)   (S32(7,  (x)) ^ S32(18, (x)) ^ R(3 ,   (x)))

	// performs sigma0_256 on 4 words on an xmm registers
	// use xmm6/xmm7 as intermediate registers
	.macro	sigma0
	movdqa	$0, %xmm6
	movdqa	$0, %xmm7
	psrld	$$3, $0			// SHR3(x)
	psrld	$$7, %xmm6		// part of ROTR7
	pslld	$$14, %xmm7		// part of ROTR18
	pxor	%xmm6, $0
	pxor	%xmm7, $0
	psrld	$$11, %xmm6		// part of ROTR18
	pslld	$$11, %xmm7		// part of ROTR7
	pxor	%xmm6, $0
	pxor	%xmm7, $0
	.endm

// #define sigma1_256(x)   (S32(17, (x)) ^ S32(19, (x)) ^ R(10,   (x)))

	// performs sigma1_256 on 4 words on an xmm registers
	// use xmm6/xmm7 as intermediate registers
	.macro	sigma1
	movdqa	$0, %xmm6
	movdqa	$0, %xmm7
	psrld	$$10, $0		// SHR10(x)
	psrld	$$17, %xmm6		// part of ROTR17
	pxor	%xmm6, $0
	pslld	$$13, %xmm7		// part of ROTR19
	pxor	%xmm7, $0
	psrld	$$2, %xmm6		// part of ROTR19
	pxor	%xmm6, $0
	pslld	$$2, %xmm7		// part of ROTR17
	pxor	%xmm7, $0
	.endm

// #define Sigma0_256(x)   (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))

	.macro	Sigma0
	mov		$0, t			// x
	mov		$0, s			// x
	ror		$$2, t			// S32(2,  (x))
	ror		$$13, s			// S32(13,  (x))
	xor		s, t			// S32(2,  (x)) ^ S32(13, (x))
	ror		$$9, s			// S32(22,  (x))
	xor		s, t			// t = (S32(2,  (x)) ^ S32(13, (x)) ^ S32(22, (x)))
	.endm

// #define Sigma1_256(x)   (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))

	.macro	Sigma1
	mov		$0, s			// x
	ror		$$6, s			// S32(6,  (x))
	mov		s, t			// S32(6,  (x))
	ror		$$5, s			// S32(11, (x))
	xor		s, t			// S32(6,  (x)) ^ S32(11, (x))
	ror		$$14, s			// S32(25, (x))
	xor		s, t			// t = (S32(6,  (x)) ^ S32(11, (x)) ^ S32(25, (x)))
	.endm

	// per round digests update
	.macro	round
	Sigma1	$4				// t = T1
	add		t, $7			// use h to store h+Sigma1(e)
	Ch		$4, $5, $6		// t = Ch (e, f, g);
	add		$7, t			// t = h+Sigma1(e)+Ch(e,f,g);
	add		WK($8), t		// h = T1
	add		t, $3			// d += T1;
	mov		t, $7			// h = T1
	Sigma0	$0				// t = Sigma0(a);
	add		t, $7			// h = T1 + Sigma0(a);
	Maj		$0, $1, $2		// t = Maj(a,b,c)
	add		t, $7			// h = T1 + Sigma0(a) + Maj(a,b,c);			
	.endm

	// per 4 rounds digests update and permutation
	// permutation is absorbed by rotating the roles of digests a-h
	.macro	rounds
	round	$0, $1, $2, $3, $4, $5, $6, $7, 0+$8
	round	$7, $0, $1, $2, $3, $4, $5, $6, 1+$8
	round	$6, $7, $0, $1, $2, $3, $4, $5, 2+$8
	round	$5, $6, $7, $0, $1, $2, $3, $4, 3+$8
	.endm

	// update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future 
	.macro	message_schedule

	// 4 32-bit K256 words in xmm5
#if defined	(__x86_64__)
	movdqu	(K), %xmm5
#else
	mov		K, t
	movdqu	(t), %xmm5 
#endif	
	add		$$16, K				// K points to next K256 word for next iteration
	movdqa	$1, %xmm4 			// W7:W4
#if 0
	palignr	$$4, $0, %xmm4		// W4:W1
#else	// no-ssse3 implementation of palignr
	movdqa  $0, %xmm7
    pslldq  $$12, %xmm4
    psrldq  $$4, %xmm7
    por     %xmm7, %xmm4
#endif
	sigma0	%xmm4				// sigma0(W4:W1)
	movdqa	$3, %xmm6 			// W15:W12
	paddd	%xmm4, $0			// $0 = W3:W0 + sigma0(W4:W1) 
#if 0
	palignr	$$4, $2, %xmm6		// W12:W9
#else	// no-ssse3 implementation of palignr
	movdqa  $2, %xmm7
    pslldq  $$12, %xmm6
    psrldq  $$4, %xmm7
    por     %xmm7, %xmm6
#endif
	paddd	%xmm6, $0			// $0 = W12:W9 + sigma0(W4:W1) + W3:W0	
	movdqa	$3, %xmm4			// W15:W12
	psrldq	$$8, %xmm4			// 0,0,W15,W14	
	sigma1	%xmm4				// sigma1(0,0,W15,W14)
	paddd	%xmm4, $0			// sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
	movdqa	$0, %xmm4			// W19-sigma1(W17), W18-sigma1(W16), W17, W16
	pslldq	$$8, %xmm4			// W17, W16, 0, 0
	sigma1	%xmm4				// sigma1(W17,W16,0,0)
	paddd	%xmm4, $0			// W19:W16
	paddd	$0, %xmm5			// WK
	movdqa	%xmm5, WK($4)
	.endm

	// this macro is used in the last 16 rounds of a current block
	// it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
	// and save into stack to prepare for next block

	.macro	update_W_WK
#if defined (__x86_64__)
#if 0
	movdqu	$0*16(data), $1		// read 4 4-byte words
	pshufb	L_aligned_bswap, $1	// big-endian of each 4-byte word, W[r:r+3]
#else	// no-ssse3 implementation
	mov     0+$0*16(data), s
    bswap   s
    mov     s, 0+WK($0*4)
    mov     4+$0*16(data), s
    bswap   s
    mov     s, 4+WK($0*4)
    mov     8+$0*16(data), s
    bswap   s
    mov     s, 8+WK($0*4)
    mov     12+$0*16(data), s
    bswap   s
    mov     s, 12+WK($0*4)
    movdqa  WK($0*4), $1
#endif
	movdqu	$0*16(K), %xmm4		// K[r:r+3]
#else
	mov		data_addr, t
#if 0
	movdqu	$0*16(t), $1		// read 4 4-byte words
	pshufb	L_aligned_bswap, $1	// big-endian of each 4-byte word, W[r:r+3]
#else	// no-ssse3 implementation
	mov     0+$0*16(t), s
    bswap   s
    mov     s, 0+WK($0*4)
    mov     4+$0*16(t), s
    bswap   s
    mov     s, 4+WK($0*4)
    mov     8+$0*16(t), s
    bswap   s
    mov     s, 8+WK($0*4)
    mov     12+$0*16(t), s
    bswap   s
    mov     s, 12+WK($0*4)
    movdqa  WK($0*4), $1
#endif
	mov		K, t
	movdqu	$0*16(t), %xmm4		// K[r:r+3]
#endif
	paddd	$1, %xmm4			// WK[r:r+3]
	movdqa	%xmm4, WK($0*4)		// save WK[r:r+3] into stack circular buffer
	.endm

	.text

#if defined (__x86_64__) || defined (__i386__)

	.globl	_SHA256_Transform_nossse3

_SHA256_Transform_nossse3:

	// push callee-saved registers
#if defined	(__x86_64__)
	push	%rbp
	push	%rbx
	push	%r12
	push	%r13
	push	%r14
	push	%r15
#else
    push    %ebp
	push    %ebx
    push    %esi
    push    %edi
#endif

	// allocate stack space
	sub		$stack_size, sp

	// if kernel code, save used xmm registers
#if	KERNEL
	movdqa	%xmm0, 0*16+xmm_save
	movdqa	%xmm1, 1*16+xmm_save
	movdqa	%xmm2, 2*16+xmm_save
	movdqa	%xmm3, 3*16+xmm_save
	movdqa	%xmm4, 4*16+xmm_save
	movdqa	%xmm5, 5*16+xmm_save
	movdqa	%xmm6, 6*16+xmm_save
	movdqa	%xmm7, 7*16+xmm_save
#endif

	// set up pointer to table K256[]
#if defined (__x86_64__)
	lea		_K256(%rip), K
#else
	lea		_K256, t
	mov		t, K
#endif

	// load W[0:15] into xmm0-xmm3
    .macro  mybswap
    movl    0+$0*16($1), a
    movl    4+$0*16($1), b
    movl    8+$0*16($1), e
    movl    12+$0*16($1), d
    bswap   a
    bswap   b
    bswap   e
    bswap   d
    movl    a, $0*16(sp)
    movl    b, 4+$0*16(sp)
    movl    e, 8+$0*16(sp)
    movl    d, 12+$0*16(sp)
    .endm

#if defined (__x86_64__)
    mybswap 0, data
    mybswap 1, data
    mybswap 2, data
    mybswap 3, data
    add     $64, data
#else
    mov     data_addr, t
    mybswap 0, t
    mybswap 1, t
    mybswap 2, t
    mybswap 3, t
    add     $64, data_addr
#endif
    movdqa  0*16(sp), W0
    movdqa  1*16(sp), W1
    movdqa  2*16(sp), W2
    movdqa  3*16(sp), W3

	// compute WK[0:15] and save in stack
#if defined (__x86_64__)
	movdqu	0*16(K), %xmm4	
	movdqu	1*16(K), %xmm5
	movdqu	2*16(K), %xmm6	
	movdqu	3*16(K), %xmm7
#else
	mov		K, t
	movdqu	0*16(t), %xmm4	
	movdqu	1*16(t), %xmm5
	movdqu	2*16(t), %xmm6	
	movdqu	3*16(t), %xmm7
#endif
	add		$64, K
	paddd	%xmm0, %xmm4
	paddd	%xmm1, %xmm5
	paddd	%xmm2, %xmm6
	paddd	%xmm3, %xmm7
	movdqa	%xmm4, WK(0)
	movdqa	%xmm5, WK(4)
	movdqa	%xmm6, WK(8)
	movdqa	%xmm7, WK(12)

L_loop:

	// digests a-h = ctx->states;
#if defined (__x86_64__)
	mov		0*4(ctx), a
	mov		1*4(ctx), b
	mov		2*4(ctx), c
	mov		3*4(ctx), d
	mov		4*4(ctx), e
	mov		5*4(ctx), f
	mov		6*4(ctx), g
	mov		7*4(ctx), h
#else
	mov		ctx_addr, t
	mov 	0*4(t), a
	mov 	1*4(t), b
	mov 	2*4(t), s
	mov		s, c
	mov 	3*4(t), d
	mov 	4*4(t), e
	mov 	5*4(t), s
	mov		s, f
	mov 	6*4(t), g
	mov 	7*4(t), s
	mov		s, h
#endif

	// rounds 0:47 interleaved with W/WK update for rounds 16:63
	rounds	a, b, c, d, e, f, g, h, 0
	message_schedule W0,W1,W2,W3,16
	rounds	e, f, g, h, a, b, c, d, 4 
	message_schedule W1,W2,W3,W0,20
	rounds	a, b, c, d, e, f, g, h, 8
	message_schedule W2,W3,W0,W1,24
	rounds	e, f, g, h, a, b, c, d, 12 
	message_schedule W3,W0,W1,W2,28
	rounds	a, b, c, d, e, f, g, h, 16
	message_schedule W0,W1,W2,W3,32
	rounds	e, f, g, h, a, b, c, d, 20 
	message_schedule W1,W2,W3,W0,36
	rounds	a, b, c, d, e, f, g, h, 24
	message_schedule W2,W3,W0,W1,40
	rounds	e, f, g, h, a, b, c, d, 28 
	message_schedule W3,W0,W1,W2,44
	rounds	a, b, c, d, e, f, g, h, 32
	message_schedule W0,W1,W2,W3,48
	rounds	e, f, g, h, a, b, c, d, 36 
	message_schedule W1,W2,W3,W0,52
	rounds	a, b, c, d, e, f, g, h, 40
	message_schedule W2,W3,W0,W1,56
	rounds	e, f, g, h, a, b, c, d, 44 
	message_schedule W3,W0,W1,W2,60

	// revert K to the beginning of K256[]
#if defined __x86_64__
	sub		$256, K
#else
	subl	$256, K
#endif

	sub		$1, num_blocks				// num_blocks--
	je		L_final_block				// if final block, wrap up final rounds

	// rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15 
	rounds	a, b, c, d, e, f, g, h, 48
	update_W_WK	0, W0
	rounds	e, f, g, h, a, b, c, d, 52 
	update_W_WK	1, W1
	rounds	a, b, c, d, e, f, g, h, 56
	update_W_WK	2, W2
	rounds	e, f, g, h, a, b, c, d, 60 
	update_W_WK	3, W3

	add		$64, K
#if defined (__x86_64__)
	add		$64, data
#else
	add		$64, data_addr
#endif

	// ctx->states += digests a-h
#if	defined (__x86_64__)
	add		a, 0*4(ctx)
	add		b, 1*4(ctx)
	add		c, 2*4(ctx)
	add		d, 3*4(ctx)
	add		e, 4*4(ctx)
	add		f, 5*4(ctx)
	add		g, 6*4(ctx)
	add		h, 7*4(ctx)
#else
	mov		ctx_addr, t
	add		a, 0*4(t)
	add		b, 1*4(t)
	mov		c, s
	add		s, 2*4(t)
	add		d, 3*4(t)
	add		e, 4*4(t)
	mov		f, s
	add		s, 5*4(t)
	add		g, 6*4(t)
	mov		h, s
	add		s, 7*4(t)
#endif

	jmp		L_loop				// branch for next block

	// wrap up digest update round 48:63 for final block
L_final_block:
	rounds	a, b, c, d, e, f, g, h, 48
	rounds	e, f, g, h, a, b, c, d, 52 
	rounds	a, b, c, d, e, f, g, h, 56
	rounds	e, f, g, h, a, b, c, d, 60 

	// ctx->states += digests a-h
#if	defined (__x86_64__)
	add		a, 0*4(ctx)
	add		b, 1*4(ctx)
	add		c, 2*4(ctx)
	add		d, 3*4(ctx)
	add		e, 4*4(ctx)
	add		f, 5*4(ctx)
	add		g, 6*4(ctx)
	add		h, 7*4(ctx)
#else
	mov		ctx_addr, t
	add		a, 0*4(t)
	add		b, 1*4(t)
	mov		c, s
	add		s, 2*4(t)
	add		d, 3*4(t)
	add		e, 4*4(t)
	mov		f, s
	add		s, 5*4(t)
	add		g, 6*4(t)
	mov		h, s
	add		s, 7*4(t)
#endif

	// if kernel, restore xmm0-xmm7
#if	KERNEL
	movdqa	0*16+xmm_save, %xmm0
	movdqa	1*16+xmm_save, %xmm1
	movdqa	2*16+xmm_save, %xmm2
	movdqa	3*16+xmm_save, %xmm3
	movdqa	4*16+xmm_save, %xmm4
	movdqa	5*16+xmm_save, %xmm5
	movdqa	6*16+xmm_save, %xmm6
	movdqa	7*16+xmm_save, %xmm7
#endif

	// free allocated stack memory
	add		$stack_size, sp

	// restore callee-saved registers
#if defined (__x86_64__)
	pop		%r15
	pop		%r14
	pop		%r13
	pop		%r12
	pop		%rbx
	pop		%rbp
#else
    pop		%edi
    pop		%esi
	pop		%ebx
    pop		%ebp
#endif

	// return
	ret


#endif		// x86_64/i386