/*
 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
 *
 * This document is the property of Apple Inc.
 * It is considered confidential and proprietary.
 *
 * This document may not be reproduced or transmitted in any form,
 * in whole or in part, without the express written permission of
 * Apple Inc.
 */

/* 
 * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
 * with __arm64__ tagged ARM64_TODO .  This code revision is optimized based
 * on the 64-bit part in netinet/cpu_in_cksum.c
 *
 * cclee - CoreOS - Vector & Numerics. 06/20/2012.
 */

#ifdef KERNEL
#define	CKSUM_ERR _kprintf
#else
#ifndef LIBSYSCALL_INTERFACE
#error "LIBSYSCALL_INTERFACE not defined"
#endif /* !LIBSYSCALL_INTERFACE */
#define	CKSUM_ERR _fprintf_stderr
#endif /* !KERNEL */

/*
 * XXX: adi@apple.com:
 *
 * Ugly, but we have little choice, since relying on genassym and <assym.s>
 * is not possible unless this code lives in osfmk.  Note also that this
 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
 * authentic; it only cares about 3 fields.
 */
#define	M_NEXT	0
#define	M_DATA	16	// 8-byte address, would be aligned to 8-byte boundary
#define	M_LEN	24

	.globl	_os_cpu_in_cksum_mbuf
	.text
	.align	4
_os_cpu_in_cksum_mbuf:


/*
 * 64-bit version.
 *
 * This function returns the partial 16-bit checksum accumulated in
 * a 32-bit variable (withouth 1's complement); caller is responsible
 * for folding the 32-bit sum into 16-bit and performinng the 1's
 * complement if applicable
 */

/* 
 * uint32_t
 * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
 * {
 * 	int mlen;
 * 	uint64_t sum, partial;
 * 	unsigned int final_acc;
 * 	uint8_t *data;
 * 	boolean_t needs_swap, started_on_odd;
 *
 * 	VERIFY(len >= 0);
 * 	VERIFY(off >= 0);
 *
 * 	needs_swap = FALSE;
 * 	started_on_odd = FALSE;
 * 	sum = initial_sum;
 */

	#define	m		x0
	#define	len		x1
	#define	off		x2
	#define	sum		x3
	#define	needs_swap	x4
	#define	started_on_odd	x5
	#define	mlen			x6
	#define	Wmlen			w6
	#define t       x7
	#define	data	x8

	mov	needs_swap, #0		// needs_swap = FALSE;
	mov	started_on_odd, #0	// started_on_odd = FALSE;
	mov	w3, w3			// clear higher half


/*
 *	for (;;) {
 *		if (PREDICT_FALSE(m == NULL)) {
 *			CKSUM_ERR("%s: out of data\n", __func__);
 *			return (-1);
 *		}
 *		mlen = m->m_len;
 *		if (mlen > off) {
 *			mlen -= off;
 *			data = mtod(m, uint8_t *) + off;
 *			goto post_initial_offset;
 *		}
 *		off -= mlen;
 *		if (len == 0)
 *			break;
 *		m = m->m_next;
 *	}
 */

0:
	cbz	m, Lin_cksum_whoops	// if (m == NULL) return -1;
	ldr	Wmlen, [m, #M_LEN]	// mlen = m->m_len;
	cmp	mlen, off
	b.le	1f
	ldr	data, [m, #M_DATA]	// mtod(m, uint8_t *)
	sub	mlen, mlen, off		// mlen -= off;
	add	data, data, off		// data = mtod(m, uint8_t *) + off;
	b	L_post_initial_offset
1:
	sub	off, off, mlen
	cbnz	len, 2f
	mov	x0, x3
	ret	lr
2:
	ldr	m, [m, #M_NEXT]
	b	0b

L_loop:	// for (; len > 0; m = m->m_next) {
/*
 *		if (PREDICT_FALSE(m == NULL)) {
 *			CKSUM_ERR("%s: out of data\n", __func__);
 *			return (-1);
 *		}
 *		mlen = m->m_len;
 *		data = mtod(m, uint8_t *);
 */
	cbz	m, Lin_cksum_whoops	// if (m == NULL) return -1;
	ldr	Wmlen, [m, #M_LEN]	// mlen = m->m_len;
	ldr	data, [m, #M_DATA]	// mtod(m, uint8_t *)

L_post_initial_offset:
/*
 *		if (mlen == 0) continue;
 *		if (mlen > len) mlen = len;
 *		len -= mlen;
 */

	cbz	mlen, L_continue
	cmp	mlen, len
	csel	mlen, mlen, len, le
	sub	len, len, mlen

/*
 *		partial = 0;
 *		if ((uintptr_t)data & 1) {
 *			started_on_odd = !started_on_odd;
 *			partial = *data << 8;
 *			++data;
 *			--mlen;
 *		}
 *		needs_swap = started_on_odd;
 */

	tst	data, #1
	mov	x7, #0
	mov	x10, #0
	b.eq	1f
	ldrb	w7, [data], #1
	eor	started_on_odd, started_on_odd, #1
	sub	mlen, mlen, #1
	lsl	w7, w7, #8
1:


/*
 *		if ((uintptr_t)data & 2) {
 *			if (mlen < 2)
 *				goto trailing_bytes;
 *			partial += *(uint16_t *)(void *)data;
 *			data += 2;
 *			mlen -= 2;
 *		}
 */
	tst	data, #2
	mov	needs_swap, started_on_odd
	b.eq	1f
	cmp	mlen, #2
	b.lt	L_trailing_bytes
	ldrh	w9, [data], #2
	sub	mlen, mlen, #2
	add	w7, w7, w9
1:

/*
 *		while (mlen >= 64) {
 *			__builtin_prefetch(data + 32);
 *			__builtin_prefetch(data + 64);
 *			partial += *(uint32_t *)(void *)data;
 *			partial += *(uint32_t *)(void *)(data + 4);
 *			partial += *(uint32_t *)(void *)(data + 8);
 *			partial += *(uint32_t *)(void *)(data + 12);
 *			partial += *(uint32_t *)(void *)(data + 16);
 *			partial += *(uint32_t *)(void *)(data + 20);
 *			partial += *(uint32_t *)(void *)(data + 24);
 *			partial += *(uint32_t *)(void *)(data + 28);
 *			partial += *(uint32_t *)(void *)(data + 32);
 *			partial += *(uint32_t *)(void *)(data + 36);
 *			partial += *(uint32_t *)(void *)(data + 40);
 *			partial += *(uint32_t *)(void *)(data + 44);
 *			partial += *(uint32_t *)(void *)(data + 48);
 *			partial += *(uint32_t *)(void *)(data + 52);
 *			partial += *(uint32_t *)(void *)(data + 56);
 *			partial += *(uint32_t *)(void *)(data + 60);
 *			data += 64;
 *			mlen -= 64;
 *		//	if (PREDICT_FALSE(partial & (3ULL << 62))) {
 *		//		if (needs_swap)
 *		//			partial = (partial << 8) +
 *		//			    (partial >> 56);
 *		//		sum += (partial >> 32);
 *		//		sum += (partial & 0xffffffff);
 *		//		partial = 0;
 *		//	}
 *		}
*/

	// pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
	subs	mlen, mlen, #64
	b.lt	L32_bytes

	// save used vector registers
	sub	sp, sp, #8*16
	mov	x11, sp
	st1.4s	{v0, v1, v2, v3}, [x11], #4*16 
	st1.4s	{v4, v5, v6, v7}, [x11], #4*16 

	// spread partial into 8 8-byte registers in v0-v3
	fmov	s3, w7
	eor.16b	v0, v0, v0
	eor.16b	v1, v1, v1
	eor.16b	v2, v2, v2

	// load the 1st 64 bytes (16 32-bit words)
	ld1.4s	{v4,v5,v6,v7},[data],#64

	// branch to finish off if mlen<64
	subs	mlen, mlen, #64
	b.lt	L64_finishup

	/*
	 * loop for loading and accumulating 16 32-bit words into
	 * 8 8-byte accumulators per iteration.
	 */
L64_loop:
	subs        mlen, mlen, #64             // mlen -= 64

	uadalp.2d   v0, v4
	ld1.4s      {v4},[data], #16

	uadalp.2d   v1, v5
	ld1.4s      {v5},[data], #16

	uadalp.2d   v2, v6
	ld1.4s      {v6},[data], #16

	uadalp.2d   v3, v7
	ld1.4s      {v7},[data], #16

	b.ge        L64_loop

L64_finishup:
	uadalp.2d   v0, v4
	uadalp.2d   v1, v5
	uadalp.2d   v2, v6
	uadalp.2d   v3, v7

	add.2d      v0, v0, v1
	add.2d      v2, v2, v3
	addp.2d     d0, v0
	addp.2d     d2, v2
	add.2d      v0, v0, v2
	fmov        x7, d0			// partial in x7 now

	// restore used vector registers
	ld1.4s      {v0, v1, v2, v3}, [sp], #4*16
	ld1.4s      {v4, v5, v6, v7}, [sp], #4*16

L32_bytes:
	tst     mlen, #32
	b.eq    L16_bytes
	ldp	x9, x10, [data], #16
	ldp	x11, x12, [data], #16
	adds	x7, x7, x9
	mov	x9, #0
	adcs	x7, x7, x10
	adcs	x7, x7, x11
	adcs	x7, x7, x12
	adc	x7, x7, x9

L16_bytes:
	tst	mlen, #16
	b.eq	L8_bytes
	ldp	x9, x10, [data], #16
	adds	x7, x7, x9
	mov	x9, #0
	adcs	x7, x7, x10
	adc	x7, x7, x9

L8_bytes:
	tst     mlen, #8
	mov	x10, #0
	b.eq    L4_bytes
	ldr	x9,[data],#8
	adds	x7, x7, x9
	adc	x7, x7, x10

L4_bytes:
	tst     mlen, #4
	b.eq    L2_bytes
	ldr	w9,[data],#4
	adds	x7, x7, x9
	adc	x7, x7, x10

L2_bytes:
	tst	mlen, #2
	b.eq	L_trailing_bytes
	ldrh	w9,[data],#2
	adds	x7, x7, x9
	adc	x7, x7, x10

L_trailing_bytes:
	tst     mlen, #1
	b.eq    L0_bytes
	ldrb	w9,[data],#1
	adds	x7, x7, x9
	adc	x7, x7, x10
	eor	started_on_odd, started_on_odd, #1

L0_bytes:
/*
 *		if (needs_swap)
 *			partial = (partial << 8) + (partial >> 56);
 */
	cbz	needs_swap, 1f
	ror	x7, x7, #56
1:
/*
 *		sum += (partial >> 32) + (partial & 0xffffffff);
 *		sum = (sum >> 32) + (sum & 0xffffffff);
 *	}
 */

	add	x3, x3, x7, lsr #32
	mov	w7, w7
	add	x3, x3, x7
	mov	w7, w3
	add	x3, x7, x3, lsr #32

L_continue:
	cmp	len, #0
	ldr     m, [m, #M_NEXT]			// m = m->m_next
	b.gt	L_loop

/*
 *	final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
 *	    ((sum >> 16) & 0xffff) + (sum & 0xffff);
 *	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
 *	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
 *	return (final_acc & 0xffff);
 * }
 */

	mov	w4, #0x00ffff
	and	x0, x4, x3, lsr #48
	and	x1, x4, x3, lsr #32
	and	x2, x4, x3, lsr #16
	and	x3, x4, x3
	add	w0, w0, w1
	add	w2, w2, w3
	add	w0, w0, w2
	and	w1, w4, w0, lsr #16
	and	w0, w4, w0
	add	w0, w0, w1
	and	w1, w4, w0, lsr #16
	and	w0, w4, w0
	add	w0, w0, w1
	/*
	 * If we were to 1's complement it (XOR with 0xffff):
	 *
	 * eor    	w0, w0, w4
	 */
	and	w0, w0, w4

	ret	lr

Lin_cksum_whoops:
	adrp	x0, Lin_cksum_whoops_str@page
	add	x0, x0, Lin_cksum_whoops_str@pageoff
	bl	#CKSUM_ERR
	mov	x0, #-1
	ret	lr

Lin_cksum_whoops_str:
	.asciz	"os_cpu_in_cksum_mbuf: out of data\n"
	.align	5