/*
 * Copyright (c) 2009-2017 Apple Inc. All rights reserved.
 *
 * This document is the property of Apple Inc.
 * It is considered confidential and proprietary.
 *
 * This document may not be reproduced or transmitted in any form,
 * in whole or in part, without the express written permission of
 * Apple Inc.
 */

/*	$NetBSD: cpu_in_cksum.S,v 1.2 2008/01/27 16:58:05 chris Exp $	*/

/*
 * Copyright 2003 Wasabi Systems, Inc.
 * All rights reserved.
 *
 * Written by Steve C. Woodford for Wasabi Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed for the NetBSD Project by
 *      Wasabi Systems, Inc.
 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
 *    or promote products derived from this software without specific prior
 *    written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifdef KERNEL
#include "../../../osfmk/arm/arch.h"
#include "../../../osfmk/arm/proc_reg.h"

#if __ARM_VFP__ < 3
#error "Unsupported: __ARM_VFP__ < 3"
#endif /* __ARM_VFP__ < 3 */
#define	CKSUM_ERR _kprintf
#else /* !KERNEL */
#ifndef LIBSYSCALL_INTERFACE
#error "LIBSYSCALL_INTERFACE not defined"
#endif /* !LIBSYSCALL_INTERFACE */
#define	CKSUM_ERR _fprintf_stderr
#define	__ARM_VFP__	3
#endif /* !KERNEL */

/*
 * The following default the implementation to little-endian architectures.
 */
#define	LITTLE_ENDIAN	1
#define	BYTE_ORDER	LITTLE_ENDIAN

.syntax unified

/*
 * XXX: adi@apple.com:
 *
 * Ugly, but we have little choice, since relying on genassym and <assym.s>
 * is not possible unless this code lives in osfmk.  Note also that this
 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
 * authentic; it only cares about 3 fields.
 */
#define	M_NEXT	0
#define	M_DATA	8
#define	M_LEN	12

/*
 * APPLE MODIFICATION
 *
 * The use of R7 in this code as data register prevents
 * the use of debugging or instrumentation tools, which is an acceptable
 * tradeoff considering the potential gain in performance.
 */

/*
 * Hand-optimised implementations for ARM/Xscale
 */

	.macro EnableVFP
#ifdef KERNEL
        push    {r0, r1, r2, r12}
        bl      _enable_kernel_vfp_context
        pop     {r0, r1, r2, r12}
#endif /* KERNEL */
	.endm


/*
 * uint32_t os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off,
 *     uint32_t initial_sum);
 *
 * Entry:
 *	r0	m
 *	r1	len
 *	r2	off
 *	r3	initial_sum
 *
 * Function wide register usage
 *	r8	accumulated sum
 *	r9	remaining length to parse
 *	ip	pointer to next mbuf
 *
 * This function returns the partial 16-bit checksum accumulated in
 * a 32-bit variable (withouth 1's complement); caller is responsible
 * for folding the 32-bit sum into 16-bit and performinng the 1's
 * complement if applicable
 */
	.globl	_os_cpu_in_cksum_mbuf
	.text
	.align	4
_os_cpu_in_cksum_mbuf:
	stmfd	sp!, {r4-r11,lr}

	mov	r8, r3			/* Accumulate sum in r8 */
	mov	r9, r1			/* save len in r9 */
	mov	ip, r0			/* set ip to the current mbuf */

	cmp	r9, #0			/* length is 0? */
	bne	.Lin_cksum_skip_loop	/* if not, proceed further */
	mov	r0, r8			/* otherwise, return initial sum */

	ldmfd	sp!, {r4-r11, pc}

.Lin_cksum_skip_loop:
	ldr	r1, [ip, #(M_LEN)]
	ldr	r0, [ip, #(M_DATA)]
	ldr	ip, [ip, #(M_NEXT)]
.Lin_cksum_skip_entry:
	subs	r2, r2, r1		/* offset = offset - mbuf length */
	blt	.Lin_cksum_skip_done	/* if offset has gone negative start with this mbuf */
	cmp	ip, #0x00
	bne	.Lin_cksum_skip_loop
	b	.Lin_cksum_whoops

.Lin_cksum_skip_done:
	add	r0, r2, r0		/* data += offset (offset is < 0) */ 
	add	r0, r0, r1		/* data += length of mbuf */
					/* data == start of data to cksum */
	rsb	r1, r2, #0x00		/* length = remainder of mbuf to read */
	mov	r10, #0x00
	b	.Lin_cksum_entry

.Lin_cksum_loop:
	ldr	r1, [ip, #(M_LEN)]
	ldr	r0, [ip, #(M_DATA)]
	ldr	ip, [ip, #(M_NEXT)]
.Lin_cksum_entry:
	cmp	r9, r1
	movlt	r1, r9
	sub	r9, r9, r1
	eor	r11, r10, r0
	add	r10, r10, r1
	adds	r2, r1, #0x00

	beq	.Lin_cksum_next

/*
 * APPLE MODIFICATION
 *
 * Replace the 'blne _ASM_LABEL(L_cksumdata)' by bringing the called function
 * inline. This results in slightly faster code, and also permits the whole
 * function to be included in kernel profiling data.
 */

/*
 * The main in*_cksum() workhorse...
 *
 * Entry parameters:
 *	r0	Pointer to buffer
 *	r1	Buffer length
 *	lr	Return address
 *
 * Returns:
 *	r2	Accumulated 32-bit sum
 *
 * Clobbers:
 *	r0-r7
 */
	mov	r2, #0

	/* We first have to word-align the buffer.  */
	ands	r7, r0, #0x03
	beq	.Lcksumdata_wordaligned
	rsb	r7, r7, #0x04
	cmp	r1, r7			/* Enough bytes left to make it? */
	blt	.Lcksumdata_endgame
	cmp	r7, #0x02
	ldrb	r4, [r0], #0x01		/* Fetch 1st byte */
	ldrbge	r5, [r0], #0x01		/* Fetch 2nd byte */
	movlt	r5, #0x00
	ldrbgt	r6, [r0], #0x01		/* Fetch 3rd byte */
	movle	r6, #0x00
	/* Combine the three bytes depending on endianness and alignment */
#if BYTE_ORDER != LITTLE_ENDIAN
	orreq	r2, r5, r4, lsl #8
	orreq	r2, r2, r6, lsl #24
	orrne	r2, r4, r5, lsl #8
	orrne	r2, r2, r6, lsl #16
#else
	orreq	r2, r4, r5, lsl #8
	orreq	r2, r2, r6, lsl #16
	orrne	r2, r5, r4, lsl #8
	orrne	r2, r2, r6, lsl #24
#endif
	subs	r1, r1, r7		/* Update length */
	beq	.Lin_cksum_next		/* All done? */

	/* Buffer is now word aligned */
.Lcksumdata_wordaligned:

#if __ARM_VFP__ >= 3

	cmp		r1, #512	// do this if r1 is at least 512
	blt		9f

	EnableVFP

	and		r3, r1, #~0x3f

	vpush	{q0-q7}

	// move r2 to s16 (q4) for neon computation
	veor        q4, q4, q4
	vld1.32     {q0-q1}, [r0]!
	vmov        s16, r2
	vld1.32     {q2-q3}, [r0]!

	// pre-decrement size by 64
	subs	r3, r3, #0x80

	vpadal.u32  q4, q0
	vld1.32     {q0}, [r0]!
	vpaddl.u32  q5, q1
	vld1.32     {q1}, [r0]!
	vpaddl.u32  q6, q2
	vld1.32     {q2}, [r0]!
	vpaddl.u32  q7, q3
	vld1.32     {q3}, [r0]!

0:
	subs	r3, r3, #0x40		// decrement size by 64

	vpadal.u32  q4, q0
	vld1.32     {q0}, [r0]!
	vpadal.u32  q5, q1
	vld1.32     {q1}, [r0]!
	vpadal.u32  q6, q2
	vld1.32     {q2}, [r0]!
	vpadal.u32  q7, q3
	vld1.32     {q3}, [r0]!

	bgt		0b

	vpadal.u32  q4, q0
	vpadal.u32  q5, q1
	vpadal.u32  q6, q2
	vpadal.u32  q7, q3

	vpadal.u32  q4, q5
	vpadal.u32  q6, q7
	vpadal.u32  q4, q6
	vadd.i64    d8, d9

	vpaddl.u32  d8, d8
	vpaddl.u32  d8, d8
	vpaddl.u32  d8, d8

	vmov    r2, s16

	vpop   {q0-q7}

	ands    r1, r1, #0x3f		// residual bytes
	beq 	.Lin_cksum_next
	
9:

#endif /* __ARM_VFP__ >= 3 */

	subs	r1, r1, #0x40
	blt	.Lcksumdata_bigloop_end

.Lcksumdata_bigloop:
	ldmia	r0!, {r3, r4, r5, r6}
	adds	r2, r2, r3
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	ldmia	r0!, {r3, r4, r5, r7}
	adcs	r2, r2, r6
	adcs	r2, r2, r3
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	ldmia	r0!, {r3, r4, r5, r6}
	adcs	r2, r2, r7
	adcs	r2, r2, r3
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	ldmia	r0!, {r3, r4, r5, r7}
	adcs	r2, r2, r6
	adcs	r2, r2, r3
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	adcs	r2, r2, r7
	adc	r2, r2, #0x00
	subs	r1, r1, #0x40
	bge	.Lcksumdata_bigloop
.Lcksumdata_bigloop_end:

	adds	r1, r1, #0x40
	beq	.Lin_cksum_next

	cmp	r1, #0x20
	
	blt	.Lcksumdata_less_than_32
	ldmia	r0!, {r3, r4, r5, r6}
	adds	r2, r2, r3
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	ldmia	r0!, {r3, r4, r5, r7}
	adcs	r2, r2, r6
	adcs	r2, r2, r3
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	adcs	r2, r2, r7
	adc	r2, r2, #0x00
	subs	r1, r1, #0x20
	beq	.Lin_cksum_next

.Lcksumdata_less_than_32:
	/* There are less than 32 bytes left */
	and	r3, r1, #0x18
	rsb	r4, r3, #0x18
	sub	r1, r1, r3
	adds	r4, r4, r4, lsr #1	/* Side effect: Clear carry flag */
	addne	pc, pc, r4

/*
 * Note: We use ldm here, even on Xscale, since the combined issue/result
 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
 */
	/* At least 24 bytes remaining... */
	ldmia	r0!, {r4, r5}
	nop
	adcs	r2, r2, r4
	adcs	r2, r2, r5

	/* At least 16 bytes remaining... */
	ldmia	r0!, {r4, r5}
	adcs	r2, r2, r4
	adcs	r2, r2, r5

	/* At least 8 bytes remaining... */
	ldmia	r0!, {r4, r5}
	adcs	r2, r2, r4
	adcs	r2, r2, r5

	/* Less than 8 bytes remaining... */
	adc	r2, r2, #0x00
	subs	r1, r1, #0x04
	blt	.Lcksumdata_lessthan4

	ldr	r4, [r0], #0x04
	sub	r1, r1, #0x04
	adds	r2, r2, r4
	adc	r2, r2, #0x00

	/* Deal with < 4 bytes remaining */
.Lcksumdata_lessthan4:
	adds	r1, r1, #0x04
	beq	.Lin_cksum_next

	/* Deal with 1 to 3 remaining bytes, possibly misaligned */
.Lcksumdata_endgame:
	ldrb	r3, [r0]		/* Fetch first byte */
	cmp	r1, #0x02
	ldrbge	r4, [r0, #0x01]		/* Fetch 2nd and 3rd as necessary */
	movlt	r4, #0x00
	ldrbgt	r5, [r0, #0x02]
	movle	r5, #0x00
	/* Combine the three bytes depending on endianness and alignment */
	tst	r0, #0x01
#if BYTE_ORDER != LITTLE_ENDIAN
	orreq	r3, r4, r3, lsl #8
	orreq	r3, r3, r5, lsl #24
	orrne	r3, r3, r4, lsl #8
	orrne	r3, r3, r5, lsl #16
#else
	orreq	r3, r3, r4, lsl #8
	orreq	r3, r3, r5, lsl #16
	orrne	r3, r4, r3, lsl #8
	orrne	r3, r3, r5, lsl #24
#endif
	adds	r2, r2, r3
	adc	r2, r2, #0x00

.Lin_cksum_next:
	tst	r11, #0x01
	movne	r2, r2, ror #8
	adds	r8, r8, r2
	adc	r8, r8, #0x00
	cmp	ip, #00
	bne	.Lin_cksum_loop
	
	mov	r1, #0xff
	orr	r1, r1, #0xff00
	and	r0, r8, r1
	add	r0, r0, r8, lsr #16
	add	r0, r0, r0, lsr #16
	and	r0, r0, r1
	/*
	 * If we were to 1's complement it (XOR with 0xffff):
	 *
	 * eor	r0, r0, r1
	 */

	ldmfd	sp!, {r4-r11, pc}

.Lin_cksum_whoops:
	adr	r0, .Lin_cksum_whoops_str
	bl	#CKSUM_ERR
	mov	r0, #-1

	ldmfd	sp!, {r4-r11, pc}

.Lin_cksum_whoops_str:
	.asciz	"os_cpu_in_cksum_mbuf: out of data\n"
	.align	5