[apple/xnu.git] / bsd / dev / arm / cpu_in_cksum.s

/*
 * Copyright (c) 2009-2018 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

/*	$NetBSD: cpu_in_cksum.S,v 1.2 2008/01/27 16:58:05 chris Exp $	*/

/*
 * Copyright 2003 Wasabi Systems, Inc.
 * All rights reserved.
 *
 * Written by Steve C. Woodford for Wasabi Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed for the NetBSD Project by
 *      Wasabi Systems, Inc.
 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
 *    or promote products derived from this software without specific prior
 *    written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifdef KERNEL
#include "../../../osfmk/arm/arch.h"
#include "../../../osfmk/arm/proc_reg.h"

#if __ARM_VFP__ < 3
#error "Unsupported: __ARM_VFP__ < 3"
#endif /* __ARM_VFP__ < 3 */
#define	CKSUM_ERR _kprintf
#else /* !KERNEL */
#ifndef LIBSYSCALL_INTERFACE
#error "LIBSYSCALL_INTERFACE not defined"
#endif /* !LIBSYSCALL_INTERFACE */
#define	CKSUM_ERR _fprintf_stderr
#define	__ARM_VFP__	3
#endif /* !KERNEL */

/*
 * The following default the implementation to little-endian architectures.
 */
#define	LITTLE_ENDIAN	1
#define	BYTE_ORDER	LITTLE_ENDIAN

.syntax unified

/*
 * XXX: adi@apple.com:
 *
 * Ugly, but we have little choice, since relying on genassym and <assym.s>
 * is not possible unless this code lives in osfmk.  Note also that this
 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
 * authentic; it only cares about 3 fields.
 */
#define	M_NEXT	0
#define	M_DATA	8
#define	M_LEN	12

/*
 * APPLE MODIFICATION
 *
 * The use of R7 in this code as data register prevents
 * the use of debugging or instrumentation tools, which is an acceptable
 * tradeoff considering the potential gain in performance.
 */

/*
 * Hand-optimised implementations for ARM/Xscale
 */

	.macro EnableVFP
#ifdef KERNEL
        push    {r0, r1, r2, r12}
        bl      _enable_kernel_vfp_context
        pop     {r0, r1, r2, r12}
#endif /* KERNEL */
	.endm


/*
 * uint32_t os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off,
 *     uint32_t initial_sum);
 *
 * Entry:
 *	r0	m
 *	r1	len
 *	r2	off
 *	r3	initial_sum
 *
 * Function wide register usage
 *	r8	accumulated sum
 *	r9	remaining length to parse
 *	ip	pointer to next mbuf
 *
 * This function returns the partial 16-bit checksum accumulated in
 * a 32-bit variable (withouth 1's complement); caller is responsible
 * for folding the 32-bit sum into 16-bit and performinng the 1's
 * complement if applicable
 */
	.globl	_os_cpu_in_cksum_mbuf
	.text
	.align	4
_os_cpu_in_cksum_mbuf:
	stmfd	sp!, {r4-r11,lr}

	mov	r8, r3			/* Accumulate sum in r8 */
	mov	r9, r1			/* save len in r9 */
	mov	ip, r0			/* set ip to the current mbuf */

	cmp	r9, #0			/* length is 0? */
	bne	.Lin_cksum_skip_loop	/* if not, proceed further */
	mov	r0, r8			/* otherwise, return initial sum */

	ldmfd	sp!, {r4-r11, pc}

.Lin_cksum_skip_loop:
	ldr	r1, [ip, #(M_LEN)]
	ldr	r0, [ip, #(M_DATA)]
	ldr	ip, [ip, #(M_NEXT)]
.Lin_cksum_skip_entry:
	subs	r2, r2, r1		/* offset = offset - mbuf length */
	blt	.Lin_cksum_skip_done	/* if offset has gone negative start with this mbuf */
	cmp	ip, #0x00
	bne	.Lin_cksum_skip_loop
	b	.Lin_cksum_whoops

.Lin_cksum_skip_done:
	add	r0, r2, r0		/* data += offset (offset is < 0) */ 
	add	r0, r0, r1		/* data += length of mbuf */
					/* data == start of data to cksum */
	rsb	r1, r2, #0x00		/* length = remainder of mbuf to read */
	mov	r10, #0x00
	b	.Lin_cksum_entry

.Lin_cksum_loop:
	ldr	r1, [ip, #(M_LEN)]
	ldr	r0, [ip, #(M_DATA)]
	ldr	ip, [ip, #(M_NEXT)]
.Lin_cksum_entry:
	cmp	r9, r1
	movlt	r1, r9
	sub	r9, r9, r1
	eor	r11, r10, r0
	add	r10, r10, r1
	adds	r2, r1, #0x00

	beq	.Lin_cksum_next

/*
 * APPLE MODIFICATION
 *
 * Replace the 'blne _ASM_LABEL(L_cksumdata)' by bringing the called function
 * inline. This results in slightly faster code, and also permits the whole
 * function to be included in kernel profiling data.
 */

/*
 * The main in*_cksum() workhorse...
 *
 * Entry parameters:
 *	r0	Pointer to buffer
 *	r1	Buffer length
 *	lr	Return address
 *
 * Returns:
 *	r2	Accumulated 32-bit sum
 *
 * Clobbers:
 *	r0-r7
 */
	mov	r2, #0

	/* We first have to word-align the buffer.  */
	ands	r7, r0, #0x03
	beq	.Lcksumdata_wordaligned
	rsb	r7, r7, #0x04
	cmp	r1, r7			/* Enough bytes left to make it? */
	blt	.Lcksumdata_endgame
	cmp	r7, #0x02
	ldrb	r4, [r0], #0x01		/* Fetch 1st byte */
	ldrbge	r5, [r0], #0x01		/* Fetch 2nd byte */
	movlt	r5, #0x00
	ldrbgt	r6, [r0], #0x01		/* Fetch 3rd byte */
	movle	r6, #0x00
	/* Combine the three bytes depending on endianness and alignment */
#if BYTE_ORDER != LITTLE_ENDIAN
	orreq	r2, r5, r4, lsl #8
	orreq	r2, r2, r6, lsl #24
	orrne	r2, r4, r5, lsl #8
	orrne	r2, r2, r6, lsl #16
#else
	orreq	r2, r4, r5, lsl #8
	orreq	r2, r2, r6, lsl #16
	orrne	r2, r5, r4, lsl #8
	orrne	r2, r2, r6, lsl #24
#endif
	subs	r1, r1, r7		/* Update length */
	beq	.Lin_cksum_next		/* All done? */

	/* Buffer is now word aligned */
.Lcksumdata_wordaligned:

#if __ARM_VFP__ >= 3

	cmp		r1, #512	// do this if r1 is at least 512
	blt		9f

	EnableVFP

	and		r3, r1, #~0x3f

	vpush	{q0-q7}

	// move r2 to s16 (q4) for neon computation
	veor        q4, q4, q4
	vld1.32     {q0-q1}, [r0]!
	vmov        s16, r2
	vld1.32     {q2-q3}, [r0]!

	// pre-decrement size by 64
	subs	r3, r3, #0x80

	vpadal.u32  q4, q0
	vld1.32     {q0}, [r0]!
	vpaddl.u32  q5, q1
	vld1.32     {q1}, [r0]!
	vpaddl.u32  q6, q2
	vld1.32     {q2}, [r0]!
	vpaddl.u32  q7, q3
	vld1.32     {q3}, [r0]!

0:
	subs	r3, r3, #0x40		// decrement size by 64

	vpadal.u32  q4, q0
	vld1.32     {q0}, [r0]!
	vpadal.u32  q5, q1
	vld1.32     {q1}, [r0]!
	vpadal.u32  q6, q2
	vld1.32     {q2}, [r0]!
	vpadal.u32  q7, q3
	vld1.32     {q3}, [r0]!

	bgt		0b

	vpadal.u32  q4, q0
	vpadal.u32  q5, q1
	vpadal.u32  q6, q2
	vpadal.u32  q7, q3

	vpadal.u32  q4, q5
	vpadal.u32  q6, q7
	vpadal.u32  q4, q6
	vadd.i64    d8, d9

	vpaddl.u32  d8, d8
	vpaddl.u32  d8, d8
	vpaddl.u32  d8, d8

	vmov    r2, s16

	vpop   {q0-q7}

	ands    r1, r1, #0x3f		// residual bytes
	beq 	.Lin_cksum_next
	
9:

#endif /* __ARM_VFP__ >= 3 */

	subs	r1, r1, #0x40
	blt	.Lcksumdata_bigloop_end

.Lcksumdata_bigloop:
	ldmia	r0!, {r3, r4, r5, r6}
	adds	r2, r2, r3
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	ldmia	r0!, {r3, r4, r5, r7}
	adcs	r2, r2, r6
	adcs	r2, r2, r3
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	ldmia	r0!, {r3, r4, r5, r6}
	adcs	r2, r2, r7
	adcs	r2, r2, r3
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	ldmia	r0!, {r3, r4, r5, r7}
	adcs	r2, r2, r6
	adcs	r2, r2, r3
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	adcs	r2, r2, r7
	adc	r2, r2, #0x00
	subs	r1, r1, #0x40
	bge	.Lcksumdata_bigloop
.Lcksumdata_bigloop_end:

	adds	r1, r1, #0x40
	beq	.Lin_cksum_next

	cmp	r1, #0x20
	
	blt	.Lcksumdata_less_than_32
	ldmia	r0!, {r3, r4, r5, r6}
	adds	r2, r2, r3
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	ldmia	r0!, {r3, r4, r5, r7}
	adcs	r2, r2, r6
	adcs	r2, r2, r3
	adcs	r2, r2, r4
	adcs	r2, r2, r5
	adcs	r2, r2, r7
	adc	r2, r2, #0x00
	subs	r1, r1, #0x20
	beq	.Lin_cksum_next

.Lcksumdata_less_than_32:
	/* There are less than 32 bytes left */
	and	r3, r1, #0x18
	rsb	r4, r3, #0x18
	sub	r1, r1, r3
	adds	r4, r4, r4, lsr #1	/* Side effect: Clear carry flag */
	addne	pc, pc, r4

/*
 * Note: We use ldm here, even on Xscale, since the combined issue/result
 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
 */
	/* At least 24 bytes remaining... */
	ldmia	r0!, {r4, r5}
	nop
	adcs	r2, r2, r4
	adcs	r2, r2, r5

	/* At least 16 bytes remaining... */
	ldmia	r0!, {r4, r5}
	adcs	r2, r2, r4
	adcs	r2, r2, r5

	/* At least 8 bytes remaining... */
	ldmia	r0!, {r4, r5}
	adcs	r2, r2, r4
	adcs	r2, r2, r5

	/* Less than 8 bytes remaining... */
	adc	r2, r2, #0x00
	subs	r1, r1, #0x04
	blt	.Lcksumdata_lessthan4

	ldr	r4, [r0], #0x04
	sub	r1, r1, #0x04
	adds	r2, r2, r4
	adc	r2, r2, #0x00

	/* Deal with < 4 bytes remaining */
.Lcksumdata_lessthan4:
	adds	r1, r1, #0x04
	beq	.Lin_cksum_next

	/* Deal with 1 to 3 remaining bytes, possibly misaligned */
.Lcksumdata_endgame:
	ldrb	r3, [r0]		/* Fetch first byte */
	cmp	r1, #0x02
	ldrbge	r4, [r0, #0x01]		/* Fetch 2nd and 3rd as necessary */
	movlt	r4, #0x00
	ldrbgt	r5, [r0, #0x02]
	movle	r5, #0x00
	/* Combine the three bytes depending on endianness and alignment */
	tst	r0, #0x01
#if BYTE_ORDER != LITTLE_ENDIAN
	orreq	r3, r4, r3, lsl #8
	orreq	r3, r3, r5, lsl #24
	orrne	r3, r3, r4, lsl #8
	orrne	r3, r3, r5, lsl #16
#else
	orreq	r3, r3, r4, lsl #8
	orreq	r3, r3, r5, lsl #16
	orrne	r3, r4, r3, lsl #8
	orrne	r3, r3, r5, lsl #24
#endif
	adds	r2, r2, r3
	adc	r2, r2, #0x00

.Lin_cksum_next:
	tst	r11, #0x01
	movne	r2, r2, ror #8
	adds	r8, r8, r2
	adc	r8, r8, #0x00
	cmp	ip, #00
	bne	.Lin_cksum_loop
	
	mov	r1, #0xff
	orr	r1, r1, #0xff00
	and	r0, r8, r1
	add	r0, r0, r8, lsr #16
	add	r0, r0, r0, lsr #16
	and	r0, r0, r1
	/*
	 * If we were to 1's complement it (XOR with 0xffff):
	 *
	 * eor	r0, r0, r1
	 */

	ldmfd	sp!, {r4-r11, pc}

.Lin_cksum_whoops:
	adr	r0, .Lin_cksum_whoops_str
	bl	#CKSUM_ERR
	mov	r0, #-1

	ldmfd	sp!, {r4-r11, pc}

.Lin_cksum_whoops_str:
	.asciz	"os_cpu_in_cksum_mbuf: out of data\n"
	.align	5
Commit	Line	Data
5ba3f43e	1	/*
a39ff7e2	2	* Copyright (c) 2009-2018 Apple Inc. All rights reserved.
5ba3f43e	3	*
a39ff7e2	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5ba3f43e	5	*
a39ff7e2 A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
5ba3f43e A	27	*/
	28
	29	/* $NetBSD: cpu_in_cksum.S,v 1.2 2008/01/27 16:58:05 chris Exp $ */
	30
	31	/*
	32	* Copyright 2003 Wasabi Systems, Inc.
	33	* All rights reserved.
	34	*
	35	* Written by Steve C. Woodford for Wasabi Systems, Inc.
	36	*
	37	* Redistribution and use in source and binary forms, with or without
	38	* modification, are permitted provided that the following conditions
	39	* are met:
	40	* 1. Redistributions of source code must retain the above copyright
	41	* notice, this list of conditions and the following disclaimer.
	42	* 2. Redistributions in binary form must reproduce the above copyright
	43	* notice, this list of conditions and the following disclaimer in the
	44	* documentation and/or other materials provided with the distribution.
	45	* 3. All advertising materials mentioning features or use of this software
	46	* must display the following acknowledgement:
	47	* This product includes software developed for the NetBSD Project by
	48	* Wasabi Systems, Inc.
	49	* 4. The name of Wasabi Systems, Inc. may not be used to endorse
	50	* or promote products derived from this software without specific prior
	51	* written permission.
	52	*
	53	* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
	54	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
	55	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	56	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
	57	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	58	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	59	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	60	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	61	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	62	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	63	* POSSIBILITY OF SUCH DAMAGE.
	64	*/
	65
	66	#ifdef KERNEL
	67	#include "../../../osfmk/arm/arch.h"
	68	#include "../../../osfmk/arm/proc_reg.h"
	69
	70	#if __ARM_VFP__ < 3
	71	#error "Unsupported: __ARM_VFP__ < 3"
	72	#endif /* __ARM_VFP__ < 3 */
	73	#define CKSUM_ERR _kprintf
	74	#else /* !KERNEL */
	75	#ifndef LIBSYSCALL_INTERFACE
	76	#error "LIBSYSCALL_INTERFACE not defined"
	77	#endif /* !LIBSYSCALL_INTERFACE */
	78	#define CKSUM_ERR _fprintf_stderr
	79	#define __ARM_VFP__ 3
	80	#endif /* !KERNEL */
	81
	82	/*
	83	* The following default the implementation to little-endian architectures.
	84	*/
	85	#define LITTLE_ENDIAN 1
	86	#define BYTE_ORDER LITTLE_ENDIAN
	87
	88	.syntax unified
	89
	90	/*
91	* XXX: adi@apple.com:
92	*
93	* Ugly, but we have little choice, since relying on genassym and <assym.s>
94	* is not possible unless this code lives in osfmk. Note also that this
95	* routine expects "mbuf-like" argument, and it does not expect the mbuf to be
96	* authentic; it only cares about 3 fields.
97	*/
98	#define M_NEXT 0
99	#define M_DATA 8
100	#define M_LEN 12
101
102	/*
103	* APPLE MODIFICATION
104	*
105	* The use of R7 in this code as data register prevents
106	* the use of debugging or instrumentation tools, which is an acceptable
107	* tradeoff considering the potential gain in performance.
108	*/
109
110	/*
111	* Hand-optimised implementations for ARM/Xscale
112	*/
113
114	.macro EnableVFP
115	#ifdef KERNEL
116	push {r0, r1, r2, r12}
117	bl _enable_kernel_vfp_context
118	pop {r0, r1, r2, r12}
119	#endif /* KERNEL */
120	.endm
121
122
123	/*
124	* uint32_t os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off,
125	* uint32_t initial_sum);
126	*
127	* Entry:
128	* r0 m
129	* r1 len
130	* r2 off
131	* r3 initial_sum
132	*
133	* Function wide register usage
134	* r8 accumulated sum
135	* r9 remaining length to parse
136	* ip pointer to next mbuf
137	*
138	* This function returns the partial 16-bit checksum accumulated in
139	* a 32-bit variable (withouth 1's complement); caller is responsible
140	* for folding the 32-bit sum into 16-bit and performinng the 1's
141	* complement if applicable
142	*/
143	.globl _os_cpu_in_cksum_mbuf
144	.text
145	.align 4
146	_os_cpu_in_cksum_mbuf:
147	stmfd sp!, {r4-r11,lr}
148
149	mov r8, r3 /* Accumulate sum in r8 */
150	mov r9, r1 /* save len in r9 */
151	mov ip, r0 /* set ip to the current mbuf */
152
153	cmp r9, #0 /* length is 0? */
154	bne .Lin_cksum_skip_loop /* if not, proceed further */
155	mov r0, r8 /* otherwise, return initial sum */
156
157	ldmfd sp!, {r4-r11, pc}
158
159	.Lin_cksum_skip_loop:
160	ldr r1, [ip, #(M_LEN)]
161	ldr r0, [ip, #(M_DATA)]
162	ldr ip, [ip, #(M_NEXT)]
163	.Lin_cksum_skip_entry:
164	subs r2, r2, r1 /* offset = offset - mbuf length */
165	blt .Lin_cksum_skip_done /* if offset has gone negative start with this mbuf */
166	cmp ip, #0x00
167	bne .Lin_cksum_skip_loop
168	b .Lin_cksum_whoops
169
170	.Lin_cksum_skip_done:
171	add r0, r2, r0 /* data += offset (offset is < 0) */
172	add r0, r0, r1 /* data += length of mbuf */
173	/* data == start of data to cksum */
174	rsb r1, r2, #0x00 /* length = remainder of mbuf to read */
175	mov r10, #0x00
176	b .Lin_cksum_entry
177
178	.Lin_cksum_loop:
179	ldr r1, [ip, #(M_LEN)]
180	ldr r0, [ip, #(M_DATA)]
181	ldr ip, [ip, #(M_NEXT)]
182	.Lin_cksum_entry:
183	cmp r9, r1
184	movlt r1, r9
185	sub r9, r9, r1
186	eor r11, r10, r0
187	add r10, r10, r1
188	adds r2, r1, #0x00
189
190	beq .Lin_cksum_next
191
192	/*
193	* APPLE MODIFICATION
194	*
195	* Replace the 'blne _ASM_LABEL(L_cksumdata)' by bringing the called function
196	* inline. This results in slightly faster code, and also permits the whole
197	* function to be included in kernel profiling data.
198	*/
199
200	/*
201	* The main in*_cksum() workhorse...
202	*
203	* Entry parameters:
204	* r0 Pointer to buffer
205	* r1 Buffer length
206	* lr Return address
207	*
208	* Returns:
209	* r2 Accumulated 32-bit sum
210	*
211	* Clobbers:
212	* r0-r7
213	*/
214	mov r2, #0
215
216	/* We first have to word-align the buffer. */
217	ands r7, r0, #0x03
218	beq .Lcksumdata_wordaligned
219	rsb r7, r7, #0x04
220	cmp r1, r7 /* Enough bytes left to make it? */
221	blt .Lcksumdata_endgame
222	cmp r7, #0x02
223	ldrb r4, [r0], #0x01 /* Fetch 1st byte */
224	ldrbge r5, [r0], #0x01 /* Fetch 2nd byte */
225	movlt r5, #0x00
226	ldrbgt r6, [r0], #0x01 /* Fetch 3rd byte */
227	movle r6, #0x00
228	/* Combine the three bytes depending on endianness and alignment */
229	#if BYTE_ORDER != LITTLE_ENDIAN
230	orreq r2, r5, r4, lsl #8
231	orreq r2, r2, r6, lsl #24
232	orrne r2, r4, r5, lsl #8
233	orrne r2, r2, r6, lsl #16
234	#else
235	orreq r2, r4, r5, lsl #8
236	orreq r2, r2, r6, lsl #16
237	orrne r2, r5, r4, lsl #8
238	orrne r2, r2, r6, lsl #24
239	#endif
240	subs r1, r1, r7 /* Update length */
241	beq .Lin_cksum_next /* All done? */
242
243	/* Buffer is now word aligned */
244	.Lcksumdata_wordaligned:
245
246	#if __ARM_VFP__ >= 3
247
248	cmp r1, #512 // do this if r1 is at least 512
249	blt 9f
250
251	EnableVFP
252
253	and r3, r1, #~0x3f
254
255	vpush {q0-q7}
256
257	// move r2 to s16 (q4) for neon computation
258	veor q4, q4, q4
259	vld1.32 {q0-q1}, [r0]!
260	vmov s16, r2
261	vld1.32 {q2-q3}, [r0]!
262
263	// pre-decrement size by 64
264	subs r3, r3, #0x80
265
266	vpadal.u32 q4, q0
267	vld1.32 {q0}, [r0]!
268	vpaddl.u32 q5, q1
269	vld1.32 {q1}, [r0]!
270	vpaddl.u32 q6, q2
271	vld1.32 {q2}, [r0]!
272	vpaddl.u32 q7, q3
273	vld1.32 {q3}, [r0]!
274
275	0:
276	subs r3, r3, #0x40 // decrement size by 64
277
278	vpadal.u32 q4, q0
279	vld1.32 {q0}, [r0]!
280	vpadal.u32 q5, q1
281	vld1.32 {q1}, [r0]!
282	vpadal.u32 q6, q2
283	vld1.32 {q2}, [r0]!
284	vpadal.u32 q7, q3
285	vld1.32 {q3}, [r0]!
286
287	bgt 0b
288
289	vpadal.u32 q4, q0
290	vpadal.u32 q5, q1
291	vpadal.u32 q6, q2
292	vpadal.u32 q7, q3
293
294	vpadal.u32 q4, q5
295	vpadal.u32 q6, q7
296	vpadal.u32 q4, q6
297	vadd.i64 d8, d9
298
299	vpaddl.u32 d8, d8
300	vpaddl.u32 d8, d8
301	vpaddl.u32 d8, d8
302
303	vmov r2, s16
304
305	vpop {q0-q7}
306
307	ands r1, r1, #0x3f // residual bytes
308	beq .Lin_cksum_next
309
310	9:
311
312	#endif /* __ARM_VFP__ >= 3 */
313
314	subs r1, r1, #0x40
315	blt .Lcksumdata_bigloop_end
316
317	.Lcksumdata_bigloop:
318	ldmia r0!, {r3, r4, r5, r6}
319	adds r2, r2, r3
320	adcs r2, r2, r4
321	adcs r2, r2, r5
322	ldmia r0!, {r3, r4, r5, r7}
323	adcs r2, r2, r6
324	adcs r2, r2, r3
325	adcs r2, r2, r4
326	adcs r2, r2, r5
327	ldmia r0!, {r3, r4, r5, r6}
328	adcs r2, r2, r7
329	adcs r2, r2, r3
330	adcs r2, r2, r4
331	adcs r2, r2, r5
332	ldmia r0!, {r3, r4, r5, r7}
333	adcs r2, r2, r6
334	adcs r2, r2, r3
335	adcs r2, r2, r4
336	adcs r2, r2, r5
337	adcs r2, r2, r7
338	adc r2, r2, #0x00
339	subs r1, r1, #0x40
340	bge .Lcksumdata_bigloop
341	.Lcksumdata_bigloop_end:
342
343	adds r1, r1, #0x40
344	beq .Lin_cksum_next
345
346	cmp r1, #0x20
347
348	blt .Lcksumdata_less_than_32
349	ldmia r0!, {r3, r4, r5, r6}
350	adds r2, r2, r3
351	adcs r2, r2, r4
352	adcs r2, r2, r5
353	ldmia r0!, {r3, r4, r5, r7}
354	adcs r2, r2, r6
355	adcs r2, r2, r3
356	adcs r2, r2, r4
357	adcs r2, r2, r5
358	adcs r2, r2, r7
359	adc r2, r2, #0x00
360	subs r1, r1, #0x20
361	beq .Lin_cksum_next
362
363	.Lcksumdata_less_than_32:
364	/* There are less than 32 bytes left */
365	and r3, r1, #0x18
366	rsb r4, r3, #0x18
367	sub r1, r1, r3
368	adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */
369	addne pc, pc, r4
370
371	/*
372	* Note: We use ldm here, even on Xscale, since the combined issue/result
373	* latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
374	*/
375	/* At least 24 bytes remaining... */
376	ldmia r0!, {r4, r5}
377	nop
378	adcs r2, r2, r4
379	adcs r2, r2, r5
380
381	/* At least 16 bytes remaining... */
382	ldmia r0!, {r4, r5}
383	adcs r2, r2, r4
384	adcs r2, r2, r5
385
386	/* At least 8 bytes remaining... */
387	ldmia r0!, {r4, r5}
388	adcs r2, r2, r4
389	adcs r2, r2, r5
390
391	/* Less than 8 bytes remaining... */
392	adc r2, r2, #0x00
393	subs r1, r1, #0x04
394	blt .Lcksumdata_lessthan4
395
396	ldr r4, [r0], #0x04
397	sub r1, r1, #0x04
398	adds r2, r2, r4
399	adc r2, r2, #0x00
400
401	/* Deal with < 4 bytes remaining */
402	.Lcksumdata_lessthan4:
403	adds r1, r1, #0x04
404	beq .Lin_cksum_next
405
406	/* Deal with 1 to 3 remaining bytes, possibly misaligned */
407	.Lcksumdata_endgame:
408	ldrb r3, [r0] /* Fetch first byte */
409	cmp r1, #0x02
410	ldrbge r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */
411	movlt r4, #0x00
412	ldrbgt r5, [r0, #0x02]
413	movle r5, #0x00
414	/* Combine the three bytes depending on endianness and alignment */
415	tst r0, #0x01
416	#if BYTE_ORDER != LITTLE_ENDIAN
417	orreq r3, r4, r3, lsl #8
418	orreq r3, r3, r5, lsl #24
419	orrne r3, r3, r4, lsl #8
420	orrne r3, r3, r5, lsl #16
421	#else
422	orreq r3, r3, r4, lsl #8
423	orreq r3, r3, r5, lsl #16
424	orrne r3, r4, r3, lsl #8
425	orrne r3, r3, r5, lsl #24
426	#endif
427	adds r2, r2, r3
428	adc r2, r2, #0x00
429
430	.Lin_cksum_next:
431	tst r11, #0x01
432	movne r2, r2, ror #8
433	adds r8, r8, r2
434	adc r8, r8, #0x00
435	cmp ip, #00
436	bne .Lin_cksum_loop
437
438	mov r1, #0xff
439	orr r1, r1, #0xff00
440	and r0, r8, r1
441	add r0, r0, r8, lsr #16
442	add r0, r0, r0, lsr #16
443	and r0, r0, r1
444	/*
445	* If we were to 1's complement it (XOR with 0xffff):
446	*
447	* eor r0, r0, r1
448	*/
449
450	ldmfd sp!, {r4-r11, pc}
451
452	.Lin_cksum_whoops:
453	adr r0, .Lin_cksum_whoops_str
454	bl #CKSUM_ERR
455	mov r0, #-1
456
457	ldmfd sp!, {r4-r11, pc}
458
459	.Lin_cksum_whoops_str:
460	.asciz "os_cpu_in_cksum_mbuf: out of data\n"
461	.align 5