[apple/xnu.git] / bsd / dev / arm64 / cpu_in_cksum.s

/*
 * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

/* 
 * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
 * with __arm64__ tagged ARM64_TODO .  This code revision is optimized based
 * on the 64-bit part in netinet/cpu_in_cksum.c
 *
 * cclee - CoreOS - Vector & Numerics. 06/20/2012.
 */

#ifdef KERNEL
#define	CKSUM_ERR _kprintf
#else
#ifndef LIBSYSCALL_INTERFACE
#error "LIBSYSCALL_INTERFACE not defined"
#endif /* !LIBSYSCALL_INTERFACE */
#define	CKSUM_ERR _fprintf_stderr
#endif /* !KERNEL */

/*
 * XXX: adi@apple.com:
 *
 * Ugly, but we have little choice, since relying on genassym and <assym.s>
 * is not possible unless this code lives in osfmk.  Note also that this
 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
 * authentic; it only cares about 3 fields.
 */
#if defined(__LP64__)
#define	M_NEXT	0
#define	M_DATA	16	// 8-byte address, would be aligned to 8-byte boundary
#define	M_LEN	24
#else
#define	M_NEXT	0
#define	M_DATA	8
#define	M_LEN	12
#endif

	.globl	_os_cpu_in_cksum_mbuf
	.text
	.align	4
_os_cpu_in_cksum_mbuf:


/*
 * 64-bit version.
 *
 * This function returns the partial 16-bit checksum accumulated in
 * a 32-bit variable (withouth 1's complement); caller is responsible
 * for folding the 32-bit sum into 16-bit and performinng the 1's
 * complement if applicable
 */

/* 
 * uint32_t
 * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
 * {
 * 	int mlen;
 * 	uint64_t sum, partial;
 * 	unsigned int final_acc;
 * 	uint8_t *data;
 * 	boolean_t needs_swap, started_on_odd;
 *
 * 	VERIFY(len >= 0);
 * 	VERIFY(off >= 0);
 *
 * 	needs_swap = FALSE;
 * 	started_on_odd = FALSE;
 * 	sum = initial_sum;
 */

	#define	m		x0
	#define	len		x1
	#define	off		x2
	#define	sum		x3
	#define	needs_swap	x4
	#define	started_on_odd	x5
	#define	mlen			x6
	#define	Wmlen			w6
	#define t       x7
	#define	data	x8
#if defined(__LP64__)
	#define ptr_m		x0
	#define ptr_data	x8
#else
	#define ptr_m		w0
	#define ptr_data	w8
#endif


	mov	needs_swap, #0		// needs_swap = FALSE;
	mov	started_on_odd, #0	// started_on_odd = FALSE;
	mov	w3, w3			// clear higher half


/*
 *	for (;;) {
 *		if (PREDICT_FALSE(m == NULL)) {
 *			CKSUM_ERR("%s: out of data\n", __func__);
 *			return (-1);
 *		}
 *		mlen = m->m_len;
 *		if (mlen > off) {
 *			mlen -= off;
 *			data = mtod(m, uint8_t *) + off;
 *			goto post_initial_offset;
 *		}
 *		off -= mlen;
 *		if (len == 0)
 *			break;
 *		m = m->m_next;
 *	}
 */

0:
	cbz	m, Lin_cksum_whoops	// if (m == NULL) return -1;
	ldr	Wmlen, [m, #M_LEN]	// mlen = m->m_len;
	cmp	mlen, off
	b.le	1f
	ldr	ptr_data, [m, #M_DATA]	// mtod(m, uint8_t *)
	sub	mlen, mlen, off		// mlen -= off;
	add	data, data, off		// data = mtod(m, uint8_t *) + off;
	b	L_post_initial_offset
1:
	sub	off, off, mlen
	cbnz	len, 2f
	mov	x0, x3
	ret	lr
2:
	ldr	ptr_m, [m, #M_NEXT]
	b	0b

L_loop:	// for (; len > 0; m = m->m_next) {
/*
 *		if (PREDICT_FALSE(m == NULL)) {
 *			CKSUM_ERR("%s: out of data\n", __func__);
 *			return (-1);
 *		}
 *		mlen = m->m_len;
 *		data = mtod(m, uint8_t *);
 */
	cbz	m, Lin_cksum_whoops	// if (m == NULL) return -1;
	ldr	Wmlen, [m, #M_LEN]	// mlen = m->m_len;
	ldr	ptr_data, [m, #M_DATA]	// mtod(m, uint8_t *)

L_post_initial_offset:
/*
 *		if (mlen == 0) continue;
 *		if (mlen > len) mlen = len;
 *		len -= mlen;
 */

	cbz	mlen, L_continue
	cmp	mlen, len
	csel	mlen, mlen, len, le
	sub	len, len, mlen

/*
 *		partial = 0;
 *		if ((uintptr_t)data & 1) {
 *			started_on_odd = !started_on_odd;
 *			partial = *data << 8;
 *			++data;
 *			--mlen;
 *		}
 *		needs_swap = started_on_odd;
 */

	tst	data, #1
	mov	x7, #0
	mov	x10, #0
	b.eq	1f
	ldrb	w7, [data], #1
	eor	started_on_odd, started_on_odd, #1
	sub	mlen, mlen, #1
	lsl	w7, w7, #8
1:


/*
 *		if ((uintptr_t)data & 2) {
 *			if (mlen < 2)
 *				goto trailing_bytes;
 *			partial += *(uint16_t *)(void *)data;
 *			data += 2;
 *			mlen -= 2;
 *		}
 */
	tst	data, #2
	mov	needs_swap, started_on_odd
	b.eq	1f
	cmp	mlen, #2
	b.lt	L_trailing_bytes
	ldrh	w9, [data], #2
	sub	mlen, mlen, #2
	add	w7, w7, w9
1:

/*
 *		if ((uintptr_t)data & 4) {
 *			if (mlen < 4)
 *				goto L2_bytes;
 *			partial += *(uint32_t *)(void *)data;
 *			data += 4;
 *			mlen -= 4;
 *		}
 */
	// align on 8-bytes boundary if applicable
	tst	data, #4
	b.eq	1f
	cmp	mlen, #4
	b.lt	L2_bytes
	ldr	w9, [data], #4
	sub	mlen, mlen, #4
	adds	w7, w7, w9
	adc	x7, x7, x10 // assumes x10 still is #0 as set above
1:

/*
 *		while (mlen >= 64) {
 *			__builtin_prefetch(data + 32);
 *			__builtin_prefetch(data + 64);
 *			partial += *(uint32_t *)(void *)data;
 *			partial += *(uint32_t *)(void *)(data + 4);
 *			partial += *(uint32_t *)(void *)(data + 8);
 *			partial += *(uint32_t *)(void *)(data + 12);
 *			partial += *(uint32_t *)(void *)(data + 16);
 *			partial += *(uint32_t *)(void *)(data + 20);
 *			partial += *(uint32_t *)(void *)(data + 24);
 *			partial += *(uint32_t *)(void *)(data + 28);
 *			partial += *(uint32_t *)(void *)(data + 32);
 *			partial += *(uint32_t *)(void *)(data + 36);
 *			partial += *(uint32_t *)(void *)(data + 40);
 *			partial += *(uint32_t *)(void *)(data + 44);
 *			partial += *(uint32_t *)(void *)(data + 48);
 *			partial += *(uint32_t *)(void *)(data + 52);
 *			partial += *(uint32_t *)(void *)(data + 56);
 *			partial += *(uint32_t *)(void *)(data + 60);
 *			data += 64;
 *			mlen -= 64;
 *		//	if (PREDICT_FALSE(partial & (3ULL << 62))) {
 *		//		if (needs_swap)
 *		//			partial = (partial << 8) +
 *		//			    (partial >> 56);
 *		//		sum += (partial >> 32);
 *		//		sum += (partial & 0xffffffff);
 *		//		partial = 0;
 *		//	}
 *		}
*/

	// pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
	subs	mlen, mlen, #64
	b.lt	L32_bytes

	// save used vector registers
	sub	sp, sp, #8*16
	mov	x11, sp
	st1.4s	{v0, v1, v2, v3}, [x11], #4*16 
	st1.4s	{v4, v5, v6, v7}, [x11], #4*16 

	// spread partial into 8 8-byte registers in v0-v3
	fmov	s3, w7
	eor.16b	v0, v0, v0
	eor.16b	v1, v1, v1
	eor.16b	v2, v2, v2

	// load the 1st 64 bytes (16 32-bit words)
	ld1.4s	{v4,v5,v6,v7},[data],#64

	// branch to finish off if mlen<64
	subs	mlen, mlen, #64
	b.lt	L64_finishup

	/*
	 * loop for loading and accumulating 16 32-bit words into
	 * 8 8-byte accumulators per iteration.
	 */
L64_loop:
	subs        mlen, mlen, #64             // mlen -= 64

	uadalp.2d   v0, v4
	ld1.4s      {v4},[data], #16

	uadalp.2d   v1, v5
	ld1.4s      {v5},[data], #16

	uadalp.2d   v2, v6
	ld1.4s      {v6},[data], #16

	uadalp.2d   v3, v7
	ld1.4s      {v7},[data], #16

	b.ge        L64_loop

L64_finishup:
	uadalp.2d   v0, v4
	uadalp.2d   v1, v5
	uadalp.2d   v2, v6
	uadalp.2d   v3, v7

	add.2d      v0, v0, v1
	add.2d      v2, v2, v3
	addp.2d     d0, v0
	addp.2d     d2, v2
	add.2d      v0, v0, v2
	fmov        x7, d0			// partial in x7 now

	// restore used vector registers
	ld1.4s      {v0, v1, v2, v3}, [sp], #4*16
	ld1.4s      {v4, v5, v6, v7}, [sp], #4*16

L32_bytes:
	tst     mlen, #32
	b.eq    L16_bytes
	ldp	x9, x10, [data], #16
	ldp	x11, x12, [data], #16
	adds	x7, x7, x9
	mov	x9, #0
	adcs	x7, x7, x10
	adcs	x7, x7, x11
	adcs	x7, x7, x12
	adc	x7, x7, x9

L16_bytes:
	tst	mlen, #16
	b.eq	L8_bytes
	ldp	x9, x10, [data], #16
	adds	x7, x7, x9
	mov	x9, #0
	adcs	x7, x7, x10
	adc	x7, x7, x9

L8_bytes:
	tst     mlen, #8
	mov	x10, #0
	b.eq    L4_bytes
	ldr	x9,[data],#8
	adds	x7, x7, x9
	adc	x7, x7, x10

L4_bytes:
	tst     mlen, #4
	b.eq    L2_bytes
	ldr	w9,[data],#4
	adds	x7, x7, x9
	adc	x7, x7, x10

L2_bytes:
	tst	mlen, #2
	b.eq	L_trailing_bytes
	ldrh	w9,[data],#2
	adds	x7, x7, x9
	adc	x7, x7, x10

L_trailing_bytes:
	tst     mlen, #1
	b.eq    L0_bytes
	ldrb	w9,[data],#1
	adds	x7, x7, x9
	adc	x7, x7, x10
	eor	started_on_odd, started_on_odd, #1

L0_bytes:
/*
 *		if (needs_swap)
 *			partial = (partial << 8) + (partial >> 56);
 */
	cbz	needs_swap, 1f
	ror	x7, x7, #56
1:
/*
 *		sum += (partial >> 32) + (partial & 0xffffffff);
 *		sum = (sum >> 32) + (sum & 0xffffffff);
 *	}
 */

	add	x3, x3, x7, lsr #32
	mov	w7, w7
	add	x3, x3, x7
	mov	w7, w3
	add	x3, x7, x3, lsr #32

L_continue:
	cmp	len, #0
	ldr     ptr_m, [m, #M_NEXT]			// m = m->m_next
	b.gt	L_loop

/*
 *	final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
 *	    ((sum >> 16) & 0xffff) + (sum & 0xffff);
 *	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
 *	final_acc = (final_acc >> 16) + (final_acc & 0xffff);
 *	return (final_acc & 0xffff);
 * }
 */

	mov	w4, #0x00ffff
	and	x0, x4, x3, lsr #48
	and	x1, x4, x3, lsr #32
	and	x2, x4, x3, lsr #16
	and	x3, x4, x3
	add	w0, w0, w1
	add	w2, w2, w3
	add	w0, w0, w2
	and	w1, w4, w0, lsr #16
	and	w0, w4, w0
	add	w0, w0, w1
	and	w1, w4, w0, lsr #16
	and	w0, w4, w0
	add	w0, w0, w1
	/*
	 * If we were to 1's complement it (XOR with 0xffff):
	 *
	 * eor    	w0, w0, w4
	 */
	and	w0, w0, w4

	ret	lr

Lin_cksum_whoops:
	adrp	x0, Lin_cksum_whoops_str@page
	add	x0, x0, Lin_cksum_whoops_str@pageoff
	bl	#CKSUM_ERR
	mov	x0, #-1
	ret	lr

Lin_cksum_whoops_str:
	.asciz	"os_cpu_in_cksum_mbuf: out of data\n"
	.align	5
Commit	Line	Data
5ba3f43e	1	/*
a39ff7e2	2	* Copyright (c) 2012-2018 Apple Inc. All rights reserved.
5ba3f43e	3	*
a39ff7e2	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5ba3f43e	5	*
a39ff7e2 A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
5ba3f43e A	27	*/
	28
	29	/*
	30	* This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
	31	* with __arm64__ tagged ARM64_TODO . This code revision is optimized based
	32	* on the 64-bit part in netinet/cpu_in_cksum.c
	33	*
	34	* cclee - CoreOS - Vector & Numerics. 06/20/2012.
	35	*/
	36
	37	#ifdef KERNEL
	38	#define CKSUM_ERR _kprintf
	39	#else
	40	#ifndef LIBSYSCALL_INTERFACE
	41	#error "LIBSYSCALL_INTERFACE not defined"
	42	#endif /* !LIBSYSCALL_INTERFACE */
	43	#define CKSUM_ERR _fprintf_stderr
	44	#endif /* !KERNEL */
	45
	46	/*
	47	* XXX: adi@apple.com:
	48	*
	49	* Ugly, but we have little choice, since relying on genassym and <assym.s>
	50	* is not possible unless this code lives in osfmk. Note also that this
	51	* routine expects "mbuf-like" argument, and it does not expect the mbuf to be
	52	* authentic; it only cares about 3 fields.
	53	*/
d9a64523	54	#if defined(__LP64__)
5ba3f43e A	55	#define M_NEXT 0
	56	#define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary
	57	#define M_LEN 24
d9a64523 A	58	#else
	59	#define M_NEXT 0
	60	#define M_DATA 8
	61	#define M_LEN 12
	62	#endif
5ba3f43e A	63
	64	.globl _os_cpu_in_cksum_mbuf
	65	.text
	66	.align 4
	67	_os_cpu_in_cksum_mbuf:
	68
	69
	70	/*
	71	* 64-bit version.
	72	*
	73	* This function returns the partial 16-bit checksum accumulated in
	74	* a 32-bit variable (withouth 1's complement); caller is responsible
	75	* for folding the 32-bit sum into 16-bit and performinng the 1's
	76	* complement if applicable
	77	*/
	78
	79	/*
	80	* uint32_t
	81	* os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
	82	* {
	83	* int mlen;
	84	* uint64_t sum, partial;
	85	* unsigned int final_acc;
	86	* uint8_t *data;
	87	* boolean_t needs_swap, started_on_odd;
	88	*
	89	* VERIFY(len >= 0);
	90	* VERIFY(off >= 0);
	91	*
	92	* needs_swap = FALSE;
	93	* started_on_odd = FALSE;
	94	* sum = initial_sum;
	95	*/
	96
	97	#define m x0
	98	#define len x1
	99	#define off x2
	100	#define sum x3
	101	#define needs_swap x4
	102	#define started_on_odd x5
	103	#define mlen x6
	104	#define Wmlen w6
	105	#define t x7
	106	#define data x8
d9a64523 A	107	#if defined(__LP64__)
	108	#define ptr_m x0
	109	#define ptr_data x8
	110	#else
	111	#define ptr_m w0
	112	#define ptr_data w8
	113	#endif
	114
5ba3f43e A	115
	116	mov needs_swap, #0 // needs_swap = FALSE;
	117	mov started_on_odd, #0 // started_on_odd = FALSE;
	118	mov w3, w3 // clear higher half
	119
	120
	121	/*
	122	* for (;;) {
	123	* if (PREDICT_FALSE(m == NULL)) {
	124	* CKSUM_ERR("%s: out of data\n", __func__);
	125	* return (-1);
	126	* }
	127	* mlen = m->m_len;
	128	* if (mlen > off) {
	129	* mlen -= off;
	130	* data = mtod(m, uint8_t *) + off;
	131	* goto post_initial_offset;
	132	* }
	133	* off -= mlen;
	134	* if (len == 0)
	135	* break;
	136	* m = m->m_next;
	137	* }
	138	*/
	139
	140	0:
	141	cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
	142	ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
	143	cmp mlen, off
	144	b.le 1f
d9a64523	145	ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *)
5ba3f43e A	146	sub mlen, mlen, off // mlen -= off;
	147	add data, data, off // data = mtod(m, uint8_t *) + off;
	148	b L_post_initial_offset
	149	1:
	150	sub off, off, mlen
	151	cbnz len, 2f
	152	mov x0, x3
	153	ret lr
	154	2:
d9a64523	155	ldr ptr_m, [m, #M_NEXT]
5ba3f43e A	156	b 0b
	157
	158	L_loop: // for (; len > 0; m = m->m_next) {
	159	/*
	160	* if (PREDICT_FALSE(m == NULL)) {
	161	* CKSUM_ERR("%s: out of data\n", __func__);
	162	* return (-1);
	163	* }
	164	* mlen = m->m_len;
	165	* data = mtod(m, uint8_t *);
	166	*/
	167	cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
	168	ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
d9a64523	169	ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *)
5ba3f43e A	170
	171	L_post_initial_offset:
	172	/*
	173	* if (mlen == 0) continue;
	174	* if (mlen > len) mlen = len;
	175	* len -= mlen;
	176	*/
	177
	178	cbz mlen, L_continue
	179	cmp mlen, len
	180	csel mlen, mlen, len, le
	181	sub len, len, mlen
	182
	183	/*
	184	* partial = 0;
	185	* if ((uintptr_t)data & 1) {
	186	* started_on_odd = !started_on_odd;
	187	* partial = *data << 8;
	188	* ++data;
	189	* --mlen;
	190	* }
	191	* needs_swap = started_on_odd;
	192	*/
	193
	194	tst data, #1
	195	mov x7, #0
	196	mov x10, #0
	197	b.eq 1f
	198	ldrb w7, [data], #1
	199	eor started_on_odd, started_on_odd, #1
	200	sub mlen, mlen, #1
	201	lsl w7, w7, #8
	202	1:
	203
	204
	205	/*
	206	* if ((uintptr_t)data & 2) {
	207	* if (mlen < 2)
	208	* goto trailing_bytes;
	209	* partial += (uint16_t )(void *)data;
	210	* data += 2;
	211	* mlen -= 2;
	212	* }
	213	*/
	214	tst data, #2
	215	mov needs_swap, started_on_odd
	216	b.eq 1f
	217	cmp mlen, #2
	218	b.lt L_trailing_bytes
	219	ldrh w9, [data], #2
	220	sub mlen, mlen, #2
	221	add w7, w7, w9
	222	1:
	223
cb323159 A	224	/*
	225	* if ((uintptr_t)data & 4) {
	226	* if (mlen < 4)
	227	* goto L2_bytes;
	228	* partial += (uint32_t )(void *)data;
	229	* data += 4;
	230	* mlen -= 4;
	231	* }
	232	*/
	233	// align on 8-bytes boundary if applicable
	234	tst data, #4
	235	b.eq 1f
	236	cmp mlen, #4
	237	b.lt L2_bytes
	238	ldr w9, [data], #4
	239	sub mlen, mlen, #4
	240	adds w7, w7, w9
	241	adc x7, x7, x10 // assumes x10 still is #0 as set above
	242	1:
	243
5ba3f43e A	244	/*
	245	* while (mlen >= 64) {
	246	* __builtin_prefetch(data + 32);
	247	* __builtin_prefetch(data + 64);
	248	* partial += (uint32_t )(void *)data;
	249	* partial += (uint32_t )(void *)(data + 4);
	250	* partial += (uint32_t )(void *)(data + 8);
	251	* partial += (uint32_t )(void *)(data + 12);
	252	* partial += (uint32_t )(void *)(data + 16);
	253	* partial += (uint32_t )(void *)(data + 20);
	254	* partial += (uint32_t )(void *)(data + 24);
	255	* partial += (uint32_t )(void *)(data + 28);
	256	* partial += (uint32_t )(void *)(data + 32);
	257	* partial += (uint32_t )(void *)(data + 36);
	258	* partial += (uint32_t )(void *)(data + 40);
	259	* partial += (uint32_t )(void *)(data + 44);
	260	* partial += (uint32_t )(void *)(data + 48);
	261	* partial += (uint32_t )(void *)(data + 52);
	262	* partial += (uint32_t )(void *)(data + 56);
	263	* partial += (uint32_t )(void *)(data + 60);
	264	* data += 64;
	265	* mlen -= 64;
	266	* // if (PREDICT_FALSE(partial & (3ULL << 62))) {
	267	* // if (needs_swap)
	268	* // partial = (partial << 8) +
	269	* // (partial >> 56);
	270	* // sum += (partial >> 32);
	271	* // sum += (partial & 0xffffffff);
	272	* // partial = 0;
	273	* // }
	274	* }
	275	*/
	276
	277	// pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
	278	subs mlen, mlen, #64
	279	b.lt L32_bytes
	280
	281	// save used vector registers
	282	sub sp, sp, #8*16
	283	mov x11, sp
	284	st1.4s {v0, v1, v2, v3}, [x11], #4*16
	285	st1.4s {v4, v5, v6, v7}, [x11], #4*16
	286
	287	// spread partial into 8 8-byte registers in v0-v3
	288	fmov s3, w7
	289	eor.16b v0, v0, v0
	290	eor.16b v1, v1, v1
	291	eor.16b v2, v2, v2
	292
	293	// load the 1st 64 bytes (16 32-bit words)
	294	ld1.4s {v4,v5,v6,v7},[data],#64
	295
	296	// branch to finish off if mlen<64
	297	subs mlen, mlen, #64
	298	b.lt L64_finishup
	299
	300	/*
	301	* loop for loading and accumulating 16 32-bit words into
	302	* 8 8-byte accumulators per iteration.
	303	*/
	304	L64_loop:
	305	subs mlen, mlen, #64 // mlen -= 64
	306
	307	uadalp.2d v0, v4
308	ld1.4s {v4},[data], #16
309
310	uadalp.2d v1, v5
311	ld1.4s {v5},[data], #16
312
313	uadalp.2d v2, v6
314	ld1.4s {v6},[data], #16
315
316	uadalp.2d v3, v7
317	ld1.4s {v7},[data], #16
318
319	b.ge L64_loop
320
321	L64_finishup:
322	uadalp.2d v0, v4
323	uadalp.2d v1, v5
324	uadalp.2d v2, v6
325	uadalp.2d v3, v7
326
327	add.2d v0, v0, v1
328	add.2d v2, v2, v3
329	addp.2d d0, v0
330	addp.2d d2, v2
331	add.2d v0, v0, v2
332	fmov x7, d0 // partial in x7 now
333
334	// restore used vector registers
335	ld1.4s {v0, v1, v2, v3}, [sp], #4*16
336	ld1.4s {v4, v5, v6, v7}, [sp], #4*16
337
338	L32_bytes:
339	tst mlen, #32
340	b.eq L16_bytes
341	ldp x9, x10, [data], #16
342	ldp x11, x12, [data], #16
343	adds x7, x7, x9
344	mov x9, #0
345	adcs x7, x7, x10
346	adcs x7, x7, x11
347	adcs x7, x7, x12
348	adc x7, x7, x9
349
350	L16_bytes:
351	tst mlen, #16
352	b.eq L8_bytes
353	ldp x9, x10, [data], #16
354	adds x7, x7, x9
355	mov x9, #0
356	adcs x7, x7, x10
357	adc x7, x7, x9
358
359	L8_bytes:
360	tst mlen, #8
361	mov x10, #0
362	b.eq L4_bytes
363	ldr x9,[data],#8
364	adds x7, x7, x9
365	adc x7, x7, x10
366
367	L4_bytes:
368	tst mlen, #4
369	b.eq L2_bytes
370	ldr w9,[data],#4
371	adds x7, x7, x9
372	adc x7, x7, x10
373
374	L2_bytes:
375	tst mlen, #2
376	b.eq L_trailing_bytes
377	ldrh w9,[data],#2
378	adds x7, x7, x9
379	adc x7, x7, x10
380
381	L_trailing_bytes:
382	tst mlen, #1
383	b.eq L0_bytes
384	ldrb w9,[data],#1
385	adds x7, x7, x9
386	adc x7, x7, x10
387	eor started_on_odd, started_on_odd, #1
388
389	L0_bytes:
390	/*
391	* if (needs_swap)
392	* partial = (partial << 8) + (partial >> 56);
393	*/
394	cbz needs_swap, 1f
395	ror x7, x7, #56
396	1:
397	/*
398	* sum += (partial >> 32) + (partial & 0xffffffff);
399	* sum = (sum >> 32) + (sum & 0xffffffff);
400	* }
401	*/
402
403	add x3, x3, x7, lsr #32
404	mov w7, w7
405	add x3, x3, x7
406	mov w7, w3
407	add x3, x7, x3, lsr #32
408
409	L_continue:
410	cmp len, #0
d9a64523	411	ldr ptr_m, [m, #M_NEXT] // m = m->m_next
5ba3f43e A	412	b.gt L_loop
	413
	414	/*
	415	* final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
	416	* ((sum >> 16) & 0xffff) + (sum & 0xffff);
	417	* final_acc = (final_acc >> 16) + (final_acc & 0xffff);
	418	* final_acc = (final_acc >> 16) + (final_acc & 0xffff);
	419	* return (final_acc & 0xffff);
	420	* }
	421	*/
	422
	423	mov w4, #0x00ffff
	424	and x0, x4, x3, lsr #48
	425	and x1, x4, x3, lsr #32
	426	and x2, x4, x3, lsr #16
	427	and x3, x4, x3
	428	add w0, w0, w1
	429	add w2, w2, w3
	430	add w0, w0, w2
	431	and w1, w4, w0, lsr #16
	432	and w0, w4, w0
	433	add w0, w0, w1
	434	and w1, w4, w0, lsr #16
	435	and w0, w4, w0
	436	add w0, w0, w1
	437	/*
	438	* If we were to 1's complement it (XOR with 0xffff):
	439	*
	440	* eor w0, w0, w4
	441	*/
	442	and w0, w0, w4
	443
	444	ret lr
	445
	446	Lin_cksum_whoops:
	447	adrp x0, Lin_cksum_whoops_str@page
	448	add x0, x0, Lin_cksum_whoops_str@pageoff
	449	bl #CKSUM_ERR
	450	mov x0, #-1
	451	ret lr
	452
	453	Lin_cksum_whoops_str:
	454	.asciz "os_cpu_in_cksum_mbuf: out of data\n"
	455	.align 5