git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2012-2018 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	/*
	30	* This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
	31	* with __arm64__ tagged ARM64_TODO . This code revision is optimized based
	32	* on the 64-bit part in netinet/cpu_in_cksum.c
	33	*
	34	* cclee - CoreOS - Vector & Numerics. 06/20/2012.
	35	*/
	36
	37	#ifdef KERNEL
	38	#define CKSUM_ERR _kprintf
	39	#else
	40	#ifndef LIBSYSCALL_INTERFACE
	41	#error "LIBSYSCALL_INTERFACE not defined"
	42	#endif /* !LIBSYSCALL_INTERFACE */
	43	#define CKSUM_ERR _fprintf_stderr
	44	#endif /* !KERNEL */
	45
	46	/*
	47	* XXX: adi@apple.com:
	48	*
	49	* Ugly, but we have little choice, since relying on genassym and <assym.s>
	50	* is not possible unless this code lives in osfmk. Note also that this
	51	* routine expects "mbuf-like" argument, and it does not expect the mbuf to be
	52	* authentic; it only cares about 3 fields.
	53	*/
	54	#if defined(__LP64__)
	55	#define M_NEXT 0
	56	#define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary
	57	#define M_LEN 24
	58	#else
	59	#define M_NEXT 0
	60	#define M_DATA 8
	61	#define M_LEN 12
	62	#endif
	63
	64	.globl _os_cpu_in_cksum_mbuf
	65	.text
	66	.align 4
	67	_os_cpu_in_cksum_mbuf:
	68
	69
	70	/*
	71	* 64-bit version.
	72	*
	73	* This function returns the partial 16-bit checksum accumulated in
	74	* a 32-bit variable (withouth 1's complement); caller is responsible
	75	* for folding the 32-bit sum into 16-bit and performinng the 1's
	76	* complement if applicable
	77	*/
	78
	79	/*
	80	* uint32_t
	81	* os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
	82	* {
	83	* int mlen;
	84	* uint64_t sum, partial;
	85	* unsigned int final_acc;
	86	* uint8_t *data;
	87	* boolean_t needs_swap, started_on_odd;
	88	*
	89	* VERIFY(len >= 0);
	90	* VERIFY(off >= 0);
	91	*
	92	* needs_swap = FALSE;
	93	* started_on_odd = FALSE;
	94	* sum = initial_sum;
	95	*/
	96
	97	#define m x0
	98	#define len x1
	99	#define off x2
	100	#define sum x3
	101	#define needs_swap x4
	102	#define started_on_odd x5
	103	#define mlen x6
	104	#define Wmlen w6
	105	#define t x7
	106	#define data x8
	107	#if defined(__LP64__)
	108	#define ptr_m x0
	109	#define ptr_data x8
	110	#else
	111	#define ptr_m w0
	112	#define ptr_data w8
	113	#endif
	114
	115
	116	mov needs_swap, #0 // needs_swap = FALSE;
	117	mov started_on_odd, #0 // started_on_odd = FALSE;
	118	mov w3, w3 // clear higher half
	119
	120
	121	/*
	122	* for (;;) {
	123	* if (PREDICT_FALSE(m == NULL)) {
	124	* CKSUM_ERR("%s: out of data\n", __func__);
	125	* return (-1);
	126	* }
	127	* mlen = m->m_len;
	128	* if (mlen > off) {
	129	* mlen -= off;
	130	* data = mtod(m, uint8_t *) + off;
	131	* goto post_initial_offset;
	132	* }
	133	* off -= mlen;
	134	* if (len == 0)
	135	* break;
	136	* m = m->m_next;
	137	* }
	138	*/
	139
	140	0:
	141	cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
	142	ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
	143	cmp mlen, off
	144	b.le 1f
	145	ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *)
	146	sub mlen, mlen, off // mlen -= off;
	147	add data, data, off // data = mtod(m, uint8_t *) + off;
	148	b L_post_initial_offset
	149	1:
	150	sub off, off, mlen
	151	cbnz len, 2f
	152	mov x0, x3
	153	ret lr
	154	2:
	155	ldr ptr_m, [m, #M_NEXT]
	156	b 0b
	157
	158	L_loop: // for (; len > 0; m = m->m_next) {
	159	/*
	160	* if (PREDICT_FALSE(m == NULL)) {
	161	* CKSUM_ERR("%s: out of data\n", __func__);
	162	* return (-1);
	163	* }
	164	* mlen = m->m_len;
	165	* data = mtod(m, uint8_t *);
	166	*/
	167	cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
	168	ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
	169	ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *)
	170
	171	L_post_initial_offset:
	172	/*
	173	* if (mlen == 0) continue;
	174	* if (mlen > len) mlen = len;
	175	* len -= mlen;
	176	*/
	177
	178	cbz mlen, L_continue
	179	cmp mlen, len
	180	csel mlen, mlen, len, le
	181	sub len, len, mlen
	182
	183	/*
	184	* partial = 0;
	185	* if ((uintptr_t)data & 1) {
	186	* started_on_odd = !started_on_odd;
	187	* partial = *data << 8;
	188	* ++data;
	189	* --mlen;
	190	* }
	191	* needs_swap = started_on_odd;
	192	*/
	193
	194	tst data, #1
	195	mov x7, #0
	196	mov x10, #0
	197	b.eq 1f
	198	ldrb w7, [data], #1
	199	eor started_on_odd, started_on_odd, #1
	200	sub mlen, mlen, #1
	201	lsl w7, w7, #8
	202	1:
	203
	204
	205	/*
	206	* if ((uintptr_t)data & 2) {
	207	* if (mlen < 2)
	208	* goto trailing_bytes;
	209	* partial += (uint16_t )(void *)data;
	210	* data += 2;
	211	* mlen -= 2;
	212	* }
	213	*/
	214	tst data, #2
	215	mov needs_swap, started_on_odd
	216	b.eq 1f
	217	cmp mlen, #2
	218	b.lt L_trailing_bytes
	219	ldrh w9, [data], #2
	220	sub mlen, mlen, #2
	221	add w7, w7, w9
	222	1:
	223
	224	/*
	225	* while (mlen >= 64) {
	226	* __builtin_prefetch(data + 32);
	227	* __builtin_prefetch(data + 64);
	228	* partial += (uint32_t )(void *)data;
	229	* partial += (uint32_t )(void *)(data + 4);
	230	* partial += (uint32_t )(void *)(data + 8);
	231	* partial += (uint32_t )(void *)(data + 12);
	232	* partial += (uint32_t )(void *)(data + 16);
	233	* partial += (uint32_t )(void *)(data + 20);
	234	* partial += (uint32_t )(void *)(data + 24);
	235	* partial += (uint32_t )(void *)(data + 28);
	236	* partial += (uint32_t )(void *)(data + 32);
	237	* partial += (uint32_t )(void *)(data + 36);
	238	* partial += (uint32_t )(void *)(data + 40);
	239	* partial += (uint32_t )(void *)(data + 44);
	240	* partial += (uint32_t )(void *)(data + 48);
	241	* partial += (uint32_t )(void *)(data + 52);
	242	* partial += (uint32_t )(void *)(data + 56);
	243	* partial += (uint32_t )(void *)(data + 60);
	244	* data += 64;
	245	* mlen -= 64;
	246	* // if (PREDICT_FALSE(partial & (3ULL << 62))) {
	247	* // if (needs_swap)
	248	* // partial = (partial << 8) +
	249	* // (partial >> 56);
	250	* // sum += (partial >> 32);
	251	* // sum += (partial & 0xffffffff);
	252	* // partial = 0;
	253	* // }
	254	* }
	255	*/
	256
	257	// pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
	258	subs mlen, mlen, #64
	259	b.lt L32_bytes
	260
	261	// save used vector registers
	262	sub sp, sp, #8*16
	263	mov x11, sp
	264	st1.4s {v0, v1, v2, v3}, [x11], #4*16
	265	st1.4s {v4, v5, v6, v7}, [x11], #4*16
	266
	267	// spread partial into 8 8-byte registers in v0-v3
	268	fmov s3, w7
	269	eor.16b v0, v0, v0
	270	eor.16b v1, v1, v1
	271	eor.16b v2, v2, v2
	272
	273	// load the 1st 64 bytes (16 32-bit words)
	274	ld1.4s {v4,v5,v6,v7},[data],#64
	275
	276	// branch to finish off if mlen<64
	277	subs mlen, mlen, #64
	278	b.lt L64_finishup
	279
	280	/*
	281	* loop for loading and accumulating 16 32-bit words into
	282	* 8 8-byte accumulators per iteration.
	283	*/
	284	L64_loop:
	285	subs mlen, mlen, #64 // mlen -= 64
	286
	287	uadalp.2d v0, v4
	288	ld1.4s {v4},[data], #16
	289
	290	uadalp.2d v1, v5
	291	ld1.4s {v5},[data], #16
	292
	293	uadalp.2d v2, v6
	294	ld1.4s {v6},[data], #16
	295
	296	uadalp.2d v3, v7
	297	ld1.4s {v7},[data], #16
	298
	299	b.ge L64_loop
	300
	301	L64_finishup:
	302	uadalp.2d v0, v4
	303	uadalp.2d v1, v5
	304	uadalp.2d v2, v6
	305	uadalp.2d v3, v7
	306
	307	add.2d v0, v0, v1
	308	add.2d v2, v2, v3
	309	addp.2d d0, v0
	310	addp.2d d2, v2
	311	add.2d v0, v0, v2
	312	fmov x7, d0 // partial in x7 now
	313
	314	// restore used vector registers
	315	ld1.4s {v0, v1, v2, v3}, [sp], #4*16
	316	ld1.4s {v4, v5, v6, v7}, [sp], #4*16
	317
	318	L32_bytes:
	319	tst mlen, #32
	320	b.eq L16_bytes
	321	ldp x9, x10, [data], #16
	322	ldp x11, x12, [data], #16
	323	adds x7, x7, x9
	324	mov x9, #0
	325	adcs x7, x7, x10
	326	adcs x7, x7, x11
	327	adcs x7, x7, x12
	328	adc x7, x7, x9
	329
	330	L16_bytes:
	331	tst mlen, #16
	332	b.eq L8_bytes
	333	ldp x9, x10, [data], #16
	334	adds x7, x7, x9
	335	mov x9, #0
	336	adcs x7, x7, x10
	337	adc x7, x7, x9
	338
	339	L8_bytes:
	340	tst mlen, #8
	341	mov x10, #0
	342	b.eq L4_bytes
	343	ldr x9,[data],#8
	344	adds x7, x7, x9
	345	adc x7, x7, x10
	346
	347	L4_bytes:
	348	tst mlen, #4
	349	b.eq L2_bytes
	350	ldr w9,[data],#4
	351	adds x7, x7, x9
	352	adc x7, x7, x10
	353
	354	L2_bytes:
	355	tst mlen, #2
	356	b.eq L_trailing_bytes
	357	ldrh w9,[data],#2
	358	adds x7, x7, x9
	359	adc x7, x7, x10
	360
	361	L_trailing_bytes:
	362	tst mlen, #1
	363	b.eq L0_bytes
	364	ldrb w9,[data],#1
	365	adds x7, x7, x9
	366	adc x7, x7, x10
	367	eor started_on_odd, started_on_odd, #1
	368
	369	L0_bytes:
	370	/*
	371	* if (needs_swap)
	372	* partial = (partial << 8) + (partial >> 56);
	373	*/
	374	cbz needs_swap, 1f
	375	ror x7, x7, #56
	376	1:
	377	/*
	378	* sum += (partial >> 32) + (partial & 0xffffffff);
	379	* sum = (sum >> 32) + (sum & 0xffffffff);
	380	* }
	381	*/
	382
	383	add x3, x3, x7, lsr #32
	384	mov w7, w7
	385	add x3, x3, x7
	386	mov w7, w3
	387	add x3, x7, x3, lsr #32
	388
	389	L_continue:
	390	cmp len, #0
	391	ldr ptr_m, [m, #M_NEXT] // m = m->m_next
	392	b.gt L_loop
	393
	394	/*
	395	* final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
	396	* ((sum >> 16) & 0xffff) + (sum & 0xffff);
	397	* final_acc = (final_acc >> 16) + (final_acc & 0xffff);
	398	* final_acc = (final_acc >> 16) + (final_acc & 0xffff);
	399	* return (final_acc & 0xffff);
	400	* }
	401	*/
	402
	403	mov w4, #0x00ffff
	404	and x0, x4, x3, lsr #48
	405	and x1, x4, x3, lsr #32
	406	and x2, x4, x3, lsr #16
	407	and x3, x4, x3
	408	add w0, w0, w1
	409	add w2, w2, w3
	410	add w0, w0, w2
	411	and w1, w4, w0, lsr #16
	412	and w0, w4, w0
	413	add w0, w0, w1
	414	and w1, w4, w0, lsr #16
	415	and w0, w4, w0
	416	add w0, w0, w1
	417	/*
	418	* If we were to 1's complement it (XOR with 0xffff):
	419	*
	420	* eor w0, w0, w4
	421	*/
	422	and w0, w0, w4
	423
	424	ret lr
	425
	426	Lin_cksum_whoops:
	427	adrp x0, Lin_cksum_whoops_str@page
	428	add x0, x0, Lin_cksum_whoops_str@pageoff
	429	bl #CKSUM_ERR
	430	mov x0, #-1
	431	ret lr
	432
	433	Lin_cksum_whoops_str:
	434	.asciz "os_cpu_in_cksum_mbuf: out of data\n"
	435	.align 5