2 * Copyright (c) 2009-2017 Apple Inc. All rights reserved.
4 * This document is the property of Apple Inc.
5 * It is considered confidential and proprietary.
7 * This document may not be reproduced or transmitted in any form,
8 * in whole or in part, without the express written permission of
12 /* $NetBSD: cpu_in_cksum.S,v 1.2 2008/01/27 16:58:05 chris Exp $ */
15 * Copyright 2003 Wasabi Systems, Inc.
16 * All rights reserved.
18 * Written by Steve C. Woodford for Wasabi Systems, Inc.
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
23 * 1. Redistributions of source code must retain the above copyright
24 * notice, this list of conditions and the following disclaimer.
25 * 2. Redistributions in binary form must reproduce the above copyright
26 * notice, this list of conditions and the following disclaimer in the
27 * documentation and/or other materials provided with the distribution.
28 * 3. All advertising materials mentioning features or use of this software
29 * must display the following acknowledgement:
30 * This product includes software developed for the NetBSD Project by
31 * Wasabi Systems, Inc.
32 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
33 * or promote products derived from this software without specific prior
36 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
37 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
38 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
40 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
41 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
42 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
43 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
44 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
46 * POSSIBILITY OF SUCH DAMAGE.
50 #include "../../../osfmk/arm/arch.h"
51 #include "../../../osfmk/arm/proc_reg.h"
54 #error "Unsupported: __ARM_VFP__ < 3"
55 #endif /* __ARM_VFP__ < 3 */
56 #define CKSUM_ERR _kprintf
58 #ifndef LIBSYSCALL_INTERFACE
59 #error "LIBSYSCALL_INTERFACE not defined"
60 #endif /* !LIBSYSCALL_INTERFACE */
61 #define CKSUM_ERR _fprintf_stderr
66 * The following default the implementation to little-endian architectures.
68 #define LITTLE_ENDIAN 1
69 #define BYTE_ORDER LITTLE_ENDIAN
76 * Ugly, but we have little choice, since relying on genassym and <assym.s>
77 * is not possible unless this code lives in osfmk. Note also that this
78 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
79 * authentic; it only cares about 3 fields.
88 * The use of R7 in this code as data register prevents
89 * the use of debugging or instrumentation tools, which is an acceptable
90 * tradeoff considering the potential gain in performance.
94 * Hand-optimised implementations for ARM/Xscale
99 push {r0, r1, r2, r12}
100 bl _enable_kernel_vfp_context
101 pop {r0, r1, r2, r12}
107 * uint32_t os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off,
108 * uint32_t initial_sum);
116 * Function wide register usage
118 * r9 remaining length to parse
119 * ip pointer to next mbuf
121 * This function returns the partial 16-bit checksum accumulated in
122 * a 32-bit variable (withouth 1's complement); caller is responsible
123 * for folding the 32-bit sum into 16-bit and performinng the 1's
124 * complement if applicable
126 .globl _os_cpu_in_cksum_mbuf
129 _os_cpu_in_cksum_mbuf:
130 stmfd sp!, {r4-r11,lr}
132 mov r8, r3 /* Accumulate sum in r8 */
133 mov r9, r1 /* save len in r9 */
134 mov ip, r0 /* set ip to the current mbuf */
136 cmp r9, #0 /* length is 0? */
137 bne .Lin_cksum_skip_loop /* if not, proceed further */
138 mov r0, r8 /* otherwise, return initial sum */
140 ldmfd sp!, {r4-r11, pc}
142 .Lin_cksum_skip_loop:
143 ldr r1, [ip, #(M_LEN)]
144 ldr r0, [ip, #(M_DATA)]
145 ldr ip, [ip, #(M_NEXT)]
146 .Lin_cksum_skip_entry:
147 subs r2, r2, r1 /* offset = offset - mbuf length */
148 blt .Lin_cksum_skip_done /* if offset has gone negative start with this mbuf */
150 bne .Lin_cksum_skip_loop
153 .Lin_cksum_skip_done:
154 add r0, r2, r0 /* data += offset (offset is < 0) */
155 add r0, r0, r1 /* data += length of mbuf */
156 /* data == start of data to cksum */
157 rsb r1, r2, #0x00 /* length = remainder of mbuf to read */
162 ldr r1, [ip, #(M_LEN)]
163 ldr r0, [ip, #(M_DATA)]
164 ldr ip, [ip, #(M_NEXT)]
178 * Replace the 'blne _ASM_LABEL(L_cksumdata)' by bringing the called function
179 * inline. This results in slightly faster code, and also permits the whole
180 * function to be included in kernel profiling data.
184 * The main in*_cksum() workhorse...
187 * r0 Pointer to buffer
192 * r2 Accumulated 32-bit sum
199 /* We first have to word-align the buffer. */
201 beq .Lcksumdata_wordaligned
203 cmp r1, r7 /* Enough bytes left to make it? */
204 blt .Lcksumdata_endgame
206 ldrb r4, [r0], #0x01 /* Fetch 1st byte */
207 ldrbge r5, [r0], #0x01 /* Fetch 2nd byte */
209 ldrbgt r6, [r0], #0x01 /* Fetch 3rd byte */
211 /* Combine the three bytes depending on endianness and alignment */
212 #if BYTE_ORDER != LITTLE_ENDIAN
213 orreq r2, r5, r4, lsl #8
214 orreq r2, r2, r6, lsl #24
215 orrne r2, r4, r5, lsl #8
216 orrne r2, r2, r6, lsl #16
218 orreq r2, r4, r5, lsl #8
219 orreq r2, r2, r6, lsl #16
220 orrne r2, r5, r4, lsl #8
221 orrne r2, r2, r6, lsl #24
223 subs r1, r1, r7 /* Update length */
224 beq .Lin_cksum_next /* All done? */
226 /* Buffer is now word aligned */
227 .Lcksumdata_wordaligned:
231 cmp r1, #512 // do this if r1 is at least 512
240 // move r2 to s16 (q4) for neon computation
242 vld1.32 {q0-q1}, [r0]!
244 vld1.32 {q2-q3}, [r0]!
246 // pre-decrement size by 64
259 subs r3, r3, #0x40 // decrement size by 64
290 ands r1, r1, #0x3f // residual bytes
295 #endif /* __ARM_VFP__ >= 3 */
298 blt .Lcksumdata_bigloop_end
301 ldmia r0!, {r3, r4, r5, r6}
305 ldmia r0!, {r3, r4, r5, r7}
310 ldmia r0!, {r3, r4, r5, r6}
315 ldmia r0!, {r3, r4, r5, r7}
323 bge .Lcksumdata_bigloop
324 .Lcksumdata_bigloop_end:
331 blt .Lcksumdata_less_than_32
332 ldmia r0!, {r3, r4, r5, r6}
336 ldmia r0!, {r3, r4, r5, r7}
346 .Lcksumdata_less_than_32:
347 /* There are less than 32 bytes left */
351 adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */
355 * Note: We use ldm here, even on Xscale, since the combined issue/result
356 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
358 /* At least 24 bytes remaining... */
364 /* At least 16 bytes remaining... */
369 /* At least 8 bytes remaining... */
374 /* Less than 8 bytes remaining... */
377 blt .Lcksumdata_lessthan4
384 /* Deal with < 4 bytes remaining */
385 .Lcksumdata_lessthan4:
389 /* Deal with 1 to 3 remaining bytes, possibly misaligned */
391 ldrb r3, [r0] /* Fetch first byte */
393 ldrbge r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */
395 ldrbgt r5, [r0, #0x02]
397 /* Combine the three bytes depending on endianness and alignment */
399 #if BYTE_ORDER != LITTLE_ENDIAN
400 orreq r3, r4, r3, lsl #8
401 orreq r3, r3, r5, lsl #24
402 orrne r3, r3, r4, lsl #8
403 orrne r3, r3, r5, lsl #16
405 orreq r3, r3, r4, lsl #8
406 orreq r3, r3, r5, lsl #16
407 orrne r3, r4, r3, lsl #8
408 orrne r3, r3, r5, lsl #24
424 add r0, r0, r8, lsr #16
425 add r0, r0, r0, lsr #16
428 * If we were to 1's complement it (XOR with 0xffff):
433 ldmfd sp!, {r4-r11, pc}
436 adr r0, .Lin_cksum_whoops_str
440 ldmfd sp!, {r4-r11, pc}
442 .Lin_cksum_whoops_str:
443 .asciz "os_cpu_in_cksum_mbuf: out of data\n"