]> git.saurik.com Git - apple/xnu.git/blame - bsd/dev/arm/cpu_in_cksum.s
xnu-6153.61.1.tar.gz
[apple/xnu.git] / bsd / dev / arm / cpu_in_cksum.s
CommitLineData
5ba3f43e 1/*
a39ff7e2 2 * Copyright (c) 2009-2018 Apple Inc. All rights reserved.
5ba3f43e 3 *
a39ff7e2 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5ba3f43e 5 *
a39ff7e2
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
5ba3f43e
A
27 */
28
29/* $NetBSD: cpu_in_cksum.S,v 1.2 2008/01/27 16:58:05 chris Exp $ */
30
31/*
32 * Copyright 2003 Wasabi Systems, Inc.
33 * All rights reserved.
34 *
35 * Written by Steve C. Woodford for Wasabi Systems, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed for the NetBSD Project by
48 * Wasabi Systems, Inc.
49 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
50 * or promote products derived from this software without specific prior
51 * written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
56 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
57 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
58 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
59 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
60 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
61 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
62 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
63 * POSSIBILITY OF SUCH DAMAGE.
64 */
65
66#ifdef KERNEL
67#include "../../../osfmk/arm/arch.h"
68#include "../../../osfmk/arm/proc_reg.h"
69
70#if __ARM_VFP__ < 3
71#error "Unsupported: __ARM_VFP__ < 3"
72#endif /* __ARM_VFP__ < 3 */
73#define CKSUM_ERR _kprintf
74#else /* !KERNEL */
75#ifndef LIBSYSCALL_INTERFACE
76#error "LIBSYSCALL_INTERFACE not defined"
77#endif /* !LIBSYSCALL_INTERFACE */
78#define CKSUM_ERR _fprintf_stderr
79#define __ARM_VFP__ 3
80#endif /* !KERNEL */
81
82/*
83 * The following default the implementation to little-endian architectures.
84 */
85#define LITTLE_ENDIAN 1
86#define BYTE_ORDER LITTLE_ENDIAN
87
88.syntax unified
89
90/*
91 * XXX: adi@apple.com:
92 *
93 * Ugly, but we have little choice, since relying on genassym and <assym.s>
94 * is not possible unless this code lives in osfmk. Note also that this
95 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
96 * authentic; it only cares about 3 fields.
97 */
98#define M_NEXT 0
99#define M_DATA 8
100#define M_LEN 12
101
102/*
103 * APPLE MODIFICATION
104 *
105 * The use of R7 in this code as data register prevents
106 * the use of debugging or instrumentation tools, which is an acceptable
107 * tradeoff considering the potential gain in performance.
108 */
109
110/*
111 * Hand-optimised implementations for ARM/Xscale
112 */
113
114 .macro EnableVFP
115#ifdef KERNEL
116 push {r0, r1, r2, r12}
117 bl _enable_kernel_vfp_context
118 pop {r0, r1, r2, r12}
119#endif /* KERNEL */
120 .endm
121
122
123/*
124 * uint32_t os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off,
125 * uint32_t initial_sum);
126 *
127 * Entry:
128 * r0 m
129 * r1 len
130 * r2 off
131 * r3 initial_sum
132 *
133 * Function wide register usage
134 * r8 accumulated sum
135 * r9 remaining length to parse
136 * ip pointer to next mbuf
137 *
138 * This function returns the partial 16-bit checksum accumulated in
139 * a 32-bit variable (withouth 1's complement); caller is responsible
140 * for folding the 32-bit sum into 16-bit and performinng the 1's
141 * complement if applicable
142 */
143 .globl _os_cpu_in_cksum_mbuf
144 .text
145 .align 4
146_os_cpu_in_cksum_mbuf:
147 stmfd sp!, {r4-r11,lr}
148
149 mov r8, r3 /* Accumulate sum in r8 */
150 mov r9, r1 /* save len in r9 */
151 mov ip, r0 /* set ip to the current mbuf */
152
153 cmp r9, #0 /* length is 0? */
154 bne .Lin_cksum_skip_loop /* if not, proceed further */
155 mov r0, r8 /* otherwise, return initial sum */
156
157 ldmfd sp!, {r4-r11, pc}
158
159.Lin_cksum_skip_loop:
160 ldr r1, [ip, #(M_LEN)]
161 ldr r0, [ip, #(M_DATA)]
162 ldr ip, [ip, #(M_NEXT)]
163.Lin_cksum_skip_entry:
164 subs r2, r2, r1 /* offset = offset - mbuf length */
165 blt .Lin_cksum_skip_done /* if offset has gone negative start with this mbuf */
166 cmp ip, #0x00
167 bne .Lin_cksum_skip_loop
168 b .Lin_cksum_whoops
169
170.Lin_cksum_skip_done:
171 add r0, r2, r0 /* data += offset (offset is < 0) */
172 add r0, r0, r1 /* data += length of mbuf */
173 /* data == start of data to cksum */
174 rsb r1, r2, #0x00 /* length = remainder of mbuf to read */
175 mov r10, #0x00
176 b .Lin_cksum_entry
177
178.Lin_cksum_loop:
179 ldr r1, [ip, #(M_LEN)]
180 ldr r0, [ip, #(M_DATA)]
181 ldr ip, [ip, #(M_NEXT)]
182.Lin_cksum_entry:
183 cmp r9, r1
184 movlt r1, r9
185 sub r9, r9, r1
186 eor r11, r10, r0
187 add r10, r10, r1
188 adds r2, r1, #0x00
189
190 beq .Lin_cksum_next
191
192/*
193 * APPLE MODIFICATION
194 *
195 * Replace the 'blne _ASM_LABEL(L_cksumdata)' by bringing the called function
196 * inline. This results in slightly faster code, and also permits the whole
197 * function to be included in kernel profiling data.
198 */
199
200/*
201 * The main in*_cksum() workhorse...
202 *
203 * Entry parameters:
204 * r0 Pointer to buffer
205 * r1 Buffer length
206 * lr Return address
207 *
208 * Returns:
209 * r2 Accumulated 32-bit sum
210 *
211 * Clobbers:
212 * r0-r7
213 */
214 mov r2, #0
215
216 /* We first have to word-align the buffer. */
217 ands r7, r0, #0x03
218 beq .Lcksumdata_wordaligned
219 rsb r7, r7, #0x04
220 cmp r1, r7 /* Enough bytes left to make it? */
221 blt .Lcksumdata_endgame
222 cmp r7, #0x02
223 ldrb r4, [r0], #0x01 /* Fetch 1st byte */
224 ldrbge r5, [r0], #0x01 /* Fetch 2nd byte */
225 movlt r5, #0x00
226 ldrbgt r6, [r0], #0x01 /* Fetch 3rd byte */
227 movle r6, #0x00
228 /* Combine the three bytes depending on endianness and alignment */
229#if BYTE_ORDER != LITTLE_ENDIAN
230 orreq r2, r5, r4, lsl #8
231 orreq r2, r2, r6, lsl #24
232 orrne r2, r4, r5, lsl #8
233 orrne r2, r2, r6, lsl #16
234#else
235 orreq r2, r4, r5, lsl #8
236 orreq r2, r2, r6, lsl #16
237 orrne r2, r5, r4, lsl #8
238 orrne r2, r2, r6, lsl #24
239#endif
240 subs r1, r1, r7 /* Update length */
241 beq .Lin_cksum_next /* All done? */
242
243 /* Buffer is now word aligned */
244.Lcksumdata_wordaligned:
245
246#if __ARM_VFP__ >= 3
247
248 cmp r1, #512 // do this if r1 is at least 512
249 blt 9f
250
251 EnableVFP
252
253 and r3, r1, #~0x3f
254
255 vpush {q0-q7}
256
257 // move r2 to s16 (q4) for neon computation
258 veor q4, q4, q4
259 vld1.32 {q0-q1}, [r0]!
260 vmov s16, r2
261 vld1.32 {q2-q3}, [r0]!
262
263 // pre-decrement size by 64
264 subs r3, r3, #0x80
265
266 vpadal.u32 q4, q0
267 vld1.32 {q0}, [r0]!
268 vpaddl.u32 q5, q1
269 vld1.32 {q1}, [r0]!
270 vpaddl.u32 q6, q2
271 vld1.32 {q2}, [r0]!
272 vpaddl.u32 q7, q3
273 vld1.32 {q3}, [r0]!
274
2750:
276 subs r3, r3, #0x40 // decrement size by 64
277
278 vpadal.u32 q4, q0
279 vld1.32 {q0}, [r0]!
280 vpadal.u32 q5, q1
281 vld1.32 {q1}, [r0]!
282 vpadal.u32 q6, q2
283 vld1.32 {q2}, [r0]!
284 vpadal.u32 q7, q3
285 vld1.32 {q3}, [r0]!
286
287 bgt 0b
288
289 vpadal.u32 q4, q0
290 vpadal.u32 q5, q1
291 vpadal.u32 q6, q2
292 vpadal.u32 q7, q3
293
294 vpadal.u32 q4, q5
295 vpadal.u32 q6, q7
296 vpadal.u32 q4, q6
297 vadd.i64 d8, d9
298
299 vpaddl.u32 d8, d8
300 vpaddl.u32 d8, d8
301 vpaddl.u32 d8, d8
302
303 vmov r2, s16
304
305 vpop {q0-q7}
306
307 ands r1, r1, #0x3f // residual bytes
308 beq .Lin_cksum_next
309
3109:
311
312#endif /* __ARM_VFP__ >= 3 */
313
314 subs r1, r1, #0x40
315 blt .Lcksumdata_bigloop_end
316
317.Lcksumdata_bigloop:
318 ldmia r0!, {r3, r4, r5, r6}
319 adds r2, r2, r3
320 adcs r2, r2, r4
321 adcs r2, r2, r5
322 ldmia r0!, {r3, r4, r5, r7}
323 adcs r2, r2, r6
324 adcs r2, r2, r3
325 adcs r2, r2, r4
326 adcs r2, r2, r5
327 ldmia r0!, {r3, r4, r5, r6}
328 adcs r2, r2, r7
329 adcs r2, r2, r3
330 adcs r2, r2, r4
331 adcs r2, r2, r5
332 ldmia r0!, {r3, r4, r5, r7}
333 adcs r2, r2, r6
334 adcs r2, r2, r3
335 adcs r2, r2, r4
336 adcs r2, r2, r5
337 adcs r2, r2, r7
338 adc r2, r2, #0x00
339 subs r1, r1, #0x40
340 bge .Lcksumdata_bigloop
341.Lcksumdata_bigloop_end:
342
343 adds r1, r1, #0x40
344 beq .Lin_cksum_next
345
346 cmp r1, #0x20
347
348 blt .Lcksumdata_less_than_32
349 ldmia r0!, {r3, r4, r5, r6}
350 adds r2, r2, r3
351 adcs r2, r2, r4
352 adcs r2, r2, r5
353 ldmia r0!, {r3, r4, r5, r7}
354 adcs r2, r2, r6
355 adcs r2, r2, r3
356 adcs r2, r2, r4
357 adcs r2, r2, r5
358 adcs r2, r2, r7
359 adc r2, r2, #0x00
360 subs r1, r1, #0x20
361 beq .Lin_cksum_next
362
363.Lcksumdata_less_than_32:
364 /* There are less than 32 bytes left */
365 and r3, r1, #0x18
366 rsb r4, r3, #0x18
367 sub r1, r1, r3
368 adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */
369 addne pc, pc, r4
370
371/*
372 * Note: We use ldm here, even on Xscale, since the combined issue/result
373 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
374 */
375 /* At least 24 bytes remaining... */
376 ldmia r0!, {r4, r5}
377 nop
378 adcs r2, r2, r4
379 adcs r2, r2, r5
380
381 /* At least 16 bytes remaining... */
382 ldmia r0!, {r4, r5}
383 adcs r2, r2, r4
384 adcs r2, r2, r5
385
386 /* At least 8 bytes remaining... */
387 ldmia r0!, {r4, r5}
388 adcs r2, r2, r4
389 adcs r2, r2, r5
390
391 /* Less than 8 bytes remaining... */
392 adc r2, r2, #0x00
393 subs r1, r1, #0x04
394 blt .Lcksumdata_lessthan4
395
396 ldr r4, [r0], #0x04
397 sub r1, r1, #0x04
398 adds r2, r2, r4
399 adc r2, r2, #0x00
400
401 /* Deal with < 4 bytes remaining */
402.Lcksumdata_lessthan4:
403 adds r1, r1, #0x04
404 beq .Lin_cksum_next
405
406 /* Deal with 1 to 3 remaining bytes, possibly misaligned */
407.Lcksumdata_endgame:
408 ldrb r3, [r0] /* Fetch first byte */
409 cmp r1, #0x02
410 ldrbge r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */
411 movlt r4, #0x00
412 ldrbgt r5, [r0, #0x02]
413 movle r5, #0x00
414 /* Combine the three bytes depending on endianness and alignment */
415 tst r0, #0x01
416#if BYTE_ORDER != LITTLE_ENDIAN
417 orreq r3, r4, r3, lsl #8
418 orreq r3, r3, r5, lsl #24
419 orrne r3, r3, r4, lsl #8
420 orrne r3, r3, r5, lsl #16
421#else
422 orreq r3, r3, r4, lsl #8
423 orreq r3, r3, r5, lsl #16
424 orrne r3, r4, r3, lsl #8
425 orrne r3, r3, r5, lsl #24
426#endif
427 adds r2, r2, r3
428 adc r2, r2, #0x00
429
430.Lin_cksum_next:
431 tst r11, #0x01
432 movne r2, r2, ror #8
433 adds r8, r8, r2
434 adc r8, r8, #0x00
435 cmp ip, #00
436 bne .Lin_cksum_loop
437
438 mov r1, #0xff
439 orr r1, r1, #0xff00
440 and r0, r8, r1
441 add r0, r0, r8, lsr #16
442 add r0, r0, r0, lsr #16
443 and r0, r0, r1
444 /*
445 * If we were to 1's complement it (XOR with 0xffff):
446 *
447 * eor r0, r0, r1
448 */
449
450 ldmfd sp!, {r4-r11, pc}
451
452.Lin_cksum_whoops:
453 adr r0, .Lin_cksum_whoops_str
454 bl #CKSUM_ERR
455 mov r0, #-1
456
457 ldmfd sp!, {r4-r11, pc}
458
459.Lin_cksum_whoops_str:
460 .asciz "os_cpu_in_cksum_mbuf: out of data\n"
461 .align 5