]> git.saurik.com Git - apple/xnu.git/blob - bsd/dev/arm/cpu_in_cksum.s
28f6481836152cb95514afc166baf59d5381da74
[apple/xnu.git] / bsd / dev / arm / cpu_in_cksum.s
1 /*
2 * Copyright (c) 2009-2017 Apple Inc. All rights reserved.
3 *
4 * This document is the property of Apple Inc.
5 * It is considered confidential and proprietary.
6 *
7 * This document may not be reproduced or transmitted in any form,
8 * in whole or in part, without the express written permission of
9 * Apple Inc.
10 */
11
12 /* $NetBSD: cpu_in_cksum.S,v 1.2 2008/01/27 16:58:05 chris Exp $ */
13
14 /*
15 * Copyright 2003 Wasabi Systems, Inc.
16 * All rights reserved.
17 *
18 * Written by Steve C. Woodford for Wasabi Systems, Inc.
19 *
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
22 * are met:
23 * 1. Redistributions of source code must retain the above copyright
24 * notice, this list of conditions and the following disclaimer.
25 * 2. Redistributions in binary form must reproduce the above copyright
26 * notice, this list of conditions and the following disclaimer in the
27 * documentation and/or other materials provided with the distribution.
28 * 3. All advertising materials mentioning features or use of this software
29 * must display the following acknowledgement:
30 * This product includes software developed for the NetBSD Project by
31 * Wasabi Systems, Inc.
32 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
33 * or promote products derived from this software without specific prior
34 * written permission.
35 *
36 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
37 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
38 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
40 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
41 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
42 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
43 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
44 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
46 * POSSIBILITY OF SUCH DAMAGE.
47 */
48
49 #ifdef KERNEL
50 #include "../../../osfmk/arm/arch.h"
51 #include "../../../osfmk/arm/proc_reg.h"
52
53 #if __ARM_VFP__ < 3
54 #error "Unsupported: __ARM_VFP__ < 3"
55 #endif /* __ARM_VFP__ < 3 */
56 #define CKSUM_ERR _kprintf
57 #else /* !KERNEL */
58 #ifndef LIBSYSCALL_INTERFACE
59 #error "LIBSYSCALL_INTERFACE not defined"
60 #endif /* !LIBSYSCALL_INTERFACE */
61 #define CKSUM_ERR _fprintf_stderr
62 #define __ARM_VFP__ 3
63 #endif /* !KERNEL */
64
65 /*
66 * The following default the implementation to little-endian architectures.
67 */
68 #define LITTLE_ENDIAN 1
69 #define BYTE_ORDER LITTLE_ENDIAN
70
71 .syntax unified
72
73 /*
74 * XXX: adi@apple.com:
75 *
76 * Ugly, but we have little choice, since relying on genassym and <assym.s>
77 * is not possible unless this code lives in osfmk. Note also that this
78 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
79 * authentic; it only cares about 3 fields.
80 */
81 #define M_NEXT 0
82 #define M_DATA 8
83 #define M_LEN 12
84
85 /*
86 * APPLE MODIFICATION
87 *
88 * The use of R7 in this code as data register prevents
89 * the use of debugging or instrumentation tools, which is an acceptable
90 * tradeoff considering the potential gain in performance.
91 */
92
93 /*
94 * Hand-optimised implementations for ARM/Xscale
95 */
96
97 .macro EnableVFP
98 #ifdef KERNEL
99 push {r0, r1, r2, r12}
100 bl _enable_kernel_vfp_context
101 pop {r0, r1, r2, r12}
102 #endif /* KERNEL */
103 .endm
104
105
106 /*
107 * uint32_t os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off,
108 * uint32_t initial_sum);
109 *
110 * Entry:
111 * r0 m
112 * r1 len
113 * r2 off
114 * r3 initial_sum
115 *
116 * Function wide register usage
117 * r8 accumulated sum
118 * r9 remaining length to parse
119 * ip pointer to next mbuf
120 *
121 * This function returns the partial 16-bit checksum accumulated in
122 * a 32-bit variable (withouth 1's complement); caller is responsible
123 * for folding the 32-bit sum into 16-bit and performinng the 1's
124 * complement if applicable
125 */
126 .globl _os_cpu_in_cksum_mbuf
127 .text
128 .align 4
129 _os_cpu_in_cksum_mbuf:
130 stmfd sp!, {r4-r11,lr}
131
132 mov r8, r3 /* Accumulate sum in r8 */
133 mov r9, r1 /* save len in r9 */
134 mov ip, r0 /* set ip to the current mbuf */
135
136 cmp r9, #0 /* length is 0? */
137 bne .Lin_cksum_skip_loop /* if not, proceed further */
138 mov r0, r8 /* otherwise, return initial sum */
139
140 ldmfd sp!, {r4-r11, pc}
141
142 .Lin_cksum_skip_loop:
143 ldr r1, [ip, #(M_LEN)]
144 ldr r0, [ip, #(M_DATA)]
145 ldr ip, [ip, #(M_NEXT)]
146 .Lin_cksum_skip_entry:
147 subs r2, r2, r1 /* offset = offset - mbuf length */
148 blt .Lin_cksum_skip_done /* if offset has gone negative start with this mbuf */
149 cmp ip, #0x00
150 bne .Lin_cksum_skip_loop
151 b .Lin_cksum_whoops
152
153 .Lin_cksum_skip_done:
154 add r0, r2, r0 /* data += offset (offset is < 0) */
155 add r0, r0, r1 /* data += length of mbuf */
156 /* data == start of data to cksum */
157 rsb r1, r2, #0x00 /* length = remainder of mbuf to read */
158 mov r10, #0x00
159 b .Lin_cksum_entry
160
161 .Lin_cksum_loop:
162 ldr r1, [ip, #(M_LEN)]
163 ldr r0, [ip, #(M_DATA)]
164 ldr ip, [ip, #(M_NEXT)]
165 .Lin_cksum_entry:
166 cmp r9, r1
167 movlt r1, r9
168 sub r9, r9, r1
169 eor r11, r10, r0
170 add r10, r10, r1
171 adds r2, r1, #0x00
172
173 beq .Lin_cksum_next
174
175 /*
176 * APPLE MODIFICATION
177 *
178 * Replace the 'blne _ASM_LABEL(L_cksumdata)' by bringing the called function
179 * inline. This results in slightly faster code, and also permits the whole
180 * function to be included in kernel profiling data.
181 */
182
183 /*
184 * The main in*_cksum() workhorse...
185 *
186 * Entry parameters:
187 * r0 Pointer to buffer
188 * r1 Buffer length
189 * lr Return address
190 *
191 * Returns:
192 * r2 Accumulated 32-bit sum
193 *
194 * Clobbers:
195 * r0-r7
196 */
197 mov r2, #0
198
199 /* We first have to word-align the buffer. */
200 ands r7, r0, #0x03
201 beq .Lcksumdata_wordaligned
202 rsb r7, r7, #0x04
203 cmp r1, r7 /* Enough bytes left to make it? */
204 blt .Lcksumdata_endgame
205 cmp r7, #0x02
206 ldrb r4, [r0], #0x01 /* Fetch 1st byte */
207 ldrbge r5, [r0], #0x01 /* Fetch 2nd byte */
208 movlt r5, #0x00
209 ldrbgt r6, [r0], #0x01 /* Fetch 3rd byte */
210 movle r6, #0x00
211 /* Combine the three bytes depending on endianness and alignment */
212 #if BYTE_ORDER != LITTLE_ENDIAN
213 orreq r2, r5, r4, lsl #8
214 orreq r2, r2, r6, lsl #24
215 orrne r2, r4, r5, lsl #8
216 orrne r2, r2, r6, lsl #16
217 #else
218 orreq r2, r4, r5, lsl #8
219 orreq r2, r2, r6, lsl #16
220 orrne r2, r5, r4, lsl #8
221 orrne r2, r2, r6, lsl #24
222 #endif
223 subs r1, r1, r7 /* Update length */
224 beq .Lin_cksum_next /* All done? */
225
226 /* Buffer is now word aligned */
227 .Lcksumdata_wordaligned:
228
229 #if __ARM_VFP__ >= 3
230
231 cmp r1, #512 // do this if r1 is at least 512
232 blt 9f
233
234 EnableVFP
235
236 and r3, r1, #~0x3f
237
238 vpush {q0-q7}
239
240 // move r2 to s16 (q4) for neon computation
241 veor q4, q4, q4
242 vld1.32 {q0-q1}, [r0]!
243 vmov s16, r2
244 vld1.32 {q2-q3}, [r0]!
245
246 // pre-decrement size by 64
247 subs r3, r3, #0x80
248
249 vpadal.u32 q4, q0
250 vld1.32 {q0}, [r0]!
251 vpaddl.u32 q5, q1
252 vld1.32 {q1}, [r0]!
253 vpaddl.u32 q6, q2
254 vld1.32 {q2}, [r0]!
255 vpaddl.u32 q7, q3
256 vld1.32 {q3}, [r0]!
257
258 0:
259 subs r3, r3, #0x40 // decrement size by 64
260
261 vpadal.u32 q4, q0
262 vld1.32 {q0}, [r0]!
263 vpadal.u32 q5, q1
264 vld1.32 {q1}, [r0]!
265 vpadal.u32 q6, q2
266 vld1.32 {q2}, [r0]!
267 vpadal.u32 q7, q3
268 vld1.32 {q3}, [r0]!
269
270 bgt 0b
271
272 vpadal.u32 q4, q0
273 vpadal.u32 q5, q1
274 vpadal.u32 q6, q2
275 vpadal.u32 q7, q3
276
277 vpadal.u32 q4, q5
278 vpadal.u32 q6, q7
279 vpadal.u32 q4, q6
280 vadd.i64 d8, d9
281
282 vpaddl.u32 d8, d8
283 vpaddl.u32 d8, d8
284 vpaddl.u32 d8, d8
285
286 vmov r2, s16
287
288 vpop {q0-q7}
289
290 ands r1, r1, #0x3f // residual bytes
291 beq .Lin_cksum_next
292
293 9:
294
295 #endif /* __ARM_VFP__ >= 3 */
296
297 subs r1, r1, #0x40
298 blt .Lcksumdata_bigloop_end
299
300 .Lcksumdata_bigloop:
301 ldmia r0!, {r3, r4, r5, r6}
302 adds r2, r2, r3
303 adcs r2, r2, r4
304 adcs r2, r2, r5
305 ldmia r0!, {r3, r4, r5, r7}
306 adcs r2, r2, r6
307 adcs r2, r2, r3
308 adcs r2, r2, r4
309 adcs r2, r2, r5
310 ldmia r0!, {r3, r4, r5, r6}
311 adcs r2, r2, r7
312 adcs r2, r2, r3
313 adcs r2, r2, r4
314 adcs r2, r2, r5
315 ldmia r0!, {r3, r4, r5, r7}
316 adcs r2, r2, r6
317 adcs r2, r2, r3
318 adcs r2, r2, r4
319 adcs r2, r2, r5
320 adcs r2, r2, r7
321 adc r2, r2, #0x00
322 subs r1, r1, #0x40
323 bge .Lcksumdata_bigloop
324 .Lcksumdata_bigloop_end:
325
326 adds r1, r1, #0x40
327 beq .Lin_cksum_next
328
329 cmp r1, #0x20
330
331 blt .Lcksumdata_less_than_32
332 ldmia r0!, {r3, r4, r5, r6}
333 adds r2, r2, r3
334 adcs r2, r2, r4
335 adcs r2, r2, r5
336 ldmia r0!, {r3, r4, r5, r7}
337 adcs r2, r2, r6
338 adcs r2, r2, r3
339 adcs r2, r2, r4
340 adcs r2, r2, r5
341 adcs r2, r2, r7
342 adc r2, r2, #0x00
343 subs r1, r1, #0x20
344 beq .Lin_cksum_next
345
346 .Lcksumdata_less_than_32:
347 /* There are less than 32 bytes left */
348 and r3, r1, #0x18
349 rsb r4, r3, #0x18
350 sub r1, r1, r3
351 adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */
352 addne pc, pc, r4
353
354 /*
355 * Note: We use ldm here, even on Xscale, since the combined issue/result
356 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
357 */
358 /* At least 24 bytes remaining... */
359 ldmia r0!, {r4, r5}
360 nop
361 adcs r2, r2, r4
362 adcs r2, r2, r5
363
364 /* At least 16 bytes remaining... */
365 ldmia r0!, {r4, r5}
366 adcs r2, r2, r4
367 adcs r2, r2, r5
368
369 /* At least 8 bytes remaining... */
370 ldmia r0!, {r4, r5}
371 adcs r2, r2, r4
372 adcs r2, r2, r5
373
374 /* Less than 8 bytes remaining... */
375 adc r2, r2, #0x00
376 subs r1, r1, #0x04
377 blt .Lcksumdata_lessthan4
378
379 ldr r4, [r0], #0x04
380 sub r1, r1, #0x04
381 adds r2, r2, r4
382 adc r2, r2, #0x00
383
384 /* Deal with < 4 bytes remaining */
385 .Lcksumdata_lessthan4:
386 adds r1, r1, #0x04
387 beq .Lin_cksum_next
388
389 /* Deal with 1 to 3 remaining bytes, possibly misaligned */
390 .Lcksumdata_endgame:
391 ldrb r3, [r0] /* Fetch first byte */
392 cmp r1, #0x02
393 ldrbge r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */
394 movlt r4, #0x00
395 ldrbgt r5, [r0, #0x02]
396 movle r5, #0x00
397 /* Combine the three bytes depending on endianness and alignment */
398 tst r0, #0x01
399 #if BYTE_ORDER != LITTLE_ENDIAN
400 orreq r3, r4, r3, lsl #8
401 orreq r3, r3, r5, lsl #24
402 orrne r3, r3, r4, lsl #8
403 orrne r3, r3, r5, lsl #16
404 #else
405 orreq r3, r3, r4, lsl #8
406 orreq r3, r3, r5, lsl #16
407 orrne r3, r4, r3, lsl #8
408 orrne r3, r3, r5, lsl #24
409 #endif
410 adds r2, r2, r3
411 adc r2, r2, #0x00
412
413 .Lin_cksum_next:
414 tst r11, #0x01
415 movne r2, r2, ror #8
416 adds r8, r8, r2
417 adc r8, r8, #0x00
418 cmp ip, #00
419 bne .Lin_cksum_loop
420
421 mov r1, #0xff
422 orr r1, r1, #0xff00
423 and r0, r8, r1
424 add r0, r0, r8, lsr #16
425 add r0, r0, r0, lsr #16
426 and r0, r0, r1
427 /*
428 * If we were to 1's complement it (XOR with 0xffff):
429 *
430 * eor r0, r0, r1
431 */
432
433 ldmfd sp!, {r4-r11, pc}
434
435 .Lin_cksum_whoops:
436 adr r0, .Lin_cksum_whoops_str
437 bl #CKSUM_ERR
438 mov r0, #-1
439
440 ldmfd sp!, {r4-r11, pc}
441
442 .Lin_cksum_whoops_str:
443 .asciz "os_cpu_in_cksum_mbuf: out of data\n"
444 .align 5