]>
Commit | Line | Data |
---|---|---|
5ba3f43e | 1 | /* |
a39ff7e2 | 2 | * Copyright (c) 2009-2018 Apple Inc. All rights reserved. |
5ba3f43e | 3 | * |
a39ff7e2 | 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5ba3f43e | 5 | * |
a39ff7e2 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
5ba3f43e A |
27 | */ |
28 | ||
29 | /* $NetBSD: cpu_in_cksum.S,v 1.2 2008/01/27 16:58:05 chris Exp $ */ | |
30 | ||
31 | /* | |
32 | * Copyright 2003 Wasabi Systems, Inc. | |
33 | * All rights reserved. | |
34 | * | |
35 | * Written by Steve C. Woodford for Wasabi Systems, Inc. | |
36 | * | |
37 | * Redistribution and use in source and binary forms, with or without | |
38 | * modification, are permitted provided that the following conditions | |
39 | * are met: | |
40 | * 1. Redistributions of source code must retain the above copyright | |
41 | * notice, this list of conditions and the following disclaimer. | |
42 | * 2. Redistributions in binary form must reproduce the above copyright | |
43 | * notice, this list of conditions and the following disclaimer in the | |
44 | * documentation and/or other materials provided with the distribution. | |
45 | * 3. All advertising materials mentioning features or use of this software | |
46 | * must display the following acknowledgement: | |
47 | * This product includes software developed for the NetBSD Project by | |
48 | * Wasabi Systems, Inc. | |
49 | * 4. The name of Wasabi Systems, Inc. may not be used to endorse | |
50 | * or promote products derived from this software without specific prior | |
51 | * written permission. | |
52 | * | |
53 | * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND | |
54 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | |
55 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
56 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC | |
57 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
58 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
59 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
60 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
61 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
62 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
63 | * POSSIBILITY OF SUCH DAMAGE. | |
64 | */ | |
65 | ||
66 | #ifdef KERNEL | |
67 | #include "../../../osfmk/arm/arch.h" | |
68 | #include "../../../osfmk/arm/proc_reg.h" | |
69 | ||
70 | #if __ARM_VFP__ < 3 | |
71 | #error "Unsupported: __ARM_VFP__ < 3" | |
72 | #endif /* __ARM_VFP__ < 3 */ | |
73 | #define CKSUM_ERR _kprintf | |
74 | #else /* !KERNEL */ | |
75 | #ifndef LIBSYSCALL_INTERFACE | |
76 | #error "LIBSYSCALL_INTERFACE not defined" | |
77 | #endif /* !LIBSYSCALL_INTERFACE */ | |
78 | #define CKSUM_ERR _fprintf_stderr | |
79 | #define __ARM_VFP__ 3 | |
80 | #endif /* !KERNEL */ | |
81 | ||
82 | /* | |
83 | * The following default the implementation to little-endian architectures. | |
84 | */ | |
85 | #define LITTLE_ENDIAN 1 | |
86 | #define BYTE_ORDER LITTLE_ENDIAN | |
87 | ||
88 | .syntax unified | |
89 | ||
90 | /* | |
91 | * XXX: adi@apple.com: | |
92 | * | |
93 | * Ugly, but we have little choice, since relying on genassym and <assym.s> | |
94 | * is not possible unless this code lives in osfmk. Note also that this | |
95 | * routine expects "mbuf-like" argument, and it does not expect the mbuf to be | |
96 | * authentic; it only cares about 3 fields. | |
97 | */ | |
98 | #define M_NEXT 0 | |
99 | #define M_DATA 8 | |
100 | #define M_LEN 12 | |
101 | ||
102 | /* | |
103 | * APPLE MODIFICATION | |
104 | * | |
105 | * The use of R7 in this code as data register prevents | |
106 | * the use of debugging or instrumentation tools, which is an acceptable | |
107 | * tradeoff considering the potential gain in performance. | |
108 | */ | |
109 | ||
110 | /* | |
111 | * Hand-optimised implementations for ARM/Xscale | |
112 | */ | |
113 | ||
114 | .macro EnableVFP | |
115 | #ifdef KERNEL | |
116 | push {r0, r1, r2, r12} | |
117 | bl _enable_kernel_vfp_context | |
118 | pop {r0, r1, r2, r12} | |
119 | #endif /* KERNEL */ | |
120 | .endm | |
121 | ||
122 | ||
123 | /* | |
124 | * uint32_t os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, | |
125 | * uint32_t initial_sum); | |
126 | * | |
127 | * Entry: | |
128 | * r0 m | |
129 | * r1 len | |
130 | * r2 off | |
131 | * r3 initial_sum | |
132 | * | |
133 | * Function wide register usage | |
134 | * r8 accumulated sum | |
135 | * r9 remaining length to parse | |
136 | * ip pointer to next mbuf | |
137 | * | |
138 | * This function returns the partial 16-bit checksum accumulated in | |
139 | * a 32-bit variable (withouth 1's complement); caller is responsible | |
140 | * for folding the 32-bit sum into 16-bit and performinng the 1's | |
141 | * complement if applicable | |
142 | */ | |
143 | .globl _os_cpu_in_cksum_mbuf | |
144 | .text | |
145 | .align 4 | |
146 | _os_cpu_in_cksum_mbuf: | |
147 | stmfd sp!, {r4-r11,lr} | |
148 | ||
149 | mov r8, r3 /* Accumulate sum in r8 */ | |
150 | mov r9, r1 /* save len in r9 */ | |
151 | mov ip, r0 /* set ip to the current mbuf */ | |
152 | ||
153 | cmp r9, #0 /* length is 0? */ | |
154 | bne .Lin_cksum_skip_loop /* if not, proceed further */ | |
155 | mov r0, r8 /* otherwise, return initial sum */ | |
156 | ||
157 | ldmfd sp!, {r4-r11, pc} | |
158 | ||
159 | .Lin_cksum_skip_loop: | |
160 | ldr r1, [ip, #(M_LEN)] | |
161 | ldr r0, [ip, #(M_DATA)] | |
162 | ldr ip, [ip, #(M_NEXT)] | |
163 | .Lin_cksum_skip_entry: | |
164 | subs r2, r2, r1 /* offset = offset - mbuf length */ | |
165 | blt .Lin_cksum_skip_done /* if offset has gone negative start with this mbuf */ | |
166 | cmp ip, #0x00 | |
167 | bne .Lin_cksum_skip_loop | |
168 | b .Lin_cksum_whoops | |
169 | ||
170 | .Lin_cksum_skip_done: | |
171 | add r0, r2, r0 /* data += offset (offset is < 0) */ | |
172 | add r0, r0, r1 /* data += length of mbuf */ | |
173 | /* data == start of data to cksum */ | |
174 | rsb r1, r2, #0x00 /* length = remainder of mbuf to read */ | |
175 | mov r10, #0x00 | |
176 | b .Lin_cksum_entry | |
177 | ||
178 | .Lin_cksum_loop: | |
179 | ldr r1, [ip, #(M_LEN)] | |
180 | ldr r0, [ip, #(M_DATA)] | |
181 | ldr ip, [ip, #(M_NEXT)] | |
182 | .Lin_cksum_entry: | |
183 | cmp r9, r1 | |
184 | movlt r1, r9 | |
185 | sub r9, r9, r1 | |
186 | eor r11, r10, r0 | |
187 | add r10, r10, r1 | |
188 | adds r2, r1, #0x00 | |
189 | ||
190 | beq .Lin_cksum_next | |
191 | ||
192 | /* | |
193 | * APPLE MODIFICATION | |
194 | * | |
195 | * Replace the 'blne _ASM_LABEL(L_cksumdata)' by bringing the called function | |
196 | * inline. This results in slightly faster code, and also permits the whole | |
197 | * function to be included in kernel profiling data. | |
198 | */ | |
199 | ||
200 | /* | |
201 | * The main in*_cksum() workhorse... | |
202 | * | |
203 | * Entry parameters: | |
204 | * r0 Pointer to buffer | |
205 | * r1 Buffer length | |
206 | * lr Return address | |
207 | * | |
208 | * Returns: | |
209 | * r2 Accumulated 32-bit sum | |
210 | * | |
211 | * Clobbers: | |
212 | * r0-r7 | |
213 | */ | |
214 | mov r2, #0 | |
215 | ||
216 | /* We first have to word-align the buffer. */ | |
217 | ands r7, r0, #0x03 | |
218 | beq .Lcksumdata_wordaligned | |
219 | rsb r7, r7, #0x04 | |
220 | cmp r1, r7 /* Enough bytes left to make it? */ | |
221 | blt .Lcksumdata_endgame | |
222 | cmp r7, #0x02 | |
223 | ldrb r4, [r0], #0x01 /* Fetch 1st byte */ | |
224 | ldrbge r5, [r0], #0x01 /* Fetch 2nd byte */ | |
225 | movlt r5, #0x00 | |
226 | ldrbgt r6, [r0], #0x01 /* Fetch 3rd byte */ | |
227 | movle r6, #0x00 | |
228 | /* Combine the three bytes depending on endianness and alignment */ | |
229 | #if BYTE_ORDER != LITTLE_ENDIAN | |
230 | orreq r2, r5, r4, lsl #8 | |
231 | orreq r2, r2, r6, lsl #24 | |
232 | orrne r2, r4, r5, lsl #8 | |
233 | orrne r2, r2, r6, lsl #16 | |
234 | #else | |
235 | orreq r2, r4, r5, lsl #8 | |
236 | orreq r2, r2, r6, lsl #16 | |
237 | orrne r2, r5, r4, lsl #8 | |
238 | orrne r2, r2, r6, lsl #24 | |
239 | #endif | |
240 | subs r1, r1, r7 /* Update length */ | |
241 | beq .Lin_cksum_next /* All done? */ | |
242 | ||
243 | /* Buffer is now word aligned */ | |
244 | .Lcksumdata_wordaligned: | |
245 | ||
246 | #if __ARM_VFP__ >= 3 | |
247 | ||
248 | cmp r1, #512 // do this if r1 is at least 512 | |
249 | blt 9f | |
250 | ||
251 | EnableVFP | |
252 | ||
253 | and r3, r1, #~0x3f | |
254 | ||
255 | vpush {q0-q7} | |
256 | ||
257 | // move r2 to s16 (q4) for neon computation | |
258 | veor q4, q4, q4 | |
259 | vld1.32 {q0-q1}, [r0]! | |
260 | vmov s16, r2 | |
261 | vld1.32 {q2-q3}, [r0]! | |
262 | ||
263 | // pre-decrement size by 64 | |
264 | subs r3, r3, #0x80 | |
265 | ||
266 | vpadal.u32 q4, q0 | |
267 | vld1.32 {q0}, [r0]! | |
268 | vpaddl.u32 q5, q1 | |
269 | vld1.32 {q1}, [r0]! | |
270 | vpaddl.u32 q6, q2 | |
271 | vld1.32 {q2}, [r0]! | |
272 | vpaddl.u32 q7, q3 | |
273 | vld1.32 {q3}, [r0]! | |
274 | ||
275 | 0: | |
276 | subs r3, r3, #0x40 // decrement size by 64 | |
277 | ||
278 | vpadal.u32 q4, q0 | |
279 | vld1.32 {q0}, [r0]! | |
280 | vpadal.u32 q5, q1 | |
281 | vld1.32 {q1}, [r0]! | |
282 | vpadal.u32 q6, q2 | |
283 | vld1.32 {q2}, [r0]! | |
284 | vpadal.u32 q7, q3 | |
285 | vld1.32 {q3}, [r0]! | |
286 | ||
287 | bgt 0b | |
288 | ||
289 | vpadal.u32 q4, q0 | |
290 | vpadal.u32 q5, q1 | |
291 | vpadal.u32 q6, q2 | |
292 | vpadal.u32 q7, q3 | |
293 | ||
294 | vpadal.u32 q4, q5 | |
295 | vpadal.u32 q6, q7 | |
296 | vpadal.u32 q4, q6 | |
297 | vadd.i64 d8, d9 | |
298 | ||
299 | vpaddl.u32 d8, d8 | |
300 | vpaddl.u32 d8, d8 | |
301 | vpaddl.u32 d8, d8 | |
302 | ||
303 | vmov r2, s16 | |
304 | ||
305 | vpop {q0-q7} | |
306 | ||
307 | ands r1, r1, #0x3f // residual bytes | |
308 | beq .Lin_cksum_next | |
309 | ||
310 | 9: | |
311 | ||
312 | #endif /* __ARM_VFP__ >= 3 */ | |
313 | ||
314 | subs r1, r1, #0x40 | |
315 | blt .Lcksumdata_bigloop_end | |
316 | ||
317 | .Lcksumdata_bigloop: | |
318 | ldmia r0!, {r3, r4, r5, r6} | |
319 | adds r2, r2, r3 | |
320 | adcs r2, r2, r4 | |
321 | adcs r2, r2, r5 | |
322 | ldmia r0!, {r3, r4, r5, r7} | |
323 | adcs r2, r2, r6 | |
324 | adcs r2, r2, r3 | |
325 | adcs r2, r2, r4 | |
326 | adcs r2, r2, r5 | |
327 | ldmia r0!, {r3, r4, r5, r6} | |
328 | adcs r2, r2, r7 | |
329 | adcs r2, r2, r3 | |
330 | adcs r2, r2, r4 | |
331 | adcs r2, r2, r5 | |
332 | ldmia r0!, {r3, r4, r5, r7} | |
333 | adcs r2, r2, r6 | |
334 | adcs r2, r2, r3 | |
335 | adcs r2, r2, r4 | |
336 | adcs r2, r2, r5 | |
337 | adcs r2, r2, r7 | |
338 | adc r2, r2, #0x00 | |
339 | subs r1, r1, #0x40 | |
340 | bge .Lcksumdata_bigloop | |
341 | .Lcksumdata_bigloop_end: | |
342 | ||
343 | adds r1, r1, #0x40 | |
344 | beq .Lin_cksum_next | |
345 | ||
346 | cmp r1, #0x20 | |
347 | ||
348 | blt .Lcksumdata_less_than_32 | |
349 | ldmia r0!, {r3, r4, r5, r6} | |
350 | adds r2, r2, r3 | |
351 | adcs r2, r2, r4 | |
352 | adcs r2, r2, r5 | |
353 | ldmia r0!, {r3, r4, r5, r7} | |
354 | adcs r2, r2, r6 | |
355 | adcs r2, r2, r3 | |
356 | adcs r2, r2, r4 | |
357 | adcs r2, r2, r5 | |
358 | adcs r2, r2, r7 | |
359 | adc r2, r2, #0x00 | |
360 | subs r1, r1, #0x20 | |
361 | beq .Lin_cksum_next | |
362 | ||
363 | .Lcksumdata_less_than_32: | |
364 | /* There are less than 32 bytes left */ | |
365 | and r3, r1, #0x18 | |
366 | rsb r4, r3, #0x18 | |
367 | sub r1, r1, r3 | |
368 | adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */ | |
369 | addne pc, pc, r4 | |
370 | ||
371 | /* | |
372 | * Note: We use ldm here, even on Xscale, since the combined issue/result | |
373 | * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs. | |
374 | */ | |
375 | /* At least 24 bytes remaining... */ | |
376 | ldmia r0!, {r4, r5} | |
377 | nop | |
378 | adcs r2, r2, r4 | |
379 | adcs r2, r2, r5 | |
380 | ||
381 | /* At least 16 bytes remaining... */ | |
382 | ldmia r0!, {r4, r5} | |
383 | adcs r2, r2, r4 | |
384 | adcs r2, r2, r5 | |
385 | ||
386 | /* At least 8 bytes remaining... */ | |
387 | ldmia r0!, {r4, r5} | |
388 | adcs r2, r2, r4 | |
389 | adcs r2, r2, r5 | |
390 | ||
391 | /* Less than 8 bytes remaining... */ | |
392 | adc r2, r2, #0x00 | |
393 | subs r1, r1, #0x04 | |
394 | blt .Lcksumdata_lessthan4 | |
395 | ||
396 | ldr r4, [r0], #0x04 | |
397 | sub r1, r1, #0x04 | |
398 | adds r2, r2, r4 | |
399 | adc r2, r2, #0x00 | |
400 | ||
401 | /* Deal with < 4 bytes remaining */ | |
402 | .Lcksumdata_lessthan4: | |
403 | adds r1, r1, #0x04 | |
404 | beq .Lin_cksum_next | |
405 | ||
406 | /* Deal with 1 to 3 remaining bytes, possibly misaligned */ | |
407 | .Lcksumdata_endgame: | |
408 | ldrb r3, [r0] /* Fetch first byte */ | |
409 | cmp r1, #0x02 | |
410 | ldrbge r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */ | |
411 | movlt r4, #0x00 | |
412 | ldrbgt r5, [r0, #0x02] | |
413 | movle r5, #0x00 | |
414 | /* Combine the three bytes depending on endianness and alignment */ | |
415 | tst r0, #0x01 | |
416 | #if BYTE_ORDER != LITTLE_ENDIAN | |
417 | orreq r3, r4, r3, lsl #8 | |
418 | orreq r3, r3, r5, lsl #24 | |
419 | orrne r3, r3, r4, lsl #8 | |
420 | orrne r3, r3, r5, lsl #16 | |
421 | #else | |
422 | orreq r3, r3, r4, lsl #8 | |
423 | orreq r3, r3, r5, lsl #16 | |
424 | orrne r3, r4, r3, lsl #8 | |
425 | orrne r3, r3, r5, lsl #24 | |
426 | #endif | |
427 | adds r2, r2, r3 | |
428 | adc r2, r2, #0x00 | |
429 | ||
430 | .Lin_cksum_next: | |
431 | tst r11, #0x01 | |
432 | movne r2, r2, ror #8 | |
433 | adds r8, r8, r2 | |
434 | adc r8, r8, #0x00 | |
435 | cmp ip, #00 | |
436 | bne .Lin_cksum_loop | |
437 | ||
438 | mov r1, #0xff | |
439 | orr r1, r1, #0xff00 | |
440 | and r0, r8, r1 | |
441 | add r0, r0, r8, lsr #16 | |
442 | add r0, r0, r0, lsr #16 | |
443 | and r0, r0, r1 | |
444 | /* | |
445 | * If we were to 1's complement it (XOR with 0xffff): | |
446 | * | |
447 | * eor r0, r0, r1 | |
448 | */ | |
449 | ||
450 | ldmfd sp!, {r4-r11, pc} | |
451 | ||
452 | .Lin_cksum_whoops: | |
453 | adr r0, .Lin_cksum_whoops_str | |
454 | bl #CKSUM_ERR | |
455 | mov r0, #-1 | |
456 | ||
457 | ldmfd sp!, {r4-r11, pc} | |
458 | ||
459 | .Lin_cksum_whoops_str: | |
460 | .asciz "os_cpu_in_cksum_mbuf: out of data\n" | |
461 | .align 5 |