]>
Commit | Line | Data |
---|---|---|
b5d655f7 | 1 | /* |
51282358 | 2 | * Copyright (c) 2006, 2009 Apple Inc. All rights reserved. |
b5d655f7 A |
3 | * |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
51282358 A |
23 | |
24 | #if defined __thumb2__ && defined __ARM_NEON__ | |
25 | ||
26 | // Use our tuned NEON implementation when it is available. Otherwise fall back | |
27 | // on more generic ARM code. | |
28 | ||
29 | #include "NEON/bzero.s" | |
30 | ||
31 | #else // defined __thumb2__ && defined __ARM_NEON__ | |
b5d655f7 A |
32 | |
33 | #include <mach/machine/asm.h> | |
34e8f829 A |
34 | #include <architecture/arm/asm_help.h> |
35 | ||
b5d655f7 A |
36 | /* |
37 | * A reasonably well-optimized bzero/memset. Should work equally well on arm11 and arm9 based | |
38 | * cores. | |
39 | * | |
40 | * The algorithm is to align the destination pointer on a 32 byte boundary and then | |
41 | * blast data 64 bytes at a time, in two stores of 32 bytes per loop. | |
42 | */ | |
43 | .text | |
44 | .align 2 | |
45 | ||
46 | .globl _memset | |
47 | /* void *memset(void *ptr, int c, size_t len); */ | |
48 | _memset: | |
49 | /* move len into r1, unpack c into r2 */ | |
50 | mov r3, r2 | |
51 | and r1, r1, #0xff | |
52 | orr r1, r1, r1, lsl #8 | |
53 | orr r2, r1, r1, lsl #16 | |
54 | mov r1, r3 | |
55 | b Lbzeroengine | |
56 | ||
57 | .globl _bzero | |
58 | /* void bzero(void *ptr, size_t len); */ | |
59 | _bzero: | |
60 | /* zero out r2 so we can be just like memset(0) */ | |
61 | mov r2, #0 | |
62 | ||
63 | Lbzeroengine: | |
64 | /* move the base pointer into r12 and leave r0 alone so that we return the original pointer */ | |
65 | mov r12, r0 | |
66 | ||
67 | /* copy r2 into r3 for 64-bit stores */ | |
68 | mov r3, r2 | |
69 | ||
70 | /* check for zero len */ | |
71 | cmp r1, #0 | |
72 | bxeq lr | |
73 | ||
74 | /* fall back to a bytewise store for less than 32 bytes */ | |
75 | cmp r1, #32 | |
76 | blt L_bytewise | |
77 | ||
78 | /* check for 32 byte unaligned ptr */ | |
79 | tst r12, #0x1f | |
80 | bne L_unaligned | |
81 | ||
82 | /* make sure we have more than 64 bytes to zero */ | |
83 | cmp r1, #64 | |
84 | blt L_lessthan64aligned | |
85 | ||
86 | /* >= 64 bytes of len, 32 byte aligned */ | |
87 | L_64ormorealigned: | |
88 | ||
89 | /* we need some registers, avoid r7 (frame pointer) and r9 (thread register) */ | |
90 | stmfd sp!, { r4-r6, r8, r10-r11 } | |
91 | mov r4, r2 | |
92 | mov r5, r2 | |
93 | mov r6, r2 | |
94 | mov r8, r2 | |
95 | mov r10, r2 | |
96 | mov r11, r2 | |
97 | ||
98 | /* pre-subtract 64 from the len to avoid an extra compare in the loop */ | |
99 | sub r1, r1, #64 | |
100 | ||
101 | L_64loop: | |
102 | stmia r12!, { r2-r6, r8, r10-r11 } | |
103 | subs r1, r1, #64 | |
104 | stmia r12!, { r2-r6, r8, r10-r11 } | |
105 | bge L_64loop | |
106 | ||
107 | /* restore the saved regs */ | |
108 | ldmfd sp!, { r4-r6, r8, r10-r11 } | |
109 | ||
110 | /* check for completion (had previously subtracted an extra 64 from len) */ | |
111 | adds r1, r1, #64 | |
112 | bxeq lr | |
113 | ||
114 | L_lessthan64aligned: | |
115 | /* do we have 16 or more bytes left */ | |
116 | cmp r1, #16 | |
117 | stmgeia r12!, { r2-r3 } | |
118 | stmgeia r12!, { r2-r3 } | |
119 | subges r1, r1, #16 | |
120 | bgt L_lessthan64aligned | |
121 | bxeq lr | |
122 | ||
123 | L_lessthan16aligned: | |
124 | /* store 0 to 15 bytes */ | |
125 | mov r1, r1, lsl #28 /* move the remaining len bits [3:0] to the flags area of cpsr */ | |
126 | msr cpsr_f, r1 | |
127 | ||
128 | stmmiia r12!, { r2-r3 } /* n is set, store 8 bytes */ | |
129 | streq r2, [r12], #4 /* z is set, store 4 bytes */ | |
130 | strcsh r2, [r12], #2 /* c is set, store 2 bytes */ | |
131 | strvsb r2, [r12], #1 /* v is set, store 1 byte */ | |
132 | bx lr | |
133 | ||
134 | L_bytewise: | |
135 | /* bytewise copy, 2 bytes at a time, alignment not guaranteed */ | |
136 | subs r1, r1, #2 | |
137 | strb r2, [r12], #1 | |
138 | strplb r2, [r12], #1 | |
139 | bhi L_bytewise | |
140 | bx lr | |
141 | ||
142 | L_unaligned: | |
143 | /* unaligned on 32 byte boundary, store 1-15 bytes until we're 16 byte aligned */ | |
144 | mov r3, r12, lsl #28 | |
145 | rsb r3, r3, #0x00000000 | |
146 | msr cpsr_f, r3 | |
147 | ||
148 | strvsb r2, [r12], #1 /* v is set, unaligned in the 1s column */ | |
149 | strcsh r2, [r12], #2 /* c is set, unaligned in the 2s column */ | |
150 | streq r2, [r12], #4 /* z is set, unaligned in the 4s column */ | |
151 | strmi r2, [r12], #4 /* n is set, unaligned in the 8s column */ | |
152 | strmi r2, [r12], #4 | |
153 | ||
154 | subs r1, r1, r3, lsr #28 | |
155 | bxeq lr | |
156 | ||
157 | /* we had previously trashed r3, restore it */ | |
158 | mov r3, r2 | |
159 | ||
160 | /* now make sure we're 32 byte aligned */ | |
161 | tst r12, #(1 << 4) | |
162 | stmneia r12!, { r2-r3 } | |
163 | stmneia r12!, { r2-r3 } | |
164 | subnes r1, r1, #16 | |
165 | ||
166 | /* we're now aligned, check for >= 64 bytes left */ | |
167 | cmp r1, #64 | |
168 | bge L_64ormorealigned | |
169 | b L_lessthan64aligned | |
170 | ||
34e8f829 | 171 | X_LEAF(___bzero, _bzero) |
51282358 A |
172 | |
173 | #endif // defined __thumb2__ && defined __ARM_NEON__ |