]>
Commit | Line | Data |
---|---|---|
b5d655f7 | 1 | /* |
51282358 | 2 | * Copyright (c) 2006, 2009 Apple Inc. All rights reserved. |
b5d655f7 A |
3 | * |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
51282358 A |
24 | #if defined __thumb2__ && defined __ARM_NEON__ |
25 | ||
26 | // Use our tuned NEON implementation when it is available. Otherwise fall back | |
27 | // on more generic ARM code. | |
28 | ||
29 | #include "NEON/bcopy.s" | |
30 | ||
31 | #else // defined __thumb2__ && defined __ARM_NEON__ | |
32 | ||
33 | /***************************************************************************** | |
34 | * ARMv5 and ARMv6 implementation * | |
35 | *****************************************************************************/ | |
36 | ||
b5d655f7 A |
37 | #include <arm/arch.h> |
38 | ||
39 | .text | |
40 | .align 2 | |
41 | ||
42 | .globl _memcpy | |
43 | .globl _bcopy | |
44 | .globl _memmove | |
45 | ||
46 | _bcopy: /* void bcopy(const void *src, void *dest, size_t len); */ | |
47 | mov r3, r0 | |
48 | mov r0, r1 | |
49 | mov r1, r3 | |
50 | ||
51 | _memcpy: /* void *memcpy(void *dest, const void *src, size_t len); */ | |
52 | _memmove: /* void *memmove(void *dest, const void *src, size_t len); */ | |
53 | /* check for zero len or if the pointers are the same */ | |
54 | cmp r2, #0 | |
55 | cmpne r0, r1 | |
56 | bxeq lr | |
57 | ||
58 | /* save r0 (return value), r4 (scratch), and r5 (scratch) */ | |
59 | stmfd sp!, { r0, r4, r5, r7, lr } | |
60 | add r7, sp, #12 | |
61 | ||
62 | /* check for overlap. r3 <- distance between src & dest */ | |
63 | subhs r3, r0, r1 | |
64 | sublo r3, r1, r0 | |
65 | cmp r3, r2 /* if distance(src, dest) < len, we have overlap */ | |
66 | blo Loverlap | |
67 | ||
68 | Lnormalforwardcopy: | |
69 | /* are src and dest dissimilarly word aligned? */ | |
70 | mov r12, r0, lsl #30 | |
71 | cmp r12, r1, lsl #30 | |
72 | bne Lnonwordaligned_forward | |
73 | ||
74 | /* if len < 64, do a quick forward copy */ | |
75 | cmp r2, #64 | |
76 | blt Lsmallforwardcopy | |
77 | ||
78 | /* check for 16 byte src/dest unalignment */ | |
79 | tst r0, #0xf | |
80 | bne Lsimilarlyunaligned | |
81 | ||
82 | /* check for 32 byte dest unalignment */ | |
83 | tst r0, #(1<<4) | |
84 | bne Lunaligned_32 | |
85 | ||
86 | Lmorethan64_aligned: | |
87 | /* save some more registers to use in the copy */ | |
88 | stmfd sp!, { r6, r8, r10, r11 } | |
89 | ||
90 | /* pre-subtract 64 from the len counter to avoid an extra compare in the loop */ | |
91 | sub r2, r2, #64 | |
92 | ||
93 | L64loop: | |
94 | /* copy 64 bytes at a time */ | |
95 | ldmia r1!, { r3, r4, r5, r6, r8, r10, r11, r12 } | |
96 | #ifdef _ARM_ARCH_6 | |
97 | pld [r1, #32] | |
98 | #endif | |
99 | stmia r0!, { r3, r4, r5, r6, r8, r10, r11, r12 } | |
100 | ldmia r1!, { r3, r4, r5, r6, r8, r10, r11, r12 } | |
101 | subs r2, r2, #64 | |
102 | #ifdef _ARM_ARCH_6 | |
103 | pld [r1, #32] | |
104 | #endif | |
105 | stmia r0!, { r3, r4, r5, r6, r8, r10, r11, r12 } | |
106 | bge L64loop | |
107 | ||
108 | /* restore the scratch registers we just saved */ | |
109 | ldmfd sp!, { r6, r8, r10, r11 } | |
110 | ||
111 | /* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */ | |
112 | adds r2, r2, #64 | |
113 | beq Lexit | |
114 | ||
115 | Llessthan64_aligned: | |
116 | /* copy 16 bytes at a time until we have < 16 bytes */ | |
117 | cmp r2, #16 | |
118 | ldmgeia r1!, { r3, r4, r5, r12 } | |
119 | stmgeia r0!, { r3, r4, r5, r12 } | |
120 | subges r2, r2, #16 | |
121 | bgt Llessthan64_aligned | |
122 | beq Lexit | |
123 | ||
124 | Llessthan16_aligned: | |
125 | mov r2, r2, lsl #28 | |
126 | msr cpsr_f, r2 | |
127 | ||
128 | ldmmiia r1!, { r2, r3 } | |
129 | ldreq r4, [r1], #4 | |
130 | ldrcsh r5, [r1], #2 | |
131 | ldrvsb r12, [r1], #1 | |
132 | ||
133 | stmmiia r0!, { r2, r3 } | |
134 | streq r4, [r0], #4 | |
135 | strcsh r5, [r0], #2 | |
136 | strvsb r12, [r0], #1 | |
137 | b Lexit | |
138 | ||
139 | Lsimilarlyunaligned: | |
140 | /* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */ | |
141 | mov r12, r0, lsl #28 | |
142 | rsb r12, r12, #0 | |
143 | msr cpsr_f, r12 | |
144 | ||
145 | ldrvsb r3, [r1], #1 | |
146 | ldrcsh r4, [r1], #2 | |
147 | ldreq r5, [r1], #4 | |
148 | ||
149 | strvsb r3, [r0], #1 | |
150 | strcsh r4, [r0], #2 | |
151 | streq r5, [r0], #4 | |
152 | ||
153 | ldmmiia r1!, { r3, r4 } | |
154 | stmmiia r0!, { r3, r4 } | |
155 | ||
156 | subs r2, r2, r12, lsr #28 | |
157 | beq Lexit | |
158 | ||
159 | Lunaligned_32: | |
160 | /* bring up to dest 32 byte alignment */ | |
161 | tst r0, #(1 << 4) | |
162 | ldmneia r1!, { r3, r4, r5, r12 } | |
163 | stmneia r0!, { r3, r4, r5, r12 } | |
164 | subne r2, r2, #16 | |
165 | ||
166 | /* we should now be aligned, see what copy method we should use */ | |
167 | cmp r2, #64 | |
168 | bge Lmorethan64_aligned | |
169 | b Llessthan64_aligned | |
170 | ||
171 | Lbytewise2: | |
172 | /* copy 2 bytes at a time */ | |
173 | subs r2, r2, #2 | |
174 | ||
175 | ldrb r3, [r1], #1 | |
176 | ldrplb r4, [r1], #1 | |
177 | ||
178 | strb r3, [r0], #1 | |
179 | strplb r4, [r0], #1 | |
180 | ||
181 | bhi Lbytewise2 | |
182 | b Lexit | |
183 | ||
184 | Lbytewise: | |
185 | /* simple bytewise forward copy */ | |
186 | ldrb r3, [r1], #1 | |
187 | subs r2, r2, #1 | |
188 | strb r3, [r0], #1 | |
189 | bne Lbytewise | |
190 | b Lexit | |
191 | ||
192 | Lsmallforwardcopy: | |
193 | /* src and dest are word aligned similarly, less than 64 bytes to copy */ | |
194 | cmp r2, #4 | |
195 | blt Lbytewise2 | |
196 | ||
197 | /* bytewise copy until word aligned */ | |
198 | tst r1, #3 | |
199 | Lwordalignloop: | |
200 | ldrneb r3, [r1], #1 | |
201 | strneb r3, [r0], #1 | |
202 | subne r2, r2, #1 | |
203 | tstne r1, #3 | |
204 | bne Lwordalignloop | |
205 | ||
206 | cmp r2, #16 | |
207 | bge Llessthan64_aligned | |
208 | blt Llessthan16_aligned | |
209 | ||
210 | Loverlap: | |
211 | /* src and dest overlap in some way, len > 0 */ | |
212 | cmp r0, r1 /* if dest > src */ | |
213 | bhi Loverlap_srclower | |
214 | ||
215 | Loverlap_destlower: | |
216 | /* dest < src, see if we can still do a fast forward copy or fallback to slow forward copy */ | |
217 | cmp r3, #64 | |
218 | bge Lnormalforwardcopy /* overlap is greater than one stride of the copy, use normal copy */ | |
219 | ||
220 | cmp r3, #2 | |
221 | bge Lbytewise2 | |
222 | b Lbytewise | |
223 | ||
224 | /* the following routines deal with having to copy in the reverse direction */ | |
225 | Loverlap_srclower: | |
226 | /* src < dest, with overlap */ | |
227 | ||
228 | /* src += len; dest += len; */ | |
229 | add r0, r0, r2 | |
230 | add r1, r1, r2 | |
231 | ||
232 | /* we have to copy in reverse no matter what, test if we can we use a large block reverse copy */ | |
233 | cmp r2, #64 /* less than 64 bytes to copy? */ | |
234 | cmpgt r3, #64 /* less than 64 bytes of nonoverlap? */ | |
235 | blt Lbytewise_reverse | |
236 | ||
237 | /* test of src and dest are nonword aligned differently */ | |
238 | mov r3, r0, lsl #30 | |
239 | cmp r3, r1, lsl #30 | |
240 | bne Lbytewise_reverse | |
241 | ||
242 | /* test if src and dest are non word aligned or dest is non 16 byte aligned */ | |
243 | tst r0, #0xf | |
244 | bne Lunaligned_reverse_similarly | |
245 | ||
246 | /* test for dest 32 byte alignment */ | |
247 | tst r0, #(1<<4) | |
248 | bne Lunaligned_32_reverse_similarly | |
249 | ||
250 | /* 64 byte reverse block copy, src and dest aligned */ | |
251 | Lmorethan64_aligned_reverse: | |
252 | /* save some more registers to use in the copy */ | |
253 | stmfd sp!, { r6, r8, r10, r11 } | |
254 | ||
255 | /* pre-subtract 64 from the len counter to avoid an extra compare in the loop */ | |
256 | sub r2, r2, #64 | |
257 | ||
258 | L64loop_reverse: | |
259 | /* copy 64 bytes at a time */ | |
260 | ldmdb r1!, { r3, r4, r5, r6, r8, r10, r11, r12 } | |
261 | #ifdef _ARM_ARCH_6 | |
262 | pld [r1, #-32] | |
263 | #endif | |
264 | stmdb r0!, { r3, r4, r5, r6, r8, r10, r11, r12 } | |
265 | ldmdb r1!, { r3, r4, r5, r6, r8, r10, r11, r12 } | |
266 | subs r2, r2, #64 | |
267 | #ifdef _ARM_ARCH_6 | |
268 | pld [r1, #-32] | |
269 | #endif | |
270 | stmdb r0!, { r3, r4, r5, r6, r8, r10, r11, r12 } | |
271 | bge L64loop_reverse | |
272 | ||
273 | /* restore the scratch registers we just saved */ | |
274 | ldmfd sp!, { r6, r8, r10, r11 } | |
275 | ||
276 | /* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */ | |
277 | adds r2, r2, #64 | |
278 | beq Lexit | |
279 | ||
280 | Lbytewise_reverse: | |
281 | ldrb r3, [r1, #-1]! | |
282 | strb r3, [r0, #-1]! | |
283 | subs r2, r2, #1 | |
284 | bne Lbytewise_reverse | |
285 | b Lexit | |
286 | ||
287 | Lunaligned_reverse_similarly: | |
288 | /* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */ | |
289 | mov r12, r0, lsl #28 | |
290 | msr cpsr_f, r12 | |
291 | ||
292 | ldrvsb r3, [r1, #-1]! | |
293 | ldrcsh r4, [r1, #-2]! | |
294 | ldreq r5, [r1, #-4]! | |
295 | ||
296 | strvsb r3, [r0, #-1]! | |
297 | strcsh r4, [r0, #-2]! | |
298 | streq r5, [r0, #-4]! | |
299 | ||
300 | ldmmidb r1!, { r3, r4 } | |
301 | stmmidb r0!, { r3, r4 } | |
302 | ||
303 | subs r2, r2, r12, lsr #28 | |
304 | beq Lexit | |
305 | ||
306 | Lunaligned_32_reverse_similarly: | |
307 | /* bring up to dest 32 byte alignment */ | |
308 | tst r0, #(1 << 4) | |
309 | ldmnedb r1!, { r3, r4, r5, r12 } | |
310 | stmnedb r0!, { r3, r4, r5, r12 } | |
311 | subne r2, r2, #16 | |
312 | ||
313 | /* we should now be aligned, see what copy method we should use */ | |
314 | cmp r2, #64 | |
315 | bge Lmorethan64_aligned_reverse | |
316 | b Lbytewise_reverse | |
317 | ||
318 | /* the following routines deal with non word aligned copies */ | |
319 | Lnonwordaligned_forward: | |
320 | cmp r2, #8 | |
321 | blt Lbytewise2 /* not worth the effort with less than 24 bytes total */ | |
322 | ||
323 | /* bytewise copy until src word aligned */ | |
324 | tst r1, #3 | |
325 | Lwordalignloop2: | |
326 | ldrneb r3, [r1], #1 | |
327 | strneb r3, [r0], #1 | |
328 | subne r2, r2, #1 | |
329 | tstne r1, #3 | |
330 | bne Lwordalignloop2 | |
331 | ||
332 | /* figure out how the src and dest are unaligned */ | |
333 | and r3, r0, #3 | |
334 | cmp r3, #2 | |
335 | blt Lalign1_forward | |
336 | beq Lalign2_forward | |
337 | bgt Lalign3_forward | |
338 | ||
339 | Lalign1_forward: | |
340 | /* the dest pointer is 1 byte off from src */ | |
341 | mov r12, r2, lsr #2 /* number of words we should copy */ | |
342 | sub r0, r0, #1 | |
343 | ||
344 | /* prime the copy */ | |
345 | ldrb r4, [r0] /* load D[7:0] */ | |
346 | ||
347 | Lalign1_forward_loop: | |
348 | ldr r3, [r1], #4 /* load S */ | |
349 | orr r4, r4, r3, lsl #8 /* D[31:8] = S[24:0] */ | |
350 | str r4, [r0], #4 /* save D */ | |
351 | mov r4, r3, lsr #24 /* D[7:0] = S[31:25] */ | |
352 | subs r12, r12, #1 | |
353 | bne Lalign1_forward_loop | |
354 | ||
355 | /* finish the copy off */ | |
356 | strb r4, [r0], #1 /* save D[7:0] */ | |
357 | ||
358 | ands r2, r2, #3 | |
359 | beq Lexit | |
360 | b Lbytewise2 | |
361 | ||
362 | Lalign2_forward: | |
363 | /* the dest pointer is 2 bytes off from src */ | |
364 | mov r12, r2, lsr #2 /* number of words we should copy */ | |
365 | sub r0, r0, #2 | |
366 | ||
367 | /* prime the copy */ | |
368 | ldrh r4, [r0] /* load D[15:0] */ | |
369 | ||
370 | Lalign2_forward_loop: | |
371 | ldr r3, [r1], #4 /* load S */ | |
372 | orr r4, r4, r3, lsl #16 /* D[31:16] = S[15:0] */ | |
373 | str r4, [r0], #4 /* save D */ | |
374 | mov r4, r3, lsr #16 /* D[15:0] = S[31:15] */ | |
375 | subs r12, r12, #1 | |
376 | bne Lalign2_forward_loop | |
377 | ||
378 | /* finish the copy off */ | |
379 | strh r4, [r0], #2 /* save D[15:0] */ | |
380 | ||
381 | ands r2, r2, #3 | |
382 | beq Lexit | |
383 | b Lbytewise2 | |
384 | ||
385 | Lalign3_forward: | |
386 | /* the dest pointer is 3 bytes off from src */ | |
387 | mov r12, r2, lsr #2 /* number of words we should copy */ | |
388 | sub r0, r0, #3 | |
389 | ||
390 | /* prime the copy */ | |
391 | ldr r4, [r0] | |
392 | and r4, r4, #0x00ffffff /* load D[24:0] */ | |
393 | ||
394 | Lalign3_forward_loop: | |
395 | ldr r3, [r1], #4 /* load S */ | |
396 | orr r4, r4, r3, lsl #24 /* D[31:25] = S[7:0] */ | |
397 | str r4, [r0], #4 /* save D */ | |
398 | mov r4, r3, lsr #8 /* D[24:0] = S[31:8] */ | |
399 | subs r12, r12, #1 | |
400 | bne Lalign3_forward_loop | |
401 | ||
402 | /* finish the copy off */ | |
403 | strh r4, [r0], #2 /* save D[15:0] */ | |
404 | mov r4, r4, lsr #16 | |
405 | strb r4, [r0], #1 /* save D[23:16] */ | |
406 | ||
407 | ands r2, r2, #3 | |
408 | beq Lexit | |
409 | b Lbytewise2 | |
410 | ||
411 | Lexit: | |
412 | ldmfd sp!, {r0, r4, r5, r7, pc} | |
413 | ||
51282358 | 414 | #endif // defined __thumb2__ && defined __ARM_NEON__ |
b5d655f7 | 415 |