]> git.saurik.com Git - apple/libc.git/blob - arm/string/bcopy.s
Libc-594.9.5.tar.gz
[apple/libc.git] / arm / string / bcopy.s
1 /*
2 * Copyright (c) 2006, 2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 #if defined __thumb2__ && defined __ARM_NEON__
25
26 // Use our tuned NEON implementation when it is available. Otherwise fall back
27 // on more generic ARM code.
28
29 #include "NEON/bcopy.s"
30
31 #else // defined __thumb2__ && defined __ARM_NEON__
32
33 /*****************************************************************************
34 * ARMv5 and ARMv6 implementation *
35 *****************************************************************************/
36
37 #include <arm/arch.h>
38
39 .text
40 .align 2
41
42 .globl _memcpy
43 .globl _bcopy
44 .globl _memmove
45
46 _bcopy: /* void bcopy(const void *src, void *dest, size_t len); */
47 mov r3, r0
48 mov r0, r1
49 mov r1, r3
50
51 _memcpy: /* void *memcpy(void *dest, const void *src, size_t len); */
52 _memmove: /* void *memmove(void *dest, const void *src, size_t len); */
53 /* check for zero len or if the pointers are the same */
54 cmp r2, #0
55 cmpne r0, r1
56 bxeq lr
57
58 /* save r0 (return value), r4 (scratch), and r5 (scratch) */
59 stmfd sp!, { r0, r4, r5, r7, lr }
60 add r7, sp, #12
61
62 /* check for overlap. r3 <- distance between src & dest */
63 subhs r3, r0, r1
64 sublo r3, r1, r0
65 cmp r3, r2 /* if distance(src, dest) < len, we have overlap */
66 blo Loverlap
67
68 Lnormalforwardcopy:
69 /* are src and dest dissimilarly word aligned? */
70 mov r12, r0, lsl #30
71 cmp r12, r1, lsl #30
72 bne Lnonwordaligned_forward
73
74 /* if len < 64, do a quick forward copy */
75 cmp r2, #64
76 blt Lsmallforwardcopy
77
78 /* check for 16 byte src/dest unalignment */
79 tst r0, #0xf
80 bne Lsimilarlyunaligned
81
82 /* check for 32 byte dest unalignment */
83 tst r0, #(1<<4)
84 bne Lunaligned_32
85
86 Lmorethan64_aligned:
87 /* save some more registers to use in the copy */
88 stmfd sp!, { r6, r8, r10, r11 }
89
90 /* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
91 sub r2, r2, #64
92
93 L64loop:
94 /* copy 64 bytes at a time */
95 ldmia r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
96 #ifdef _ARM_ARCH_6
97 pld [r1, #32]
98 #endif
99 stmia r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
100 ldmia r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
101 subs r2, r2, #64
102 #ifdef _ARM_ARCH_6
103 pld [r1, #32]
104 #endif
105 stmia r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
106 bge L64loop
107
108 /* restore the scratch registers we just saved */
109 ldmfd sp!, { r6, r8, r10, r11 }
110
111 /* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
112 adds r2, r2, #64
113 beq Lexit
114
115 Llessthan64_aligned:
116 /* copy 16 bytes at a time until we have < 16 bytes */
117 cmp r2, #16
118 ldmgeia r1!, { r3, r4, r5, r12 }
119 stmgeia r0!, { r3, r4, r5, r12 }
120 subges r2, r2, #16
121 bgt Llessthan64_aligned
122 beq Lexit
123
124 Llessthan16_aligned:
125 mov r2, r2, lsl #28
126 msr cpsr_f, r2
127
128 ldmmiia r1!, { r2, r3 }
129 ldreq r4, [r1], #4
130 ldrcsh r5, [r1], #2
131 ldrvsb r12, [r1], #1
132
133 stmmiia r0!, { r2, r3 }
134 streq r4, [r0], #4
135 strcsh r5, [r0], #2
136 strvsb r12, [r0], #1
137 b Lexit
138
139 Lsimilarlyunaligned:
140 /* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
141 mov r12, r0, lsl #28
142 rsb r12, r12, #0
143 msr cpsr_f, r12
144
145 ldrvsb r3, [r1], #1
146 ldrcsh r4, [r1], #2
147 ldreq r5, [r1], #4
148
149 strvsb r3, [r0], #1
150 strcsh r4, [r0], #2
151 streq r5, [r0], #4
152
153 ldmmiia r1!, { r3, r4 }
154 stmmiia r0!, { r3, r4 }
155
156 subs r2, r2, r12, lsr #28
157 beq Lexit
158
159 Lunaligned_32:
160 /* bring up to dest 32 byte alignment */
161 tst r0, #(1 << 4)
162 ldmneia r1!, { r3, r4, r5, r12 }
163 stmneia r0!, { r3, r4, r5, r12 }
164 subne r2, r2, #16
165
166 /* we should now be aligned, see what copy method we should use */
167 cmp r2, #64
168 bge Lmorethan64_aligned
169 b Llessthan64_aligned
170
171 Lbytewise2:
172 /* copy 2 bytes at a time */
173 subs r2, r2, #2
174
175 ldrb r3, [r1], #1
176 ldrplb r4, [r1], #1
177
178 strb r3, [r0], #1
179 strplb r4, [r0], #1
180
181 bhi Lbytewise2
182 b Lexit
183
184 Lbytewise:
185 /* simple bytewise forward copy */
186 ldrb r3, [r1], #1
187 subs r2, r2, #1
188 strb r3, [r0], #1
189 bne Lbytewise
190 b Lexit
191
192 Lsmallforwardcopy:
193 /* src and dest are word aligned similarly, less than 64 bytes to copy */
194 cmp r2, #4
195 blt Lbytewise2
196
197 /* bytewise copy until word aligned */
198 tst r1, #3
199 Lwordalignloop:
200 ldrneb r3, [r1], #1
201 strneb r3, [r0], #1
202 subne r2, r2, #1
203 tstne r1, #3
204 bne Lwordalignloop
205
206 cmp r2, #16
207 bge Llessthan64_aligned
208 blt Llessthan16_aligned
209
210 Loverlap:
211 /* src and dest overlap in some way, len > 0 */
212 cmp r0, r1 /* if dest > src */
213 bhi Loverlap_srclower
214
215 Loverlap_destlower:
216 /* dest < src, see if we can still do a fast forward copy or fallback to slow forward copy */
217 cmp r3, #64
218 bge Lnormalforwardcopy /* overlap is greater than one stride of the copy, use normal copy */
219
220 cmp r3, #2
221 bge Lbytewise2
222 b Lbytewise
223
224 /* the following routines deal with having to copy in the reverse direction */
225 Loverlap_srclower:
226 /* src < dest, with overlap */
227
228 /* src += len; dest += len; */
229 add r0, r0, r2
230 add r1, r1, r2
231
232 /* we have to copy in reverse no matter what, test if we can we use a large block reverse copy */
233 cmp r2, #64 /* less than 64 bytes to copy? */
234 cmpgt r3, #64 /* less than 64 bytes of nonoverlap? */
235 blt Lbytewise_reverse
236
237 /* test of src and dest are nonword aligned differently */
238 mov r3, r0, lsl #30
239 cmp r3, r1, lsl #30
240 bne Lbytewise_reverse
241
242 /* test if src and dest are non word aligned or dest is non 16 byte aligned */
243 tst r0, #0xf
244 bne Lunaligned_reverse_similarly
245
246 /* test for dest 32 byte alignment */
247 tst r0, #(1<<4)
248 bne Lunaligned_32_reverse_similarly
249
250 /* 64 byte reverse block copy, src and dest aligned */
251 Lmorethan64_aligned_reverse:
252 /* save some more registers to use in the copy */
253 stmfd sp!, { r6, r8, r10, r11 }
254
255 /* pre-subtract 64 from the len counter to avoid an extra compare in the loop */
256 sub r2, r2, #64
257
258 L64loop_reverse:
259 /* copy 64 bytes at a time */
260 ldmdb r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
261 #ifdef _ARM_ARCH_6
262 pld [r1, #-32]
263 #endif
264 stmdb r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
265 ldmdb r1!, { r3, r4, r5, r6, r8, r10, r11, r12 }
266 subs r2, r2, #64
267 #ifdef _ARM_ARCH_6
268 pld [r1, #-32]
269 #endif
270 stmdb r0!, { r3, r4, r5, r6, r8, r10, r11, r12 }
271 bge L64loop_reverse
272
273 /* restore the scratch registers we just saved */
274 ldmfd sp!, { r6, r8, r10, r11 }
275
276 /* fix up the len counter (previously subtracted an extra 64 from it) and test for completion */
277 adds r2, r2, #64
278 beq Lexit
279
280 Lbytewise_reverse:
281 ldrb r3, [r1, #-1]!
282 strb r3, [r0, #-1]!
283 subs r2, r2, #1
284 bne Lbytewise_reverse
285 b Lexit
286
287 Lunaligned_reverse_similarly:
288 /* both src and dest are unaligned in similar ways, align to dest on 32 byte boundary */
289 mov r12, r0, lsl #28
290 msr cpsr_f, r12
291
292 ldrvsb r3, [r1, #-1]!
293 ldrcsh r4, [r1, #-2]!
294 ldreq r5, [r1, #-4]!
295
296 strvsb r3, [r0, #-1]!
297 strcsh r4, [r0, #-2]!
298 streq r5, [r0, #-4]!
299
300 ldmmidb r1!, { r3, r4 }
301 stmmidb r0!, { r3, r4 }
302
303 subs r2, r2, r12, lsr #28
304 beq Lexit
305
306 Lunaligned_32_reverse_similarly:
307 /* bring up to dest 32 byte alignment */
308 tst r0, #(1 << 4)
309 ldmnedb r1!, { r3, r4, r5, r12 }
310 stmnedb r0!, { r3, r4, r5, r12 }
311 subne r2, r2, #16
312
313 /* we should now be aligned, see what copy method we should use */
314 cmp r2, #64
315 bge Lmorethan64_aligned_reverse
316 b Lbytewise_reverse
317
318 /* the following routines deal with non word aligned copies */
319 Lnonwordaligned_forward:
320 cmp r2, #8
321 blt Lbytewise2 /* not worth the effort with less than 24 bytes total */
322
323 /* bytewise copy until src word aligned */
324 tst r1, #3
325 Lwordalignloop2:
326 ldrneb r3, [r1], #1
327 strneb r3, [r0], #1
328 subne r2, r2, #1
329 tstne r1, #3
330 bne Lwordalignloop2
331
332 /* figure out how the src and dest are unaligned */
333 and r3, r0, #3
334 cmp r3, #2
335 blt Lalign1_forward
336 beq Lalign2_forward
337 bgt Lalign3_forward
338
339 Lalign1_forward:
340 /* the dest pointer is 1 byte off from src */
341 mov r12, r2, lsr #2 /* number of words we should copy */
342 sub r0, r0, #1
343
344 /* prime the copy */
345 ldrb r4, [r0] /* load D[7:0] */
346
347 Lalign1_forward_loop:
348 ldr r3, [r1], #4 /* load S */
349 orr r4, r4, r3, lsl #8 /* D[31:8] = S[24:0] */
350 str r4, [r0], #4 /* save D */
351 mov r4, r3, lsr #24 /* D[7:0] = S[31:25] */
352 subs r12, r12, #1
353 bne Lalign1_forward_loop
354
355 /* finish the copy off */
356 strb r4, [r0], #1 /* save D[7:0] */
357
358 ands r2, r2, #3
359 beq Lexit
360 b Lbytewise2
361
362 Lalign2_forward:
363 /* the dest pointer is 2 bytes off from src */
364 mov r12, r2, lsr #2 /* number of words we should copy */
365 sub r0, r0, #2
366
367 /* prime the copy */
368 ldrh r4, [r0] /* load D[15:0] */
369
370 Lalign2_forward_loop:
371 ldr r3, [r1], #4 /* load S */
372 orr r4, r4, r3, lsl #16 /* D[31:16] = S[15:0] */
373 str r4, [r0], #4 /* save D */
374 mov r4, r3, lsr #16 /* D[15:0] = S[31:15] */
375 subs r12, r12, #1
376 bne Lalign2_forward_loop
377
378 /* finish the copy off */
379 strh r4, [r0], #2 /* save D[15:0] */
380
381 ands r2, r2, #3
382 beq Lexit
383 b Lbytewise2
384
385 Lalign3_forward:
386 /* the dest pointer is 3 bytes off from src */
387 mov r12, r2, lsr #2 /* number of words we should copy */
388 sub r0, r0, #3
389
390 /* prime the copy */
391 ldr r4, [r0]
392 and r4, r4, #0x00ffffff /* load D[24:0] */
393
394 Lalign3_forward_loop:
395 ldr r3, [r1], #4 /* load S */
396 orr r4, r4, r3, lsl #24 /* D[31:25] = S[7:0] */
397 str r4, [r0], #4 /* save D */
398 mov r4, r3, lsr #8 /* D[24:0] = S[31:8] */
399 subs r12, r12, #1
400 bne Lalign3_forward_loop
401
402 /* finish the copy off */
403 strh r4, [r0], #2 /* save D[15:0] */
404 mov r4, r4, lsr #16
405 strb r4, [r0], #1 /* save D[23:16] */
406
407 ands r2, r2, #3
408 beq Lexit
409 b Lbytewise2
410
411 Lexit:
412 ldmfd sp!, {r0, r4, r5, r7, pc}
413
414 #endif // defined __thumb2__ && defined __ARM_NEON__
415