]> git.saurik.com Git - apple/libc.git/blob - arm/string/NEON/bcopy.s
Libc-594.9.1.tar.gz
[apple/libc.git] / arm / string / NEON / bcopy.s
1 /*
2 * Copyright (c) 2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 /*****************************************************************************
25 * Cortex-A8 implementation *
26 *****************************************************************************/
27
28 // Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
29 //
30 // Our tests have shown that NEON is always a performance win for memcpy( ).
31 // However, for the specific case of copies from a warm source to a cold
32 // destination when the buffer size is between 1k and 32k, it is not enough
33 // of a performance win to offset the increased power footprint, resulting
34 // in an energy usage regression. Thus, we detect that particular case, and
35 // pass those copies through the ARM core registers. All other copies larger
36 // than 8 bytes are handled on NEON.
37 //
38 // Stephen Canon, August 2009
39
40 .text
41 .code 16
42 .syntax unified
43
44 // void bcopy(const void * source,
45 // void * destination,
46 // size_t length);
47 //
48 // void *memmove(void * destination,
49 // const void * source,
50 // size_t n);
51 //
52 // void *memcpy(void * restrict destination,
53 // const void * restrict source,
54 // size_t n);
55 //
56 // all copy n successive bytes from source to destination. memmove and memcpy
57 // returns destination, whereas bcopy has no return value. copying takes place
58 // as if it were through a temporary buffer -- after return destination contains
59 // exactly the bytes from source, even if the buffers overlap.
60
61 .thumb_func _bcopy
62 .globl _bcopy
63 .thumb_func _memmove
64 .globl _memmove
65 .thumb_func _memcpy
66 .globl _memcpy
67
68 .align 2
69 _bcopy:
70 mov r3, r0 // swap the first and second arguments
71 mov r0, r1 // and fall through into memmove
72 mov r1, r3 //
73
74 .align 2
75 _memmove:
76 _memcpy:
77 subs r3, r0, r1 // offset = destination addr - source addr
78 it eq
79 bxeq lr // if source == destination, early out
80
81 // Our preference is for using a (faster) front-to-back copy. However, if
82 // 0 < offset < length, it is necessary to copy back-to-front for correctness.
83 // We have already ruled out offset == 0, so we can use an unsigned compare
84 // with length -- if offset is higher, offset is either greater than length
85 // or negative.
86
87 cmp r3, r2
88 bhs L_copyFrontToBack
89
90 /*****************************************************************************
91 * back to front copy *
92 *****************************************************************************/
93
94 mov ip, r0 // copy destination pointer.
95 add r1, r2 // move source pointer to end of source array
96 add ip, r2 // move destination pointer to end of dest array
97
98 subs r2, $8 // if length - 8 is negative (i.e. length
99 blt L_scalarReverseCopy // is less than 8), jump to cleanup path.
100 tst ip, $7 // if (destination + length) is doubleword
101 beq L_vectorReverseCopy // aligned, jump to fast path.
102
103 0: ldrb r3, [r1, $-1]! // load byte
104 sub r2, $1 // decrement length
105 strb r3, [ip, $-1]! // store byte
106 tst ip, $7 // test alignment
107 bne 0b
108
109 cmp r2, $0 // if length - 8 is negative,
110 blt L_scalarReverseCopy // jump to the cleanup code
111
112 /*****************************************************************************
113 * destination is doubleword aligned *
114 *****************************************************************************/
115
116 L_vectorReverseCopy:
117 ands r3, r1, $3 // Extract the alignment of the source
118 bic r1, $3
119 tbh [pc, r3, lsl $1] // Dispatch table on source alignment
120 0:
121 .short (L_reverseAligned0-0b)/2 // The NEON alignment hardware does not work
122 .short (L_reverseAligned1-0b)/2 // properly with sub 4-byte alignment and
123 .short (L_reverseAligned2-0b)/2 // buffers that are uncacheable, so we need
124 .short (L_reverseAligned3-0b)/2 // to have a software workaround.
125
126 /*****************************************************************************
127 * source is also at least word aligned *
128 *****************************************************************************/
129
130 L_reverseAligned0:
131 subs r2, $0x38 // if length - 64 is negative, jump to
132 blt L_reverseVectorCleanup// the cleanup path.
133 tst ip, $0x38 // if (destination + length) is cacheline
134 beq L_reverseCachelineAligned // aligned, jump to the fast path.
135
136 0: sub r1, $8 // copy eight bytes at a time until the
137 vld1.32 {d0}, [r1] // destination is 8 byte aligned.
138 sub ip, $8 //
139 sub r2, $8 //
140 tst ip, $0x38 //
141 vst1.64 {d0}, [ip, :64] //
142 bne 0b //
143
144 cmp r2, $0 // if length - 64 is negative,
145 blt L_reverseVectorCleanup// jump to the cleanup code
146
147 L_reverseCachelineAligned:
148 sub r3, r2, $0x3c0 // If 1024 < length < 32768, use core
149 cmp r3, $0x7c00 // register copies instead of NEON to
150 blo L_useSTMDB // control energy usage.
151
152 sub r1, $32 // decrement source
153 sub ip, $32 // decrement destination
154 mov r3, $-32 // load address increment
155 tst r1, $0x1f // if source shares 32 byte alignment
156 beq L_reverseSourceAligned// jump to loop with more alignment hints
157
158 vld1.32 {q2,q3}, [r1], r3 // This loop handles 4-byte aligned copies
159 vld1.32 {q0,q1}, [r1], r3 // as generally as possible.
160 subs r2, $64 //
161 vst1.64 {q2,q3}, [ip,:256], r3 // The Cortex-A8 NEON unit does not always
162 blt 1f // properly handle misalignment in vld1
163 .align 3 // with an element size of 8 or 16, so
164 0: vld1.32 {q2,q3}, [r1], r3 // this is the best we can do without
165 vst1.64 {q0,q1}, [ip,:256], r3 // handling alignment in software.
166 vld1.32 {q0,q1}, [r1], r3 //
167 subs r2, $64 //
168 vst1.64 {q2,q3}, [ip,:256], r3 //
169 bge 0b //
170 b 1f //
171
172 L_reverseSourceAligned:
173 vld1.64 {q2,q3}, [r1,:256], r3 // Identical to loop above except for
174 vld1.64 {q0,q1}, [r1,:256], r3 // additional alignment information; this
175 subs r2, $64 // gets an additional .5 bytes per cycle
176 vst1.64 {q2,q3}, [ip,:256], r3 // on Cortex-A8.
177 blt 1f //
178 .align 3 //
179 0: vld1.64 {q2,q3}, [r1,:256], r3 //
180 vst1.64 {q0,q1}, [ip,:256], r3 //
181 vld1.64 {q0,q1}, [r1,:256], r3 //
182 subs r2, $64 //
183 vst1.64 {q2,q3}, [ip,:256], r3 //
184 bge 0b //
185 1: vst1.64 {q0,q1}, [ip,:256], r3 // loop cleanup: final 32 byte store
186 add r1, $32 // point source at last element stored
187 add ip, $32 // point destination at last element stored
188
189 L_reverseVectorCleanup:
190 adds r2, $0x38 // If (length - 8) < 0, goto scalar cleanup
191 blt L_scalarReverseCopy //
192
193 0: sub r1, $8 // copy eight bytes at a time until
194 vld1.32 {d0}, [r1] // (length - 8) < 0.
195 sub ip, $8 //
196 subs r2, $8 //
197 vst1.64 {d0}, [ip, :64] //
198 bge 0b //
199
200 /*****************************************************************************
201 * sub-doubleword cleanup copies *
202 *****************************************************************************/
203
204 L_scalarReverseCopy:
205 adds r2, #0x8 // restore length
206 it eq // if this is zero
207 bxeq lr // early out
208
209 0: ldrb r3, [r1, #-1]! // load a byte from source
210 strb r3, [ip, #-1]! // store to destination
211 subs r2, #0x1 // subtract one from length
212 bne 0b // if non-zero, repeat
213 bx lr // return
214
215 /*****************************************************************************
216 * STMDB loop for 1k-32k buffers *
217 *****************************************************************************/
218
219 L_useSTMDB:
220 push {r4-r8,r10,r11}
221 .align 3
222 0: ldmdb r1!, {r3-r8,r10,r11}
223 subs r2, #0x40
224 stmdb ip!, {r3-r8,r10,r11}
225 ldmdb r1!, {r3-r8,r10,r11}
226 pld [r1, #-0x40]
227 stmdb ip!, {r3-r8,r10,r11}
228 bge 0b
229 pop {r4-r8,r10,r11}
230 b L_reverseVectorCleanup
231
232 /*****************************************************************************
233 * Misaligned vld1 loop *
234 *****************************************************************************/
235
236 // Software alignment fixup to handle source and dest that are relatively
237 // misaligned mod 4 bytes. Load two 4-byte aligned double words from source,
238 // use vext.8 to extract a double word to store, and perform an 8-byte aligned
239 // store to destination.
240
241 #define RCOPY_UNALIGNED(offset) \
242 subs r2, $8 ;\
243 blt 2f ;\
244 sub r1, $8 ;\
245 sub ip, $8 ;\
246 mov r3, $-8 ;\
247 vld1.32 {d2,d3}, [r1], r3 ;\
248 subs r2, $8 ;\
249 blt 1f ;\
250 0: vext.8 d0, d2, d3, $(offset);\
251 vmov d3, d2 ;\
252 vld1.32 {d2}, [r1], r3 ;\
253 subs r2, $8 ;\
254 vst1.64 {d0}, [ip, :64], r3 ;\
255 bge 0b ;\
256 1: vext.8 d0, d2, d3, $(offset);\
257 add r1, $8 ;\
258 vst1.64 {d0}, [ip, :64] ;\
259 2: add r2, $8 ;\
260 add r1, $(offset);\
261 b L_scalarReverseCopy
262
263 L_reverseAligned1:
264 RCOPY_UNALIGNED(1)
265 L_reverseAligned2:
266 RCOPY_UNALIGNED(2)
267 L_reverseAligned3:
268 RCOPY_UNALIGNED(3)
269
270 /*****************************************************************************
271 * front to back copy *
272 *****************************************************************************/
273
274 L_copyFrontToBack:
275 mov ip, r0 // copy destination pointer.
276 subs r2, $8 // if length - 8 is negative (i.e. length
277 blt L_scalarCopy // is less than 8), jump to cleanup path.
278 tst ip, $7 // if the destination is doubleword
279 beq L_vectorCopy // aligned, jump to fast path.
280
281 0: ldrb r3, [r1], $1 // load byte
282 sub r2, $1 // decrement length
283 strb r3, [ip], $1 // store byte
284 tst ip, $7 // test alignment
285 bne 0b
286
287 cmp r2, $0 // if length - 8 is negative,
288 blt L_scalarCopy // jump to the cleanup code
289
290 /*****************************************************************************
291 * destination is doubleword aligned *
292 *****************************************************************************/
293
294 L_vectorCopy:
295 ands r3, r1, $3 // Extract the alignment of the source
296 bic r1, $3
297 tbh [pc, r3, lsl $1] // Dispatch table on source alignment
298 0:
299 .short (L_sourceAligned0-0b)/2 // The NEON alignment hardware does not work
300 .short (L_sourceAligned1-0b)/2 // properly with sub 4-byte alignment and
301 .short (L_sourceAligned2-0b)/2 // buffers that are uncacheable, so we need
302 .short (L_sourceAligned3-0b)/2 // to have a software workaround.
303
304 /*****************************************************************************
305 * source is also at least word aligned *
306 *****************************************************************************/
307
308 L_sourceAligned0:
309 subs r2, $0x38 // If (length - 64) < 0
310 blt L_vectorCleanup // jump to cleanup code
311 tst ip, $0x38 // If destination is 64 byte aligned
312 beq L_cachelineAligned // jump to main loop
313
314 0: vld1.32 {d0}, [r1]! // Copy one double word at a time until
315 sub r2, $8 // the destination is 64-byte aligned.
316 vst1.64 {d0}, [ip, :64]! //
317 tst ip, $0x38 //
318 bne 0b //
319
320 cmp r2, $0 // If (length - 64) < 0, goto cleanup
321 blt L_vectorCleanup //
322
323 L_cachelineAligned:
324 sub r3, r2, $0x3c0 // If 1024 < length < 32768, use core
325 cmp r3, $0x7c00 // register copies instead of NEON to
326 blo L_useSTMIA // control energy usage.
327 tst r1, $0x1f // If source has 32-byte alignment, use
328 beq L_sourceAligned32 // an optimized loop.
329
330 vld1.32 {q2,q3}, [r1]! // This is the most common path for small
331 vld1.32 {q0,q1}, [r1]! // copies, which are alarmingly frequent.
332 subs r2, #0x40 // It requires 4-byte alignment on the
333 vst1.64 {q2,q3}, [ip, :256]! // source. For ordinary malloc'd buffers,
334 blt 1f // this path could handle only single-byte
335 .align 3 // alignment at speed by using vld1.8
336 0: vld1.32 {q2,q3}, [r1]! // instead of vld1.32; however, the NEON
337 vst1.64 {q0,q1}, [ip, :256]! // alignment handler misbehaves for some
338 vld1.32 {q0,q1}, [r1]! // special copies if the element size is
339 subs r2, #0x40 // 8 or 16, so we need to work around
340 vst1.64 {q2,q3}, [ip, :256]! // sub 4-byte alignment in software, in
341 bge 0b // another code path.
342 b 1f
343
344 L_sourceAligned32:
345 vld1.64 {q2,q3}, [r1, :256]! // When the source shares 32-byte alignment
346 vld1.64 {q0,q1}, [r1, :256]! // with the destination, we use this loop
347 subs r2, #0x40 // instead, which specifies the maximum
348 vst1.64 {q2,q3}, [ip, :256]! // :256 alignment on all loads and stores.
349 blt 1f //
350 .align 3 // This gets an additional .5 bytes per
351 0: vld1.64 {q2,q3}, [r1, :256]! // cycle for in-cache copies, which is not
352 vst1.64 {q0,q1}, [ip, :256]! // insignificant for this (rather common)
353 vld1.64 {q0,q1}, [r1, :256]! // case.
354 subs r2, #0x40 //
355 vst1.64 {q2,q3}, [ip, :256]! // This is identical to the above loop,
356 bge 0b // except for the additional alignment.
357 1: vst1.64 {q0,q1}, [ip, :256]! //
358
359 L_vectorCleanup:
360 adds r2, $0x38 // If (length - 8) < 0, goto scalar cleanup
361 blt L_scalarCopy //
362
363 0: vld1.32 {d0}, [r1]! // Copy one doubleword at a time until
364 subs r2, $8 // (length - 8) < 0.
365 vst1.64 {d0}, [ip, :64]! //
366 bge 0b //
367
368 /*****************************************************************************
369 * sub-doubleword cleanup copies *
370 *****************************************************************************/
371
372 L_scalarCopy:
373 adds r2, #0x8 // restore length
374 it eq // if this is zero
375 bxeq lr // early out
376
377 0: ldrb r3, [r1], #1 // load a byte from source
378 strb r3, [ip], #1 // store to destination
379 subs r2, #1 // subtract one from length
380 bne 0b // if non-zero, repeat
381 bx lr // return
382
383 /*****************************************************************************
384 * STMIA loop for 1k-32k buffers *
385 *****************************************************************************/
386
387 L_useSTMIA:
388 push {r4-r8,r10,r11}
389 .align 3
390 0: ldmia r1!, {r3-r8,r10,r11}
391 subs r2, r2, #64
392 stmia ip!, {r3-r8,r10,r11}
393 ldmia r1!, {r3-r8,r10,r11}
394 pld [r1, #64]
395 stmia ip!, {r3-r8,r10,r11}
396 bge 0b
397 pop {r4-r8,r10,r11}
398 b L_vectorCleanup
399
400 /*****************************************************************************
401 * Misaligned reverse vld1 loop *
402 *****************************************************************************/
403
404 // Software alignment fixup to handle source and dest that are relatively
405 // misaligned mod 4 bytes. Load two 4-byte aligned double words from source,
406 // use vext.8 to extract a double word to store, and perform an 8-byte aligned
407 // store to destination.
408
409 #define COPY_UNALIGNED(offset) \
410 subs r2, $8 ;\
411 blt 2f ;\
412 vld1.32 {d2,d3}, [r1]! ;\
413 subs r2, $8 ;\
414 blt 1f ;\
415 0: vext.8 d0, d2, d3, $(offset);\
416 vmov d2, d3 ;\
417 vld1.32 {d3}, [r1]! ;\
418 subs r2, $8 ;\
419 vst1.64 {d0}, [ip, :64]! ;\
420 bge 0b ;\
421 1: vext.8 d0, d2, d3, $(offset);\
422 sub r1, $8 ;\
423 vst1.64 {d0}, [ip, :64]! ;\
424 2: add r1, $(offset);\
425 add r2, $8 ;\
426 b L_scalarCopy
427
428 L_sourceAligned1:
429 COPY_UNALIGNED(1)
430 L_sourceAligned2:
431 COPY_UNALIGNED(2)
432 L_sourceAligned3:
433 COPY_UNALIGNED(3)