]> git.saurik.com Git - apple/libc.git/blob - arm/string/bcopy_CortexA8.s
Libc-825.25.tar.gz
[apple/libc.git] / arm / string / bcopy_CortexA8.s
1 /*
2 * Copyright (c) 2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 #include <arm/arch.h>
25 #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
26
27 /*****************************************************************************
28 * Cortex-A8 implementation *
29 *****************************************************************************/
30
31 // Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
32 //
33 // Our tests have shown that NEON is always a performance win for memcpy( ).
34 // However, for the specific case of copies from a warm source to a cold
35 // destination when the buffer size is between 1k and 32k, it is not enough
36 // of a performance win to offset the increased power footprint, resulting
37 // in an energy usage regression. Thus, we detect that particular case, and
38 // pass those copies through the ARM core registers. All other copies larger
39 // than 8 bytes are handled on NEON.
40 //
41 // Stephen Canon, August 2009
42
43 .text
44 .code 16
45 .syntax unified
46
47 // void bcopy(const void * source,
48 // void * destination,
49 // size_t length);
50 //
51 // void *memmove(void * destination,
52 // const void * source,
53 // size_t n);
54 //
55 // void *memcpy(void * restrict destination,
56 // const void * restrict source,
57 // size_t n);
58 //
59 // all copy n successive bytes from source to destination. memmove and memcpy
60 // returns destination, whereas bcopy has no return value. copying takes place
61 // as if it were through a temporary buffer -- after return destination contains
62 // exactly the bytes from source, even if the buffers overlap.
63
64 .thumb_func _bcopy$VARIANT$CortexA8
65 .thumb_func _memmove$VARIANT$CortexA8
66 .thumb_func _memcpy$VARIANT$CortexA8
67 .globl _bcopy$VARIANT$CortexA8
68 .globl _memmove$VARIANT$CortexA8
69 .globl _memcpy$VARIANT$CortexA8
70
71 #define SAVE_REGISTERS {r4,r5,r6,r8,r10,r11}
72 #define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r11}
73
74 /*****************************************************************************
75 * entry points *
76 *****************************************************************************/
77
78 .align 2
79 _bcopy$VARIANT$CortexA8:
80
81 // bcopy has the first and second arguments in the opposite order as the C
82 // library functions memmove and memcpy. If bcopy is called, we swap these
83 // two arguments and then fall into memmove.
84
85 mov r3, r0
86 mov r0, r1
87 mov r1, r3
88
89 .align 2
90 _memmove$VARIANT$CortexA8:
91 _memcpy$VARIANT$CortexA8:
92
93 // At entry to memmove/memcpy, registers contain the following values:
94 //
95 // r0 pointer to the first byte of the destination buffer
96 // r1 pointer to the first byte of the source buffer
97 // r2 number of bytes to copy
98 //
99 // Our preference is to use a (faster and easier to understand) front-to-back
100 // copy of the buffer. However, memmove requires that copies take place as
101 // though through a temporary buffer. This means that if the buffers overlap,
102 // it may be necessary to copy the buffer in reverse order.
103 //
104 // To properly detect such overlap, we begin by computing the offset between
105 // the source and destination pointers. If the offset happens to be zero,
106 // then there is no work to be done, so we can early out.
107
108 subs r3, r0, r1
109 it eq
110 bxeq lr
111
112 // r3 now contains the offset between the buffers, (destination - source). If
113 // 0 < offset < length, then the high-addressed bits of the source alias the
114 // low addressed bytes of the destination. Thus, if we were to perform the
115 // copy in ascending address order, we would overwrite the high-addressed
116 // source bytes before we had a chance to copy them, and the data would be lost.
117 //
118 // Thus, we can use the front-to-back copy only if offset is negative or
119 // greater than the length. This is the case precisely if offset compares
120 // unsigned higher than length.
121
122 cmp r3, r2
123 bhs L_copyFrontToBack
124
125 /*****************************************************************************
126 * back to front copy *
127 *****************************************************************************/
128
129 // Here we have fallen through into the back-to-front copy. We preserve the
130 // original destination pointer in r0 because it is the return value for the
131 // routine, and update the other registers as follows:
132 //
133 // r1 one byte beyond the end of the destination buffer
134 // r2 number of bytes to copy
135 // ip one byte beyond the end of the destination buffer
136
137 mov ip, r0
138 add r1, r2
139 add ip, r2
140
141 // Subtract 8 from the buffer length; if this is negative, then we will use
142 // only single-byte copies, and we jump directly to a scalar copy loop.
143
144 subs r2, $8
145 blt L_scalarReverseCopy
146
147 // If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
148 // to move the data.
149
150 tst ip, $7
151 beq L_vectorReverseCopy
152
153 // Otherwise, we copy a single byte at a time, in order of descending memory
154 // address, until the destination is 8 byte aligned. Within this loop,
155 // registers are used as follows:
156 //
157 // r0 original destination pointer
158 // r1 pointer to one byte past the next element to be copied
159 // r2 (bytes remaining to be copied) - 8
160 // r3 temporary to hold the byte that is being copied
161 // ip pointer one byte past the destination of the next byte to be copied
162 //
163 // byte that will be copied in this iteration
164 // | byte that was copied in the previous iteration
165 // Source buffer: v v
166 // ------------------------+---+---+-------------------------
167 // bytes still to copy ... | | | ... bytes already copied
168 // ------------------------+---+---+-------------------------
169 // ^
170 // r1 holds the address of this byte
171
172 0: ldrb r3, [r1, $-1]!
173 sub r2, $1
174 strb r3, [ip, $-1]!
175 tst ip, $7
176 bne 0b
177
178 // At this point, the destination pointer is 8 byte aligned. Check again that
179 // there are at least 8 bytes remaining to copy by comparing the remaining
180 // length minus 8 to zero. If fewer than 8 bytes remain, jump to the cleanup
181 // path.
182
183 cmp r2, $0
184 blt L_scalarReverseCopy
185
186 /*****************************************************************************
187 * destination is 8 byte aligned *
188 *****************************************************************************/
189
190 L_vectorReverseCopy:
191
192 // At this point, registers contain the following values:
193 //
194 // r0 original destination pointer
195 // r1 pointer to one byte past the next element to be copied
196 // r2 (bytes remaining to copy) - 8
197 // ip pointer one byte past the destination of the next byte to be copied
198 //
199 // Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
200 // NEON has really excellent alignment handling in hardware, so we would like
201 // to use that to handle cases where the source is not similarly aligned to the
202 // destination (it supports even single-byte misalignment at speed). However,
203 // on some SoC designs, not all of the DMA busses support such access. Thus,
204 // we must unfortunately use a software workaround in those cases.
205 //
206 // Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
207 // we only need to handle the different possible source alignments modulo 4.
208 // Here we have a dispatch table to jump to the correct copy implementation
209 // for the given source alignment.
210 //
211 // The tbh instruction loads the address offset of the correct implementation
212 // from the data table that immediately follows it and adds it to the pc to
213 // jump to the correct branch.
214
215 ands r3, r1, $3
216 tbh [pc, r3, lsl $1]
217 0:
218 .short (L_reverseAligned0-0b)/2
219 .short (L_reverseAligned1-0b)/2
220 .short (L_reverseAligned2-0b)/2
221 .short (L_reverseAligned3-0b)/2
222
223 /*****************************************************************************
224 * source is also at least word aligned *
225 *****************************************************************************/
226
227 L_reverseAligned0:
228
229 // Subtract 56 from r2, so that it contains the number of bytes remaining to
230 // copy minus 64. If this result is negative, then we jump into a loop that
231 // copies 8 bytes at a time.
232
233 subs r2, $0x38
234 blt L_reverseVectorCleanup
235
236 // Check if the destination pointer is 64-byte aligned. If so, jump to a loop
237 // that copies whole cachelines.
238
239 tst ip, $0x38
240 beq L_reverseCachelineAligned
241
242 // Otherwise, we copy a 8 bytes at a time, in order of descending memory
243 // address, until the destination is 64 byte aligned. Within this loop,
244 // registers are used as follows:
245 //
246 // r0 original destination pointer
247 // r1 pointer to one byte past the next element to be copied
248 // r2 (bytes remaining to be copied) - 64
249 // ip pointer one byte past the destination of the next byte to be copied
250 // d0 temporary storage for copy
251 //
252 // bytes that will be copied after this iteration
253 // | 8 byte block that will be copied in this iteration
254 // v v
255 // --------------+-------------------------------+---------------------
256 // | 0 1 2 3 4 5 6 7 | bytes already copied
257 // --------------+-------------------------------+---------------------
258 // ^
259 // r1 points here
260
261 0: sub r1, $8
262 vld1.32 {d0}, [r1]
263 sub ip, $8
264 sub r2, $8
265 tst ip, $0x38
266 vst1.64 {d0}, [ip,:64]
267 bne 0b
268
269 // At this point, the destination pointer is 64 byte aligned. Check again that
270 // there are at least 64 bytes remaining to copy by comparing the remaining
271 // length minus 64 to zero. If fewer than 64 bytes remain, skip over the main
272 // copy loop.
273
274 cmp r2, $0
275 blt L_reverseVectorCleanup
276
277 /*****************************************************************************
278 * destination is cacheline aligned *
279 *****************************************************************************/
280
281 L_reverseCachelineAligned:
282
283 // In the special case that we are copying a buffer of between 1k and 32k bytes
284 // we do not use a NEON copy for the main loop. This is because if we happen
285 // to be doing a copy from a source in cache to a destination that is not in
286 // cache, this will result in an increase in energy usage. In all other cases,
287 // NEON gives superior energy conservation.
288
289 sub r3, r2, $0x3c0
290 cmp r3, $0x7c00
291 blo L_useSTMDB
292
293 // Pre-decrement the source (r1) and destination (ip) pointers so that they
294 // point to the first byte of the trailing 32-byte window of each buffer.
295 // Additionally, load the address increment of -32 into r3.
296
297 sub r1, $32
298 sub ip, $32
299 mov r3, $-32
300
301 // The destination pointer is known to be 64-byte aligned, so we can use the
302 // maximal alignment hint (:256) for our vector stores. Detect if the source
303 // is also at least 32-byte aligned and jump to a loop that uses maximal
304 // alignment hints for the loads as well if possible.
305
306 tst r1, $0x1f
307 beq L_reverseSourceAligned
308
309 // This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
310 // 64-byte aligned destination, in order of descending memory address. Within
311 // this loop, registers are used as follows:
312 //
313 // r0 original destination pointer (unmodified)
314 // r1 pointer to the next 32-byte block to load
315 // r2 (number of bytes remaining to copy) - 64
316 // r3 address increment of -32.
317 // ip pointer to which the next 32-byte block is to be stored
318 // q0-q3 temporary registers used for copies
319 //
320 // Note that the loop is arrange in such a way that a single cleanup store is
321 // necessary after the final loop iteration. This occurs at label (1), and is
322 // shared between the unaligned and aligned loops.
323
324 vld1.32 {q2,q3}, [r1], r3
325 vld1.32 {q0,q1}, [r1], r3
326 subs r2, $64
327 vst1.64 {q2,q3}, [ip,:256], r3
328 blt 1f
329 .align 3
330 0: vld1.32 {q2,q3}, [r1], r3
331 vst1.64 {q0,q1}, [ip,:256], r3
332 vld1.32 {q0,q1}, [r1], r3
333 subs r2, $64
334 vst1.64 {q2,q3}, [ip,:256], r3
335 bge 0b
336 b 1f
337
338 L_reverseSourceAligned:
339
340 // This loop is identical to the immediately preceeding loop, except that it
341 // uses the additional alignment hint that the source pointer (r1) is 32-byte
342 // aligned. The two loops share cleanup code for the final iteration.
343
344 vld1.64 {q2,q3}, [r1,:256], r3
345 vld1.64 {q0,q1}, [r1,:256], r3
346 subs r2, $64
347 vst1.64 {q2,q3}, [ip,:256], r3
348 blt 1f
349 .align 3
350 0: vld1.64 {q2,q3}, [r1,:256], r3
351 vst1.64 {q0,q1}, [ip,:256], r3
352 vld1.64 {q0,q1}, [r1,:256], r3
353 subs r2, $64
354 vst1.64 {q2,q3}, [ip,:256], r3
355 bge 0b
356
357 // Final vector store for both of the above loops.
358
359 1: vst1.64 {q0,q1}, [ip,:256], r3
360
361 // Adjust the source and destination pointers so that they once again point to
362 // the last byte that we used (which is one byte higher than the address that
363 // we will use next for any required cleanup).
364
365 add r1, $32
366 add ip, $32
367
368 L_reverseVectorCleanup:
369
370 // Add 56 to r2, so that it contains the number of bytes remaing to copy minus
371 // 8. A comparison of this value with zero tells us if any more whole 8-byte
372 // blocks need to be copied.
373
374 adds r2, r2, $0x38
375 blt L_scalarReverseCopy
376
377 // This loop copies 8 bytes at a time in order of descending memory address,
378 // until fewer than 8 bytes remain to be copied. Within this loop, registers
379 // are used as follows:
380 //
381 // r0 original destination pointer
382 // r1 pointer to one byte past the next element to be copied
383 // r2 (bytes remaining to be copied) - 64
384 // ip pointer one byte past the destination of the next byte to be copied
385 // d0 temporary storage for copy
386
387 0: sub r1, $8
388 vld1.32 {d0}, [r1]
389 sub ip, $8
390 subs r2, $8
391 vst1.64 {d0}, [ip,:64]
392 bge 0b
393
394 /*****************************************************************************
395 * sub-doubleword cleanup copies *
396 *****************************************************************************/
397
398 L_scalarReverseCopy:
399
400 // Add 8 to r2, so that it contains the number of bytes remaining to copy, and
401 // return to the calling routine if zero bytes remain.
402
403 adds r2, $8
404 it eq
405 bxeq lr
406
407 // Copy one byte at a time in descending address order until we reach the front
408 // of the buffer. Within this loop, registers are used as follows:
409 //
410 // r0 original destination pointer
411 // r1 pointer to one byte past the next element to be copied
412 // r2 (bytes remaining to be copied) - 8
413 // r3 temporary to hold the byte that is being copied
414 // ip pointer one byte past the destination of the next byte to be copied
415
416 0: ldrb r3, [r1, $-1]!
417 subs r2, $1
418 strb r3, [ip, $-1]!
419 bne 0b
420 bx lr
421
422 /*****************************************************************************
423 * STMDB loop for 1k-32k buffers *
424 *****************************************************************************/
425
426 // This loop copies 64 bytes each iteration in order of descending memory
427 // address, using the GPRs instead of NEON.
428 //
429 // r0 original destination pointer
430 // r1 pointer to one byte past the next element to be copied
431 // r2 (bytes remaining to be copied) - 64
432 // r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
433 // ip pointer to one byte past the next location to store to
434
435 L_useSTMDB:
436 push SAVE_REGISTERS
437 .align 3
438 0: ldmdb r1!, COPY_REGISTERS
439 subs r2, r2, $64
440 stmdb ip!, COPY_REGISTERS
441 ldmdb r1!, COPY_REGISTERS
442 pld [r1, $-64]
443 stmdb ip!, COPY_REGISTERS
444 bge 0b
445 pop SAVE_REGISTERS
446 b L_reverseVectorCleanup
447
448 /*****************************************************************************
449 * Misaligned reverse vld1 loop *
450 *****************************************************************************/
451
452 // Software alignment fixup to handle source and dest that are relatively
453 // misaligned mod 4 bytes.
454 //
455 // The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
456 // which we combine with the 8 bytes loaded in the previous iteration to get a
457 // 16 byte field; the next 8 bytes to be stored to the destination buffer are
458 // somewhere in that field, and we get them using the VEXT instruction:
459 //
460 // | 8 bytes from this iteration | 8 bytes from last iteration |
461 // +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
462 // | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
463 // +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
464 // ^8 bytes to store this iteration^ |
465 // could be a page boundary
466 //
467 // We need to be a little bit careful, however. Because the loads only have 4
468 // byte alignment, the very first load could slop over into a page that is not
469 // mapped readable. In order to prevent this scenario, we copy eight bytes
470 // using byte-by-byte before beginning the main loop.
471 //
472 // At the beginning of each iteration through this loop, registers are used
473 // as follows:
474 //
475 // r0 original destination pointer
476 // r1 pointer to the next block of 8 bytes to load
477 // r2 (bytes remaining to copy) - 8
478 // ip pointer to the next block of 8 bytes to store
479 // d0 next 8 bytes to store
480 // d2 8 bytes loaded in the previous iteration
481 // d3 8 bytes loaded two iterations ago
482
483 #define RCOPY_UNALIGNED(offset) \
484 0: ldrb r3, [r1,$-1]! ;\
485 strb r3, [ip,$-1]! ;\
486 subs r2, $1 ;\
487 blt L_scalarReverseCopy ;\
488 tst ip, $7 ;\
489 bne 0b ;\
490 bic r1, $3 ;\
491 sub r1, $8 ;\
492 sub ip, $8 ;\
493 mov r3, $-8 ;\
494 vld1.32 {d2,d3}, [r1], r3 ;\
495 subs r2, $8 ;\
496 blt 1f ;\
497 0: vext.8 d0, d2, d3, $(offset);\
498 vmov d3, d2 ;\
499 vld1.32 {d2}, [r1], r3 ;\
500 subs r2, $8 ;\
501 vst1.64 {d0}, [ip, :64], r3 ;\
502 bge 0b ;\
503 1: vext.8 d0, d2, d3, $(offset);\
504 add r1, $8 ;\
505 vst1.64 {d0}, [ip, :64] ;\
506 2: add r1, $(offset);\
507 b L_scalarReverseCopy
508
509 L_reverseAligned1:
510 RCOPY_UNALIGNED(1)
511 L_reverseAligned2:
512 RCOPY_UNALIGNED(2)
513 L_reverseAligned3:
514 RCOPY_UNALIGNED(3)
515
516 /*****************************************************************************
517 * front to back copy *
518 *****************************************************************************/
519
520 L_copyFrontToBack:
521
522 // Here the pointers are laid out such that we can use our preferred
523 // front-to-back copy. We preserve original destination pointer in r0 because
524 // it is the return value for the routine, and copy it to ip to use in this
525 // routine.
526
527 mov ip, r0
528
529 // Subtract 8 from the buffer length; if this is negative, then we will use
530 // only single-byte copies, and we jump directly to a scalar copy loop.
531
532 subs r2, $8
533 blt L_scalarCopy
534
535 // If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
536 // to move the data.
537
538 tst ip, $7
539 beq L_vectorCopy
540
541 // Otherwise, we copy a single byte at a time, in order of ascending memory
542 // address, until the destination is 8 byte aligned. Within this loop,
543 // registers are used as follows:
544 //
545 // r0 original destination pointer
546 // r1 pointer to the next byte to copy
547 // r2 (bytes remaining to be copied) - 8
548 // r3 temporary to hold the byte that is being copied
549 // ip pointer to the next byte to store to
550
551 0: ldrb r3, [r1], $1
552 sub r2, $1
553 strb r3, [ip], $1
554 tst ip, $7
555 bne 0b
556
557 // At this point, the destination pointer is 8 byte aligned. Check again that
558 // there are at least 8 bytes remaining to copy by comparing the remaining
559 // length minus 8 to zero. If fewer than 8 bytes remain, jump to the cleanup
560 // path.
561
562 cmp r2, $0
563 blt L_scalarCopy
564
565 /*****************************************************************************
566 * destination is doubleword aligned *
567 *****************************************************************************/
568
569 L_vectorCopy:
570
571 // At this point, registers contain the following values:
572 //
573 // r0 original destination pointer
574 // r1 pointer to the next element to be copied
575 // r2 (bytes remaining to copy) - 8
576 // ip pointer to the destination of the next byte to be copied
577 //
578 // Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
579 // NEON has really excellent alignment handling in hardware, so we would like
580 // to use that to handle cases where the source is not similarly aligned to the
581 // destination (it supports even single-byte misalignment at speed). However,
582 // on some SoC designs, not all of the DMA busses support such access. Thus,
583 // we must unfortunately use a software workaround in those cases.
584 //
585 // Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
586 // we only need to handle the different possible source alignments modulo 4.
587 // Here we have a dispatch table to jump to the correct copy implementation
588 // for the given source alignment.
589 //
590 // The tbh instruction loads the address offset of the correct implementation
591 // from the data table that immediately follows it and adds it to the pc to
592 // jump to the correct branch.
593
594 ands r3, r1, $3
595 bic r1, $3
596 tbh [pc, r3, lsl $1]
597 0:
598 .short (L_sourceAligned0-0b)/2
599 .short (L_sourceAligned1-0b)/2
600 .short (L_sourceAligned2-0b)/2
601 .short (L_sourceAligned3-0b)/2
602
603 /*****************************************************************************
604 * source is also at least word aligned *
605 *****************************************************************************/
606
607 L_sourceAligned0:
608
609 // Subtract 56 from r2, so that it contains the number of bytes remaining to
610 // copy minus 64. If this result is negative, then we jump into a loop that
611 // copies 8 bytes at a time.
612
613 subs r2, $0x38
614 blt L_vectorCleanup
615
616 // Check if the destination pointer is 64-byte aligned. If so, jump to a loop
617 // that copies whole cachelines.
618
619 tst ip, $0x38
620 beq L_cachelineAligned
621
622 // Otherwise, we copy a 8 bytes at a time, in order of ascending memory
623 // address, until the destination is 64 byte aligned. Within this loop,
624 // registers are used as follows:
625 //
626 // r0 original destination pointer
627 // r1 pointer to the next element to be copied
628 // r2 (bytes remaining to be copied) - 64
629 // ip pointer to the destination of the next byte to be copied
630 // d0 temporary storage for copy
631
632 0: vld1.32 {d0}, [r1]!
633 sub r2, $8
634 vst1.64 {d0}, [ip,:64]!
635 tst ip, $0x38
636 bne 0b
637
638 // At this point, the destination pointer is 64 byte aligned. Check again that
639 // there are at least 64 bytes remaining to copy by comparing the remaining
640 // length minus 64 to zero. If fewer than 64 bytes remain, skip over the main
641 // copy loop.
642
643 cmp r2, $0
644 blt L_vectorCleanup
645
646 /*****************************************************************************
647 * destination is cacheline aligned *
648 *****************************************************************************/
649
650 // In the special case that we are copying a buffer of between 1k and 32k bytes
651 // we do not use a NEON copy for the main loop. This is because if we happen
652 // to be doing a copy from a source in cache to a destination that is not in
653 // cache, this will result in an increase in energy usage. In all other cases,
654 // NEON gives superior energy conservation.
655
656 L_cachelineAligned:
657 sub r3, r2, $0x3c0
658 cmp r3, $0x7c00
659 blo L_useSTMIA
660
661 // The destination pointer is known to be 64-byte aligned, so we can use the
662 // maximal alignment hint (:256) for our vector stores. Detect if the source
663 // is also at least 32-byte aligned and jump to a loop that uses maximal
664 // alignment hints for the loads as well if possible.
665
666 tst r1, $0x1f
667 beq L_sourceAligned32
668
669 // This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
670 // 64-byte aligned destination, in order of ascending memory address. Within
671 // this loop, registers are used as follows:
672 //
673 // r0 original destination pointer (unmodified)
674 // r1 pointer to the next 32-byte block to load
675 // r2 (number of bytes remaining to copy) - 64
676 // ip pointer to which the next 32-byte block is to be stored
677 // q0-q3 temporary registers used for copies
678 //
679 // Note that the loop is arrange in such a way that a single cleanup store is
680 // necessary after the final loop iteration. This occurs at label (1), and is
681 // shared between the unaligned and aligned loops.
682
683 vld1.32 {q2,q3}, [r1]!
684 vld1.32 {q0,q1}, [r1]!
685 subs r2, $64
686 vst1.64 {q2,q3}, [ip,:256]!
687 blt 1f
688 .align 3
689 0: vld1.32 {q2,q3}, [r1]!
690 vst1.64 {q0,q1}, [ip,:256]!
691 vld1.32 {q0,q1}, [r1]!
692 subs r2, $64
693 vst1.64 {q2,q3}, [ip,:256]!
694 bge 0b
695 b 1f
696
697 L_sourceAligned32:
698
699 // This loop is identical to the immediately preceeding loop, except that it
700 // uses the additional alignment hint that the source pointer (r1) is 32-byte
701 // aligned. The two loops share cleanup code for the final iteration.
702
703 vld1.64 {q2,q3}, [r1,:256]!
704 vld1.64 {q0,q1}, [r1,:256]!
705 subs r2, $64
706 vst1.64 {q2,q3}, [ip,:256]!
707 blt 1f
708 .align 3
709 0: vld1.64 {q2,q3}, [r1,:256]!
710 vst1.64 {q0,q1}, [ip,:256]!
711 vld1.64 {q0,q1}, [r1,:256]!
712 subs r2, $64
713 vst1.64 {q2,q3}, [ip,:256]!
714 bge 0b
715
716 // Final vector store for both of the above loops.
717
718 1: vst1.64 {q0,q1}, [ip,:256]!
719
720 L_vectorCleanup:
721
722 // Add 56 to r2, so that it contains the number of bytes remaing to copy minus
723 // 8. A comparison of this value with zero tells us if any more whole 8-byte
724 // blocks need to be copied.
725
726 adds r2, $0x38
727 blt L_scalarCopy
728
729 // This loop copies 8 bytes at a time in order of descending memory address,
730 // until fewer than 8 bytes remain to be copied. Within this loop, registers
731 // are used as follows:
732 //
733 // r0 original destination pointer
734 // r1 pointer to the next element to be copied
735 // r2 (bytes remaining to be copied) - 64
736 // ip pointer to the destination of the next byte to be copied
737 // d0 temporary storage for copy
738
739 0: vld1.32 {d0}, [r1]!
740 subs r2, $8
741 vst1.64 {d0}, [ip,:64]!
742 bge 0b
743
744 /*****************************************************************************
745 * sub-doubleword cleanup copies *
746 *****************************************************************************/
747
748 L_scalarCopy:
749
750 // Add 8 to r2, so that it contains the number of bytes remaining to copy, and
751 // return to the calling routine if zero bytes remain.
752
753 adds r2, $8
754 it eq
755 bxeq lr
756
757 // Copy one byte at a time in descending address order until we reach the front
758 // of the buffer. Within this loop, registers are used as follows:
759 //
760 // r0 original destination pointer
761 // r1 pointer to one byte past the next element to be copied
762 // r2 (bytes remaining to be copied) - 8
763 // r3 temporary to hold the byte that is being copied
764 // ip pointer one byte past the destination of the next byte to be copied
765
766 0: ldrb r3, [r1], $1
767 strb r3, [ip], $1
768 subs r2, $1
769 bne 0b
770 bx lr
771
772 /*****************************************************************************
773 * STMIA loop for 1k-32k buffers *
774 *****************************************************************************/
775
776 // This loop copies 64 bytes each iteration in order of ascending memory
777 // address, using the GPRs instead of NEON.
778 //
779 // r0 original destination pointer
780 // r1 pointer to the next element to be copied
781 // r2 (bytes remaining to be copied) - 64
782 // r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
783 // ip pointer to the next location to store to
784
785 L_useSTMIA:
786 push SAVE_REGISTERS
787 .align 3
788 0: ldmia r1!, COPY_REGISTERS
789 subs r2, r2, $64
790 stmia ip!, COPY_REGISTERS
791 ldmia r1!, COPY_REGISTERS
792 pld [r1, $64]
793 stmia ip!, COPY_REGISTERS
794 bge 0b
795 pop SAVE_REGISTERS
796 b L_vectorCleanup
797
798 /*****************************************************************************
799 * Misaligned forward vld1 loop *
800 *****************************************************************************/
801
802 // Software alignment fixup to handle source and dest that are relatively
803 // misaligned mod 4 bytes.
804 //
805 // The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
806 // which we combine with the 8 bytes loaded in the previous iteration to get a
807 // 16 byte field; the next 8 bytes to be stored to the destination buffer are
808 // somewhere in that field, and we get them using the VEXT instruction:
809 //
810 // | 8 bytes from last iteration | 8 bytes from this iteration |
811 // +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
812 // | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
813 // +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
814 // ^8 bytes to store this iteration^ |
815 // could be a page boundary
816 //
817 // We need to be a little bit careful, however. Because the loads only have 4
818 // byte alignment, if we used this approach all the way to the end of the
819 // buffer, the very last 8 byte load might slop over onto a new page by 4
820 // bytes, and that new page might not be mapped into our process. Thus, we
821 // terminate this copy loop when fewer than 12 bytes remain to be copied,
822 // instead of the more natural-seeming termination condition of "8 bytes
823 // remaining" (the illustration above shows the worst case and demonstrates
824 // why 12 is a sufficiently safe condition).
825 //
826 // At the beginning of each iteration through this loop, registers are used
827 // as follows:
828 //
829 // r0 original destination pointer
830 // r1 pointer to the next block of 8 bytes to load
831 // r2 (bytes remaining to copy) - 12
832 // ip pointer to the next block of 8 bytes to store
833 // d0 next 8 bytes to store
834 // d2 8 bytes loaded in the previous iteration
835 // d3 8 bytes loaded two iterations ago
836
837 #define COPY_UNALIGNED(offset) \
838 subs r2, $4 ;\
839 blt 2f ;\
840 vld1.32 {d2,d3}, [r1]! ;\
841 subs r2, $8 ;\
842 blt 1f ;\
843 0: vext.8 d0, d2, d3, $(offset);\
844 vmov d2, d3 ;\
845 vld1.32 {d3}, [r1]! ;\
846 subs r2, $8 ;\
847 vst1.64 {d0}, [ip, :64]! ;\
848 bge 0b ;\
849 1: vext.8 d0, d2, d3, $(offset);\
850 sub r1, $8 ;\
851 vst1.64 {d0}, [ip, :64]! ;\
852 2: add r1, $(offset);\
853 add r2, $4 ;\
854 b L_scalarCopy
855
856 L_sourceAligned1:
857 COPY_UNALIGNED(1)
858 L_sourceAligned2:
859 COPY_UNALIGNED(2)
860 L_sourceAligned3:
861 COPY_UNALIGNED(3)
862
863 #endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD