2 * Copyright (c) 2009 Apple Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
25 #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
27 /*****************************************************************************
28 * Cortex-A8 implementation *
29 *****************************************************************************/
31 // Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
33 // Our tests have shown that NEON is always a performance win for memcpy( ).
34 // However, for the specific case of copies from a warm source to a cold
35 // destination when the buffer size is between 1k and 32k, it is not enough
36 // of a performance win to offset the increased power footprint, resulting
37 // in an energy usage regression. Thus, we detect that particular case, and
38 // pass those copies through the ARM core registers. All other copies larger
39 // than 8 bytes are handled on NEON.
41 // Stephen Canon, August 2009
47 // void bcopy(const void * source,
48 // void * destination,
51 // void *memmove(void * destination,
52 // const void * source,
55 // void *memcpy(void * restrict destination,
56 // const void * restrict source,
59 // all copy n successive bytes from source to destination. memmove and memcpy
60 // returns destination, whereas bcopy has no return value. copying takes place
61 // as if it were through a temporary buffer -- after return destination contains
62 // exactly the bytes from source, even if the buffers overlap.
64 .thumb_func _bcopy$VARIANT$CortexA8
65 .thumb_func _memmove$VARIANT$CortexA8
66 .thumb_func _memcpy$VARIANT$CortexA8
67 .globl _bcopy$VARIANT$CortexA8
68 .globl _memmove$VARIANT$CortexA8
69 .globl _memcpy$VARIANT$CortexA8
71 #define SAVE_REGISTERS {r4,r5,r6,r8,r10,r11}
72 #define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r11}
74 /*****************************************************************************
76 *****************************************************************************/
79 _bcopy$VARIANT$CortexA8:
81 // bcopy has the first and second arguments in the opposite order as the C
82 // library functions memmove and memcpy. If bcopy is called, we swap these
83 // two arguments and then fall into memmove.
90 _memmove$VARIANT$CortexA8:
91 _memcpy$VARIANT$CortexA8:
93 // At entry to memmove/memcpy, registers contain the following values:
95 // r0 pointer to the first byte of the destination buffer
96 // r1 pointer to the first byte of the source buffer
97 // r2 number of bytes to copy
99 // Our preference is to use a (faster and easier to understand) front-to-back
100 // copy of the buffer. However, memmove requires that copies take place as
101 // though through a temporary buffer. This means that if the buffers overlap,
102 // it may be necessary to copy the buffer in reverse order.
104 // To properly detect such overlap, we begin by computing the offset between
105 // the source and destination pointers. If the offset happens to be zero,
106 // then there is no work to be done, so we can early out.
112 // r3 now contains the offset between the buffers, (destination - source). If
113 // 0 < offset < length, then the high-addressed bits of the source alias the
114 // low addressed bytes of the destination. Thus, if we were to perform the
115 // copy in ascending address order, we would overwrite the high-addressed
116 // source bytes before we had a chance to copy them, and the data would be lost.
118 // Thus, we can use the front-to-back copy only if offset is negative or
119 // greater than the length. This is the case precisely if offset compares
120 // unsigned higher than length.
123 bhs L_copyFrontToBack
125 /*****************************************************************************
126 * back to front copy *
127 *****************************************************************************/
129 // Here we have fallen through into the back-to-front copy. We preserve the
130 // original destination pointer in r0 because it is the return value for the
131 // routine, and update the other registers as follows:
133 // r1 one byte beyond the end of the destination buffer
134 // r2 number of bytes to copy
135 // ip one byte beyond the end of the destination buffer
141 // Subtract 8 from the buffer length; if this is negative, then we will use
142 // only single-byte copies, and we jump directly to a scalar copy loop.
145 blt L_scalarReverseCopy
147 // If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
151 beq L_vectorReverseCopy
153 // Otherwise, we copy a single byte at a time, in order of descending memory
154 // address, until the destination is 8 byte aligned. Within this loop,
155 // registers are used as follows:
157 // r0 original destination pointer
158 // r1 pointer to one byte past the next element to be copied
159 // r2 (bytes remaining to be copied) - 8
160 // r3 temporary to hold the byte that is being copied
161 // ip pointer one byte past the destination of the next byte to be copied
163 // byte that will be copied in this iteration
164 // | byte that was copied in the previous iteration
165 // Source buffer: v v
166 // ------------------------+---+---+-------------------------
167 // bytes still to copy ... | | | ... bytes already copied
168 // ------------------------+---+---+-------------------------
170 // r1 holds the address of this byte
172 0: ldrb r3, [r1, $-1]!
178 // At this point, the destination pointer is 8 byte aligned. Check again that
179 // there are at least 8 bytes remaining to copy by comparing the remaining
180 // length minus 8 to zero. If fewer than 8 bytes remain, jump to the cleanup
184 blt L_scalarReverseCopy
186 /*****************************************************************************
187 * destination is 8 byte aligned *
188 *****************************************************************************/
192 // At this point, registers contain the following values:
194 // r0 original destination pointer
195 // r1 pointer to one byte past the next element to be copied
196 // r2 (bytes remaining to copy) - 8
197 // ip pointer one byte past the destination of the next byte to be copied
199 // Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
200 // NEON has really excellent alignment handling in hardware, so we would like
201 // to use that to handle cases where the source is not similarly aligned to the
202 // destination (it supports even single-byte misalignment at speed). However,
203 // on some SoC designs, not all of the DMA busses support such access. Thus,
204 // we must unfortunately use a software workaround in those cases.
206 // Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
207 // we only need to handle the different possible source alignments modulo 4.
208 // Here we have a dispatch table to jump to the correct copy implementation
209 // for the given source alignment.
211 // The tbh instruction loads the address offset of the correct implementation
212 // from the data table that immediately follows it and adds it to the pc to
213 // jump to the correct branch.
218 .short (L_reverseAligned0-0b)/2
219 .short (L_reverseAligned1-0b)/2
220 .short (L_reverseAligned2-0b)/2
221 .short (L_reverseAligned3-0b)/2
223 /*****************************************************************************
224 * source is also at least word aligned *
225 *****************************************************************************/
229 // Subtract 56 from r2, so that it contains the number of bytes remaining to
230 // copy minus 64. If this result is negative, then we jump into a loop that
231 // copies 8 bytes at a time.
234 blt L_reverseVectorCleanup
236 // Check if the destination pointer is 64-byte aligned. If so, jump to a loop
237 // that copies whole cachelines.
240 beq L_reverseCachelineAligned
242 // Otherwise, we copy a 8 bytes at a time, in order of descending memory
243 // address, until the destination is 64 byte aligned. Within this loop,
244 // registers are used as follows:
246 // r0 original destination pointer
247 // r1 pointer to one byte past the next element to be copied
248 // r2 (bytes remaining to be copied) - 64
249 // ip pointer one byte past the destination of the next byte to be copied
250 // d0 temporary storage for copy
252 // bytes that will be copied after this iteration
253 // | 8 byte block that will be copied in this iteration
255 // --------------+-------------------------------+---------------------
256 // | 0 1 2 3 4 5 6 7 | bytes already copied
257 // --------------+-------------------------------+---------------------
266 vst1.64 {d0}, [ip,:64]
269 // At this point, the destination pointer is 64 byte aligned. Check again that
270 // there are at least 64 bytes remaining to copy by comparing the remaining
271 // length minus 64 to zero. If fewer than 64 bytes remain, skip over the main
275 blt L_reverseVectorCleanup
277 /*****************************************************************************
278 * destination is cacheline aligned *
279 *****************************************************************************/
281 L_reverseCachelineAligned:
283 // In the special case that we are copying a buffer of between 1k and 32k bytes
284 // we do not use a NEON copy for the main loop. This is because if we happen
285 // to be doing a copy from a source in cache to a destination that is not in
286 // cache, this will result in an increase in energy usage. In all other cases,
287 // NEON gives superior energy conservation.
293 // Pre-decrement the source (r1) and destination (ip) pointers so that they
294 // point to the first byte of the trailing 32-byte window of each buffer.
295 // Additionally, load the address increment of -32 into r3.
301 // The destination pointer is known to be 64-byte aligned, so we can use the
302 // maximal alignment hint (:256) for our vector stores. Detect if the source
303 // is also at least 32-byte aligned and jump to a loop that uses maximal
304 // alignment hints for the loads as well if possible.
307 beq L_reverseSourceAligned
309 // This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
310 // 64-byte aligned destination, in order of descending memory address. Within
311 // this loop, registers are used as follows:
313 // r0 original destination pointer (unmodified)
314 // r1 pointer to the next 32-byte block to load
315 // r2 (number of bytes remaining to copy) - 64
316 // r3 address increment of -32.
317 // ip pointer to which the next 32-byte block is to be stored
318 // q0-q3 temporary registers used for copies
320 // Note that the loop is arrange in such a way that a single cleanup store is
321 // necessary after the final loop iteration. This occurs at label (1), and is
322 // shared between the unaligned and aligned loops.
324 vld1.32 {q2,q3}, [r1], r3
325 vld1.32 {q0,q1}, [r1], r3
327 vst1.64 {q2,q3}, [ip,:256], r3
330 0: vld1.32 {q2,q3}, [r1], r3
331 vst1.64 {q0,q1}, [ip,:256], r3
332 vld1.32 {q0,q1}, [r1], r3
334 vst1.64 {q2,q3}, [ip,:256], r3
338 L_reverseSourceAligned:
340 // This loop is identical to the immediately preceeding loop, except that it
341 // uses the additional alignment hint that the source pointer (r1) is 32-byte
342 // aligned. The two loops share cleanup code for the final iteration.
344 vld1.64 {q2,q3}, [r1,:256], r3
345 vld1.64 {q0,q1}, [r1,:256], r3
347 vst1.64 {q2,q3}, [ip,:256], r3
350 0: vld1.64 {q2,q3}, [r1,:256], r3
351 vst1.64 {q0,q1}, [ip,:256], r3
352 vld1.64 {q0,q1}, [r1,:256], r3
354 vst1.64 {q2,q3}, [ip,:256], r3
357 // Final vector store for both of the above loops.
359 1: vst1.64 {q0,q1}, [ip,:256], r3
361 // Adjust the source and destination pointers so that they once again point to
362 // the last byte that we used (which is one byte higher than the address that
363 // we will use next for any required cleanup).
368 L_reverseVectorCleanup:
370 // Add 56 to r2, so that it contains the number of bytes remaing to copy minus
371 // 8. A comparison of this value with zero tells us if any more whole 8-byte
372 // blocks need to be copied.
375 blt L_scalarReverseCopy
377 // This loop copies 8 bytes at a time in order of descending memory address,
378 // until fewer than 8 bytes remain to be copied. Within this loop, registers
379 // are used as follows:
381 // r0 original destination pointer
382 // r1 pointer to one byte past the next element to be copied
383 // r2 (bytes remaining to be copied) - 64
384 // ip pointer one byte past the destination of the next byte to be copied
385 // d0 temporary storage for copy
391 vst1.64 {d0}, [ip,:64]
394 /*****************************************************************************
395 * sub-doubleword cleanup copies *
396 *****************************************************************************/
400 // Add 8 to r2, so that it contains the number of bytes remaining to copy, and
401 // return to the calling routine if zero bytes remain.
407 // Copy one byte at a time in descending address order until we reach the front
408 // of the buffer. Within this loop, registers are used as follows:
410 // r0 original destination pointer
411 // r1 pointer to one byte past the next element to be copied
412 // r2 (bytes remaining to be copied) - 8
413 // r3 temporary to hold the byte that is being copied
414 // ip pointer one byte past the destination of the next byte to be copied
416 0: ldrb r3, [r1, $-1]!
422 /*****************************************************************************
423 * STMDB loop for 1k-32k buffers *
424 *****************************************************************************/
426 // This loop copies 64 bytes each iteration in order of descending memory
427 // address, using the GPRs instead of NEON.
429 // r0 original destination pointer
430 // r1 pointer to one byte past the next element to be copied
431 // r2 (bytes remaining to be copied) - 64
432 // r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
433 // ip pointer to one byte past the next location to store to
438 0: ldmdb r1!, COPY_REGISTERS
440 stmdb ip!, COPY_REGISTERS
441 ldmdb r1!, COPY_REGISTERS
443 stmdb ip!, COPY_REGISTERS
446 b L_reverseVectorCleanup
448 /*****************************************************************************
449 * Misaligned reverse vld1 loop *
450 *****************************************************************************/
452 // Software alignment fixup to handle source and dest that are relatively
453 // misaligned mod 4 bytes.
455 // The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
456 // which we combine with the 8 bytes loaded in the previous iteration to get a
457 // 16 byte field; the next 8 bytes to be stored to the destination buffer are
458 // somewhere in that field, and we get them using the VEXT instruction:
460 // | 8 bytes from this iteration | 8 bytes from last iteration |
461 // +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
462 // | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
463 // +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
464 // ^8 bytes to store this iteration^ |
465 // could be a page boundary
467 // We need to be a little bit careful, however. Because the loads only have 4
468 // byte alignment, the very first load could slop over into a page that is not
469 // mapped readable. In order to prevent this scenario, we copy eight bytes
470 // using byte-by-byte before beginning the main loop.
472 // At the beginning of each iteration through this loop, registers are used
475 // r0 original destination pointer
476 // r1 pointer to the next block of 8 bytes to load
477 // r2 (bytes remaining to copy) - 8
478 // ip pointer to the next block of 8 bytes to store
479 // d0 next 8 bytes to store
480 // d2 8 bytes loaded in the previous iteration
481 // d3 8 bytes loaded two iterations ago
483 #define RCOPY_UNALIGNED(offset) \
484 0: ldrb r3, [r1,$-1]! ;\
485 strb r3, [ip,$-1]! ;\
487 blt L_scalarReverseCopy ;\
494 vld1.32 {d2,d3}, [r1], r3 ;\
497 0: vext.8 d0, d2, d3, $(offset);\
499 vld1.32 {d2}, [r1], r3 ;\
501 vst1.64 {d0}, [ip, :64], r3 ;\
503 1: vext.8 d0, d2, d3, $(offset);\
505 vst1.64 {d0}, [ip, :64] ;\
506 2: add r1, $(offset);\
507 b L_scalarReverseCopy
516 /*****************************************************************************
517 * front to back copy *
518 *****************************************************************************/
522 // Here the pointers are laid out such that we can use our preferred
523 // front-to-back copy. We preserve original destination pointer in r0 because
524 // it is the return value for the routine, and copy it to ip to use in this
529 // Subtract 8 from the buffer length; if this is negative, then we will use
530 // only single-byte copies, and we jump directly to a scalar copy loop.
535 // If the destination pointer is 8-byte aligned we can use 8-byte NEON copies
541 // Otherwise, we copy a single byte at a time, in order of ascending memory
542 // address, until the destination is 8 byte aligned. Within this loop,
543 // registers are used as follows:
545 // r0 original destination pointer
546 // r1 pointer to the next byte to copy
547 // r2 (bytes remaining to be copied) - 8
548 // r3 temporary to hold the byte that is being copied
549 // ip pointer to the next byte to store to
557 // At this point, the destination pointer is 8 byte aligned. Check again that
558 // there are at least 8 bytes remaining to copy by comparing the remaining
559 // length minus 8 to zero. If fewer than 8 bytes remain, jump to the cleanup
565 /*****************************************************************************
566 * destination is doubleword aligned *
567 *****************************************************************************/
571 // At this point, registers contain the following values:
573 // r0 original destination pointer
574 // r1 pointer to the next element to be copied
575 // r2 (bytes remaining to copy) - 8
576 // ip pointer to the destination of the next byte to be copied
578 // Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive.
579 // NEON has really excellent alignment handling in hardware, so we would like
580 // to use that to handle cases where the source is not similarly aligned to the
581 // destination (it supports even single-byte misalignment at speed). However,
582 // on some SoC designs, not all of the DMA busses support such access. Thus,
583 // we must unfortunately use a software workaround in those cases.
585 // Fortunately, 4-byte aligned loads are supported even on the DMA busses, so
586 // we only need to handle the different possible source alignments modulo 4.
587 // Here we have a dispatch table to jump to the correct copy implementation
588 // for the given source alignment.
590 // The tbh instruction loads the address offset of the correct implementation
591 // from the data table that immediately follows it and adds it to the pc to
592 // jump to the correct branch.
598 .short (L_sourceAligned0-0b)/2
599 .short (L_sourceAligned1-0b)/2
600 .short (L_sourceAligned2-0b)/2
601 .short (L_sourceAligned3-0b)/2
603 /*****************************************************************************
604 * source is also at least word aligned *
605 *****************************************************************************/
609 // Subtract 56 from r2, so that it contains the number of bytes remaining to
610 // copy minus 64. If this result is negative, then we jump into a loop that
611 // copies 8 bytes at a time.
616 // Check if the destination pointer is 64-byte aligned. If so, jump to a loop
617 // that copies whole cachelines.
620 beq L_cachelineAligned
622 // Otherwise, we copy a 8 bytes at a time, in order of ascending memory
623 // address, until the destination is 64 byte aligned. Within this loop,
624 // registers are used as follows:
626 // r0 original destination pointer
627 // r1 pointer to the next element to be copied
628 // r2 (bytes remaining to be copied) - 64
629 // ip pointer to the destination of the next byte to be copied
630 // d0 temporary storage for copy
632 0: vld1.32 {d0}, [r1]!
634 vst1.64 {d0}, [ip,:64]!
638 // At this point, the destination pointer is 64 byte aligned. Check again that
639 // there are at least 64 bytes remaining to copy by comparing the remaining
640 // length minus 64 to zero. If fewer than 64 bytes remain, skip over the main
646 /*****************************************************************************
647 * destination is cacheline aligned *
648 *****************************************************************************/
650 // In the special case that we are copying a buffer of between 1k and 32k bytes
651 // we do not use a NEON copy for the main loop. This is because if we happen
652 // to be doing a copy from a source in cache to a destination that is not in
653 // cache, this will result in an increase in energy usage. In all other cases,
654 // NEON gives superior energy conservation.
661 // The destination pointer is known to be 64-byte aligned, so we can use the
662 // maximal alignment hint (:256) for our vector stores. Detect if the source
663 // is also at least 32-byte aligned and jump to a loop that uses maximal
664 // alignment hints for the loads as well if possible.
667 beq L_sourceAligned32
669 // This loop copies 64 bytes per iteration, from a 4-byte aligned source to a
670 // 64-byte aligned destination, in order of ascending memory address. Within
671 // this loop, registers are used as follows:
673 // r0 original destination pointer (unmodified)
674 // r1 pointer to the next 32-byte block to load
675 // r2 (number of bytes remaining to copy) - 64
676 // ip pointer to which the next 32-byte block is to be stored
677 // q0-q3 temporary registers used for copies
679 // Note that the loop is arrange in such a way that a single cleanup store is
680 // necessary after the final loop iteration. This occurs at label (1), and is
681 // shared between the unaligned and aligned loops.
683 vld1.32 {q2,q3}, [r1]!
684 vld1.32 {q0,q1}, [r1]!
686 vst1.64 {q2,q3}, [ip,:256]!
689 0: vld1.32 {q2,q3}, [r1]!
690 vst1.64 {q0,q1}, [ip,:256]!
691 vld1.32 {q0,q1}, [r1]!
693 vst1.64 {q2,q3}, [ip,:256]!
699 // This loop is identical to the immediately preceeding loop, except that it
700 // uses the additional alignment hint that the source pointer (r1) is 32-byte
701 // aligned. The two loops share cleanup code for the final iteration.
703 vld1.64 {q2,q3}, [r1,:256]!
704 vld1.64 {q0,q1}, [r1,:256]!
706 vst1.64 {q2,q3}, [ip,:256]!
709 0: vld1.64 {q2,q3}, [r1,:256]!
710 vst1.64 {q0,q1}, [ip,:256]!
711 vld1.64 {q0,q1}, [r1,:256]!
713 vst1.64 {q2,q3}, [ip,:256]!
716 // Final vector store for both of the above loops.
718 1: vst1.64 {q0,q1}, [ip,:256]!
722 // Add 56 to r2, so that it contains the number of bytes remaing to copy minus
723 // 8. A comparison of this value with zero tells us if any more whole 8-byte
724 // blocks need to be copied.
729 // This loop copies 8 bytes at a time in order of descending memory address,
730 // until fewer than 8 bytes remain to be copied. Within this loop, registers
731 // are used as follows:
733 // r0 original destination pointer
734 // r1 pointer to the next element to be copied
735 // r2 (bytes remaining to be copied) - 64
736 // ip pointer to the destination of the next byte to be copied
737 // d0 temporary storage for copy
739 0: vld1.32 {d0}, [r1]!
741 vst1.64 {d0}, [ip,:64]!
744 /*****************************************************************************
745 * sub-doubleword cleanup copies *
746 *****************************************************************************/
750 // Add 8 to r2, so that it contains the number of bytes remaining to copy, and
751 // return to the calling routine if zero bytes remain.
757 // Copy one byte at a time in descending address order until we reach the front
758 // of the buffer. Within this loop, registers are used as follows:
760 // r0 original destination pointer
761 // r1 pointer to one byte past the next element to be copied
762 // r2 (bytes remaining to be copied) - 8
763 // r3 temporary to hold the byte that is being copied
764 // ip pointer one byte past the destination of the next byte to be copied
772 /*****************************************************************************
773 * STMIA loop for 1k-32k buffers *
774 *****************************************************************************/
776 // This loop copies 64 bytes each iteration in order of ascending memory
777 // address, using the GPRs instead of NEON.
779 // r0 original destination pointer
780 // r1 pointer to the next element to be copied
781 // r2 (bytes remaining to be copied) - 64
782 // r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data
783 // ip pointer to the next location to store to
788 0: ldmia r1!, COPY_REGISTERS
790 stmia ip!, COPY_REGISTERS
791 ldmia r1!, COPY_REGISTERS
793 stmia ip!, COPY_REGISTERS
798 /*****************************************************************************
799 * Misaligned forward vld1 loop *
800 *****************************************************************************/
802 // Software alignment fixup to handle source and dest that are relatively
803 // misaligned mod 4 bytes.
805 // The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration,
806 // which we combine with the 8 bytes loaded in the previous iteration to get a
807 // 16 byte field; the next 8 bytes to be stored to the destination buffer are
808 // somewhere in that field, and we get them using the VEXT instruction:
810 // | 8 bytes from last iteration | 8 bytes from this iteration |
811 // +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
812 // | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f |
813 // +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
814 // ^8 bytes to store this iteration^ |
815 // could be a page boundary
817 // We need to be a little bit careful, however. Because the loads only have 4
818 // byte alignment, if we used this approach all the way to the end of the
819 // buffer, the very last 8 byte load might slop over onto a new page by 4
820 // bytes, and that new page might not be mapped into our process. Thus, we
821 // terminate this copy loop when fewer than 12 bytes remain to be copied,
822 // instead of the more natural-seeming termination condition of "8 bytes
823 // remaining" (the illustration above shows the worst case and demonstrates
824 // why 12 is a sufficiently safe condition).
826 // At the beginning of each iteration through this loop, registers are used
829 // r0 original destination pointer
830 // r1 pointer to the next block of 8 bytes to load
831 // r2 (bytes remaining to copy) - 12
832 // ip pointer to the next block of 8 bytes to store
833 // d0 next 8 bytes to store
834 // d2 8 bytes loaded in the previous iteration
835 // d3 8 bytes loaded two iterations ago
837 #define COPY_UNALIGNED(offset) \
840 vld1.32 {d2,d3}, [r1]! ;\
843 0: vext.8 d0, d2, d3, $(offset);\
845 vld1.32 {d3}, [r1]! ;\
847 vst1.64 {d0}, [ip, :64]! ;\
849 1: vext.8 d0, d2, d3, $(offset);\
851 vst1.64 {d0}, [ip, :64]! ;\
852 2: add r1, $(offset);\
863 #endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD