/* * Copyright (c) 2009 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_LICENSE_HEADER_END@ */ #include #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD /***************************************************************************** * Cortex-A8 implementation * *****************************************************************************/ // Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ). // // Our tests have shown that NEON is always a performance win for memcpy( ). // However, for the specific case of copies from a warm source to a cold // destination when the buffer size is between 1k and 32k, it is not enough // of a performance win to offset the increased power footprint, resulting // in an energy usage regression. Thus, we detect that particular case, and // pass those copies through the ARM core registers. All other copies larger // than 8 bytes are handled on NEON. // // Stephen Canon, August 2009 .text .code 16 .syntax unified // void bcopy(const void * source, // void * destination, // size_t length); // // void *memmove(void * destination, // const void * source, // size_t n); // // void *memcpy(void * restrict destination, // const void * restrict source, // size_t n); // // all copy n successive bytes from source to destination. memmove and memcpy // returns destination, whereas bcopy has no return value. copying takes place // as if it were through a temporary buffer -- after return destination contains // exactly the bytes from source, even if the buffers overlap. .thumb_func _bcopy$VARIANT$CortexA8 .thumb_func _memmove$VARIANT$CortexA8 .thumb_func _memcpy$VARIANT$CortexA8 .globl _bcopy$VARIANT$CortexA8 .globl _memmove$VARIANT$CortexA8 .globl _memcpy$VARIANT$CortexA8 #define SAVE_REGISTERS {r4,r5,r6,r8,r10,r11} #define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r11} /***************************************************************************** * entry points * *****************************************************************************/ .align 2 _bcopy$VARIANT$CortexA8: // bcopy has the first and second arguments in the opposite order as the C // library functions memmove and memcpy. If bcopy is called, we swap these // two arguments and then fall into memmove. mov r3, r0 mov r0, r1 mov r1, r3 .align 2 _memmove$VARIANT$CortexA8: _memcpy$VARIANT$CortexA8: // At entry to memmove/memcpy, registers contain the following values: // // r0 pointer to the first byte of the destination buffer // r1 pointer to the first byte of the source buffer // r2 number of bytes to copy // // Our preference is to use a (faster and easier to understand) front-to-back // copy of the buffer. However, memmove requires that copies take place as // though through a temporary buffer. This means that if the buffers overlap, // it may be necessary to copy the buffer in reverse order. // // To properly detect such overlap, we begin by computing the offset between // the source and destination pointers. If the offset happens to be zero, // then there is no work to be done, so we can early out. subs r3, r0, r1 it eq bxeq lr // r3 now contains the offset between the buffers, (destination - source). If // 0 < offset < length, then the high-addressed bits of the source alias the // low addressed bytes of the destination. Thus, if we were to perform the // copy in ascending address order, we would overwrite the high-addressed // source bytes before we had a chance to copy them, and the data would be lost. // // Thus, we can use the front-to-back copy only if offset is negative or // greater than the length. This is the case precisely if offset compares // unsigned higher than length. cmp r3, r2 bhs L_copyFrontToBack /***************************************************************************** * back to front copy * *****************************************************************************/ // Here we have fallen through into the back-to-front copy. We preserve the // original destination pointer in r0 because it is the return value for the // routine, and update the other registers as follows: // // r1 one byte beyond the end of the destination buffer // r2 number of bytes to copy // ip one byte beyond the end of the destination buffer mov ip, r0 add r1, r2 add ip, r2 // Subtract 8 from the buffer length; if this is negative, then we will use // only single-byte copies, and we jump directly to a scalar copy loop. subs r2, $8 blt L_scalarReverseCopy // If the destination pointer is 8-byte aligned we can use 8-byte NEON copies // to move the data. tst ip, $7 beq L_vectorReverseCopy // Otherwise, we copy a single byte at a time, in order of descending memory // address, until the destination is 8 byte aligned. Within this loop, // registers are used as follows: // // r0 original destination pointer // r1 pointer to one byte past the next element to be copied // r2 (bytes remaining to be copied) - 8 // r3 temporary to hold the byte that is being copied // ip pointer one byte past the destination of the next byte to be copied // // byte that will be copied in this iteration // | byte that was copied in the previous iteration // Source buffer: v v // ------------------------+---+---+------------------------- // bytes still to copy ... | | | ... bytes already copied // ------------------------+---+---+------------------------- // ^ // r1 holds the address of this byte 0: ldrb r3, [r1, $-1]! sub r2, $1 strb r3, [ip, $-1]! tst ip, $7 bne 0b // At this point, the destination pointer is 8 byte aligned. Check again that // there are at least 8 bytes remaining to copy by comparing the remaining // length minus 8 to zero. If fewer than 8 bytes remain, jump to the cleanup // path. cmp r2, $0 blt L_scalarReverseCopy /***************************************************************************** * destination is 8 byte aligned * *****************************************************************************/ L_vectorReverseCopy: // At this point, registers contain the following values: // // r0 original destination pointer // r1 pointer to one byte past the next element to be copied // r2 (bytes remaining to copy) - 8 // ip pointer one byte past the destination of the next byte to be copied // // Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive. // NEON has really excellent alignment handling in hardware, so we would like // to use that to handle cases where the source is not similarly aligned to the // destination (it supports even single-byte misalignment at speed). However, // on some SoC designs, not all of the DMA busses support such access. Thus, // we must unfortunately use a software workaround in those cases. // // Fortunately, 4-byte aligned loads are supported even on the DMA busses, so // we only need to handle the different possible source alignments modulo 4. // Here we have a dispatch table to jump to the correct copy implementation // for the given source alignment. // // The tbh instruction loads the address offset of the correct implementation // from the data table that immediately follows it and adds it to the pc to // jump to the correct branch. ands r3, r1, $3 tbh [pc, r3, lsl $1] 0: .short (L_reverseAligned0-0b)/2 .short (L_reverseAligned1-0b)/2 .short (L_reverseAligned2-0b)/2 .short (L_reverseAligned3-0b)/2 /***************************************************************************** * source is also at least word aligned * *****************************************************************************/ L_reverseAligned0: // Subtract 56 from r2, so that it contains the number of bytes remaining to // copy minus 64. If this result is negative, then we jump into a loop that // copies 8 bytes at a time. subs r2, $0x38 blt L_reverseVectorCleanup // Check if the destination pointer is 64-byte aligned. If so, jump to a loop // that copies whole cachelines. tst ip, $0x38 beq L_reverseCachelineAligned // Otherwise, we copy a 8 bytes at a time, in order of descending memory // address, until the destination is 64 byte aligned. Within this loop, // registers are used as follows: // // r0 original destination pointer // r1 pointer to one byte past the next element to be copied // r2 (bytes remaining to be copied) - 64 // ip pointer one byte past the destination of the next byte to be copied // d0 temporary storage for copy // // bytes that will be copied after this iteration // | 8 byte block that will be copied in this iteration // v v // --------------+-------------------------------+--------------------- // | 0 1 2 3 4 5 6 7 | bytes already copied // --------------+-------------------------------+--------------------- // ^ // r1 points here 0: sub r1, $8 vld1.32 {d0}, [r1] sub ip, $8 sub r2, $8 tst ip, $0x38 vst1.64 {d0}, [ip,:64] bne 0b // At this point, the destination pointer is 64 byte aligned. Check again that // there are at least 64 bytes remaining to copy by comparing the remaining // length minus 64 to zero. If fewer than 64 bytes remain, skip over the main // copy loop. cmp r2, $0 blt L_reverseVectorCleanup /***************************************************************************** * destination is cacheline aligned * *****************************************************************************/ L_reverseCachelineAligned: // In the special case that we are copying a buffer of between 1k and 32k bytes // we do not use a NEON copy for the main loop. This is because if we happen // to be doing a copy from a source in cache to a destination that is not in // cache, this will result in an increase in energy usage. In all other cases, // NEON gives superior energy conservation. sub r3, r2, $0x3c0 cmp r3, $0x7c00 blo L_useSTMDB // Pre-decrement the source (r1) and destination (ip) pointers so that they // point to the first byte of the trailing 32-byte window of each buffer. // Additionally, load the address increment of -32 into r3. sub r1, $32 sub ip, $32 mov r3, $-32 // The destination pointer is known to be 64-byte aligned, so we can use the // maximal alignment hint (:256) for our vector stores. Detect if the source // is also at least 32-byte aligned and jump to a loop that uses maximal // alignment hints for the loads as well if possible. tst r1, $0x1f beq L_reverseSourceAligned // This loop copies 64 bytes per iteration, from a 4-byte aligned source to a // 64-byte aligned destination, in order of descending memory address. Within // this loop, registers are used as follows: // // r0 original destination pointer (unmodified) // r1 pointer to the next 32-byte block to load // r2 (number of bytes remaining to copy) - 64 // r3 address increment of -32. // ip pointer to which the next 32-byte block is to be stored // q0-q3 temporary registers used for copies // // Note that the loop is arrange in such a way that a single cleanup store is // necessary after the final loop iteration. This occurs at label (1), and is // shared between the unaligned and aligned loops. vld1.32 {q2,q3}, [r1], r3 vld1.32 {q0,q1}, [r1], r3 subs r2, $64 vst1.64 {q2,q3}, [ip,:256], r3 blt 1f .align 3 0: vld1.32 {q2,q3}, [r1], r3 vst1.64 {q0,q1}, [ip,:256], r3 vld1.32 {q0,q1}, [r1], r3 subs r2, $64 vst1.64 {q2,q3}, [ip,:256], r3 bge 0b b 1f L_reverseSourceAligned: // This loop is identical to the immediately preceeding loop, except that it // uses the additional alignment hint that the source pointer (r1) is 32-byte // aligned. The two loops share cleanup code for the final iteration. vld1.64 {q2,q3}, [r1,:256], r3 vld1.64 {q0,q1}, [r1,:256], r3 subs r2, $64 vst1.64 {q2,q3}, [ip,:256], r3 blt 1f .align 3 0: vld1.64 {q2,q3}, [r1,:256], r3 vst1.64 {q0,q1}, [ip,:256], r3 vld1.64 {q0,q1}, [r1,:256], r3 subs r2, $64 vst1.64 {q2,q3}, [ip,:256], r3 bge 0b // Final vector store for both of the above loops. 1: vst1.64 {q0,q1}, [ip,:256], r3 // Adjust the source and destination pointers so that they once again point to // the last byte that we used (which is one byte higher than the address that // we will use next for any required cleanup). add r1, $32 add ip, $32 L_reverseVectorCleanup: // Add 56 to r2, so that it contains the number of bytes remaing to copy minus // 8. A comparison of this value with zero tells us if any more whole 8-byte // blocks need to be copied. adds r2, r2, $0x38 blt L_scalarReverseCopy // This loop copies 8 bytes at a time in order of descending memory address, // until fewer than 8 bytes remain to be copied. Within this loop, registers // are used as follows: // // r0 original destination pointer // r1 pointer to one byte past the next element to be copied // r2 (bytes remaining to be copied) - 64 // ip pointer one byte past the destination of the next byte to be copied // d0 temporary storage for copy 0: sub r1, $8 vld1.32 {d0}, [r1] sub ip, $8 subs r2, $8 vst1.64 {d0}, [ip,:64] bge 0b /***************************************************************************** * sub-doubleword cleanup copies * *****************************************************************************/ L_scalarReverseCopy: // Add 8 to r2, so that it contains the number of bytes remaining to copy, and // return to the calling routine if zero bytes remain. adds r2, $8 it eq bxeq lr // Copy one byte at a time in descending address order until we reach the front // of the buffer. Within this loop, registers are used as follows: // // r0 original destination pointer // r1 pointer to one byte past the next element to be copied // r2 (bytes remaining to be copied) - 8 // r3 temporary to hold the byte that is being copied // ip pointer one byte past the destination of the next byte to be copied 0: ldrb r3, [r1, $-1]! subs r2, $1 strb r3, [ip, $-1]! bne 0b bx lr /***************************************************************************** * STMDB loop for 1k-32k buffers * *****************************************************************************/ // This loop copies 64 bytes each iteration in order of descending memory // address, using the GPRs instead of NEON. // // r0 original destination pointer // r1 pointer to one byte past the next element to be copied // r2 (bytes remaining to be copied) - 64 // r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data // ip pointer to one byte past the next location to store to L_useSTMDB: push SAVE_REGISTERS .align 3 0: ldmdb r1!, COPY_REGISTERS subs r2, r2, $64 stmdb ip!, COPY_REGISTERS ldmdb r1!, COPY_REGISTERS pld [r1, $-64] stmdb ip!, COPY_REGISTERS bge 0b pop SAVE_REGISTERS b L_reverseVectorCleanup /***************************************************************************** * Misaligned reverse vld1 loop * *****************************************************************************/ // Software alignment fixup to handle source and dest that are relatively // misaligned mod 4 bytes. // // The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration, // which we combine with the 8 bytes loaded in the previous iteration to get a // 16 byte field; the next 8 bytes to be stored to the destination buffer are // somewhere in that field, and we get them using the VEXT instruction: // // | 8 bytes from this iteration | 8 bytes from last iteration | // +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ // | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f | // +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ // ^8 bytes to store this iteration^ | // could be a page boundary // // We need to be a little bit careful, however. Because the loads only have 4 // byte alignment, the very first load could slop over into a page that is not // mapped readable. In order to prevent this scenario, we copy eight bytes // using byte-by-byte before beginning the main loop. // // At the beginning of each iteration through this loop, registers are used // as follows: // // r0 original destination pointer // r1 pointer to the next block of 8 bytes to load // r2 (bytes remaining to copy) - 8 // ip pointer to the next block of 8 bytes to store // d0 next 8 bytes to store // d2 8 bytes loaded in the previous iteration // d3 8 bytes loaded two iterations ago #define RCOPY_UNALIGNED(offset) \ 0: ldrb r3, [r1,$-1]! ;\ strb r3, [ip,$-1]! ;\ subs r2, $1 ;\ blt L_scalarReverseCopy ;\ tst ip, $7 ;\ bne 0b ;\ bic r1, $3 ;\ sub r1, $8 ;\ sub ip, $8 ;\ mov r3, $-8 ;\ vld1.32 {d2,d3}, [r1], r3 ;\ subs r2, $8 ;\ blt 1f ;\ 0: vext.8 d0, d2, d3, $(offset);\ vmov d3, d2 ;\ vld1.32 {d2}, [r1], r3 ;\ subs r2, $8 ;\ vst1.64 {d0}, [ip, :64], r3 ;\ bge 0b ;\ 1: vext.8 d0, d2, d3, $(offset);\ add r1, $8 ;\ vst1.64 {d0}, [ip, :64] ;\ 2: add r1, $(offset);\ b L_scalarReverseCopy L_reverseAligned1: RCOPY_UNALIGNED(1) L_reverseAligned2: RCOPY_UNALIGNED(2) L_reverseAligned3: RCOPY_UNALIGNED(3) /***************************************************************************** * front to back copy * *****************************************************************************/ L_copyFrontToBack: // Here the pointers are laid out such that we can use our preferred // front-to-back copy. We preserve original destination pointer in r0 because // it is the return value for the routine, and copy it to ip to use in this // routine. mov ip, r0 // Subtract 8 from the buffer length; if this is negative, then we will use // only single-byte copies, and we jump directly to a scalar copy loop. subs r2, $8 blt L_scalarCopy // If the destination pointer is 8-byte aligned we can use 8-byte NEON copies // to move the data. tst ip, $7 beq L_vectorCopy // Otherwise, we copy a single byte at a time, in order of ascending memory // address, until the destination is 8 byte aligned. Within this loop, // registers are used as follows: // // r0 original destination pointer // r1 pointer to the next byte to copy // r2 (bytes remaining to be copied) - 8 // r3 temporary to hold the byte that is being copied // ip pointer to the next byte to store to 0: ldrb r3, [r1], $1 sub r2, $1 strb r3, [ip], $1 tst ip, $7 bne 0b // At this point, the destination pointer is 8 byte aligned. Check again that // there are at least 8 bytes remaining to copy by comparing the remaining // length minus 8 to zero. If fewer than 8 bytes remain, jump to the cleanup // path. cmp r2, $0 blt L_scalarCopy /***************************************************************************** * destination is doubleword aligned * *****************************************************************************/ L_vectorCopy: // At this point, registers contain the following values: // // r0 original destination pointer // r1 pointer to the next element to be copied // r2 (bytes remaining to copy) - 8 // ip pointer to the destination of the next byte to be copied // // Furthermore, it is known that ip is 8 byte aligned, and that r2 is positive. // NEON has really excellent alignment handling in hardware, so we would like // to use that to handle cases where the source is not similarly aligned to the // destination (it supports even single-byte misalignment at speed). However, // on some SoC designs, not all of the DMA busses support such access. Thus, // we must unfortunately use a software workaround in those cases. // // Fortunately, 4-byte aligned loads are supported even on the DMA busses, so // we only need to handle the different possible source alignments modulo 4. // Here we have a dispatch table to jump to the correct copy implementation // for the given source alignment. // // The tbh instruction loads the address offset of the correct implementation // from the data table that immediately follows it and adds it to the pc to // jump to the correct branch. ands r3, r1, $3 bic r1, $3 tbh [pc, r3, lsl $1] 0: .short (L_sourceAligned0-0b)/2 .short (L_sourceAligned1-0b)/2 .short (L_sourceAligned2-0b)/2 .short (L_sourceAligned3-0b)/2 /***************************************************************************** * source is also at least word aligned * *****************************************************************************/ L_sourceAligned0: // Subtract 56 from r2, so that it contains the number of bytes remaining to // copy minus 64. If this result is negative, then we jump into a loop that // copies 8 bytes at a time. subs r2, $0x38 blt L_vectorCleanup // Check if the destination pointer is 64-byte aligned. If so, jump to a loop // that copies whole cachelines. tst ip, $0x38 beq L_cachelineAligned // Otherwise, we copy a 8 bytes at a time, in order of ascending memory // address, until the destination is 64 byte aligned. Within this loop, // registers are used as follows: // // r0 original destination pointer // r1 pointer to the next element to be copied // r2 (bytes remaining to be copied) - 64 // ip pointer to the destination of the next byte to be copied // d0 temporary storage for copy 0: vld1.32 {d0}, [r1]! sub r2, $8 vst1.64 {d0}, [ip,:64]! tst ip, $0x38 bne 0b // At this point, the destination pointer is 64 byte aligned. Check again that // there are at least 64 bytes remaining to copy by comparing the remaining // length minus 64 to zero. If fewer than 64 bytes remain, skip over the main // copy loop. cmp r2, $0 blt L_vectorCleanup /***************************************************************************** * destination is cacheline aligned * *****************************************************************************/ // In the special case that we are copying a buffer of between 1k and 32k bytes // we do not use a NEON copy for the main loop. This is because if we happen // to be doing a copy from a source in cache to a destination that is not in // cache, this will result in an increase in energy usage. In all other cases, // NEON gives superior energy conservation. L_cachelineAligned: sub r3, r2, $0x3c0 cmp r3, $0x7c00 blo L_useSTMIA // The destination pointer is known to be 64-byte aligned, so we can use the // maximal alignment hint (:256) for our vector stores. Detect if the source // is also at least 32-byte aligned and jump to a loop that uses maximal // alignment hints for the loads as well if possible. tst r1, $0x1f beq L_sourceAligned32 // This loop copies 64 bytes per iteration, from a 4-byte aligned source to a // 64-byte aligned destination, in order of ascending memory address. Within // this loop, registers are used as follows: // // r0 original destination pointer (unmodified) // r1 pointer to the next 32-byte block to load // r2 (number of bytes remaining to copy) - 64 // ip pointer to which the next 32-byte block is to be stored // q0-q3 temporary registers used for copies // // Note that the loop is arrange in such a way that a single cleanup store is // necessary after the final loop iteration. This occurs at label (1), and is // shared between the unaligned and aligned loops. vld1.32 {q2,q3}, [r1]! vld1.32 {q0,q1}, [r1]! subs r2, $64 vst1.64 {q2,q3}, [ip,:256]! blt 1f .align 3 0: vld1.32 {q2,q3}, [r1]! vst1.64 {q0,q1}, [ip,:256]! vld1.32 {q0,q1}, [r1]! subs r2, $64 vst1.64 {q2,q3}, [ip,:256]! bge 0b b 1f L_sourceAligned32: // This loop is identical to the immediately preceeding loop, except that it // uses the additional alignment hint that the source pointer (r1) is 32-byte // aligned. The two loops share cleanup code for the final iteration. vld1.64 {q2,q3}, [r1,:256]! vld1.64 {q0,q1}, [r1,:256]! subs r2, $64 vst1.64 {q2,q3}, [ip,:256]! blt 1f .align 3 0: vld1.64 {q2,q3}, [r1,:256]! vst1.64 {q0,q1}, [ip,:256]! vld1.64 {q0,q1}, [r1,:256]! subs r2, $64 vst1.64 {q2,q3}, [ip,:256]! bge 0b // Final vector store for both of the above loops. 1: vst1.64 {q0,q1}, [ip,:256]! L_vectorCleanup: // Add 56 to r2, so that it contains the number of bytes remaing to copy minus // 8. A comparison of this value with zero tells us if any more whole 8-byte // blocks need to be copied. adds r2, $0x38 blt L_scalarCopy // This loop copies 8 bytes at a time in order of descending memory address, // until fewer than 8 bytes remain to be copied. Within this loop, registers // are used as follows: // // r0 original destination pointer // r1 pointer to the next element to be copied // r2 (bytes remaining to be copied) - 64 // ip pointer to the destination of the next byte to be copied // d0 temporary storage for copy 0: vld1.32 {d0}, [r1]! subs r2, $8 vst1.64 {d0}, [ip,:64]! bge 0b /***************************************************************************** * sub-doubleword cleanup copies * *****************************************************************************/ L_scalarCopy: // Add 8 to r2, so that it contains the number of bytes remaining to copy, and // return to the calling routine if zero bytes remain. adds r2, $8 it eq bxeq lr // Copy one byte at a time in descending address order until we reach the front // of the buffer. Within this loop, registers are used as follows: // // r0 original destination pointer // r1 pointer to one byte past the next element to be copied // r2 (bytes remaining to be copied) - 8 // r3 temporary to hold the byte that is being copied // ip pointer one byte past the destination of the next byte to be copied 0: ldrb r3, [r1], $1 strb r3, [ip], $1 subs r2, $1 bne 0b bx lr /***************************************************************************** * STMIA loop for 1k-32k buffers * *****************************************************************************/ // This loop copies 64 bytes each iteration in order of ascending memory // address, using the GPRs instead of NEON. // // r0 original destination pointer // r1 pointer to the next element to be copied // r2 (bytes remaining to be copied) - 64 // r3-6,r8-11 (COPY_REGISTERS) temporary registers used for moving data // ip pointer to the next location to store to L_useSTMIA: push SAVE_REGISTERS .align 3 0: ldmia r1!, COPY_REGISTERS subs r2, r2, $64 stmia ip!, COPY_REGISTERS ldmia r1!, COPY_REGISTERS pld [r1, $64] stmia ip!, COPY_REGISTERS bge 0b pop SAVE_REGISTERS b L_vectorCleanup /***************************************************************************** * Misaligned forward vld1 loop * *****************************************************************************/ // Software alignment fixup to handle source and dest that are relatively // misaligned mod 4 bytes. // // The basic idea is to use 4-byte aligned loads to load 8 bytes per iteration, // which we combine with the 8 bytes loaded in the previous iteration to get a // 16 byte field; the next 8 bytes to be stored to the destination buffer are // somewhere in that field, and we get them using the VEXT instruction: // // | 8 bytes from last iteration | 8 bytes from this iteration | // +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ // | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | b | c | d | e | f | // +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ // ^8 bytes to store this iteration^ | // could be a page boundary // // We need to be a little bit careful, however. Because the loads only have 4 // byte alignment, if we used this approach all the way to the end of the // buffer, the very last 8 byte load might slop over onto a new page by 4 // bytes, and that new page might not be mapped into our process. Thus, we // terminate this copy loop when fewer than 12 bytes remain to be copied, // instead of the more natural-seeming termination condition of "8 bytes // remaining" (the illustration above shows the worst case and demonstrates // why 12 is a sufficiently safe condition). // // At the beginning of each iteration through this loop, registers are used // as follows: // // r0 original destination pointer // r1 pointer to the next block of 8 bytes to load // r2 (bytes remaining to copy) - 12 // ip pointer to the next block of 8 bytes to store // d0 next 8 bytes to store // d2 8 bytes loaded in the previous iteration // d3 8 bytes loaded two iterations ago #define COPY_UNALIGNED(offset) \ subs r2, $4 ;\ blt 2f ;\ vld1.32 {d2,d3}, [r1]! ;\ subs r2, $8 ;\ blt 1f ;\ 0: vext.8 d0, d2, d3, $(offset);\ vmov d2, d3 ;\ vld1.32 {d3}, [r1]! ;\ subs r2, $8 ;\ vst1.64 {d0}, [ip, :64]! ;\ bge 0b ;\ 1: vext.8 d0, d2, d3, $(offset);\ sub r1, $8 ;\ vst1.64 {d0}, [ip, :64]! ;\ 2: add r1, $(offset);\ add r2, $4 ;\ b L_scalarCopy L_sourceAligned1: COPY_UNALIGNED(1) L_sourceAligned2: COPY_UNALIGNED(2) L_sourceAligned3: COPY_UNALIGNED(3) #endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD