/* * Copyright (c) 2009 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_LICENSE_HEADER_END@ */ /***************************************************************************** * Cortex-A8 implementation * *****************************************************************************/ // Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ). // // Our tests have shown that NEON is always a performance win for memcpy( ). // However, for the specific case of copies from a warm source to a cold // destination when the buffer size is between 1k and 32k, it is not enough // of a performance win to offset the increased power footprint, resulting // in an energy usage regression. Thus, we detect that particular case, and // pass those copies through the ARM core registers. All other copies larger // than 8 bytes are handled on NEON. // // Stephen Canon, August 2009 .text .code 16 .syntax unified // void bcopy(const void * source, // void * destination, // size_t length); // // void *memmove(void * destination, // const void * source, // size_t n); // // void *memcpy(void * restrict destination, // const void * restrict source, // size_t n); // // all copy n successive bytes from source to destination. memmove and memcpy // returns destination, whereas bcopy has no return value. copying takes place // as if it were through a temporary buffer -- after return destination contains // exactly the bytes from source, even if the buffers overlap. .thumb_func _bcopy .globl _bcopy .thumb_func _memmove .globl _memmove .thumb_func _memcpy .globl _memcpy .align 2 _bcopy: mov r3, r0 // swap the first and second arguments mov r0, r1 // and fall through into memmove mov r1, r3 // .align 2 _memmove: _memcpy: subs r3, r0, r1 // offset = destination addr - source addr it eq bxeq lr // if source == destination, early out // Our preference is for using a (faster) front-to-back copy. However, if // 0 < offset < length, it is necessary to copy back-to-front for correctness. // We have already ruled out offset == 0, so we can use an unsigned compare // with length -- if offset is higher, offset is either greater than length // or negative. cmp r3, r2 bhs L_copyFrontToBack /***************************************************************************** * back to front copy * *****************************************************************************/ mov ip, r0 // copy destination pointer. add r1, r2 // move source pointer to end of source array add ip, r2 // move destination pointer to end of dest array subs r2, $8 // if length - 8 is negative (i.e. length blt L_scalarReverseCopy // is less than 8), jump to cleanup path. tst ip, $7 // if (destination + length) is doubleword beq L_vectorReverseCopy // aligned, jump to fast path. 0: ldrb r3, [r1, $-1]! // load byte sub r2, $1 // decrement length strb r3, [ip, $-1]! // store byte tst ip, $7 // test alignment bne 0b cmp r2, $0 // if length - 8 is negative, blt L_scalarReverseCopy // jump to the cleanup code /***************************************************************************** * destination is doubleword aligned * *****************************************************************************/ L_vectorReverseCopy: ands r3, r1, $3 // Extract the alignment of the source bic r1, $3 tbh [pc, r3, lsl $1] // Dispatch table on source alignment 0: .short (L_reverseAligned0-0b)/2 // The NEON alignment hardware does not work .short (L_reverseAligned1-0b)/2 // properly with sub 4-byte alignment and .short (L_reverseAligned2-0b)/2 // buffers that are uncacheable, so we need .short (L_reverseAligned3-0b)/2 // to have a software workaround. /***************************************************************************** * source is also at least word aligned * *****************************************************************************/ L_reverseAligned0: subs r2, $0x38 // if length - 64 is negative, jump to blt L_reverseVectorCleanup// the cleanup path. tst ip, $0x38 // if (destination + length) is cacheline beq L_reverseCachelineAligned // aligned, jump to the fast path. 0: sub r1, $8 // copy eight bytes at a time until the vld1.32 {d0}, [r1] // destination is 8 byte aligned. sub ip, $8 // sub r2, $8 // tst ip, $0x38 // vst1.64 {d0}, [ip, :64] // bne 0b // cmp r2, $0 // if length - 64 is negative, blt L_reverseVectorCleanup// jump to the cleanup code L_reverseCachelineAligned: sub r3, r2, $0x3c0 // If 1024 < length < 32768, use core cmp r3, $0x7c00 // register copies instead of NEON to blo L_useSTMDB // control energy usage. sub r1, $32 // decrement source sub ip, $32 // decrement destination mov r3, $-32 // load address increment tst r1, $0x1f // if source shares 32 byte alignment beq L_reverseSourceAligned// jump to loop with more alignment hints vld1.32 {q2,q3}, [r1], r3 // This loop handles 4-byte aligned copies vld1.32 {q0,q1}, [r1], r3 // as generally as possible. subs r2, $64 // vst1.64 {q2,q3}, [ip,:256], r3 // The Cortex-A8 NEON unit does not always blt 1f // properly handle misalignment in vld1 .align 3 // with an element size of 8 or 16, so 0: vld1.32 {q2,q3}, [r1], r3 // this is the best we can do without vst1.64 {q0,q1}, [ip,:256], r3 // handling alignment in software. vld1.32 {q0,q1}, [r1], r3 // subs r2, $64 // vst1.64 {q2,q3}, [ip,:256], r3 // bge 0b // b 1f // L_reverseSourceAligned: vld1.64 {q2,q3}, [r1,:256], r3 // Identical to loop above except for vld1.64 {q0,q1}, [r1,:256], r3 // additional alignment information; this subs r2, $64 // gets an additional .5 bytes per cycle vst1.64 {q2,q3}, [ip,:256], r3 // on Cortex-A8. blt 1f // .align 3 // 0: vld1.64 {q2,q3}, [r1,:256], r3 // vst1.64 {q0,q1}, [ip,:256], r3 // vld1.64 {q0,q1}, [r1,:256], r3 // subs r2, $64 // vst1.64 {q2,q3}, [ip,:256], r3 // bge 0b // 1: vst1.64 {q0,q1}, [ip,:256], r3 // loop cleanup: final 32 byte store add r1, $32 // point source at last element stored add ip, $32 // point destination at last element stored L_reverseVectorCleanup: adds r2, $0x38 // If (length - 8) < 0, goto scalar cleanup blt L_scalarReverseCopy // 0: sub r1, $8 // copy eight bytes at a time until vld1.32 {d0}, [r1] // (length - 8) < 0. sub ip, $8 // subs r2, $8 // vst1.64 {d0}, [ip, :64] // bge 0b // /***************************************************************************** * sub-doubleword cleanup copies * *****************************************************************************/ L_scalarReverseCopy: adds r2, #0x8 // restore length it eq // if this is zero bxeq lr // early out 0: ldrb r3, [r1, #-1]! // load a byte from source strb r3, [ip, #-1]! // store to destination subs r2, #0x1 // subtract one from length bne 0b // if non-zero, repeat bx lr // return /***************************************************************************** * STMDB loop for 1k-32k buffers * *****************************************************************************/ L_useSTMDB: push {r4-r8,r10,r11} .align 3 0: ldmdb r1!, {r3-r8,r10,r11} subs r2, #0x40 stmdb ip!, {r3-r8,r10,r11} ldmdb r1!, {r3-r8,r10,r11} pld [r1, #-0x40] stmdb ip!, {r3-r8,r10,r11} bge 0b pop {r4-r8,r10,r11} b L_reverseVectorCleanup /***************************************************************************** * Misaligned vld1 loop * *****************************************************************************/ // Software alignment fixup to handle source and dest that are relatively // misaligned mod 4 bytes. Load two 4-byte aligned double words from source, // use vext.8 to extract a double word to store, and perform an 8-byte aligned // store to destination. #define RCOPY_UNALIGNED(offset) \ subs r2, $8 ;\ blt 2f ;\ sub r1, $8 ;\ sub ip, $8 ;\ mov r3, $-8 ;\ vld1.32 {d2,d3}, [r1], r3 ;\ subs r2, $8 ;\ blt 1f ;\ 0: vext.8 d0, d2, d3, $(offset);\ vmov d3, d2 ;\ vld1.32 {d2}, [r1], r3 ;\ subs r2, $8 ;\ vst1.64 {d0}, [ip, :64], r3 ;\ bge 0b ;\ 1: vext.8 d0, d2, d3, $(offset);\ add r1, $8 ;\ vst1.64 {d0}, [ip, :64] ;\ 2: add r2, $8 ;\ add r1, $(offset);\ b L_scalarReverseCopy L_reverseAligned1: RCOPY_UNALIGNED(1) L_reverseAligned2: RCOPY_UNALIGNED(2) L_reverseAligned3: RCOPY_UNALIGNED(3) /***************************************************************************** * front to back copy * *****************************************************************************/ L_copyFrontToBack: mov ip, r0 // copy destination pointer. subs r2, $8 // if length - 8 is negative (i.e. length blt L_scalarCopy // is less than 8), jump to cleanup path. tst ip, $7 // if the destination is doubleword beq L_vectorCopy // aligned, jump to fast path. 0: ldrb r3, [r1], $1 // load byte sub r2, $1 // decrement length strb r3, [ip], $1 // store byte tst ip, $7 // test alignment bne 0b cmp r2, $0 // if length - 8 is negative, blt L_scalarCopy // jump to the cleanup code /***************************************************************************** * destination is doubleword aligned * *****************************************************************************/ L_vectorCopy: ands r3, r1, $3 // Extract the alignment of the source bic r1, $3 tbh [pc, r3, lsl $1] // Dispatch table on source alignment 0: .short (L_sourceAligned0-0b)/2 // The NEON alignment hardware does not work .short (L_sourceAligned1-0b)/2 // properly with sub 4-byte alignment and .short (L_sourceAligned2-0b)/2 // buffers that are uncacheable, so we need .short (L_sourceAligned3-0b)/2 // to have a software workaround. /***************************************************************************** * source is also at least word aligned * *****************************************************************************/ L_sourceAligned0: subs r2, $0x38 // If (length - 64) < 0 blt L_vectorCleanup // jump to cleanup code tst ip, $0x38 // If destination is 64 byte aligned beq L_cachelineAligned // jump to main loop 0: vld1.32 {d0}, [r1]! // Copy one double word at a time until sub r2, $8 // the destination is 64-byte aligned. vst1.64 {d0}, [ip, :64]! // tst ip, $0x38 // bne 0b // cmp r2, $0 // If (length - 64) < 0, goto cleanup blt L_vectorCleanup // L_cachelineAligned: sub r3, r2, $0x3c0 // If 1024 < length < 32768, use core cmp r3, $0x7c00 // register copies instead of NEON to blo L_useSTMIA // control energy usage. tst r1, $0x1f // If source has 32-byte alignment, use beq L_sourceAligned32 // an optimized loop. vld1.32 {q2,q3}, [r1]! // This is the most common path for small vld1.32 {q0,q1}, [r1]! // copies, which are alarmingly frequent. subs r2, #0x40 // It requires 4-byte alignment on the vst1.64 {q2,q3}, [ip, :256]! // source. For ordinary malloc'd buffers, blt 1f // this path could handle only single-byte .align 3 // alignment at speed by using vld1.8 0: vld1.32 {q2,q3}, [r1]! // instead of vld1.32; however, the NEON vst1.64 {q0,q1}, [ip, :256]! // alignment handler misbehaves for some vld1.32 {q0,q1}, [r1]! // special copies if the element size is subs r2, #0x40 // 8 or 16, so we need to work around vst1.64 {q2,q3}, [ip, :256]! // sub 4-byte alignment in software, in bge 0b // another code path. b 1f L_sourceAligned32: vld1.64 {q2,q3}, [r1, :256]! // When the source shares 32-byte alignment vld1.64 {q0,q1}, [r1, :256]! // with the destination, we use this loop subs r2, #0x40 // instead, which specifies the maximum vst1.64 {q2,q3}, [ip, :256]! // :256 alignment on all loads and stores. blt 1f // .align 3 // This gets an additional .5 bytes per 0: vld1.64 {q2,q3}, [r1, :256]! // cycle for in-cache copies, which is not vst1.64 {q0,q1}, [ip, :256]! // insignificant for this (rather common) vld1.64 {q0,q1}, [r1, :256]! // case. subs r2, #0x40 // vst1.64 {q2,q3}, [ip, :256]! // This is identical to the above loop, bge 0b // except for the additional alignment. 1: vst1.64 {q0,q1}, [ip, :256]! // L_vectorCleanup: adds r2, $0x38 // If (length - 8) < 0, goto scalar cleanup blt L_scalarCopy // 0: vld1.32 {d0}, [r1]! // Copy one doubleword at a time until subs r2, $8 // (length - 8) < 0. vst1.64 {d0}, [ip, :64]! // bge 0b // /***************************************************************************** * sub-doubleword cleanup copies * *****************************************************************************/ L_scalarCopy: adds r2, #0x8 // restore length it eq // if this is zero bxeq lr // early out 0: ldrb r3, [r1], #1 // load a byte from source strb r3, [ip], #1 // store to destination subs r2, #1 // subtract one from length bne 0b // if non-zero, repeat bx lr // return /***************************************************************************** * STMIA loop for 1k-32k buffers * *****************************************************************************/ L_useSTMIA: push {r4-r8,r10,r11} .align 3 0: ldmia r1!, {r3-r8,r10,r11} subs r2, r2, #64 stmia ip!, {r3-r8,r10,r11} ldmia r1!, {r3-r8,r10,r11} pld [r1, #64] stmia ip!, {r3-r8,r10,r11} bge 0b pop {r4-r8,r10,r11} b L_vectorCleanup /***************************************************************************** * Misaligned reverse vld1 loop * *****************************************************************************/ // Software alignment fixup to handle source and dest that are relatively // misaligned mod 4 bytes. Load two 4-byte aligned double words from source, // use vext.8 to extract a double word to store, and perform an 8-byte aligned // store to destination. #define COPY_UNALIGNED(offset) \ subs r2, $8 ;\ blt 2f ;\ vld1.32 {d2,d3}, [r1]! ;\ subs r2, $8 ;\ blt 1f ;\ 0: vext.8 d0, d2, d3, $(offset);\ vmov d2, d3 ;\ vld1.32 {d3}, [r1]! ;\ subs r2, $8 ;\ vst1.64 {d0}, [ip, :64]! ;\ bge 0b ;\ 1: vext.8 d0, d2, d3, $(offset);\ sub r1, $8 ;\ vst1.64 {d0}, [ip, :64]! ;\ 2: add r1, $(offset);\ add r2, $8 ;\ b L_scalarCopy L_sourceAligned1: COPY_UNALIGNED(1) L_sourceAligned2: COPY_UNALIGNED(2) L_sourceAligned3: COPY_UNALIGNED(3)