X-Git-Url: https://git.saurik.com/apple/libc.git/blobdiff_plain/1f2f436a38f7ae2d39a943ad2898d8fed4ed2e58..a28bf75d63c6a64e4c3b417c6052e45f42c6cedd:/arm/string/bzero_CortexA8.s diff --git a/arm/string/bzero_CortexA8.s b/arm/string/bzero_CortexA8.s index e69de29..bcbfea0 100644 --- a/arm/string/bzero_CortexA8.s +++ b/arm/string/bzero_CortexA8.s @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD + +/********************************************************************** + * Cortex-A8 implementation * + **********************************************************************/ + +// Cortex-A8 implementations of memset( ) and bzero( ). Main loop is 64-byte +// NEON stores, unless the buffer length is > 1k. Beyond that point, there is +// little to no speed advantage with NEON (and a slight regression in some +// measured cases), so we switch to the GPRs. +// +// The crossover point should be reevaluated for future architectures. +// +// -- Stephen Canon, August 2009 + +.text +.syntax unified +.code 16 + +// void bzero(void * destination, +// size_t length); +// +// zeros out a buffer length bytes long, beginning at the address destination. +.thumb_func ___bzero$VARIANT$CortexA8 +.globl ___bzero$VARIANT$CortexA8 +.thumb_func _bzero$VARIANT$CortexA8 +.globl _bzero$VARIANT$CortexA8 +.align 2 +___bzero$VARIANT$CortexA8: +_bzero$VARIANT$CortexA8: + mov r2, r1 // match the API to memset(dest, 0, length) + eor r1, r1 // and fall through into memset + +// void *memset(void * destination, +// int value, size_t n); +// +// writes value converted to an unsigned char to n successive bytes, beginning +// at destination. + +// Notes on register usage: +// +// Throughout this function, registers have nearly constant usage; the pattern +// is: +// +// r0 holds the original destination pointer, unmodified. This value +// must be returned by the routine, so it is easiest to just leave it +// in place. +// r1 holds the value that is being copied into the buffer, in some stage +// of splattedness. The low byte is guaranteed to always have the value +// but the higher bytes may or may not contain copies of it. +// r2 holds the length minus some offset, where the offset is always the +// number of bytes that the current loop stores per iteration. +// r3-r6,r8,r10,r11 are used with stmia, and will only ever contain splatted +// copies of the value to be stored. +// ip holds a pointer to the lowest byte in the array that has not yet been +// set to hold value. +// q0 and q1 hold splatted copies of the value in the vector path, and are +// otherwise unused. + +.thumb_func _memset$VARIANT$CortexA8 +.globl _memset$VARIANT$CortexA8 +.align 2 +_memset$VARIANT$CortexA8: + mov ip, r0 // copy destination pointer. + subs r2, #0x8 // if length - 8 is negative (i.e. length + and r1, #0xff // is less than 8), jump to cleanup path. + blt L_scalarCleanup // + + tst ip, #0x7 // if the destination is doubleword + beq L_vectorCopy // aligned, jump to fast path. + +0: strb r1, [ip], #1 // store one byte at a time until + sub r2, #1 // destination pointer is 8 byte aligned. + tst ip, #7 // + bne 0b // + + cmp r2, #0x0 // if length - 8 is negative, + blt L_scalarCleanup // jump to the cleanup code + +L_vectorCopy: + vdup.8 q0, r1 // splat the byte to be stored across + subs r2, #0x38 // q0 and q1, and check if length - 64 + vmov q1, q0 // is negative; if so, jump to the + blt L_vectorCleanup // cleanup code. + + tst ip, #0x38 // if the destination is cacheline + beq L_cachelineAligned // aligned, jump to the fast path. + +0: vst1.64 {d0}, [ip, :64]! // store one double word at a time until + sub r2, #8 // the destination is 64-byte aligned + tst ip, #0x38 // + bne 0b + + cmp r2, #0x0 // if length - 64 is negative, + blt L_vectorCleanup // jump to the cleanup code + +L_cachelineAligned: + cmp r2, #0x3c0 // if length > 1024 + bge L_useSTMIA // we use stmia instead + +.align 4 // main loop +0: vst1.64 {q0,q1}, [ip, :256]! // store 32 bytes + subs r2, #0x40 // decrement length by 64 + vst1.64 {q0,q1}, [ip, :256]! // store 32 bytes + bge 0b // if length - 64 >= 0, continue + +L_vectorCleanup: + adds r2, #0x38 // if (length - 8) < 0, goto scalar cleanup + blt L_scalarCleanup // + +0: subs r2, #8 // store one double word at a time until + vst1.64 {d0}, [ip, :64]! // (length - 8) < 0. + bge 0b + +L_scalarCleanup: + adds r2, #8 // restore length + beq 1f // early out if zero. + +0: strb r1, [ip], #1 // store one byte at a time until length + subs r2, #1 // is zero. + bne 0b // +1: bx lr // return. + +// STMIA loop for large buffers +// +// For stores larger than 1024 bytes, we use STMIA because we can't get enough +// of a speedup from NEON to offset the higher power draw of the NEON unit. +// +// This crossover should be reevaluated on future architectures. +// +// We avoid using r7 and r9 even though it's not strictly necessary. + +L_useSTMIA: + push {r4,r5,r6,r8,r10,r11} + orr r1, r1, r1, lsl #8 + orr r1, r1, r1, lsl #16 + mov r3, r1 + mov r4, r1 + mov r5, r1 + mov r6, r1 + mov r8, r1 + mov r10, r1 + mov r11, r1 +.align 4 +0: stmia ip!, {r1,r3,r4,r5,r6,r8,r10,r11} + subs r2, #0x40 + stmia ip!, {r1,r3,r4,r5,r6,r8,r10,r11} + bge 0b + pop {r4,r5,r6,r8,r10,r11} + b L_vectorCleanup + +#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD +