arm/string/bzero_CortexA8.s

   1 /*
   2  * Copyright (c) 2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 #include <arm/arch.h>
  25 #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
  26
  27 /**********************************************************************
  28  * Cortex-A8 implementation                                           *
  29  **********************************************************************/
  30
  31 // Cortex-A8 implementations of memset( ) and bzero( ).  Main loop is 64-byte
  32 // NEON stores, unless the buffer length is > 1k.  Beyond that point, there is
  33 // little to no speed advantage with NEON (and a slight regression in some
  34 // measured cases), so we switch to the GPRs.
  35 //
  36 // The crossover point should be reevaluated for future architectures.
  37 //
  38 // -- Stephen Canon, August 2009
  39
  40 .text
  41 .syntax unified
  42 .code 16
  43
  44 // void bzero(void * destination,
  45 //            size_t length);
  46 //
  47 // zeros out a buffer length bytes long, beginning at the address destination.
  48 .thumb_func ___bzero$VARIANT$CortexA8
  49 .globl ___bzero$VARIANT$CortexA8
  50 .thumb_func _bzero$VARIANT$CortexA8
  51 .globl _bzero$VARIANT$CortexA8
  52 .align 2
  53 ___bzero$VARIANT$CortexA8:
  54 _bzero$VARIANT$CortexA8:
  55     mov     r2,     r1              // match the API to memset(dest, 0, length)
  56     eor     r1,     r1              // and fall through into memset
  57
  58 // void *memset(void * destination,
  59 //              int value, size_t n);
  60 //
  61 // writes value converted to an unsigned char to n successive bytes, beginning
  62 // at destination.
  63
  64 // Notes on register usage:
  65 //
  66 // Throughout this function, registers have nearly constant usage; the pattern
  67 // is:
  68 //
  69 //     r0 holds the original destination pointer, unmodified.  This value
  70 //        must be returned by the routine, so it is easiest to just leave it
  71 //        in place.
  72 //     r1 holds the value that is being copied into the buffer, in some stage
  73 //        of splattedness.  The low byte is guaranteed to always have the value
  74 //        but the higher bytes may or may not contain copies of it.
  75 //     r2 holds the length minus some offset, where the offset is always the
  76 //        number of bytes that the current loop stores per iteration.
  77 //     r3-r6,r8,r10,r11 are used with stmia, and will only ever contain splatted
  78 //        copies of the value to be stored.
  79 //     ip holds a pointer to the lowest byte in the array that has not yet been
  80 //        set to hold value.
  81 //     q0 and q1 hold splatted copies of the value in the vector path, and are
  82 //        otherwise unused.
  83
  84 .thumb_func _memset$VARIANT$CortexA8
  85 .globl _memset$VARIANT$CortexA8
  86 .align 2
  87 _memset$VARIANT$CortexA8:
  88     mov       ip,      r0           // copy destination pointer.
  89     subs      r2,           #0x8    // if length - 8 is negative (i.e. length
  90     and       r1,           #0xff   // is less than 8), jump to cleanup path.
  91     blt       L_scalarCleanup       //
  92
  93     tst       ip,           #0x7    // if the destination is doubleword
  94     beq       L_vectorCopy          // aligned, jump to fast path.
  95
  96 0:  strb      r1,     [ip], #1      // store one byte at a time until
  97     sub       r2,           #1      // destination pointer is 8 byte aligned.
  98     tst       ip,           #7      //
  99     bne       0b                    //
 100
 101     cmp       r2,           #0x0    // if length - 8 is negative,
 102     blt       L_scalarCleanup       // jump to the cleanup code
 103
 104 L_vectorCopy:
 105     vdup.8    q0,      r1           // splat the byte to be stored across
 106     subs      r2,           #0x38   // q0 and q1, and check if length - 64
 107     vmov      q1,      q0           // is negative; if so, jump to the
 108     blt       L_vectorCleanup       // cleanup code.
 109
 110     tst       ip,           #0x38   // if the destination is cacheline
 111     beq       L_cachelineAligned    // aligned, jump to the fast path.
 112
 113 0:  vst1.64  {d0},    [ip, :64]!    // store one double word at a time until
 114     sub       r2,           #8      // the destination is 64-byte aligned
 115     tst       ip,           #0x38   //
 116     bne       0b
 117
 118     cmp       r2,           #0x0    // if length - 64 is negative,
 119     blt       L_vectorCleanup       // jump to the cleanup code
 120
 121 L_cachelineAligned:
 122     cmp       r2,           #0x3c0  // if length > 1024
 123     bge       L_useSTMIA            // we use stmia instead
 124
 125 .align 4                            // main loop
 126 0:  vst1.64  {q0,q1}, [ip, :256]!   // store 32 bytes
 127     subs      r2,           #0x40   // decrement length by 64
 128     vst1.64  {q0,q1}, [ip, :256]!   // store 32 bytes
 129     bge       0b                    // if length - 64 >= 0, continue
 130
 131 L_vectorCleanup:
 132     adds      r2,           #0x38   // if (length - 8) < 0, goto scalar cleanup
 133     blt       L_scalarCleanup       //
 134
 135 0:  subs      r2,           #8      // store one double word at a time until
 136     vst1.64  {d0},    [ip, :64]!    // (length - 8) < 0.
 137     bge       0b
 138
 139 L_scalarCleanup:
 140     adds      r2,           #8      // restore length
 141     beq       1f                    // early out if zero.
 142
 143 0:  strb      r1,     [ip], #1      // store one byte at a time until length
 144     subs      r2,           #1      // is zero.
 145     bne       0b                    //
 146 1:  bx        lr                    // return.
 147
 148 //  STMIA loop for large buffers
 149 //
 150 //  For stores larger than 1024 bytes, we use STMIA because we can't get enough
 151 //  of a speedup from NEON to offset the higher power draw of the NEON unit.
 152 //
 153 //  This crossover should be reevaluated on future architectures.
 154 //
 155 //  We avoid using r7 and r9 even though it's not strictly necessary.
 156
 157 L_useSTMIA:
 158     push     {r4,r5,r6,r8,r10,r11}
 159     orr       r1,      r1,  r1, lsl #8
 160     orr       r1,      r1,  r1, lsl #16
 161     mov       r3,      r1
 162     mov       r4,      r1
 163     mov       r5,      r1
 164     mov       r6,      r1
 165     mov       r8,      r1
 166     mov       r10,     r1
 167     mov       r11,     r1
 168 .align 4
 169 0:  stmia     ip!,  {r1,r3,r4,r5,r6,r8,r10,r11}
 170     subs      r2,           #0x40
 171     stmia     ip!,  {r1,r3,r4,r5,r6,r8,r10,r11}
 172     bge       0b
 173     pop      {r4,r5,r6,r8,r10,r11}
 174     b         L_vectorCleanup
 175
 176 #endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD
 177