ppc/string/memset.s

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 #include <machine/cpu_capabilities.h>
  25
  26 /* We use mode-independent "g" opcodes such as "srgi".  These expand
  27  * into word operations when targeting __ppc__, and into doubleword
  28  * operations when targeting __ppc64__.
  29  */
  30 #include <architecture/ppc/mode_independent_asm.h>
  31
  32
  33         .text
  34 #define kShort  128             // threshold for calling commpage
  35
  36
  37 /* ***************
  38  * * M E M S E T *
  39  * ***************
  40  *
  41  * Registers we use:
  42  *      r3  = original ptr, not changed since memset returns it
  43  *      r4  = count of bytes to set
  44  *      r7  = value to set
  45  *      r8  = working operand ptr
  46  */
  47
  48         .globl  _memset
  49         .align  5
  50 _memset:                        // void *   memset(void *b, int c, size_t len);
  51         andi.   r7,r4,0xFF      // copy value to working register, test for 0
  52         mr              r4,r5           // move length to working register
  53         cmplgi  cr1,r5,kShort   // long enough to bother with _COMM_PAGE_MEMSET_PATTERN?
  54         beqa++  _COMM_PAGE_BZERO    // if (c==0), map to bzero()
  55         rlwimi  r7,r7,8,16,23   // replicate nonzero value to low 2 bytes
  56         neg             r5,r3           // start to compute #bytes to align
  57         mr              r8,r3           // make working copy of operand ptr
  58         rlwimi  r7,r7,16,0,15   // value now in all 4 bytes
  59         blt             cr1,Lmemset3    // too short to use commpage
  60
  61         // TEMPORARY HACK
  62         // Operand is long enough to use _COMM_PAGE_MEMSET_PATTERN.  During Tiger
  63         // development, B&I uses Panther kernels on their builders but runs Tiger
  64         // apps on it.  So _COMM_PAGE_MEMSET_PATTERN may not be on this machine.
  65         // Rather than patch build fleet kernels, we just test to see if it is there
  66         // and use the short-operand case if not.  We can remove the hack when Tiger ships.
  67
  68         lhz     r10,_COMM_PAGE_VERSION(0) // REMOVE THIS LINE WHEN TIGER SHIPS
  69         andi.   r0,r5,0xF       // r0 <- #bytes to align on quadword
  70
  71         // Align ptr and store enough so that we have an aligned 16-byte pattern.
  72
  73         stw     r7,0(r8)
  74         stw     r7,4(r8)
  75         stw     r7,8(r8)
  76         stw     r7,12(r8)
  77         cmpwi   cr1,r10,1       // REMOVE THIS LINE WHEN TIGER SHIPS
  78         beq     Lmemset1        // skip if (r0==0), ie if r8 is 16-byte aligned
  79         add     r8,r8,r0        // 16-byte align ptr
  80         sub     r4,r4,r0        // adjust length
  81         stw     r7,0(r8)        // now we can store an aligned 16-byte pattern
  82         stw     r7,4(r8)
  83         stw     r7,8(r8)
  84         stw     r7,12(r8)
  85
  86         // Call machine-specific commpage routine, which expects:
  87         //      r4 = count (>=32)
  88         //      r8 = ptr (16-byte aligned) to memory to store
  89         //      r9 = ptr (16-byte aligned) to 16-byte pattern to store
  90         // When it returns:
  91         //      r3, r7, and r12 are preserved
  92         //      r4 and r8 are updated to reflect a residual count of from 0..31 bytes
  93
  94 Lmemset1:
  95         mflr    r12             // save return address
  96         mr      r9,r8           // point to 16-byte-aligned 16-byte pattern
  97         addi    r8,r8,16        // point to first unstored byte
  98         subi    r4,r4,16        // account for the aligned bytes we have stored
  99         bnela++ cr1,_COMM_PAGE_MEMSET_PATTERN   // CHANGE THIS LINE WHEN TIGER SHIPS
 100         mtlr    r12
 101
 102         // Here for short nonzero memset.
 103         //  r4 = count (<= kShort bytes)
 104         //  r7 = pattern in all four bytes
 105         //  r8 = ptr
 106 Lmemset3:
 107         srgi.   r0,r4,4         // any 16-byte chunks?
 108         mtcrf   0x01,r4         // move length remaining to cr7 so we can test bits
 109         beq     Lmemset5        // fewer than 16 bytes
 110         mtctr   r0
 111         b       Lmemset4        // enter loop
 112
 113         .align  5
 114 Lmemset4:                       // loop over 16-byte chunks
 115         stw     r7,0(r8)
 116         stw     r7,4(r8)
 117         stw     r7,8(r8)
 118         stw     r7,12(r8)
 119         addi    r8,r8,16
 120         bdnz++  Lmemset4
 121
 122         // Handle last 0..15 bytes.
 123 Lmemset5:
 124         bf      28,2f
 125         stw     r7,0(r8)
 126         stw     r7,4(r8)
 127         addi    r8,r8,8
 128 2:
 129         bf      29,3f
 130         stw     r7,0(r8)
 131         addi    r8,r8,4
 132 3:
 133         bf      30,4f
 134         sth     r7,0(r8)
 135         addi    r8,r8,2
 136 4:
 137         bflr    31
 138         stb     r7,0(r8)
 139         blr
 140
 141
 142 /* *************************************
 143  * * _ M E M S E T _ P A T T E R N 1 6 *
 144  * *************************************
 145  *
 146  * Used to store a 16-byte pattern in memory:
 147  *
 148  *  void    _memset_pattern16(void *b, const void *c16, size_t len);
 149  *
 150  * Where c16 points to the 16-byte pattern.  None of the parameters need be aligned.
 151  */
 152
 153         .globl  __memset_pattern16
 154         .align  5
 155 __memset_pattern16:
 156         cmplgi  cr1,r5,kShort   // check length
 157         lwz     r7,0(r4)        // load pattern into (these remain lwz in 64-bit mode)
 158         lwz     r9,4(r4)
 159         neg     r6,r3           // start to compute ptr alignment
 160         lwz     r10,8(r4)
 161         lwz     r11,12(r4)
 162         b       __memset_pattern_common
 163
 164
 165 /* ***********************************
 166  * * _ M E M S E T _ P A T T E R N 8 *
 167  * ***********************************
 168  *
 169  * Used to store an 8-byte pattern in memory:
 170  *
 171  *  void    _memset_pattern8(void *b, const void *c8, size_t len);
 172  *
 173  * Where c8 points to the 8-byte pattern.  None of the parameters need be aligned.
 174  */
 175
 176         .globl  __memset_pattern8
 177         .align  5
 178 __memset_pattern8:
 179         lwz     r7,0(r4)        // load pattern (these remain lwz in 64-bit mode)
 180         lwz     r9,4(r4)
 181         cmplgi  cr1,r5,kShort   // check length
 182         neg     r6,r3           // start to compute ptr alignment
 183         mr      r10,r7          // replicate into 16-byte pattern
 184         mr      r11,r9
 185         b       __memset_pattern_common
 186
 187
 188 /* ***********************************
 189  * * _ M E M S E T _ P A T T E R N 4 *
 190  * ***********************************
 191  *
 192  * Used to store a 4-byte pattern in memory:
 193  *
 194  *  void    _memset_pattern4(void *b, const void *c4, size_t len);
 195  *
 196  * Where c4 points to the 4-byte pattern.  None of the parameters need be aligned.
 197  */
 198
 199         .globl  __memset_pattern4
 200         .align  5
 201 __memset_pattern4:
 202         lwz     r7,0(r4)        // load pattern
 203         cmplgi  cr1,r5,kShort   // check length
 204         neg     r6,r3           // start to compute ptr alignment
 205         mr      r9,r7           // replicate into 16-byte pattern
 206         mr      r10,r7
 207         mr      r11,r7
 208         b       __memset_pattern_common // don't fall through because of scatter-loading
 209
 210
 211 /* ***********************************************
 212  * * _ M E M S E T _ P A T T E R N _ C O M M O N *
 213  * ***********************************************
 214  *
 215  * This is the common code used by _memset_patter16, 8, and 4.  They all get here via
 216  * long branch (ie, "b") in case the routines are re-ordered, with:
 217  *      r3 = ptr to memory to store pattern into (unaligned)
 218  *      r5 = length in bytes
 219  *      r6 = neg(r3), used to compute #bytes to align
 220  *      r7, r9, r10, r11 = 16-byte pattern to store
 221  *      cr1= ble if (r5 <= kShort)
 222  */
 223
 224         .globl  __memset_pattern_common
 225         .align  5
 226 __memset_pattern_common:
 227         andi.   r0,r6,0xF       // get #bytes to 16-byte align ptr
 228         ble--   cr1,LShort      // if short operand skip out
 229
 230         // Align ptr and store enough of pattern so we have an aligned
 231         // 16-byte chunk of it (this effectively rotates incoming pattern
 232         // if the original ptr was not aligned.)
 233
 234         stw     r7,0(r3)
 235         stw     r9,4(r3)
 236         stw     r10,8(r3)
 237         stw     r11,12(r3)
 238         beq     Laligned        // skip if (r0==0), ie if r3 is 16-byte aligned
 239         stw     r7,16(r3)
 240         stw     r9,20(r3)
 241         stw     r10,24(r3)
 242         stw     r11,28(r3)
 243         add     r3,r3,r0        // 16-byte align ptr
 244         sub     r5,r5,r0        // adjust length
 245
 246         // We're ready to call the machine-specific commpage routine
 247         // to do the heavy lifting.  When called, _COMM_PAGE_MEMSET_PATTERN expects:
 248         //      r4 = length (>= 32)
 249         //      r8 = ptr (16-byte aligned)
 250         //      r9 = ptr to 16-byte pattern (16-byte aligned)
 251         // When it returns:
 252         //      r3, r7, and r12 are preserved
 253         //      r4 and r8 are updated to reflect a residual count of from 0..31 bytes
 254
 255 Laligned:
 256         mflr    r12             // save return across commpage call
 257         mr      r9,r3           // point to 16-byte aligned 16-byte pattern
 258         addi    r8,r3,16        // point to first unstored byte (r8 is 16-byte aligned)
 259         subi    r4,r5,16        // account for the aligned bytes we have stored
 260         bla     _COMM_PAGE_MEMSET_PATTERN
 261         mr.     r5,r4           // move length (0..31) back to original reg and test for 0
 262         mtlr    r12
 263         beqlr                   // done if residual length == 0
 264         lwz     r7,-16(r8)      // load aligned pattern into r7,r9,r10, and r11
 265         lwz     r9,-12(r8)
 266         mr      r3,r8           // move destination ptr back
 267         lwz     r10,-8(r8)
 268         lwz     r11,-4(r8)
 269
 270         // Handle short operands and leftovers.
 271         //      r3 = dest
 272         //      r5 = length
 273         //      r7,r9,r10,r11 = pattern
 274 LShort:
 275         srgi.   r0,r5,4         // at least 16 bytes?
 276         mtcrf   0x01,r5         // move leftover count to cr7
 277         beq     Lleftovers
 278         mtctr   r0
 279 LShortLoop:
 280         stw     r7,0(r3)        // replicate the pattern
 281         stw     r9,4(r3)
 282         stw     r10,8(r3)
 283         stw     r11,12(r3)
 284         addi    r3,r3,16
 285         bdnz    LShortLoop      // store 16 more bytes
 286
 287         // Fewer than 16 bytes remaining.
 288 Lleftovers:
 289         bf      28,1f
 290         stw     r7,0(r3)        // store next 8 bytes
 291         stw     r9,4(r3)
 292         addi    r3,r3,8
 293         mr      r7,r10          // shift pattern over
 294         mr      r9,r11
 295 1:
 296         bf      29,2f
 297         stw     r7,0(r3)
 298         addi    r3,r3,4
 299         mr      r7,r9
 300 2:
 301         bf      30,3f
 302         rlwinm  r7,r7,16,0,31   // position leftmost 2 bytes for store
 303         sth     r7,0(r3)
 304         addi    r3,r3,2
 305 3:
 306         bflr    31
 307         srwi    r7,r7,24        // position leftmost byte for store
 308         stb     r7,0(r3)
 309         blr