osfmk/ppc/bzero.s

   1 /*
   2  * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #include <ppc/asm.h>
  24 #include <ppc/exception.h>
  25 #include <assym.s>
  26
  27         .text
  28         .align  2
  29         .globl  _memset
  30         .globl  _bzero
  31         .globl  _bzero_nc
  32         .globl  _bzero_phys
  33
  34
  35 // ***********************
  36 // * B Z E R O _ P H Y S *
  37 // ***********************
  38 //
  39 // void bzero_phys(addr64_t phys_addr, uint32_t length);
  40 //
  41 // Takes a phys addr in (r3,r4), and length in r5.  We leave cache on.
  42
  43         .align  5
  44 LEXT(bzero_phys)
  45         mflr    r12                             // save return address
  46         rlwinm  r3,r3,0,1,0             // coallesce long-long in (r3,r4) into reg64_t in r3
  47         rlwimi  r3,r4,0,0,31
  48         mr              r4,r5                   // put length where bzero() expects it
  49         bl              EXT(ml_set_physical_get_ffs)    // turn DR off, SF on, features in cr6, old MSR in r11
  50         bl              EXT(bzero)              // use normal bzero() routine
  51         mtlr    r12                             // restore return
  52         b               EXT(ml_restore)         // restore MSR, turning DR on and SF off
  53
  54
  55 // *******************
  56 // * B Z E R O _ N C *
  57 // *******************
  58 //
  59 //      void bzero_nc(char      *addr, unsigned int length);
  60 //
  61 // For use with uncached memory.  Doesn't seem to be used at all, so probably not
  62 // performance critical.  NB: we must avoid unaligned stores, because some
  63 // machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached
  64 // memory.  Of course, we must also avoid dcbz.
  65
  66 LEXT(bzero_nc)
  67         cmplwi  cr1,r4,20               // too short to bother with 16-byte loops?
  68         cmplwi  cr7,r4,0                // check for (len==0)
  69         li              r6,0                    // get a 0
  70         bge             cr1,bznc1               // skip if length >=20
  71         mtctr   r4                              // set up byte loop
  72         beqlr-- cr7                             // done if len=0
  73
  74 // Short operands, loop over bytes.
  75
  76 bznc0:
  77         stb             r6,0(r3)
  78         addi    r3,r3,1
  79         bdnz    bznc0
  80         blr
  81
  82 // Handle operands long enough to do doubleword stores; we must doubleword
  83 // align, to avoid alignment exceptions.
  84
  85 bznc1:
  86         neg             r7,r3                   // start to compute #bytes to align
  87         mfsprg  r10,2                   // get feature flags
  88         andi.   r0,r7,7                 // get #bytes to doubleword align
  89         mr              r5,r3                   // make copy of operand ptr as bcopy expects
  90         mtcrf   0x02,r10                // put pf64Bitb etc in cr6
  91         beq             bzero_tail              // already doubleword aligned
  92         sub             r4,r4,r0                // adjust count
  93         mtctr   r0                              // set up loop
  94 bznc2:                                                  // zero bytes until doubleword aligned
  95         stb             r6,0(r5)
  96         addi    r5,r5,1
  97         bdnz    bznc2
  98         b               bzero_tail              // join bzero, now that r5 is aligned
  99
 100
 101 // *************     ***************
 102 // * B Z E R O * and * M E M S E T *
 103 // *************     ***************
 104 //
 105 // void *   memset(void *b, int c, size_t len);
 106 // void         bzero(void *b, size_t len);
 107 //
 108 // These routines support G3, G4, and the 970, and run in both 32 and
 109 // 64-bit mode.  Lengths (size_t) are always 32 bits.
 110 //
 111 // Register use:
 112 //    r0 = temp
 113 //    r2 = temp
 114 //    r3 = original ptr, not changed since memset returns it
 115 //    r4 = count of bytes to set
 116 //    r5 = working operand ptr ("rp")
 117 //    r6 = value to store (usually 0)
 118 // r7-r9 = temps
 119 //   r10 = feature flags
 120 //   r11 = old MSR (if bzero_phys)
 121 //   r12 = return address (if bzero_phys)
 122 //   cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte)
 123
 124         .align  5
 125 LEXT(memset)                                    // void *   memset(void *b, int c, size_t len);
 126         andi.   r6,r4,0xFF              // copy value to working register, test for 0
 127         mr              r4,r5                   // move length to working register
 128         bne--   memset1                 // skip if nonzero
 129 LEXT(bzero)                                             // void bzero(void *b, size_t len);
 130         dcbtst  0,r3                    // touch in 1st cache block
 131         mfsprg  r10,2                   // get features
 132         li              r6,0                    // get a 0
 133         neg             r7,r3                   // start to compute #bytes to align
 134         andi.   r0,r10,pf128Byte+pf32Byte // get cache line size
 135         mtcrf   0x02,r10                // put pf128Byte etc in cr6
 136         cmplw   r4,r0                   // operand length >= cache line size?
 137         mr              r5,r3                   // make copy of operand ptr (can't change r3)
 138         blt             bzero_tail              // too short for dcbz (or dcbz128)
 139         rlwinm  r0,r7,0,0x1F    // get #bytes to  32-byte align
 140         rlwinm  r9,r7,0,0x7F    // get #bytes to 128-byte align
 141         bt++    pf128Byteb,bzero_128 // skip if 128-byte processor
 142
 143 // Operand length >=32 and cache line size is 32.
 144 //              r0 = #bytes to 32-byte align
 145 //              r4 = length
 146 //              r5 = ptr to operand
 147 //              r6 = 0
 148
 149         sub             r2,r4,r0                // adjust length
 150         cmpwi   cr1,r0,0                // already 32-byte aligned?
 151         srwi.   r8,r2,5                 // get #32-byte chunks
 152         beq             bzero_tail              // not long enough to dcbz
 153         mtctr   r8                              // set up loop count
 154         rlwinm  r4,r2,0,27,31   // mask down to leftover byte count
 155         beq             cr1,bz_dcbz32   // skip if already 32-byte aligned
 156
 157 // 32-byte align.  We just store 32 0s, rather than test and use conditional
 158 // branches.  This is usually faster, because there are no mispredicts.
 159
 160         stw             r6,0(r5)                // zero next 32 bytes
 161         stw             r6,4(r5)
 162         stw             r6,8(r5)
 163         stw             r6,12(r5)
 164         stw             r6,16(r5)
 165         stw             r6,20(r5)
 166         stw             r6,24(r5)
 167         stw             r6,28(r5)
 168         add             r5,r5,r0                // now r5 is 32-byte aligned
 169         b               bz_dcbz32
 170
 171 // Loop doing 32-byte version of DCBZ instruction.
 172
 173         .align  4                               // align the inner loop
 174 bz_dcbz32:
 175         dcbz    0,r5                    // zero another 32 bytes
 176         addi    r5,r5,32
 177         bdnz    bz_dcbz32
 178
 179 // Store trailing bytes.  This routine is used both by bzero and memset.
 180 //              r4 = #bytes to store (may be large if memset)
 181 //              r5 = address
 182 //              r6 = value to store (in all 8 bytes)
 183 //     cr6 = pf64Bit etc flags
 184
 185 bzero_tail:
 186         srwi.   r0,r4,4                 // get #(16-byte-chunks)
 187         mtcrf   0x01,r4                 // remaining byte count to cr7
 188         beq             bzt3                    // no 16-byte chunks
 189         mtctr   r0                              // set up loop count
 190         bt++    pf64Bitb,bzt2   // skip if 64-bit processor
 191         b               bzt1
 192         .align  5
 193 bzt1:                                                   // loop over 16-byte chunks on 32-bit processor
 194         stw             r6,0(r5)
 195         stw             r6,4(r5)
 196         stw             r6,8(r5)
 197         stw             r6,12(r5)
 198         addi    r5,r5,16
 199         bdnz    bzt1
 200         b               bzt3
 201         .align  5
 202 bzt2:                                                   // loop over 16-byte chunks on 64-bit processor
 203         std             r6,0(r5)
 204         std             r6,8(r5)
 205         addi    r5,r5,16
 206         bdnz    bzt2
 207         bf              28,bzt4                 // 8-byte chunk?
 208         std             r6,0(r5)
 209         addi    r5,r5,8
 210         b               bzt4
 211 bzt3:
 212         bf              28,bzt4                 // 8-byte chunk?
 213         stw             r6,0(r5)
 214         stw             r6,4(r5)
 215         addi    r5,r5,8
 216 bzt4:
 217         bf              29,bzt5                 // word?
 218         stw             r6,0(r5)
 219         addi    r5,r5,4
 220 bzt5:
 221         bf              30,bzt6                 // halfword?
 222         sth             r6,0(r5)
 223         addi    r5,r5,2
 224 bzt6:
 225         bflr    31                              // byte?
 226         stb             r6,0(r5)
 227         blr
 228
 229 // Operand length is >=128 and cache line size is 128. We assume that
 230 // because the linesize is 128 bytes, this is a 64-bit processor.
 231 //              r4 = length
 232 //              r5 = ptr to operand
 233 //              r6 = 0
 234 //              r7 = neg(r5)
 235 //              r9 = #bytes to 128-byte align
 236
 237         .align  5
 238 bzero_128:
 239         sub             r2,r4,r9                // r2 <- length remaining after cache-line aligning
 240         rlwinm  r0,r7,0,0xF             // r0 <- #bytes to 16-byte align
 241         srwi.   r8,r2,7                 // r8 <- number of cache lines to 0
 242         std             r6,0(r5)                // always store 16 bytes to 16-byte align...
 243         std             r6,8(r5)                // ...even if too short for dcbz128
 244         add             r5,r5,r0                // 16-byte align ptr
 245         sub             r4,r4,r0                // adjust count
 246         beq             bzero_tail              // r8==0, not long enough to dcbz128
 247         sub.    r7,r9,r0                // get #bytes remaining to 128-byte align
 248         rlwinm  r4,r2,0,0x7F    // r4 <- length remaining after dcbz128'ing
 249         mtctr   r8                              // set up dcbz128 loop
 250         beq             bz_dcbz128              // already 128-byte aligned
 251         b               bz_align                // enter loop over 16-byte chunks
 252
 253 // 128-byte align by looping over 16-byte chunks.
 254
 255         .align  5
 256 bz_align:                                               // loop over 16-byte chunks
 257         subic.  r7,r7,16                // more to go?
 258         std             r6,0(r5)
 259         std             r6,8(r5)
 260         addi    r5,r5,16
 261         bgt             bz_align
 262
 263         b               bz_dcbz128              // enter dcbz128 loop
 264
 265 // Loop over 128-byte cache lines.
 266 //              r4 = length remaining after cache lines (0..127)
 267 //              r5 = ptr (128-byte aligned)
 268 //              r6 = 0
 269 //              ctr = count of cache lines to 0
 270
 271         .align  5
 272 bz_dcbz128:
 273         dcbz128 0,r5                    // zero a 128-byte cache line
 274         addi    r5,r5,128
 275         bdnz    bz_dcbz128
 276
 277         b               bzero_tail              // handle leftovers
 278
 279
 280 // Handle memset() for nonzero values.  This case is relatively infrequent;
 281 // the large majority of memset() calls are for 0.
 282 //              r3 = ptr
 283 //              r4 = count
 284 //              r6 = value in lower byte (nonzero)
 285
 286 memset1:
 287         cmplwi  r4,16                   // too short to bother aligning?
 288         rlwimi  r6,r6,8,16,23   // replicate value to low 2 bytes
 289         mr              r5,r3                   // make working copy of operand ptr
 290         rlwimi  r6,r6,16,0,15   // value now in all 4 bytes
 291         blt             bzero_tail              // length<16, we won't be using "std"
 292         mfsprg  r10,2                   // get feature flags
 293         neg             r7,r5                   // start to compute #bytes to align
 294         rlwinm  r6,r6,0,1,0             // value now in all 8 bytes (if 64-bit)
 295         andi.   r0,r7,7                 // r6 <- #bytes to doubleword align
 296         stw             r6,0(r5)                // store 8 bytes to avoid a loop
 297         stw             r6,4(r5)
 298         mtcrf   0x02,r10                // get pf64Bit flag etc in cr6
 299         sub             r4,r4,r0                // adjust count
 300         add             r5,r5,r0                // doubleword align ptr
 301         b               bzero_tail
 302
 303
 304