osfmk/ppc/bzero.s

   1 /*
   2  * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25
  26 #include <ppc/asm.h>
  27 #include <ppc/exception.h>
  28 #include <assym.s>
  29
  30         .text
  31         .align  2
  32         .globl  _memset
  33         .globl  _bzero
  34         .globl  _bzero_nc
  35         .globl  _bzero_phys
  36
  37
  38 // ***********************
  39 // * B Z E R O _ P H Y S *
  40 // ***********************
  41 //
  42 // void bzero_phys(addr64_t phys_addr, uint32_t length);
  43 //
  44 // Takes a phys addr in (r3,r4), and length in r5.  We leave cache on.
  45
  46         .align  5
  47 LEXT(bzero_phys)
  48         mflr    r12                             // save return address
  49         rlwinm  r3,r3,0,1,0             // coallesce long-long in (r3,r4) into reg64_t in r3
  50         rlwimi  r3,r4,0,0,31
  51         mr              r4,r5                   // put length where bzero() expects it
  52         bl              EXT(ml_set_physical_get_ffs)    // turn DR off, SF on, features in cr6, old MSR in r11
  53         bl              EXT(bzero)              // use normal bzero() routine
  54         mtlr    r12                             // restore return
  55         b               EXT(ml_restore)         // restore MSR, turning DR on and SF off
  56
  57
  58 // *******************
  59 // * B Z E R O _ N C *
  60 // *******************
  61 //
  62 //      void bzero_nc(char      *addr, unsigned int length);
  63 //
  64 // For use with uncached memory.  Doesn't seem to be used at all, so probably not
  65 // performance critical.  NB: we must avoid unaligned stores, because some
  66 // machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached
  67 // memory.  Of course, we must also avoid dcbz.
  68
  69 LEXT(bzero_nc)
  70         cmplwi  cr1,r4,20               // too short to bother with 16-byte loops?
  71         cmplwi  cr7,r4,0                // check for (len==0)
  72         li              r6,0                    // get a 0
  73         bge             cr1,bznc1               // skip if length >=20
  74         mtctr   r4                              // set up byte loop
  75         beqlr-- cr7                             // done if len=0
  76
  77 // Short operands, loop over bytes.
  78
  79 bznc0:
  80         stb             r6,0(r3)
  81         addi    r3,r3,1
  82         bdnz    bznc0
  83         blr
  84
  85 // Handle operands long enough to do doubleword stores; we must doubleword
  86 // align, to avoid alignment exceptions.
  87
  88 bznc1:
  89         neg             r7,r3                   // start to compute #bytes to align
  90         mfsprg  r10,2                   // get feature flags
  91         andi.   r0,r7,7                 // get #bytes to doubleword align
  92         mr              r5,r3                   // make copy of operand ptr as bcopy expects
  93         mtcrf   0x02,r10                // put pf64Bitb etc in cr6
  94         beq             bzero_tail              // already doubleword aligned
  95         sub             r4,r4,r0                // adjust count
  96         mtctr   r0                              // set up loop
  97 bznc2:                                                  // zero bytes until doubleword aligned
  98         stb             r6,0(r5)
  99         addi    r5,r5,1
 100         bdnz    bznc2
 101         b               bzero_tail              // join bzero, now that r5 is aligned
 102
 103
 104 // *************     ***************
 105 // * B Z E R O * and * M E M S E T *
 106 // *************     ***************
 107 //
 108 // void *   memset(void *b, int c, size_t len);
 109 // void         bzero(void *b, size_t len);
 110 //
 111 // These routines support G3, G4, and the 970, and run in both 32 and
 112 // 64-bit mode.  Lengths (size_t) are always 32 bits.
 113 //
 114 // Register use:
 115 //    r0 = temp
 116 //    r2 = temp
 117 //    r3 = original ptr, not changed since memset returns it
 118 //    r4 = count of bytes to set
 119 //    r5 = working operand ptr ("rp")
 120 //    r6 = value to store (usually 0)
 121 // r7-r9 = temps
 122 //   r10 = feature flags
 123 //   r11 = old MSR (if bzero_phys)
 124 //   r12 = return address (if bzero_phys)
 125 //   cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte)
 126
 127         .align  5
 128 LEXT(memset)                                    // void *   memset(void *b, int c, size_t len);
 129         andi.   r6,r4,0xFF              // copy value to working register, test for 0
 130         mr              r4,r5                   // move length to working register
 131         bne--   memset1                 // skip if nonzero
 132 LEXT(bzero)                                             // void bzero(void *b, size_t len);
 133         dcbtst  0,r3                    // touch in 1st cache block
 134         mfsprg  r10,2                   // get features
 135         li              r6,0                    // get a 0
 136         neg             r7,r3                   // start to compute #bytes to align
 137         andi.   r0,r10,pf128Byte+pf32Byte // get cache line size
 138         mtcrf   0x02,r10                // put pf128Byte etc in cr6
 139         cmplw   r4,r0                   // operand length >= cache line size?
 140         mr              r5,r3                   // make copy of operand ptr (can't change r3)
 141         blt             bzero_tail              // too short for dcbz (or dcbz128)
 142         rlwinm  r0,r7,0,0x1F    // get #bytes to  32-byte align
 143         rlwinm  r9,r7,0,0x7F    // get #bytes to 128-byte align
 144         bt++    pf128Byteb,bzero_128 // skip if 128-byte processor
 145
 146 // Operand length >=32 and cache line size is 32.
 147 //              r0 = #bytes to 32-byte align
 148 //              r4 = length
 149 //              r5 = ptr to operand
 150 //              r6 = 0
 151
 152         sub             r2,r4,r0                // adjust length
 153         cmpwi   cr1,r0,0                // already 32-byte aligned?
 154         srwi.   r8,r2,5                 // get #32-byte chunks
 155         beq             bzero_tail              // not long enough to dcbz
 156         mtctr   r8                              // set up loop count
 157         rlwinm  r4,r2,0,27,31   // mask down to leftover byte count
 158         beq             cr1,bz_dcbz32   // skip if already 32-byte aligned
 159
 160 // 32-byte align.  We just store 32 0s, rather than test and use conditional
 161 // branches.  This is usually faster, because there are no mispredicts.
 162
 163         stw             r6,0(r5)                // zero next 32 bytes
 164         stw             r6,4(r5)
 165         stw             r6,8(r5)
 166         stw             r6,12(r5)
 167         stw             r6,16(r5)
 168         stw             r6,20(r5)
 169         stw             r6,24(r5)
 170         stw             r6,28(r5)
 171         add             r5,r5,r0                // now r5 is 32-byte aligned
 172         b               bz_dcbz32
 173
 174 // Loop doing 32-byte version of DCBZ instruction.
 175
 176         .align  4                               // align the inner loop
 177 bz_dcbz32:
 178         dcbz    0,r5                    // zero another 32 bytes
 179         addi    r5,r5,32
 180         bdnz    bz_dcbz32
 181
 182 // Store trailing bytes.  This routine is used both by bzero and memset.
 183 //              r4 = #bytes to store (may be large if memset)
 184 //              r5 = address
 185 //              r6 = value to store (in all 8 bytes)
 186 //     cr6 = pf64Bit etc flags
 187
 188 bzero_tail:
 189         srwi.   r0,r4,4                 // get #(16-byte-chunks)
 190         mtcrf   0x01,r4                 // remaining byte count to cr7
 191         beq             bzt3                    // no 16-byte chunks
 192         mtctr   r0                              // set up loop count
 193         bt++    pf64Bitb,bzt2   // skip if 64-bit processor
 194         b               bzt1
 195         .align  5
 196 bzt1:                                                   // loop over 16-byte chunks on 32-bit processor
 197         stw             r6,0(r5)
 198         stw             r6,4(r5)
 199         stw             r6,8(r5)
 200         stw             r6,12(r5)
 201         addi    r5,r5,16
 202         bdnz    bzt1
 203         b               bzt3
 204         .align  5
 205 bzt2:                                                   // loop over 16-byte chunks on 64-bit processor
 206         std             r6,0(r5)
 207         std             r6,8(r5)
 208         addi    r5,r5,16
 209         bdnz    bzt2
 210         bf              28,bzt4                 // 8-byte chunk?
 211         std             r6,0(r5)
 212         addi    r5,r5,8
 213         b               bzt4
 214 bzt3:
 215         bf              28,bzt4                 // 8-byte chunk?
 216         stw             r6,0(r5)
 217         stw             r6,4(r5)
 218         addi    r5,r5,8
 219 bzt4:
 220         bf              29,bzt5                 // word?
 221         stw             r6,0(r5)
 222         addi    r5,r5,4
 223 bzt5:
 224         bf              30,bzt6                 // halfword?
 225         sth             r6,0(r5)
 226         addi    r5,r5,2
 227 bzt6:
 228         bflr    31                              // byte?
 229         stb             r6,0(r5)
 230         blr
 231
 232 // Operand length is >=128 and cache line size is 128. We assume that
 233 // because the linesize is 128 bytes, this is a 64-bit processor.
 234 //              r4 = length
 235 //              r5 = ptr to operand
 236 //              r6 = 0
 237 //              r7 = neg(r5)
 238 //              r9 = #bytes to 128-byte align
 239
 240         .align  5
 241 bzero_128:
 242         sub             r2,r4,r9                // r2 <- length remaining after cache-line aligning
 243         rlwinm  r0,r7,0,0xF             // r0 <- #bytes to 16-byte align
 244         srwi.   r8,r2,7                 // r8 <- number of cache lines to 0
 245         std             r6,0(r5)                // always store 16 bytes to 16-byte align...
 246         std             r6,8(r5)                // ...even if too short for dcbz128
 247         add             r5,r5,r0                // 16-byte align ptr
 248         sub             r4,r4,r0                // adjust count
 249         beq             bzero_tail              // r8==0, not long enough to dcbz128
 250         sub.    r7,r9,r0                // get #bytes remaining to 128-byte align
 251         rlwinm  r4,r2,0,0x7F    // r4 <- length remaining after dcbz128'ing
 252         mtctr   r8                              // set up dcbz128 loop
 253         beq             bz_dcbz128              // already 128-byte aligned
 254         b               bz_align                // enter loop over 16-byte chunks
 255
 256 // 128-byte align by looping over 16-byte chunks.
 257
 258         .align  5
 259 bz_align:                                               // loop over 16-byte chunks
 260         subic.  r7,r7,16                // more to go?
 261         std             r6,0(r5)
 262         std             r6,8(r5)
 263         addi    r5,r5,16
 264         bgt             bz_align
 265
 266         b               bz_dcbz128              // enter dcbz128 loop
 267
 268 // Loop over 128-byte cache lines.
 269 //              r4 = length remaining after cache lines (0..127)
 270 //              r5 = ptr (128-byte aligned)
 271 //              r6 = 0
 272 //              ctr = count of cache lines to 0
 273
 274         .align  5
 275 bz_dcbz128:
 276         dcbz128 0,r5                    // zero a 128-byte cache line
 277         addi    r5,r5,128
 278         bdnz    bz_dcbz128
 279
 280         b               bzero_tail              // handle leftovers
 281
 282
 283 // Handle memset() for nonzero values.  This case is relatively infrequent;
 284 // the large majority of memset() calls are for 0.
 285 //              r3 = ptr
 286 //              r4 = count
 287 //              r6 = value in lower byte (nonzero)
 288
 289 memset1:
 290         cmplwi  r4,16                   // too short to bother aligning?
 291         rlwimi  r6,r6,8,16,23   // replicate value to low 2 bytes
 292         mr              r5,r3                   // make working copy of operand ptr
 293         rlwimi  r6,r6,16,0,15   // value now in all 4 bytes
 294         blt             bzero_tail              // length<16, we won't be using "std"
 295         mfsprg  r10,2                   // get feature flags
 296         neg             r7,r5                   // start to compute #bytes to align
 297         rlwinm  r6,r6,0,1,0             // value now in all 8 bytes (if 64-bit)
 298         andi.   r0,r7,7                 // r6 <- #bytes to doubleword align
 299         stw             r6,0(r5)                // store 8 bytes to avoid a loop
 300         stw             r6,4(r5)
 301         mtcrf   0x02,r10                // get pf64Bit flag etc in cr6
 302         sub             r4,r4,r0                // adjust count
 303         add             r5,r5,r0                // doubleword align ptr
 304         b               bzero_tail
 305
 306
 307