osfmk/ppc/bzero.s

   1 /*
   2  * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <ppc/asm.h>
  30 #include <ppc/exception.h>
  31 #include <assym.s>
  32
  33         .text
  34         .align  2
  35         .globl  _memset
  36         .globl  _bzero
  37         .globl  _bzero_nc
  38         .globl  _bzero_phys
  39         .globl  _bzero_phys_nc
  40
  41
  42 // *****************************
  43 // * B Z E R O _ P H Y S _ N C *
  44 // *****************************
  45 //
  46 // void bzero_phys_nc(addr64_t phys_addr, uint32_t length);
  47 //
  48 // Takes a phys addr in (r3,r4), and length in r5.  NO CACHING
  49
  50         .align  5
  51 LEXT(bzero_phys_nc)
  52         mflr    r12                             // save return address
  53         rlwinm  r3,r3,0,1,0             // coallesce long-long in (r3,r4) into reg64_t in r3
  54         rlwimi  r3,r4,0,0,31
  55         mr              r4,r5                   // put length where bzero() expects it
  56         bl              EXT(ml_set_physical_get_ffs)    // turn DR off, SF on, features in cr6, old MSR in r11
  57         bl              EXT(bzero_nc)           // use normal bzero() routine
  58         mtlr    r12                             // restore return
  59         b               EXT(ml_restore)         // restore MSR, turning DR on and SF off
  60
  61
  62 // ***********************
  63 // * B Z E R O _ P H Y S *
  64 // ***********************
  65 //
  66 // void bzero_phys(addr64_t phys_addr, uint32_t length);
  67 //
  68 // Takes a phys addr in (r3,r4), and length in r5.  We leave cache on.
  69
  70         .align  5
  71 LEXT(bzero_phys)
  72         mflr    r12                             // save return address
  73         rlwinm  r3,r3,0,1,0             // coallesce long-long in (r3,r4) into reg64_t in r3
  74         rlwimi  r3,r4,0,0,31
  75         mr              r4,r5                   // put length where bzero() expects it
  76         bl              EXT(ml_set_physical_get_ffs)    // turn DR off, SF on, features in cr6, old MSR in r11
  77         bl              EXT(bzero)              // use normal bzero() routine
  78         mtlr    r12                             // restore return
  79         b               EXT(ml_restore)         // restore MSR, turning DR on and SF off
  80
  81
  82 // *******************
  83 // * B Z E R O _ N C *
  84 // *******************
  85 //
  86 //      void bzero_nc(char      *addr, unsigned int length);
  87 //
  88 // For use with uncached memory.  Doesn't seem to be used at all, so probably not
  89 // performance critical.  NB: we must avoid unaligned stores, because some
  90 // machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached
  91 // memory.  Of course, we must also avoid dcbz.
  92
  93 LEXT(bzero_nc)
  94         cmplwi  cr1,r4,20               // too short to bother with 16-byte loops?
  95         cmplwi  cr7,r4,0                // check for (len==0)
  96         li              r6,0                    // get a 0
  97         bge             cr1,bznc1               // skip if length >=20
  98         mtctr   r4                              // set up byte loop
  99         beqlr-- cr7                             // done if len=0
 100
 101 // Short operands, loop over bytes.
 102
 103 bznc0:
 104         stb             r6,0(r3)
 105         addi    r3,r3,1
 106         bdnz    bznc0
 107         blr
 108
 109 // Handle operands long enough to do doubleword stores; we must doubleword
 110 // align, to avoid alignment exceptions.
 111
 112 bznc1:
 113         neg             r7,r3                   // start to compute #bytes to align
 114         mfsprg  r10,2                   // get feature flags
 115         andi.   r0,r7,7                 // get #bytes to doubleword align
 116         mr              r5,r3                   // make copy of operand ptr as bcopy expects
 117         mtcrf   0x02,r10                // put pf64Bitb etc in cr6
 118         beq             bzero_tail              // already doubleword aligned
 119         sub             r4,r4,r0                // adjust count
 120         mtctr   r0                              // set up loop
 121 bznc2:                                                  // zero bytes until doubleword aligned
 122         stb             r6,0(r5)
 123         addi    r5,r5,1
 124         bdnz    bznc2
 125         b               bzero_tail              // join bzero, now that r5 is aligned
 126
 127
 128 // *************     ***************
 129 // * B Z E R O * and * M E M S E T *
 130 // *************     ***************
 131 //
 132 // void *   memset(void *b, int c, size_t len);
 133 // void         bzero(void *b, size_t len);
 134 //
 135 // These routines support G3, G4, and the 970, and run in both 32 and
 136 // 64-bit mode.  Lengths (size_t) are always 32 bits.
 137 //
 138 // Register use:
 139 //    r0 = temp
 140 //    r2 = temp
 141 //    r3 = original ptr, not changed since memset returns it
 142 //    r4 = count of bytes to set
 143 //    r5 = working operand ptr ("rp")
 144 //    r6 = value to store (usually 0)
 145 // r7-r9 = temps
 146 //   r10 = feature flags
 147 //   r11 = old MSR (if bzero_phys)
 148 //   r12 = return address (if bzero_phys)
 149 //   cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte)
 150
 151         .align  5
 152 LEXT(memset)                                    // void *   memset(void *b, int c, size_t len);
 153         andi.   r6,r4,0xFF              // copy value to working register, test for 0
 154         mr              r4,r5                   // move length to working register
 155         bne--   memset1                 // skip if nonzero
 156 LEXT(bzero)                                             // void bzero(void *b, size_t len);
 157         dcbtst  0,r3                    // touch in 1st cache block
 158         mfsprg  r10,2                   // get features
 159         li              r6,0                    // get a 0
 160         neg             r7,r3                   // start to compute #bytes to align
 161         andi.   r0,r10,pf128Byte+pf32Byte // get cache line size
 162         mtcrf   0x02,r10                // put pf128Byte etc in cr6
 163         cmplw   r4,r0                   // operand length >= cache line size?
 164         mr              r5,r3                   // make copy of operand ptr (can't change r3)
 165         blt             bzero_tail              // too short for dcbz (or dcbz128)
 166         rlwinm  r0,r7,0,0x1F    // get #bytes to  32-byte align
 167         rlwinm  r9,r7,0,0x7F    // get #bytes to 128-byte align
 168         bt++    pf128Byteb,bzero_128 // skip if 128-byte processor
 169
 170 // Operand length >=32 and cache line size is 32.
 171 //              r0 = #bytes to 32-byte align
 172 //              r4 = length
 173 //              r5 = ptr to operand
 174 //              r6 = 0
 175
 176         sub             r2,r4,r0                // adjust length
 177         cmpwi   cr1,r0,0                // already 32-byte aligned?
 178         srwi.   r8,r2,5                 // get #32-byte chunks
 179         beq             bzero_tail              // not long enough to dcbz
 180         mtctr   r8                              // set up loop count
 181         rlwinm  r4,r2,0,27,31   // mask down to leftover byte count
 182         beq             cr1,bz_dcbz32   // skip if already 32-byte aligned
 183
 184 // 32-byte align.  We just store 32 0s, rather than test and use conditional
 185 // branches.  This is usually faster, because there are no mispredicts.
 186
 187         stw             r6,0(r5)                // zero next 32 bytes
 188         stw             r6,4(r5)
 189         stw             r6,8(r5)
 190         stw             r6,12(r5)
 191         stw             r6,16(r5)
 192         stw             r6,20(r5)
 193         stw             r6,24(r5)
 194         stw             r6,28(r5)
 195         add             r5,r5,r0                // now r5 is 32-byte aligned
 196         b               bz_dcbz32
 197
 198 // Loop doing 32-byte version of DCBZ instruction.
 199
 200         .align  4                               // align the inner loop
 201 bz_dcbz32:
 202         dcbz    0,r5                    // zero another 32 bytes
 203         addi    r5,r5,32
 204         bdnz    bz_dcbz32
 205
 206 // Store trailing bytes.  This routine is used both by bzero and memset.
 207 //              r4 = #bytes to store (may be large if memset)
 208 //              r5 = address
 209 //              r6 = value to store (in all 8 bytes)
 210 //     cr6 = pf64Bit etc flags
 211
 212 bzero_tail:
 213         srwi.   r0,r4,4                 // get #(16-byte-chunks)
 214         mtcrf   0x01,r4                 // remaining byte count to cr7
 215         beq             bzt3                    // no 16-byte chunks
 216         mtctr   r0                              // set up loop count
 217         bt++    pf64Bitb,bzt2   // skip if 64-bit processor
 218         b               bzt1
 219         .align  5
 220 bzt1:                                                   // loop over 16-byte chunks on 32-bit processor
 221         stw             r6,0(r5)
 222         stw             r6,4(r5)
 223         stw             r6,8(r5)
 224         stw             r6,12(r5)
 225         addi    r5,r5,16
 226         bdnz    bzt1
 227         b               bzt3
 228         .align  5
 229 bzt2:                                                   // loop over 16-byte chunks on 64-bit processor
 230         std             r6,0(r5)
 231         std             r6,8(r5)
 232         addi    r5,r5,16
 233         bdnz    bzt2
 234         bf              28,bzt4                 // 8-byte chunk?
 235         std             r6,0(r5)
 236         addi    r5,r5,8
 237         b               bzt4
 238 bzt3:
 239         bf              28,bzt4                 // 8-byte chunk?
 240         stw             r6,0(r5)
 241         stw             r6,4(r5)
 242         addi    r5,r5,8
 243 bzt4:
 244         bf              29,bzt5                 // word?
 245         stw             r6,0(r5)
 246         addi    r5,r5,4
 247 bzt5:
 248         bf              30,bzt6                 // halfword?
 249         sth             r6,0(r5)
 250         addi    r5,r5,2
 251 bzt6:
 252         bflr    31                              // byte?
 253         stb             r6,0(r5)
 254         blr
 255
 256 // Operand length is >=128 and cache line size is 128. We assume that
 257 // because the linesize is 128 bytes, this is a 64-bit processor.
 258 //              r4 = length
 259 //              r5 = ptr to operand
 260 //              r6 = 0
 261 //              r7 = neg(r5)
 262 //              r9 = #bytes to 128-byte align
 263
 264         .align  5
 265 bzero_128:
 266         sub             r2,r4,r9                // r2 <- length remaining after cache-line aligning
 267         rlwinm  r0,r7,0,0xF             // r0 <- #bytes to 16-byte align
 268         srwi.   r8,r2,7                 // r8 <- number of cache lines to 0
 269         std             r6,0(r5)                // always store 16 bytes to 16-byte align...
 270         std             r6,8(r5)                // ...even if too short for dcbz128
 271         add             r5,r5,r0                // 16-byte align ptr
 272         sub             r4,r4,r0                // adjust count
 273         beq             bzero_tail              // r8==0, not long enough to dcbz128
 274         sub.    r7,r9,r0                // get #bytes remaining to 128-byte align
 275         rlwinm  r4,r2,0,0x7F    // r4 <- length remaining after dcbz128'ing
 276         mtctr   r8                              // set up dcbz128 loop
 277         beq             bz_dcbz128              // already 128-byte aligned
 278         b               bz_align                // enter loop over 16-byte chunks
 279
 280 // 128-byte align by looping over 16-byte chunks.
 281
 282         .align  5
 283 bz_align:                                               // loop over 16-byte chunks
 284         subic.  r7,r7,16                // more to go?
 285         std             r6,0(r5)
 286         std             r6,8(r5)
 287         addi    r5,r5,16
 288         bgt             bz_align
 289
 290         b               bz_dcbz128              // enter dcbz128 loop
 291
 292 // Loop over 128-byte cache lines.
 293 //              r4 = length remaining after cache lines (0..127)
 294 //              r5 = ptr (128-byte aligned)
 295 //              r6 = 0
 296 //              ctr = count of cache lines to 0
 297
 298         .align  5
 299 bz_dcbz128:
 300         dcbz128 0,r5                    // zero a 128-byte cache line
 301         addi    r5,r5,128
 302         bdnz    bz_dcbz128
 303
 304         b               bzero_tail              // handle leftovers
 305
 306
 307 // Handle memset() for nonzero values.  This case is relatively infrequent;
 308 // the large majority of memset() calls are for 0.
 309 //              r3 = ptr
 310 //              r4 = count
 311 //              r6 = value in lower byte (nonzero)
 312
 313 memset1:
 314         cmplwi  r4,16                   // too short to bother aligning?
 315         rlwimi  r6,r6,8,16,23   // replicate value to low 2 bytes
 316         mr              r5,r3                   // make working copy of operand ptr
 317         rlwimi  r6,r6,16,0,15   // value now in all 4 bytes
 318         blt             bzero_tail              // length<16, we won't be using "std"
 319         mfsprg  r10,2                   // get feature flags
 320         neg             r7,r5                   // start to compute #bytes to align
 321         rlwinm  r6,r6,0,1,0             // value now in all 8 bytes (if 64-bit)
 322         andi.   r0,r7,7                 // r6 <- #bytes to doubleword align
 323         stw             r6,0(r5)                // store 8 bytes to avoid a loop
 324         stw             r6,4(r5)
 325         mtcrf   0x02,r10                // get pf64Bit flag etc in cr6
 326         sub             r4,r4,r0                // adjust count
 327         add             r5,r5,r0                // doubleword align ptr
 328         b               bzero_tail
 329
 330
 331