ppc/gen/bzero.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25
  26 #define __APPLE_API_PRIVATE
  27 #include <machine/cpu_capabilities.h>
  28 #undef  __APPLE_API_PRIVATE
  29
  30 // Bzero has migrated to the comm page.
  31
  32         .text
  33         .globl  _bzero
  34         .globl  _memset
  35
  36         .align  5
  37 _bzero:                                                         // void bzero(void *b, size_t len);
  38         ba              _COMM_PAGE_BZERO
  39
  40         .align  5
  41 _memset:                                                        // void *   memset(void *b, int c, size_t len);
  42         andi.   r9,r4,0xFF                      // copy "c" and test for 0
  43         mr              r4,r5                           // move length down to where bzero() expects it
  44         beqa++  _COMM_PAGE_BZERO        // c==0, so treat like bzero()
  45
  46 // The nonzero memset() case is uncommon.
  47
  48         cmplwi  r5,8                            // too short to align?
  49         rlwimi  r9,r9,8,16,23           // replicate c to all 4 bytes
  50         neg             r7,r3                           // start to compute #bytes to word align
  51         mr              r8,r3                           // copy ptr so we can preserve r3
  52         rlwimi  r9,r9,16,0,15
  53         blt             4f                                      // fewer than 8 bytes
  54         andi.   r0,r7,3                         // get #bytes to word align
  55         mtcrf   0x01,r7                         // set up #bytes to word align
  56         sub             r5,r5,r0                        // adjust length for word alignment
  57         srwi    r6,r5,3                         // get #8-byte chunks to memset()
  58         cmplwi  cr1,r6,0                        // any chunks?
  59         mtctr   r6
  60         beq             3f                                      // already word aligned (r6!=0)
  61
  62         bf              31,1f                           // odd byte?
  63         stb             r9,0(r8)
  64         addi    r8,r8,1
  65 1:
  66         bf              30,2f                           // halfword?
  67         sth             r9,0(r8)
  68         addi    r8,r8,2
  69 2:
  70         bne             cr1,3f                          // handle 8-byte chunks
  71         b               4f                                      // no chunks
  72
  73         .align  5
  74 3:
  75         stw             r9,0(r8)
  76         stw             r9,4(r8)
  77         addi    r8,r8,8
  78         bdnz    3b
  79
  80 // Store up to 8 leftover bytes.
  81 //      r9 = value in all 4 bytes
  82 //      r8 = ptr
  83 //      r5 = length
  84
  85 4:
  86         mtcrf   0x01,r5                         // move remaining length to cr7
  87         bf              29,6f
  88         stw             r9,0(r8)
  89         addi    r8,r8,4
  90 6:
  91         bf              30,7f
  92         sth             r9,0(r8)
  93         addi    r8,r8,2
  94 7:
  95         bflr    31
  96         stb             r9,0(r8)
  97         blr
  98
  99
 100 #if 0
 101 //
 102 // =============================
 103 // BZERO and MEMSET FOR Mac OS X
 104 // =============================
 105 //
 106 // We use DCBZ, and therefore are dependent on the cache block size (32.)
 107 // Bzero and memset need to be in the same file since they are tightly
 108 // coupled, so we can use bzero for memset of 0 without incurring extra
 109 // overhead.  (The issue is that bzero must preserve r3 for memset.)
 110 //
 111 // Registers we use:
 112 //      r3  = original ptr, not changed since memset returns it
 113 //      r4  = count of bytes to set ("rc")
 114 //      r11 = working operand ptr ("rp")
 115 //      r10 = value to set ("rv")
 116
 117 #define rc      r4
 118 #define rp      r11
 119 #define rv      r10
 120
 121 #include <architecture/ppc/asm_help.h>
 122
 123         .text
 124         .align  5
 125         .globl  _bzero
 126         .globl  _memset
 127
 128 // *************
 129 // * B Z E R O *
 130 // *************
 131
 132 _bzero:                                         // void bzero(void *b, size_t len);
 133         cmplwi  cr1,rc,32       // too short for DCBZ?
 134         li              rv,0            // get a 0
 135 Lbzero1:                                        // enter from memset with cr1 and rv set up
 136         neg             r5,r3           // start to compute bytes to align
 137         mr              rp,r3           // make copy of operand ptr
 138         andi.   r6,r5,0x1F      // r6 <- bytes to align on cache block
 139         blt-    cr1,Ltail       // <32, so skip DCBZs
 140         beq-    cr0,Ldcbz       // already aligned
 141
 142         // align on 32-byte boundary
 143
 144         mtcrf   0x01,r6         // move length to cr7 (faster if only 1 cr)
 145         andi.   r7,r6,16        // test bit 27 by hand
 146         sub             rc,rc,r6        // adjust length
 147         bf              31,1f           // test bits of count
 148         stb             rv,0(rp)
 149         addi    rp,rp,1
 150 1:
 151         bf              30,2f
 152         sth             rv,0(rp)
 153         addi    rp,rp,2
 154 2:
 155         bf              29,3f
 156         stw             rv,0(rp)
 157         addi    rp,rp,4
 158 3:
 159         bf              28,4f
 160         stw             rv,0(rp)
 161         stw             rv,4(rp)
 162         addi    rp,rp,8
 163 4:
 164         beq             Ldcbz
 165         stw             rv,0(rp)
 166         stw             rv,4(rp)
 167         stw             rv,8(rp)
 168         stw             rv,12(rp)
 169         addi    rp,rp,16
 170
 171         // DCBZ 32-byte cache blocks
 172 Ldcbz:
 173         srwi.   r5,rc,5         // r5 <- number of cache blocks to zero
 174         beq             Ltail           // none
 175         mtctr   r5                      // set up loop count
 176         andi.   rc,rc,0x1F      // will there be leftovers?
 177 1:
 178         dcbz    0,rp            // zero 32 bytes
 179         addi    rp,rp,32
 180         bdnz    1b
 181         beqlr                           // no leftovers so done
 182
 183         // store up to 31 trailing bytes
 184         //      rv = value to store (in all 4 bytes)
 185         //      rc = #bytes to store (0..31)
 186 Ltail:
 187         andi.   r5,rc,16        // bit 27 set in length?
 188         mtcrf   0x01,rc         // low 4 bits of length to cr7
 189         beq             1f                      // test bits of length
 190         stw             rv,0(rp)
 191         stw             rv,4(rp)
 192         stw             rv,8(rp)
 193         stw             rv,12(rp)
 194         addi    rp,rp,16
 195 1:
 196         bf              28,2f
 197         stw             rv,0(rp)
 198         stw             rv,4(rp)
 199         addi    rp,rp,8
 200 2:
 201         bf              29,3f
 202         stw             rv,0(rp)
 203         addi    rp,rp,4
 204 3:
 205         bf              30,4f
 206         sth             rv,0(rp)
 207         addi    rp,rp,2
 208 4:
 209         bflr    31
 210         stb             rv,0(rp)
 211         blr
 212
 213
 214 // ***************
 215 // * M E M S E T *
 216 // ***************
 217
 218         .align  5
 219 _memset:                                        // void *   memset(void *b, int c, size_t len);
 220         andi.   rv,r4,0xFF      // copy value to working register, test for 0
 221         mr              rc,r5           // move length to working register
 222         cmplwi  cr1,r5,32       // length < 32 ?
 223         beq             Lbzero1         // memset of 0 is just a bzero
 224         rlwimi  rv,rv,8,16,23   // replicate value to low 2 bytes
 225         mr              rp,r3           // make working copy of operand ptr
 226         rlwimi  rv,rv,16,0,15   // value now in all 4 bytes
 227         blt             cr1,Ltail       // length<32, so use common tail routine
 228         neg             r5,rp           // start to compute #bytes to align
 229         andi.   r6,r5,0x7       // r6 <- #bytes to align on dw
 230         beq-    Lmemset1        // already aligned
 231
 232         ; align on 8-byte boundary
 233
 234         mtcrf   0x01,r6         // move count to cr7 (faster if only 1 cr)
 235         sub             rc,rc,r6        // adjust length
 236         bf              31,1f
 237         stb             rv,0(rp)
 238         addi    rp,rp,1
 239 1:
 240         bf              30,2f
 241         sth             rv,0(rp)
 242         addi    rp,rp,2
 243 2:
 244         bf              29,Lmemset1
 245         stw             rv,0(rp)
 246         addi    rp,rp,4
 247
 248        // loop on 16-byte blocks
 249 Lmemset1:
 250         stw             rv,0(rp)        // store first 8 bytes from rv
 251         stw             rv,4(rp)
 252         srwi    r5,rc,4         // r5 <- #blocks (>=1)
 253         mtcrf   0x01,rc         // leftover length to cr7
 254         mtctr   r5                      // set up loop count
 255         lfd             f0,0(rp)        // pick up in a fp register
 256         b               2f                      // enter loop in middle
 257         .align  4
 258 1:                                                      // loop on 16-byte blocks
 259         stfd    f0,0(rp)
 260 2:
 261         stfd    f0,8(rp)
 262         addi    rp,rp,16
 263         bdnz    1b
 264
 265         // store up to 16 trailing bytes (count in cr7)
 266
 267         bf              28,3f
 268         stfd    f0,0(rp)
 269         addi    rp,rp,8
 270 3:
 271         bf              29,4f
 272         stw             rv,0(rp)
 273         addi    rp,rp,4
 274 4:
 275         bf              30,5f
 276         sth             rv,0(rp)
 277         addi    rp,rp,2
 278 5:
 279         bflr    31
 280         stb             rv,0(rp)
 281         blr
 282 #endif  /* 0 */