osfmk/ppc/commpage/bcopy_g3.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /* =======================================
  24  * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
  25  * =======================================
  26  *
  27  * Version of 2/20/2003, tuned for G3.
  28  *
  29  * Register usage.  Note we use R2, so this code will not run in a PEF/CFM
  30  * environment.
  31  *
  32  *   r0  = "w7" or temp
  33  *   r2  = "w8"
  34  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  35  *   r4  = source ptr ("rs")
  36  *   r5  = count of bytes to move ("rc")
  37  *   r6  = "w1"
  38  *   r7  = "w2"
  39  *   r8  = "w3"
  40  *   r9  = "w4"
  41  *   r10 = "w5"
  42  *   r11 = "w6"
  43  *   r12 = destination ptr ("rd")
  44  * f0-f3 = used for moving 8-byte aligned data
  45  */
  46 #define rs      r4              // NB: we depend on rs==r4 in "lswx" instructions
  47 #define rd      r12
  48 #define rc      r5
  49
  50 #define w1      r6
  51 #define w2      r7
  52 #define w3      r8
  53 #define w4      r9
  54 #define w5      r10
  55 #define w6      r11
  56 #define w7      r0
  57 #define w8      r2
  58
  59 #define ASSEMBLER
  60 #include <sys/appleapiopts.h>
  61 #include <ppc/asm.h>
  62 #include <machine/cpu_capabilities.h>
  63 #include <machine/commpage.h>
  64
  65         .text
  66
  67
  68 #define kLong   33                                      // too long for string ops
  69
  70
  71 // Main entry points.
  72
  73         .align  5
  74 bcopy_g3:                                                       // void bcopy(const void *src, void *dst, size_t len)
  75         cmplwi  rc,kLong                        // length > 32 bytes?
  76         sub             w1,r4,r3                        // must move in reverse if (rd-rs)<rc
  77         mr              rd,r4                           // start to move source & dest to canonic spot
  78         bge             LLong0                          // skip if long operand
  79         mtxer   rc                                      // set length for string ops
  80         lswx    r5,0,r3                         // load bytes into r5-r12
  81         stswx   r5,0,r4                         // store them
  82         blr
  83
  84 // NB: memcpy() and memmove() must follow bcopy() by 32 bytes, for comm page.
  85
  86         .align  5
  87 Lmemcpy_g3:                                                     // void* memcpy(void *dst, void *src, size_t len)
  88 Lmemmove_g3:                                            // void* memmove(void *dst, const void *src, size_t len)
  89         cmplwi  rc,kLong                        // length > 32 bytes?
  90         sub             w1,r3,rs                        // must move in reverse if (rd-rs)<rc
  91         mr              rd,r3                           // must leave r3 alone, it is return value for memcpy etc
  92         bge             LLong1                          // longer than 32 bytes
  93         mtxer   rc                                      // set length for string ops
  94         lswx    r5,0,r4                         // load bytes into r5-r12
  95         stswx   r5,0,r3                         // store them
  96         blr
  97
  98 // Long operands (more than 32 bytes.)
  99 //              w1  = (rd-rs), used to check for alignment
 100
 101 LLong0:                                                         // enter from bcopy()
 102         mr              rs,r3                           // must leave r3 alone (it is return value for memcpy)
 103 LLong1:                                                         // enter from memcpy() and memmove()
 104         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
 105         rlwinm  r0,w1,0,0x3                     // are operands relatively word-aligned?
 106         neg             w2,rd                           // prepare to align destination
 107         cmpwi   cr5,r0,0                        // set cr5 beq if relatively word aligned
 108         blt             cr1,LLongReverse        // handle reverse move
 109         andi.   w4,w2,3                         // w4 <- #bytes to word align destination
 110         beq             cr5,LLongFloat          // relatively aligned so use FPRs
 111         sub             rc,rc,w4                        // adjust count for alignment
 112         srwi    r0,rc,5                         // get #chunks to xfer (>=1)
 113         rlwinm  rc,rc,0,0x1F            // mask down to leftover bytes
 114         mtctr   r0                                      // set up loop count
 115         beq             1f                                      // dest already word aligned
 116
 117 // Word align the destination.
 118
 119         mtxer   w4                                      // byte count to xer
 120         cmpwi   r0,0                            // any chunks to xfer?
 121         lswx    w1,0,rs                         // move w4 bytes to align dest
 122         add             rs,rs,w4
 123         stswx   w1,0,rd
 124         add             rd,rd,w4
 125         beq-    2f                                      // pathologic case, no chunks to xfer
 126
 127 // Forward, unaligned loop.
 128
 129 1:
 130         lwz             w1,0(rs)
 131         lwz             w2,4(rs)
 132         lwz             w3,8(rs)
 133         lwz             w4,12(rs)
 134         lwz             w5,16(rs)
 135         lwz             w6,20(rs)
 136         lwz             w7,24(rs)
 137         lwz             w8,28(rs)
 138         addi    rs,rs,32
 139         stw             w1,0(rd)
 140         stw             w2,4(rd)
 141         stw             w3,8(rd)
 142         stw             w4,12(rd)
 143         stw             w5,16(rd)
 144         stw             w6,20(rd)
 145         stw             w7,24(rd)
 146         stw             w8,28(rd)
 147         addi    rd,rd,32
 148         bdnz    1b
 149 2:                                                                      // rc = remaining bytes (0-31)
 150         mtxer   rc                                      // set up count for string ops
 151         mr              r0,rd                           // move dest ptr out of the way
 152         lswx    r5,0,rs                         // load xer bytes into r5-r12 (rs==r4)
 153         stswx   r5,0,r0                         // store them
 154         blr
 155
 156
 157
 158 // Forward, aligned loop.  We use FPRs.
 159
 160 LLongFloat:
 161         andi.   w4,w2,7                         // W4 <- #bytes to doubleword-align destination
 162         sub             rc,rc,w4                        // adjust count for alignment
 163         srwi    r0,rc,5                         // number of 32-byte chunks to xfer
 164         rlwinm  rc,rc,0,0x1F            // mask down to leftover bytes
 165         mtctr   r0                                      // set up loop count
 166         beq             1f                                      // dest already doubleword aligned
 167
 168 // Doubleword align the destination.
 169
 170         mtxer   w4                                      // byte count to xer
 171         cmpwi   r0,0                            // any chunks to xfer?
 172         lswx    w1,0,rs                         // move w4 bytes to align dest
 173         add             rs,rs,w4
 174         stswx   w1,0,rd
 175         add             rd,rd,w4
 176         beq-    2f                                      // pathologic case, no chunks to xfer
 177 1:                                                                      // loop over 32-byte chunks
 178         lfd             f0,0(rs)
 179         lfd             f1,8(rs)
 180         lfd             f2,16(rs)
 181         lfd             f3,24(rs)
 182         addi    rs,rs,32
 183         stfd    f0,0(rd)
 184         stfd    f1,8(rd)
 185         stfd    f2,16(rd)
 186         stfd    f3,24(rd)
 187         addi    rd,rd,32
 188         bdnz    1b
 189 2:                                                                      // rc = remaining bytes (0-31)
 190         mtxer   rc                                      // set up count for string ops
 191         mr              r0,rd                           // move dest ptr out of the way
 192         lswx    r5,0,rs                         // load xer bytes into r5-r12 (rs==r4)
 193         stswx   r5,0,r0                         // store them
 194         blr
 195
 196
 197 // Long, reverse moves.
 198 //              cr5 = beq if relatively word aligned
 199
 200 LLongReverse:
 201         add             rd,rd,rc                        // point to end of operands + 1
 202         add             rs,rs,rc
 203         beq             cr5,LReverseFloat       // aligned operands so can use FPRs
 204         srwi    r0,rc,5                         // get chunk count
 205         rlwinm  rc,rc,0,0x1F            // mask down to leftover bytes
 206         mtctr   r0                                      // set up loop count
 207         mtxer   rc                                      // set up for trailing bytes
 208 1:
 209         lwz             w1,-4(rs)
 210         lwz             w2,-8(rs)
 211         lwz             w3,-12(rs)
 212         lwz             w4,-16(rs)
 213         stw             w1,-4(rd)
 214         lwz             w5,-20(rs)
 215         stw             w2,-8(rd)
 216         lwz             w6,-24(rs)
 217         stw             w3,-12(rd)
 218         lwz             w7,-28(rs)
 219         stw             w4,-16(rd)
 220         lwzu    w8,-32(rs)
 221         stw             w5,-20(rd)
 222         stw             w6,-24(rd)
 223         stw             w7,-28(rd)
 224         stwu    w8,-32(rd)
 225         bdnz    1b
 226
 227         sub             r4,rs,rc                        // point to 1st (leftmost) leftover byte (0..31)
 228         sub             r0,rd,rc                        // move dest ptr out of way
 229         lswx    r5,0,r4                         // load xer bytes into r5-r12
 230         stswx   r5,0,r0                         // store them
 231         blr
 232
 233
 234 // Long, reverse aligned moves.  We use FPRs.
 235
 236 LReverseFloat:
 237         andi.   w4,rd,7                         // W3 <- #bytes to doubleword-align destination
 238         sub             rc,rc,w4                        // adjust count for alignment
 239         srwi    r0,rc,5                         // number of 32-byte chunks to xfer
 240         rlwinm  rc,rc,0,0x1F            // mask down to leftover bytes
 241         mtctr   r0                                      // set up loop count
 242         beq             1f                                      // dest already doubleword aligned
 243
 244 // Doubleword align the destination.
 245
 246         mtxer   w4                                      // byte count to xer
 247         cmpwi   r0,0                            // any chunks to xfer?
 248         sub             rs,rs,w4                        // point to 1st bytes to xfer
 249         sub             rd,rd,w4
 250         lswx    w1,0,rs                         // move w3 bytes to align dest
 251         stswx   w1,0,rd
 252         beq-    2f                                      // pathologic case, no chunks to xfer
 253 1:
 254         lfd             f0,-8(rs)
 255         lfd             f1,-16(rs)
 256         lfd             f2,-24(rs)
 257         lfdu    f3,-32(rs)
 258         stfd    f0,-8(rd)
 259         stfd    f1,-16(rd)
 260         stfd    f2,-24(rd)
 261         stfdu   f3,-32(rd)
 262         bdnz    1b
 263 2:                                                                      // rc = remaining bytes (0-31)
 264         mtxer   rc                                      // set up count for string ops
 265         sub             r4,rs,rc                        // point to 1st (leftmost) leftover byte (0..31)
 266         sub             r0,rd,rc                        // move dest ptr out of way
 267         lswx    r5,0,r4                         // load xer bytes into r5-r12
 268         stswx   r5,0,r0                         // store them
 269         blr
 270
 271         COMMPAGE_DESCRIPTOR(bcopy_g3,_COMM_PAGE_BCOPY,0,k64Bit+kHasAltivec,kCommPage32)