osfmk/ppc/commpage/bcopy_g3.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* =======================================
  23  * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
  24  * =======================================
  25  *
  26  * Version of 2/20/2003, tuned for G3.
  27  *
  28  * Register usage.  Note we use R2, so this code will not run in a PEF/CFM
  29  * environment.
  30  *
  31  *   r0  = "w7" or temp
  32  *   r2  = "w8"
  33  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  34  *   r4  = source ptr ("rs")
  35  *   r5  = count of bytes to move ("rc")
  36  *   r6  = "w1"
  37  *   r7  = "w2"
  38  *   r8  = "w3"
  39  *   r9  = "w4"
  40  *   r10 = "w5"
  41  *   r11 = "w6"
  42  *   r12 = destination ptr ("rd")
  43  * f0-f3 = used for moving 8-byte aligned data
  44  */
  45 #define rs      r4              // NB: we depend on rs==r4 in "lswx" instructions
  46 #define rd      r12
  47 #define rc      r5
  48
  49 #define w1      r6
  50 #define w2      r7
  51 #define w3      r8
  52 #define w4      r9
  53 #define w5      r10
  54 #define w6      r11
  55 #define w7      r0
  56 #define w8      r2
  57
  58 #define ASSEMBLER
  59 #include <sys/appleapiopts.h>
  60 #include <ppc/asm.h>
  61 #include <machine/cpu_capabilities.h>
  62 #include <machine/commpage.h>
  63
  64         .text
  65
  66
  67 #define kLong   33                                      // too long for string ops
  68
  69
  70 // Main entry points.
  71
  72         .align  5
  73 bcopy_g3:                                                       // void bcopy(const void *src, void *dst, size_t len)
  74         cmplwi  rc,kLong                        // length > 32 bytes?
  75         sub             w1,r4,r3                        // must move in reverse if (rd-rs)<rc
  76         mr              rd,r4                           // start to move source & dest to canonic spot
  77         bge             LLong0                          // skip if long operand
  78         mtxer   rc                                      // set length for string ops
  79         lswx    r5,0,r3                         // load bytes into r5-r12
  80         stswx   r5,0,r4                         // store them
  81         blr
  82
  83 // NB: memcpy() and memmove() must follow bcopy() by 32 bytes, for comm page.
  84
  85         .align  5
  86 Lmemcpy_g3:                                                     // void* memcpy(void *dst, void *src, size_t len)
  87 Lmemmove_g3:                                            // void* memmove(void *dst, const void *src, size_t len)
  88         cmplwi  rc,kLong                        // length > 32 bytes?
  89         sub             w1,r3,rs                        // must move in reverse if (rd-rs)<rc
  90         mr              rd,r3                           // must leave r3 alone, it is return value for memcpy etc
  91         bge             LLong1                          // longer than 32 bytes
  92         mtxer   rc                                      // set length for string ops
  93         lswx    r5,0,r4                         // load bytes into r5-r12
  94         stswx   r5,0,r3                         // store them
  95         blr
  96
  97 // Long operands (more than 32 bytes.)
  98 //              w1  = (rd-rs), used to check for alignment
  99
 100 LLong0:                                                         // enter from bcopy()
 101         mr              rs,r3                           // must leave r3 alone (it is return value for memcpy)
 102 LLong1:                                                         // enter from memcpy() and memmove()
 103         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
 104         rlwinm  r0,w1,0,0x3                     // are operands relatively word-aligned?
 105         neg             w2,rd                           // prepare to align destination
 106         cmpwi   cr5,r0,0                        // set cr5 beq if relatively word aligned
 107         blt             cr1,LLongReverse        // handle reverse move
 108         andi.   w4,w2,3                         // w4 <- #bytes to word align destination
 109         beq             cr5,LLongFloat          // relatively aligned so use FPRs
 110         sub             rc,rc,w4                        // adjust count for alignment
 111         srwi    r0,rc,5                         // get #chunks to xfer (>=1)
 112         rlwinm  rc,rc,0,0x1F            // mask down to leftover bytes
 113         mtctr   r0                                      // set up loop count
 114         beq             1f                                      // dest already word aligned
 115
 116 // Word align the destination.
 117
 118         mtxer   w4                                      // byte count to xer
 119         cmpwi   r0,0                            // any chunks to xfer?
 120         lswx    w1,0,rs                         // move w4 bytes to align dest
 121         add             rs,rs,w4
 122         stswx   w1,0,rd
 123         add             rd,rd,w4
 124         beq-    2f                                      // pathologic case, no chunks to xfer
 125
 126 // Forward, unaligned loop.
 127
 128 1:
 129         lwz             w1,0(rs)
 130         lwz             w2,4(rs)
 131         lwz             w3,8(rs)
 132         lwz             w4,12(rs)
 133         lwz             w5,16(rs)
 134         lwz             w6,20(rs)
 135         lwz             w7,24(rs)
 136         lwz             w8,28(rs)
 137         addi    rs,rs,32
 138         stw             w1,0(rd)
 139         stw             w2,4(rd)
 140         stw             w3,8(rd)
 141         stw             w4,12(rd)
 142         stw             w5,16(rd)
 143         stw             w6,20(rd)
 144         stw             w7,24(rd)
 145         stw             w8,28(rd)
 146         addi    rd,rd,32
 147         bdnz    1b
 148 2:                                                                      // rc = remaining bytes (0-31)
 149         mtxer   rc                                      // set up count for string ops
 150         mr              r0,rd                           // move dest ptr out of the way
 151         lswx    r5,0,rs                         // load xer bytes into r5-r12 (rs==r4)
 152         stswx   r5,0,r0                         // store them
 153         blr
 154
 155
 156
 157 // Forward, aligned loop.  We use FPRs.
 158
 159 LLongFloat:
 160         andi.   w4,w2,7                         // W4 <- #bytes to doubleword-align destination
 161         sub             rc,rc,w4                        // adjust count for alignment
 162         srwi    r0,rc,5                         // number of 32-byte chunks to xfer
 163         rlwinm  rc,rc,0,0x1F            // mask down to leftover bytes
 164         mtctr   r0                                      // set up loop count
 165         beq             1f                                      // dest already doubleword aligned
 166
 167 // Doubleword align the destination.
 168
 169         mtxer   w4                                      // byte count to xer
 170         cmpwi   r0,0                            // any chunks to xfer?
 171         lswx    w1,0,rs                         // move w4 bytes to align dest
 172         add             rs,rs,w4
 173         stswx   w1,0,rd
 174         add             rd,rd,w4
 175         beq-    2f                                      // pathologic case, no chunks to xfer
 176 1:                                                                      // loop over 32-byte chunks
 177         lfd             f0,0(rs)
 178         lfd             f1,8(rs)
 179         lfd             f2,16(rs)
 180         lfd             f3,24(rs)
 181         addi    rs,rs,32
 182         stfd    f0,0(rd)
 183         stfd    f1,8(rd)
 184         stfd    f2,16(rd)
 185         stfd    f3,24(rd)
 186         addi    rd,rd,32
 187         bdnz    1b
 188 2:                                                                      // rc = remaining bytes (0-31)
 189         mtxer   rc                                      // set up count for string ops
 190         mr              r0,rd                           // move dest ptr out of the way
 191         lswx    r5,0,rs                         // load xer bytes into r5-r12 (rs==r4)
 192         stswx   r5,0,r0                         // store them
 193         blr
 194
 195
 196 // Long, reverse moves.
 197 //              cr5 = beq if relatively word aligned
 198
 199 LLongReverse:
 200         add             rd,rd,rc                        // point to end of operands + 1
 201         add             rs,rs,rc
 202         beq             cr5,LReverseFloat       // aligned operands so can use FPRs
 203         srwi    r0,rc,5                         // get chunk count
 204         rlwinm  rc,rc,0,0x1F            // mask down to leftover bytes
 205         mtctr   r0                                      // set up loop count
 206         mtxer   rc                                      // set up for trailing bytes
 207 1:
 208         lwz             w1,-4(rs)
 209         lwz             w2,-8(rs)
 210         lwz             w3,-12(rs)
 211         lwz             w4,-16(rs)
 212         stw             w1,-4(rd)
 213         lwz             w5,-20(rs)
 214         stw             w2,-8(rd)
 215         lwz             w6,-24(rs)
 216         stw             w3,-12(rd)
 217         lwz             w7,-28(rs)
 218         stw             w4,-16(rd)
 219         lwzu    w8,-32(rs)
 220         stw             w5,-20(rd)
 221         stw             w6,-24(rd)
 222         stw             w7,-28(rd)
 223         stwu    w8,-32(rd)
 224         bdnz    1b
 225
 226         sub             r4,rs,rc                        // point to 1st (leftmost) leftover byte (0..31)
 227         sub             r0,rd,rc                        // move dest ptr out of way
 228         lswx    r5,0,r4                         // load xer bytes into r5-r12
 229         stswx   r5,0,r0                         // store them
 230         blr
 231
 232
 233 // Long, reverse aligned moves.  We use FPRs.
 234
 235 LReverseFloat:
 236         andi.   w4,rd,7                         // W3 <- #bytes to doubleword-align destination
 237         sub             rc,rc,w4                        // adjust count for alignment
 238         srwi    r0,rc,5                         // number of 32-byte chunks to xfer
 239         rlwinm  rc,rc,0,0x1F            // mask down to leftover bytes
 240         mtctr   r0                                      // set up loop count
 241         beq             1f                                      // dest already doubleword aligned
 242
 243 // Doubleword align the destination.
 244
 245         mtxer   w4                                      // byte count to xer
 246         cmpwi   r0,0                            // any chunks to xfer?
 247         sub             rs,rs,w4                        // point to 1st bytes to xfer
 248         sub             rd,rd,w4
 249         lswx    w1,0,rs                         // move w3 bytes to align dest
 250         stswx   w1,0,rd
 251         beq-    2f                                      // pathologic case, no chunks to xfer
 252 1:
 253         lfd             f0,-8(rs)
 254         lfd             f1,-16(rs)
 255         lfd             f2,-24(rs)
 256         lfdu    f3,-32(rs)
 257         stfd    f0,-8(rd)
 258         stfd    f1,-16(rd)
 259         stfd    f2,-24(rd)
 260         stfdu   f3,-32(rd)
 261         bdnz    1b
 262 2:                                                                      // rc = remaining bytes (0-31)
 263         mtxer   rc                                      // set up count for string ops
 264         sub             r4,rs,rc                        // point to 1st (leftmost) leftover byte (0..31)
 265         sub             r0,rd,rc                        // move dest ptr out of way
 266         lswx    r5,0,r4                         // load xer bytes into r5-r12
 267         stswx   r5,0,r0                         // store them
 268         blr
 269
 270         COMMPAGE_DESCRIPTOR(bcopy_g3,_COMM_PAGE_BCOPY,0,k64Bit+kHasAltivec,kCommPage32)