osfmk/ppc/commpage/bcopy_64.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* =======================================
  23  * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
  24  * =======================================
  25  *
  26  * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
  27  * This version might be used bringing up new processors, with known
  28  * Altivec bugs that need to be worked around.  It is not particularly well
  29  * optimized.
  30  *
  31  * For 64-bit processors with a 128-byte cache line, running in either
  32  * 32- or 64-bit mode.  This is written for 32-bit execution, the kernel
  33  * will translate to 64-bit code when it compiles the 64-bit commpage.
  34  *
  35  * Register usage.  Note we use R2, so this code will not run in a PEF/CFM
  36  * environment.
  37  *   r0  = "w7" or temp
  38  *   r2  = "w8"
  39  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  40  *   r4  = source ptr ("rs")
  41  *   r5  = count of bytes to move ("rc")
  42  *   r6  = "w1"
  43  *   r7  = "w2"
  44  *   r8  = "w3"
  45  *   r9  = "w4"
  46  *   r10 = "w5"
  47  *   r11 = "w6"
  48  *   r12 = destination ptr ("rd")
  49  */
  50 #define rs      r4
  51 #define rd      r12
  52 #define rc      r5
  53 #define rv      r2
  54
  55 #define w1      r6
  56 #define w2      r7
  57 #define w3      r8
  58 #define w4      r9
  59 #define w5      r10
  60 #define w6      r11
  61 #define w7      r0
  62 #define w8      r2
  63
  64 #define ASSEMBLER
  65 #include <sys/appleapiopts.h>
  66 #include <ppc/asm.h>
  67 #include <machine/cpu_capabilities.h>
  68 #include <machine/commpage.h>
  69
  70         .text
  71
  72 #define kLong           64                              // too long for inline loopless code
  73
  74
  75 // Main entry points.
  76
  77         .align  5
  78 bcopy_64:                                                       // void bcopy(const void *src, void *dst, size_t len)
  79         cmplwi  rc,kLong                        // short or long?
  80         sub             w1,r4,r3                        // must move in reverse if (rd-rs)<rc
  81         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
  82         mr              rd,r4                           // start to move registers to canonic spot
  83         mr              rs,r3
  84         blt             LShort                          // handle short operands
  85         dcbt    0,r3                            // touch in destination
  86         b               LLong                           // join medium/long operand code
  87
  88 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
  89
  90         .align  5
  91 Lmemcpy_g4:                                                     // void* memcpy(void *dst, void *src, size_t len)
  92 Lmemmove_g4:                                            // void* memmove(void *dst, const void *src, size_t len)
  93         cmplwi  rc,kLong                        // short or long?
  94         sub             w1,r3,r4                        // must move in reverse if (rd-rs)<rc
  95         dcbt    0,r4                            // touch in the first line of source
  96         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
  97         mr              rd,r3                           // must leave r3 alone, it is return value for memcpy etc
  98         bge             LLong                           // handle medium or long operands
  99
 100 // Handle short operands.
 101
 102 LShort:
 103         mtcrf   0x02,rc                         // put length bits 26-27 in cr6 (faster one cr at a time)
 104         mtcrf   0x01,rc                         // put length bits 28-31 in cr7
 105         blt             cr1,LShortReverse
 106
 107 // Forward short operands.  This is the most frequent case, so it is inline.
 108
 109 LShort64:                                                       // enter to xfer last 64 bytes
 110         bf              26,0f                           // 64-byte chunk to xfer?
 111         ld              w1,0(rs)
 112         ld              w2,8(rs)
 113         ld              w3,16(rs)
 114         ld              w4,24(rs)
 115         addi    rs,rs,32
 116         std             w1,0(rd)
 117         std             w2,8(rd)
 118         std             w3,16(rd)
 119         std             w4,24(rd)
 120         addi    rd,rd,32
 121 0:
 122         bf              27,1f                           // quadword to move?
 123         ld              w1,0(rs)
 124         ld              w2,8(rs)
 125         addi    rs,rs,16
 126         std             w1,0(rd)
 127         std             w2,8(rd)
 128         addi    rd,rd,16
 129 1:
 130         bf              28,2f                           // doubleword?
 131         ld              w1,0(rs)
 132         addi    rs,rs,8
 133         std             w1,0(rd)
 134         addi    rd,rd,8
 135 2:
 136         bf              29,3f                           // word?
 137         lwz             w1,0(rs)
 138         addi    rs,rs,4
 139         stw             w1,0(rd)
 140         addi    rd,rd,4
 141 3:
 142         bf              30,4f                           // halfword to move?
 143         lhz             w1,0(rs)
 144         addi    rs,rs,2
 145         sth             w1,0(rd)
 146         addi    rd,rd,2
 147 4:
 148         bflr    31                                      // skip if no odd byte
 149         lbz             w1,0(rs)
 150         stb             w1,0(rd)
 151         blr
 152
 153
 154 // Handle short reverse operands.
 155 //              cr6 = bits 26-27 of length
 156 //              cr7 = bits 28-31 of length
 157
 158 LShortReverse:
 159         add             rs,rs,rc                        // adjust ptrs for reverse move
 160         add             rd,rd,rc
 161 LShortReverse64:                                        // enter to xfer last 64 bytes
 162         bf              26,0f                           // 64-byte chunk to xfer?
 163         ld              w1,-8(rs)
 164         ld              w2,-16(rs)
 165         ld              w3,-24(rs)
 166         ldu             w4,-32(rs)
 167         std             w1,-8(rd)
 168         std             w2,-16(rd)
 169         std             w3,-24(rd)
 170         stdu    w4,-32(rd)
 171 0:
 172         bf              27,1f                           // quadword to move?
 173         ld              w1,-8(rs)
 174         ldu             w2,-16(rs)
 175         std             w1,-8(rd)
 176         stdu    w2,-16(rd)
 177 1:
 178         bf              28,2f                           // doubleword?
 179         ldu             w1,-8(rs)
 180         stdu    w1,-8(rd)
 181 2:
 182         bf              29,3f                           // word?
 183         lwzu    w1,-4(rs)
 184         stwu    w1,-4(rd)
 185 3:
 186         bf              30,4f                           // halfword to move?
 187         lhzu    w1,-2(rs)
 188         sthu    w1,-2(rd)
 189 4:
 190         bflr    31                                      // done if no odd byte
 191         lbz     w1,-1(rs)                       // no update
 192         stb     w1,-1(rd)
 193         blr
 194
 195
 196 // Long operands.
 197 //     cr1 = blt iff we must move reverse
 198
 199         .align  4
 200 LLong:
 201         dcbtst  0,rd                            // touch in destination
 202         neg             w3,rd                           // start to compute #bytes to align destination
 203         andi.   w6,w3,7                         // w6 <- #bytes to 8-byte align destination
 204         blt             cr1,LLongReverse        // handle reverse moves
 205         mtctr   w6                                      // set up for loop to align destination
 206         sub             rc,rc,w6                        // adjust count
 207         beq             LAligned                        // destination already 8-byte aligned
 208 1:
 209         lbz             w1,0(rs)
 210         addi    rs,rs,1
 211         stb             w1,0(rd)
 212         addi    rd,rd,1
 213         bdnz    1b
 214
 215 // Destination is 8-byte aligned.
 216
 217 LAligned:
 218         srwi.   w2,rc,6                         // w2 <- count of 64-byte chunks
 219         mtcrf   0x02,rc                         // leftover byte count to cr (faster one cr at a time)
 220         mtcrf   0x01,rc                         // put length bits 28-31 in cr7
 221         beq             LShort64                        // no 64-byte chunks
 222         mtctr   w2
 223         b               1f
 224
 225 // Loop moving 64-byte chunks.
 226
 227         .align  5
 228 1:
 229         ld              w1,0(rs)
 230         ld              w2,8(rs)
 231         ld              w3,16(rs)
 232         ld              w4,24(rs)
 233         ld              w5,32(rs)
 234         ld              w6,40(rs)
 235         ld              w7,48(rs)
 236         ld              w8,56(rs)
 237         addi    rs,rs,64
 238         std             w1,0(rd)
 239         std             w2,8(rd)
 240         std             w3,16(rd)
 241         std             w4,24(rd)
 242         std             w5,32(rd)
 243         std             w6,40(rd)
 244         std             w7,48(rd)
 245         std             w8,56(rd)
 246         addi    rd,rd,64
 247         bdnz    1b
 248
 249         b               LShort64
 250
 251
 252 // Handle reverse moves.
 253
 254 LLongReverse:
 255         add             rd,rd,rc                                // point to end of operands
 256         add             rs,rs,rc
 257         andi.   r0,rd,7                                 // is destination 8-byte aligned?
 258         sub             rc,rc,r0                                // adjust count
 259         mtctr   r0                                              // set up for byte loop
 260         beq             LRevAligned                             // already aligned
 261
 262 1:
 263         lbzu    w1,-1(rs)
 264         stbu    w1,-1(rd)
 265         bdnz    1b
 266
 267 // Destination is 8-byte aligned.
 268
 269 LRevAligned:
 270         srwi.   w2,rc,6                         // w2 <- count of 64-byte chunks
 271         mtcrf   0x02,rc                         // leftover byte count to cr (faster one cr at a time)
 272         mtcrf   0x01,rc                         // put length bits 28-31 in cr7
 273         beq             LShortReverse64         // no 64-byte chunks
 274         mtctr   w2
 275         b               1f
 276
 277 // Loop over 64-byte chunks (reverse).
 278
 279         .align  5
 280 1:
 281         ld              w1,-8(rs)
 282         ld              w2,-16(rs)
 283         ld              w3,-24(rs)
 284         ld              w4,-32(rs)
 285         ld              w5,-40(rs)
 286         ld              w6,-48(rs)
 287         ld              w7,-56(rs)
 288         ldu             w8,-64(rs)
 289         std             w1,-8(rd)
 290         std             w2,-16(rd)
 291         std             w3,-24(rd)
 292         std             w4,-32(rd)
 293         std             w5,-40(rd)
 294         std             w6,-48(rd)
 295         std             w7,-56(rd)
 296         stdu    w8,-64(rd)
 297         bdnz    1b
 298
 299         b               LShortReverse64
 300
 301         COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)