osfmk/ppc/commpage/bcopy_64.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 /* =======================================
  31  * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
  32  * =======================================
  33  *
  34  * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
  35  * This version might be used bringing up new processors, with known
  36  * Altivec bugs that need to be worked around.  It is not particularly well
  37  * optimized.
  38  *
  39  * For 64-bit processors with a 128-byte cache line, running in either
  40  * 32- or 64-bit mode.  This is written for 32-bit execution, the kernel
  41  * will translate to 64-bit code when it compiles the 64-bit commpage.
  42  *
  43  * Register usage.  Note we use R2, so this code will not run in a PEF/CFM
  44  * environment.
  45  *   r0  = "w7" or temp
  46  *   r2  = "w8"
  47  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  48  *   r4  = source ptr ("rs")
  49  *   r5  = count of bytes to move ("rc")
  50  *   r6  = "w1"
  51  *   r7  = "w2"
  52  *   r8  = "w3"
  53  *   r9  = "w4"
  54  *   r10 = "w5"
  55  *   r11 = "w6"
  56  *   r12 = destination ptr ("rd")
  57  */
  58 #define rs      r4
  59 #define rd      r12
  60 #define rc      r5
  61 #define rv      r2
  62
  63 #define w1      r6
  64 #define w2      r7
  65 #define w3      r8
  66 #define w4      r9
  67 #define w5      r10
  68 #define w6      r11
  69 #define w7      r0
  70 #define w8      r2
  71
  72 #define ASSEMBLER
  73 #include <sys/appleapiopts.h>
  74 #include <ppc/asm.h>
  75 #include <machine/cpu_capabilities.h>
  76 #include <machine/commpage.h>
  77
  78         .text
  79
  80 #define kLong           64                              // too long for inline loopless code
  81
  82
  83 // Main entry points.
  84
  85         .align  5
  86 bcopy_64:                                                       // void bcopy(const void *src, void *dst, size_t len)
  87         cmplwi  rc,kLong                        // short or long?
  88         sub             w1,r4,r3                        // must move in reverse if (rd-rs)<rc
  89         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
  90         mr              rd,r4                           // start to move registers to canonic spot
  91         mr              rs,r3
  92         blt             LShort                          // handle short operands
  93         dcbt    0,r3                            // touch in destination
  94         b               LLong                           // join medium/long operand code
  95
  96 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
  97
  98         .align  5
  99 Lmemcpy_g4:                                                     // void* memcpy(void *dst, void *src, size_t len)
 100 Lmemmove_g4:                                            // void* memmove(void *dst, const void *src, size_t len)
 101         cmplwi  rc,kLong                        // short or long?
 102         sub             w1,r3,r4                        // must move in reverse if (rd-rs)<rc
 103         dcbt    0,r4                            // touch in the first line of source
 104         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
 105         mr              rd,r3                           // must leave r3 alone, it is return value for memcpy etc
 106         bge             LLong                           // handle medium or long operands
 107
 108 // Handle short operands.
 109
 110 LShort:
 111         mtcrf   0x02,rc                         // put length bits 26-27 in cr6 (faster one cr at a time)
 112         mtcrf   0x01,rc                         // put length bits 28-31 in cr7
 113         blt             cr1,LShortReverse
 114
 115 // Forward short operands.  This is the most frequent case, so it is inline.
 116
 117 LShort64:                                                       // enter to xfer last 64 bytes
 118         bf              26,0f                           // 64-byte chunk to xfer?
 119         ld              w1,0(rs)
 120         ld              w2,8(rs)
 121         ld              w3,16(rs)
 122         ld              w4,24(rs)
 123         addi    rs,rs,32
 124         std             w1,0(rd)
 125         std             w2,8(rd)
 126         std             w3,16(rd)
 127         std             w4,24(rd)
 128         addi    rd,rd,32
 129 0:
 130         bf              27,1f                           // quadword to move?
 131         ld              w1,0(rs)
 132         ld              w2,8(rs)
 133         addi    rs,rs,16
 134         std             w1,0(rd)
 135         std             w2,8(rd)
 136         addi    rd,rd,16
 137 1:
 138         bf              28,2f                           // doubleword?
 139         ld              w1,0(rs)
 140         addi    rs,rs,8
 141         std             w1,0(rd)
 142         addi    rd,rd,8
 143 2:
 144         bf              29,3f                           // word?
 145         lwz             w1,0(rs)
 146         addi    rs,rs,4
 147         stw             w1,0(rd)
 148         addi    rd,rd,4
 149 3:
 150         bf              30,4f                           // halfword to move?
 151         lhz             w1,0(rs)
 152         addi    rs,rs,2
 153         sth             w1,0(rd)
 154         addi    rd,rd,2
 155 4:
 156         bflr    31                                      // skip if no odd byte
 157         lbz             w1,0(rs)
 158         stb             w1,0(rd)
 159         blr
 160
 161
 162 // Handle short reverse operands.
 163 //              cr6 = bits 26-27 of length
 164 //              cr7 = bits 28-31 of length
 165
 166 LShortReverse:
 167         add             rs,rs,rc                        // adjust ptrs for reverse move
 168         add             rd,rd,rc
 169 LShortReverse64:                                        // enter to xfer last 64 bytes
 170         bf              26,0f                           // 64-byte chunk to xfer?
 171         ld              w1,-8(rs)
 172         ld              w2,-16(rs)
 173         ld              w3,-24(rs)
 174         ldu             w4,-32(rs)
 175         std             w1,-8(rd)
 176         std             w2,-16(rd)
 177         std             w3,-24(rd)
 178         stdu    w4,-32(rd)
 179 0:
 180         bf              27,1f                           // quadword to move?
 181         ld              w1,-8(rs)
 182         ldu             w2,-16(rs)
 183         std             w1,-8(rd)
 184         stdu    w2,-16(rd)
 185 1:
 186         bf              28,2f                           // doubleword?
 187         ldu             w1,-8(rs)
 188         stdu    w1,-8(rd)
 189 2:
 190         bf              29,3f                           // word?
 191         lwzu    w1,-4(rs)
 192         stwu    w1,-4(rd)
 193 3:
 194         bf              30,4f                           // halfword to move?
 195         lhzu    w1,-2(rs)
 196         sthu    w1,-2(rd)
 197 4:
 198         bflr    31                                      // done if no odd byte
 199         lbz     w1,-1(rs)                       // no update
 200         stb     w1,-1(rd)
 201         blr
 202
 203
 204 // Long operands.
 205 //     cr1 = blt iff we must move reverse
 206
 207         .align  4
 208 LLong:
 209         dcbtst  0,rd                            // touch in destination
 210         neg             w3,rd                           // start to compute #bytes to align destination
 211         andi.   w6,w3,7                         // w6 <- #bytes to 8-byte align destination
 212         blt             cr1,LLongReverse        // handle reverse moves
 213         mtctr   w6                                      // set up for loop to align destination
 214         sub             rc,rc,w6                        // adjust count
 215         beq             LAligned                        // destination already 8-byte aligned
 216 1:
 217         lbz             w1,0(rs)
 218         addi    rs,rs,1
 219         stb             w1,0(rd)
 220         addi    rd,rd,1
 221         bdnz    1b
 222
 223 // Destination is 8-byte aligned.
 224
 225 LAligned:
 226         srwi.   w2,rc,6                         // w2 <- count of 64-byte chunks
 227         mtcrf   0x02,rc                         // leftover byte count to cr (faster one cr at a time)
 228         mtcrf   0x01,rc                         // put length bits 28-31 in cr7
 229         beq             LShort64                        // no 64-byte chunks
 230         mtctr   w2
 231         b               1f
 232
 233 // Loop moving 64-byte chunks.
 234
 235         .align  5
 236 1:
 237         ld              w1,0(rs)
 238         ld              w2,8(rs)
 239         ld              w3,16(rs)
 240         ld              w4,24(rs)
 241         ld              w5,32(rs)
 242         ld              w6,40(rs)
 243         ld              w7,48(rs)
 244         ld              w8,56(rs)
 245         addi    rs,rs,64
 246         std             w1,0(rd)
 247         std             w2,8(rd)
 248         std             w3,16(rd)
 249         std             w4,24(rd)
 250         std             w5,32(rd)
 251         std             w6,40(rd)
 252         std             w7,48(rd)
 253         std             w8,56(rd)
 254         addi    rd,rd,64
 255         bdnz    1b
 256
 257         b               LShort64
 258
 259
 260 // Handle reverse moves.
 261
 262 LLongReverse:
 263         add             rd,rd,rc                                // point to end of operands
 264         add             rs,rs,rc
 265         andi.   r0,rd,7                                 // is destination 8-byte aligned?
 266         sub             rc,rc,r0                                // adjust count
 267         mtctr   r0                                              // set up for byte loop
 268         beq             LRevAligned                             // already aligned
 269
 270 1:
 271         lbzu    w1,-1(rs)
 272         stbu    w1,-1(rd)
 273         bdnz    1b
 274
 275 // Destination is 8-byte aligned.
 276
 277 LRevAligned:
 278         srwi.   w2,rc,6                         // w2 <- count of 64-byte chunks
 279         mtcrf   0x02,rc                         // leftover byte count to cr (faster one cr at a time)
 280         mtcrf   0x01,rc                         // put length bits 28-31 in cr7
 281         beq             LShortReverse64         // no 64-byte chunks
 282         mtctr   w2
 283         b               1f
 284
 285 // Loop over 64-byte chunks (reverse).
 286
 287         .align  5
 288 1:
 289         ld              w1,-8(rs)
 290         ld              w2,-16(rs)
 291         ld              w3,-24(rs)
 292         ld              w4,-32(rs)
 293         ld              w5,-40(rs)
 294         ld              w6,-48(rs)
 295         ld              w7,-56(rs)
 296         ldu             w8,-64(rs)
 297         std             w1,-8(rd)
 298         std             w2,-16(rd)
 299         std             w3,-24(rd)
 300         std             w4,-32(rd)
 301         std             w5,-40(rd)
 302         std             w6,-48(rd)
 303         std             w7,-56(rd)
 304         stdu    w8,-64(rd)
 305         bdnz    1b
 306
 307         b               LShortReverse64
 308
 309         COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)