osfmk/ppc/commpage/bcopy_64.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /* =======================================
  24  * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
  25  * =======================================
  26  *
  27  * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
  28  * This version might be used bringing up new processors, with known
  29  * Altivec bugs that need to be worked around.  It is not particularly well
  30  * optimized.
  31  *
  32  * For 64-bit processors with a 128-byte cache line, running in either
  33  * 32- or 64-bit mode.  This is written for 32-bit execution, the kernel
  34  * will translate to 64-bit code when it compiles the 64-bit commpage.
  35  *
  36  * Register usage.  Note we use R2, so this code will not run in a PEF/CFM
  37  * environment.
  38  *   r0  = "w7" or temp
  39  *   r2  = "w8"
  40  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  41  *   r4  = source ptr ("rs")
  42  *   r5  = count of bytes to move ("rc")
  43  *   r6  = "w1"
  44  *   r7  = "w2"
  45  *   r8  = "w3"
  46  *   r9  = "w4"
  47  *   r10 = "w5"
  48  *   r11 = "w6"
  49  *   r12 = destination ptr ("rd")
  50  */
  51 #define rs      r4
  52 #define rd      r12
  53 #define rc      r5
  54 #define rv      r2
  55
  56 #define w1      r6
  57 #define w2      r7
  58 #define w3      r8
  59 #define w4      r9
  60 #define w5      r10
  61 #define w6      r11
  62 #define w7      r0
  63 #define w8      r2
  64
  65 #define ASSEMBLER
  66 #include <sys/appleapiopts.h>
  67 #include <ppc/asm.h>
  68 #include <machine/cpu_capabilities.h>
  69 #include <machine/commpage.h>
  70
  71         .text
  72
  73 #define kLong           64                              // too long for inline loopless code
  74
  75
  76 // Main entry points.
  77
  78         .align  5
  79 bcopy_64:                                                       // void bcopy(const void *src, void *dst, size_t len)
  80         cmplwi  rc,kLong                        // short or long?
  81         sub             w1,r4,r3                        // must move in reverse if (rd-rs)<rc
  82         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
  83         mr              rd,r4                           // start to move registers to canonic spot
  84         mr              rs,r3
  85         blt             LShort                          // handle short operands
  86         dcbt    0,r3                            // touch in destination
  87         b               LLong                           // join medium/long operand code
  88
  89 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
  90
  91         .align  5
  92 Lmemcpy_g4:                                                     // void* memcpy(void *dst, void *src, size_t len)
  93 Lmemmove_g4:                                            // void* memmove(void *dst, const void *src, size_t len)
  94         cmplwi  rc,kLong                        // short or long?
  95         sub             w1,r3,r4                        // must move in reverse if (rd-rs)<rc
  96         dcbt    0,r4                            // touch in the first line of source
  97         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
  98         mr              rd,r3                           // must leave r3 alone, it is return value for memcpy etc
  99         bge             LLong                           // handle medium or long operands
 100
 101 // Handle short operands.
 102
 103 LShort:
 104         mtcrf   0x02,rc                         // put length bits 26-27 in cr6 (faster one cr at a time)
 105         mtcrf   0x01,rc                         // put length bits 28-31 in cr7
 106         blt             cr1,LShortReverse
 107
 108 // Forward short operands.  This is the most frequent case, so it is inline.
 109
 110 LShort64:                                                       // enter to xfer last 64 bytes
 111         bf              26,0f                           // 64-byte chunk to xfer?
 112         ld              w1,0(rs)
 113         ld              w2,8(rs)
 114         ld              w3,16(rs)
 115         ld              w4,24(rs)
 116         addi    rs,rs,32
 117         std             w1,0(rd)
 118         std             w2,8(rd)
 119         std             w3,16(rd)
 120         std             w4,24(rd)
 121         addi    rd,rd,32
 122 0:
 123         bf              27,1f                           // quadword to move?
 124         ld              w1,0(rs)
 125         ld              w2,8(rs)
 126         addi    rs,rs,16
 127         std             w1,0(rd)
 128         std             w2,8(rd)
 129         addi    rd,rd,16
 130 1:
 131         bf              28,2f                           // doubleword?
 132         ld              w1,0(rs)
 133         addi    rs,rs,8
 134         std             w1,0(rd)
 135         addi    rd,rd,8
 136 2:
 137         bf              29,3f                           // word?
 138         lwz             w1,0(rs)
 139         addi    rs,rs,4
 140         stw             w1,0(rd)
 141         addi    rd,rd,4
 142 3:
 143         bf              30,4f                           // halfword to move?
 144         lhz             w1,0(rs)
 145         addi    rs,rs,2
 146         sth             w1,0(rd)
 147         addi    rd,rd,2
 148 4:
 149         bflr    31                                      // skip if no odd byte
 150         lbz             w1,0(rs)
 151         stb             w1,0(rd)
 152         blr
 153
 154
 155 // Handle short reverse operands.
 156 //              cr6 = bits 26-27 of length
 157 //              cr7 = bits 28-31 of length
 158
 159 LShortReverse:
 160         add             rs,rs,rc                        // adjust ptrs for reverse move
 161         add             rd,rd,rc
 162 LShortReverse64:                                        // enter to xfer last 64 bytes
 163         bf              26,0f                           // 64-byte chunk to xfer?
 164         ld              w1,-8(rs)
 165         ld              w2,-16(rs)
 166         ld              w3,-24(rs)
 167         ldu             w4,-32(rs)
 168         std             w1,-8(rd)
 169         std             w2,-16(rd)
 170         std             w3,-24(rd)
 171         stdu    w4,-32(rd)
 172 0:
 173         bf              27,1f                           // quadword to move?
 174         ld              w1,-8(rs)
 175         ldu             w2,-16(rs)
 176         std             w1,-8(rd)
 177         stdu    w2,-16(rd)
 178 1:
 179         bf              28,2f                           // doubleword?
 180         ldu             w1,-8(rs)
 181         stdu    w1,-8(rd)
 182 2:
 183         bf              29,3f                           // word?
 184         lwzu    w1,-4(rs)
 185         stwu    w1,-4(rd)
 186 3:
 187         bf              30,4f                           // halfword to move?
 188         lhzu    w1,-2(rs)
 189         sthu    w1,-2(rd)
 190 4:
 191         bflr    31                                      // done if no odd byte
 192         lbz     w1,-1(rs)                       // no update
 193         stb     w1,-1(rd)
 194         blr
 195
 196
 197 // Long operands.
 198 //     cr1 = blt iff we must move reverse
 199
 200         .align  4
 201 LLong:
 202         dcbtst  0,rd                            // touch in destination
 203         neg             w3,rd                           // start to compute #bytes to align destination
 204         andi.   w6,w3,7                         // w6 <- #bytes to 8-byte align destination
 205         blt             cr1,LLongReverse        // handle reverse moves
 206         mtctr   w6                                      // set up for loop to align destination
 207         sub             rc,rc,w6                        // adjust count
 208         beq             LAligned                        // destination already 8-byte aligned
 209 1:
 210         lbz             w1,0(rs)
 211         addi    rs,rs,1
 212         stb             w1,0(rd)
 213         addi    rd,rd,1
 214         bdnz    1b
 215
 216 // Destination is 8-byte aligned.
 217
 218 LAligned:
 219         srwi.   w2,rc,6                         // w2 <- count of 64-byte chunks
 220         mtcrf   0x02,rc                         // leftover byte count to cr (faster one cr at a time)
 221         mtcrf   0x01,rc                         // put length bits 28-31 in cr7
 222         beq             LShort64                        // no 64-byte chunks
 223         mtctr   w2
 224         b               1f
 225
 226 // Loop moving 64-byte chunks.
 227
 228         .align  5
 229 1:
 230         ld              w1,0(rs)
 231         ld              w2,8(rs)
 232         ld              w3,16(rs)
 233         ld              w4,24(rs)
 234         ld              w5,32(rs)
 235         ld              w6,40(rs)
 236         ld              w7,48(rs)
 237         ld              w8,56(rs)
 238         addi    rs,rs,64
 239         std             w1,0(rd)
 240         std             w2,8(rd)
 241         std             w3,16(rd)
 242         std             w4,24(rd)
 243         std             w5,32(rd)
 244         std             w6,40(rd)
 245         std             w7,48(rd)
 246         std             w8,56(rd)
 247         addi    rd,rd,64
 248         bdnz    1b
 249
 250         b               LShort64
 251
 252
 253 // Handle reverse moves.
 254
 255 LLongReverse:
 256         add             rd,rd,rc                                // point to end of operands
 257         add             rs,rs,rc
 258         andi.   r0,rd,7                                 // is destination 8-byte aligned?
 259         sub             rc,rc,r0                                // adjust count
 260         mtctr   r0                                              // set up for byte loop
 261         beq             LRevAligned                             // already aligned
 262
 263 1:
 264         lbzu    w1,-1(rs)
 265         stbu    w1,-1(rd)
 266         bdnz    1b
 267
 268 // Destination is 8-byte aligned.
 269
 270 LRevAligned:
 271         srwi.   w2,rc,6                         // w2 <- count of 64-byte chunks
 272         mtcrf   0x02,rc                         // leftover byte count to cr (faster one cr at a time)
 273         mtcrf   0x01,rc                         // put length bits 28-31 in cr7
 274         beq             LShortReverse64         // no 64-byte chunks
 275         mtctr   w2
 276         b               1f
 277
 278 // Loop over 64-byte chunks (reverse).
 279
 280         .align  5
 281 1:
 282         ld              w1,-8(rs)
 283         ld              w2,-16(rs)
 284         ld              w3,-24(rs)
 285         ld              w4,-32(rs)
 286         ld              w5,-40(rs)
 287         ld              w6,-48(rs)
 288         ld              w7,-56(rs)
 289         ldu             w8,-64(rs)
 290         std             w1,-8(rd)
 291         std             w2,-16(rd)
 292         std             w3,-24(rd)
 293         std             w4,-32(rd)
 294         std             w5,-40(rd)
 295         std             w6,-48(rd)
 296         std             w7,-56(rd)
 297         stdu    w8,-64(rd)
 298         bdnz    1b
 299
 300         b               LShortReverse64
 301
 302         COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)