osfmk/ppc/commpage/bcopy_64.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* =======================================
  29  * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
  30  * =======================================
  31  *
  32  * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
  33  * This version might be used bringing up new processors, with known
  34  * Altivec bugs that need to be worked around.  It is not particularly well
  35  * optimized.
  36  *
  37  * For 64-bit processors with a 128-byte cache line, running in either
  38  * 32- or 64-bit mode.  This is written for 32-bit execution, the kernel
  39  * will translate to 64-bit code when it compiles the 64-bit commpage.
  40  *
  41  * Register usage.  Note we use R2, so this code will not run in a PEF/CFM
  42  * environment.
  43  *   r0  = "w7" or temp
  44  *   r2  = "w8"
  45  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  46  *   r4  = source ptr ("rs")
  47  *   r5  = count of bytes to move ("rc")
  48  *   r6  = "w1"
  49  *   r7  = "w2"
  50  *   r8  = "w3"
  51  *   r9  = "w4"
  52  *   r10 = "w5"
  53  *   r11 = "w6"
  54  *   r12 = destination ptr ("rd")
  55  */
  56 #define rs      r4
  57 #define rd      r12
  58 #define rc      r5
  59 #define rv      r2
  60
  61 #define w1      r6
  62 #define w2      r7
  63 #define w3      r8
  64 #define w4      r9
  65 #define w5      r10
  66 #define w6      r11
  67 #define w7      r0
  68 #define w8      r2
  69
  70 #include <sys/appleapiopts.h>
  71 #include <ppc/asm.h>
  72 #include <machine/cpu_capabilities.h>
  73 #include <machine/commpage.h>
  74
  75         .text
  76
  77 #define kLong           64                              // too long for inline loopless code
  78
  79
  80 // Main entry points.
  81
  82         .align  5
  83 bcopy_64:                                                       // void bcopy(const void *src, void *dst, size_t len)
  84         cmplwi  rc,kLong                        // short or long?
  85         sub             w1,r4,r3                        // must move in reverse if (rd-rs)<rc
  86         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
  87         mr              rd,r4                           // start to move registers to canonic spot
  88         mr              rs,r3
  89         blt             LShort                          // handle short operands
  90         dcbt    0,r3                            // touch in destination
  91         b               LLong                           // join medium/long operand code
  92
  93 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
  94
  95         .align  5
  96 Lmemcpy_g4:                                                     // void* memcpy(void *dst, void *src, size_t len)
  97 Lmemmove_g4:                                            // void* memmove(void *dst, const void *src, size_t len)
  98         cmplwi  rc,kLong                        // short or long?
  99         sub             w1,r3,r4                        // must move in reverse if (rd-rs)<rc
 100         dcbt    0,r4                            // touch in the first line of source
 101         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
 102         mr              rd,r3                           // must leave r3 alone, it is return value for memcpy etc
 103         bge             LLong                           // handle medium or long operands
 104
 105 // Handle short operands.
 106
 107 LShort:
 108         mtcrf   0x02,rc                         // put length bits 26-27 in cr6 (faster one cr at a time)
 109         mtcrf   0x01,rc                         // put length bits 28-31 in cr7
 110         blt             cr1,LShortReverse
 111
 112 // Forward short operands.  This is the most frequent case, so it is inline.
 113
 114 LShort64:                                                       // enter to xfer last 64 bytes
 115         bf              26,0f                           // 64-byte chunk to xfer?
 116         ld              w1,0(rs)
 117         ld              w2,8(rs)
 118         ld              w3,16(rs)
 119         ld              w4,24(rs)
 120         addi    rs,rs,32
 121         std             w1,0(rd)
 122         std             w2,8(rd)
 123         std             w3,16(rd)
 124         std             w4,24(rd)
 125         addi    rd,rd,32
 126 0:
 127         bf              27,1f                           // quadword to move?
 128         ld              w1,0(rs)
 129         ld              w2,8(rs)
 130         addi    rs,rs,16
 131         std             w1,0(rd)
 132         std             w2,8(rd)
 133         addi    rd,rd,16
 134 1:
 135         bf              28,2f                           // doubleword?
 136         ld              w1,0(rs)
 137         addi    rs,rs,8
 138         std             w1,0(rd)
 139         addi    rd,rd,8
 140 2:
 141         bf              29,3f                           // word?
 142         lwz             w1,0(rs)
 143         addi    rs,rs,4
 144         stw             w1,0(rd)
 145         addi    rd,rd,4
 146 3:
 147         bf              30,4f                           // halfword to move?
 148         lhz             w1,0(rs)
 149         addi    rs,rs,2
 150         sth             w1,0(rd)
 151         addi    rd,rd,2
 152 4:
 153         bflr    31                                      // skip if no odd byte
 154         lbz             w1,0(rs)
 155         stb             w1,0(rd)
 156         blr
 157
 158
 159 // Handle short reverse operands.
 160 //              cr6 = bits 26-27 of length
 161 //              cr7 = bits 28-31 of length
 162
 163 LShortReverse:
 164         add             rs,rs,rc                        // adjust ptrs for reverse move
 165         add             rd,rd,rc
 166 LShortReverse64:                                        // enter to xfer last 64 bytes
 167         bf              26,0f                           // 64-byte chunk to xfer?
 168         ld              w1,-8(rs)
 169         ld              w2,-16(rs)
 170         ld              w3,-24(rs)
 171         ldu             w4,-32(rs)
 172         std             w1,-8(rd)
 173         std             w2,-16(rd)
 174         std             w3,-24(rd)
 175         stdu    w4,-32(rd)
 176 0:
 177         bf              27,1f                           // quadword to move?
 178         ld              w1,-8(rs)
 179         ldu             w2,-16(rs)
 180         std             w1,-8(rd)
 181         stdu    w2,-16(rd)
 182 1:
 183         bf              28,2f                           // doubleword?
 184         ldu             w1,-8(rs)
 185         stdu    w1,-8(rd)
 186 2:
 187         bf              29,3f                           // word?
 188         lwzu    w1,-4(rs)
 189         stwu    w1,-4(rd)
 190 3:
 191         bf              30,4f                           // halfword to move?
 192         lhzu    w1,-2(rs)
 193         sthu    w1,-2(rd)
 194 4:
 195         bflr    31                                      // done if no odd byte
 196         lbz     w1,-1(rs)                       // no update
 197         stb     w1,-1(rd)
 198         blr
 199
 200
 201 // Long operands.
 202 //     cr1 = blt iff we must move reverse
 203
 204         .align  4
 205 LLong:
 206         dcbtst  0,rd                            // touch in destination
 207         neg             w3,rd                           // start to compute #bytes to align destination
 208         andi.   w6,w3,7                         // w6 <- #bytes to 8-byte align destination
 209         blt             cr1,LLongReverse        // handle reverse moves
 210         mtctr   w6                                      // set up for loop to align destination
 211         sub             rc,rc,w6                        // adjust count
 212         beq             LAligned                        // destination already 8-byte aligned
 213 1:
 214         lbz             w1,0(rs)
 215         addi    rs,rs,1
 216         stb             w1,0(rd)
 217         addi    rd,rd,1
 218         bdnz    1b
 219
 220 // Destination is 8-byte aligned.
 221
 222 LAligned:
 223         srwi.   w2,rc,6                         // w2 <- count of 64-byte chunks
 224         mtcrf   0x02,rc                         // leftover byte count to cr (faster one cr at a time)
 225         mtcrf   0x01,rc                         // put length bits 28-31 in cr7
 226         beq             LShort64                        // no 64-byte chunks
 227         mtctr   w2
 228         b               1f
 229
 230 // Loop moving 64-byte chunks.
 231
 232         .align  5
 233 1:
 234         ld              w1,0(rs)
 235         ld              w2,8(rs)
 236         ld              w3,16(rs)
 237         ld              w4,24(rs)
 238         ld              w5,32(rs)
 239         ld              w6,40(rs)
 240         ld              w7,48(rs)
 241         ld              w8,56(rs)
 242         addi    rs,rs,64
 243         std             w1,0(rd)
 244         std             w2,8(rd)
 245         std             w3,16(rd)
 246         std             w4,24(rd)
 247         std             w5,32(rd)
 248         std             w6,40(rd)
 249         std             w7,48(rd)
 250         std             w8,56(rd)
 251         addi    rd,rd,64
 252         bdnz    1b
 253
 254         b               LShort64
 255
 256
 257 // Handle reverse moves.
 258
 259 LLongReverse:
 260         add             rd,rd,rc                                // point to end of operands
 261         add             rs,rs,rc
 262         andi.   r0,rd,7                                 // is destination 8-byte aligned?
 263         sub             rc,rc,r0                                // adjust count
 264         mtctr   r0                                              // set up for byte loop
 265         beq             LRevAligned                             // already aligned
 266
 267 1:
 268         lbzu    w1,-1(rs)
 269         stbu    w1,-1(rd)
 270         bdnz    1b
 271
 272 // Destination is 8-byte aligned.
 273
 274 LRevAligned:
 275         srwi.   w2,rc,6                         // w2 <- count of 64-byte chunks
 276         mtcrf   0x02,rc                         // leftover byte count to cr (faster one cr at a time)
 277         mtcrf   0x01,rc                         // put length bits 28-31 in cr7
 278         beq             LShortReverse64         // no 64-byte chunks
 279         mtctr   w2
 280         b               1f
 281
 282 // Loop over 64-byte chunks (reverse).
 283
 284         .align  5
 285 1:
 286         ld              w1,-8(rs)
 287         ld              w2,-16(rs)
 288         ld              w3,-24(rs)
 289         ld              w4,-32(rs)
 290         ld              w5,-40(rs)
 291         ld              w6,-48(rs)
 292         ld              w7,-56(rs)
 293         ldu             w8,-64(rs)
 294         std             w1,-8(rd)
 295         std             w2,-16(rd)
 296         std             w3,-24(rd)
 297         std             w4,-32(rd)
 298         std             w5,-40(rd)
 299         std             w6,-48(rd)
 300         std             w7,-56(rd)
 301         stdu    w8,-64(rd)
 302         bdnz    1b
 303
 304         b               LShortReverse64
 305
 306         COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)