osfmk/ppc/commpage/bcopy_64.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* =======================================
  29  * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
  30  * =======================================
  31  *
  32  * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec.
  33  * This version might be used bringing up new processors, with known
  34  * Altivec bugs that need to be worked around.  It is not particularly well
  35  * optimized.
  36  *
  37  * For 64-bit processors with a 128-byte cache line, running in either
  38  * 32- or 64-bit mode.  This is written for 32-bit execution, the kernel
  39  * will translate to 64-bit code when it compiles the 64-bit commpage.
  40  *
  41  * Register usage.  Note we use R2, so this code will not run in a PEF/CFM
  42  * environment.
  43  *   r0  = "w7" or temp
  44  *   r2  = "w8"
  45  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  46  *   r4  = source ptr ("rs")
  47  *   r5  = count of bytes to move ("rc")
  48  *   r6  = "w1"
  49  *   r7  = "w2"
  50  *   r8  = "w3"
  51  *   r9  = "w4"
  52  *   r10 = "w5"
  53  *   r11 = "w6"
  54  *   r12 = destination ptr ("rd")
  55  */
  56 #define rs      r4
  57 #define rd      r12
  58 #define rc      r5
  59 #define rv      r2
  60
  61 #define w1      r6
  62 #define w2      r7
  63 #define w3      r8
  64 #define w4      r9
  65 #define w5      r10
  66 #define w6      r11
  67 #define w7      r0
  68 #define w8      r2
  69
  70 #define ASSEMBLER
  71 #include <sys/appleapiopts.h>
  72 #include <ppc/asm.h>
  73 #include <machine/cpu_capabilities.h>
  74 #include <machine/commpage.h>
  75
  76         .text
  77
  78 #define kLong           64                              // too long for inline loopless code
  79
  80
  81 // Main entry points.
  82
  83         .align  5
  84 bcopy_64:                                                       // void bcopy(const void *src, void *dst, size_t len)
  85         cmplwi  rc,kLong                        // short or long?
  86         sub             w1,r4,r3                        // must move in reverse if (rd-rs)<rc
  87         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
  88         mr              rd,r4                           // start to move registers to canonic spot
  89         mr              rs,r3
  90         blt             LShort                          // handle short operands
  91         dcbt    0,r3                            // touch in destination
  92         b               LLong                           // join medium/long operand code
  93
  94 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
  95
  96         .align  5
  97 Lmemcpy_g4:                                                     // void* memcpy(void *dst, void *src, size_t len)
  98 Lmemmove_g4:                                            // void* memmove(void *dst, const void *src, size_t len)
  99         cmplwi  rc,kLong                        // short or long?
 100         sub             w1,r3,r4                        // must move in reverse if (rd-rs)<rc
 101         dcbt    0,r4                            // touch in the first line of source
 102         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
 103         mr              rd,r3                           // must leave r3 alone, it is return value for memcpy etc
 104         bge             LLong                           // handle medium or long operands
 105
 106 // Handle short operands.
 107
 108 LShort:
 109         mtcrf   0x02,rc                         // put length bits 26-27 in cr6 (faster one cr at a time)
 110         mtcrf   0x01,rc                         // put length bits 28-31 in cr7
 111         blt             cr1,LShortReverse
 112
 113 // Forward short operands.  This is the most frequent case, so it is inline.
 114
 115 LShort64:                                                       // enter to xfer last 64 bytes
 116         bf              26,0f                           // 64-byte chunk to xfer?
 117         ld              w1,0(rs)
 118         ld              w2,8(rs)
 119         ld              w3,16(rs)
 120         ld              w4,24(rs)
 121         addi    rs,rs,32
 122         std             w1,0(rd)
 123         std             w2,8(rd)
 124         std             w3,16(rd)
 125         std             w4,24(rd)
 126         addi    rd,rd,32
 127 0:
 128         bf              27,1f                           // quadword to move?
 129         ld              w1,0(rs)
 130         ld              w2,8(rs)
 131         addi    rs,rs,16
 132         std             w1,0(rd)
 133         std             w2,8(rd)
 134         addi    rd,rd,16
 135 1:
 136         bf              28,2f                           // doubleword?
 137         ld              w1,0(rs)
 138         addi    rs,rs,8
 139         std             w1,0(rd)
 140         addi    rd,rd,8
 141 2:
 142         bf              29,3f                           // word?
 143         lwz             w1,0(rs)
 144         addi    rs,rs,4
 145         stw             w1,0(rd)
 146         addi    rd,rd,4
 147 3:
 148         bf              30,4f                           // halfword to move?
 149         lhz             w1,0(rs)
 150         addi    rs,rs,2
 151         sth             w1,0(rd)
 152         addi    rd,rd,2
 153 4:
 154         bflr    31                                      // skip if no odd byte
 155         lbz             w1,0(rs)
 156         stb             w1,0(rd)
 157         blr
 158
 159
 160 // Handle short reverse operands.
 161 //              cr6 = bits 26-27 of length
 162 //              cr7 = bits 28-31 of length
 163
 164 LShortReverse:
 165         add             rs,rs,rc                        // adjust ptrs for reverse move
 166         add             rd,rd,rc
 167 LShortReverse64:                                        // enter to xfer last 64 bytes
 168         bf              26,0f                           // 64-byte chunk to xfer?
 169         ld              w1,-8(rs)
 170         ld              w2,-16(rs)
 171         ld              w3,-24(rs)
 172         ldu             w4,-32(rs)
 173         std             w1,-8(rd)
 174         std             w2,-16(rd)
 175         std             w3,-24(rd)
 176         stdu    w4,-32(rd)
 177 0:
 178         bf              27,1f                           // quadword to move?
 179         ld              w1,-8(rs)
 180         ldu             w2,-16(rs)
 181         std             w1,-8(rd)
 182         stdu    w2,-16(rd)
 183 1:
 184         bf              28,2f                           // doubleword?
 185         ldu             w1,-8(rs)
 186         stdu    w1,-8(rd)
 187 2:
 188         bf              29,3f                           // word?
 189         lwzu    w1,-4(rs)
 190         stwu    w1,-4(rd)
 191 3:
 192         bf              30,4f                           // halfword to move?
 193         lhzu    w1,-2(rs)
 194         sthu    w1,-2(rd)
 195 4:
 196         bflr    31                                      // done if no odd byte
 197         lbz     w1,-1(rs)                       // no update
 198         stb     w1,-1(rd)
 199         blr
 200
 201
 202 // Long operands.
 203 //     cr1 = blt iff we must move reverse
 204
 205         .align  4
 206 LLong:
 207         dcbtst  0,rd                            // touch in destination
 208         neg             w3,rd                           // start to compute #bytes to align destination
 209         andi.   w6,w3,7                         // w6 <- #bytes to 8-byte align destination
 210         blt             cr1,LLongReverse        // handle reverse moves
 211         mtctr   w6                                      // set up for loop to align destination
 212         sub             rc,rc,w6                        // adjust count
 213         beq             LAligned                        // destination already 8-byte aligned
 214 1:
 215         lbz             w1,0(rs)
 216         addi    rs,rs,1
 217         stb             w1,0(rd)
 218         addi    rd,rd,1
 219         bdnz    1b
 220
 221 // Destination is 8-byte aligned.
 222
 223 LAligned:
 224         srwi.   w2,rc,6                         // w2 <- count of 64-byte chunks
 225         mtcrf   0x02,rc                         // leftover byte count to cr (faster one cr at a time)
 226         mtcrf   0x01,rc                         // put length bits 28-31 in cr7
 227         beq             LShort64                        // no 64-byte chunks
 228         mtctr   w2
 229         b               1f
 230
 231 // Loop moving 64-byte chunks.
 232
 233         .align  5
 234 1:
 235         ld              w1,0(rs)
 236         ld              w2,8(rs)
 237         ld              w3,16(rs)
 238         ld              w4,24(rs)
 239         ld              w5,32(rs)
 240         ld              w6,40(rs)
 241         ld              w7,48(rs)
 242         ld              w8,56(rs)
 243         addi    rs,rs,64
 244         std             w1,0(rd)
 245         std             w2,8(rd)
 246         std             w3,16(rd)
 247         std             w4,24(rd)
 248         std             w5,32(rd)
 249         std             w6,40(rd)
 250         std             w7,48(rd)
 251         std             w8,56(rd)
 252         addi    rd,rd,64
 253         bdnz    1b
 254
 255         b               LShort64
 256
 257
 258 // Handle reverse moves.
 259
 260 LLongReverse:
 261         add             rd,rd,rc                                // point to end of operands
 262         add             rs,rs,rc
 263         andi.   r0,rd,7                                 // is destination 8-byte aligned?
 264         sub             rc,rc,r0                                // adjust count
 265         mtctr   r0                                              // set up for byte loop
 266         beq             LRevAligned                             // already aligned
 267
 268 1:
 269         lbzu    w1,-1(rs)
 270         stbu    w1,-1(rd)
 271         bdnz    1b
 272
 273 // Destination is 8-byte aligned.
 274
 275 LRevAligned:
 276         srwi.   w2,rc,6                         // w2 <- count of 64-byte chunks
 277         mtcrf   0x02,rc                         // leftover byte count to cr (faster one cr at a time)
 278         mtcrf   0x01,rc                         // put length bits 28-31 in cr7
 279         beq             LShortReverse64         // no 64-byte chunks
 280         mtctr   w2
 281         b               1f
 282
 283 // Loop over 64-byte chunks (reverse).
 284
 285         .align  5
 286 1:
 287         ld              w1,-8(rs)
 288         ld              w2,-16(rs)
 289         ld              w3,-24(rs)
 290         ld              w4,-32(rs)
 291         ld              w5,-40(rs)
 292         ld              w6,-48(rs)
 293         ld              w7,-56(rs)
 294         ldu             w8,-64(rs)
 295         std             w1,-8(rd)
 296         std             w2,-16(rd)
 297         std             w3,-24(rd)
 298         std             w4,-32(rd)
 299         std             w5,-40(rd)
 300         std             w6,-48(rd)
 301         std             w7,-56(rd)
 302         stdu    w8,-64(rd)
 303         bdnz    1b
 304
 305         b               LShortReverse64
 306
 307         COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)