osfmk/ppc/commpage/bigcopy_970.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 /* ====================================
  31  * Very Long Operand BCOPY for Mac OS X
  32  * ====================================
  33  *
  34  * Version of 2/21/2004, tuned for the IBM 970.  This is for operands at
  35  * least several pages long.  It is called from bcopy()/memcpy()/memmove(),
  36  * and runs both in 32 and 64-bit mode.
  37  *
  38  * We use the following additional strategies not used by the shorter
  39  * operand paths.  Mostly, we try to optimize for memory bandwidth:
  40  *      1. Use DCBZ128 to avoid reading destination lines.  Because this code
  41  *     resides on the commmpage, it can use a private interface with the
  42  *     kernel to minimize alignment exceptions if the destination is
  43  *     uncached.  The kernel will clear cr7 whenever it emulates a DCBZ or
  44  *     DCBZ128 on the commpage.  Thus we take at most one exception per call,
  45  *     which is amortized across the very long operand.
  46  *      2. Copy larger chunks per iteration to minimize R/W bus turnaround
  47  *     and maximize DRAM page locality (opening a new page is expensive.)
  48  *     We use 256-byte chunks.
  49  *  3. Touch in one source chunk ahead with DCBT.  This is probably the
  50  *     least important change, and probably only helps restart the
  51  *     hardware stream at the start of each source page.
  52  */
  53
  54 #define rs      r13
  55 #define rd      r14
  56 #define rc      r15
  57 #define rx  r16
  58
  59 #define c16     r3
  60 #define c32     r4
  61 #define c48     r5
  62 #define c64     r6
  63 #define c80     r7
  64 #define c96     r8
  65 #define c112    r9
  66 #define c256    r10
  67 #define c384    r11
  68 #define rv      r12     // vrsave
  69
  70 // Offsets within the "red zone" (which is 224 bytes long):
  71
  72 #define rzR3    -8
  73 #define rzR13   -16
  74 #define rzR14   -24
  75 #define rzR15   -32
  76 #define rzR16   -40
  77
  78 #define rzV20   -64
  79 #define rzV21   -80
  80 #define rzV22   -96
  81 #define rzV23   -112
  82
  83
  84 #include <sys/appleapiopts.h>
  85 #include <ppc/asm.h>
  86 #include <machine/cpu_capabilities.h>
  87 #include <machine/commpage.h>
  88
  89         .text
  90 /*
  91  * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
  92  * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
  93  * simple transformations:
  94  *      - all word compares are changed to doubleword
  95  *      - all "srwi[.]" opcodes are changed to "srdi[.]"
  96  * Nothing else is done.  For this to work, the following rules must be
  97  * carefully followed:
  98  *      - do not use carry or overflow
  99  *      - only use record mode if you are sure the results are mode-invariant
 100  *        for example, all "andi." and almost all "rlwinm." are fine
 101  *      - do not use "slwi", "slw", or "srw"
 102  * An imaginative programmer could break the porting model in other ways, but the above
 103  * are the most likely problem areas.  It is perhaps surprising how well in practice
 104  * this simple method works.
 105  */
 106
 107 // Entry point.  This is a subroutine of bcopy().  When called:
 108 //  r0 = return address (also stored in caller's SF)
 109 //      r4 = source ptr
 110 //      r5 = length (at least several pages)
 111 // r12 = dest ptr
 112 //
 113 // We only do "forward" moves, ie non-overlapping or toward 0.  We return with non-volatiles
 114 // and r3 preserved.
 115
 116         .align  5
 117 bigcopy_970:
 118         neg     r2,r12              // is destination cache-line-aligned?
 119         std     r3,rzR3(r1)         // save caller's r3, which must be preserved for memcpy()
 120         std             r13,rzR13(r1)           // spill non-volatile regs we use to redzone
 121         std             r14,rzR14(r1)
 122         std             r15,rzR15(r1)
 123         andi.   r2,r2,0x7F          // #bytes to align
 124         std     r16,rzR16(r1)
 125         mr      rs,r4               // copy parameters into nonvolatile registers
 126         mr      rd,r12
 127         mr      rc,r5
 128         mr      rx,r0               // also save return address
 129         beq     1f                  // skip if already aligned
 130
 131 // Cache-line-align destination.
 132
 133         mr      r3,rd               // set up dest ptr for memcpy()
 134         mr      r5,r2               // number of bytes to copy
 135         add     rs,rs,r2            // then bump our parameters past initial copy
 136         add     rd,rd,r2
 137         sub     rc,rc,r2
 138         bla     _COMM_PAGE_MEMCPY   // 128-byte-align destination
 139
 140
 141 // Load constant offsets and check whether source is 16-byte aligned.
 142 // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
 143 // and we dcbz only if cr7 beq is set.
 144
 145 1:
 146         dcbt    0,rs                // touch in 1st line of source
 147         andi.   r0,rs,15                        // check source alignment
 148         mfspr   rv,vrsave                       // save caller's bitmask
 149         li              c16,16                          // load the constant offsets for x-form ops
 150         li              c32,32
 151         srwi    r2,rc,8             // get number of 256-byte chunks to xfer
 152         li              r0,-256                         // we use 24 VRs (ie, 0-23)
 153         li              c48,48
 154         li      c64,64
 155         li      c80,80
 156         or      r0,r0,rv            // add our bits to caller's
 157         li      c96,96
 158         mtctr   r2                  // set up loop count
 159         li      c112,112
 160         cmpd    cr7,r2,r2           // initialize cr7_eq to "on", so we dcbz128
 161         mtspr   vrsave,r0           // say we use vr0..vr23
 162         li              c256,256
 163         li              c384,384
 164         beq             LalignedLoop            // handle aligned sources
 165
 166
 167 // Set up for unaligned loop.
 168
 169         lvsl    v0,0,rs                         // get permute vector for left shift
 170         lvxl    v1,0,rs                         // prime the loop
 171         li              r0,rzV20            // save non-volatile VRs in redzone
 172         stvx    v20,r1,r0
 173         li              r0,rzV21
 174         stvx    v21,r1,r0
 175         li              r0,rzV22
 176         stvx    v22,r1,r0
 177         li              r0,rzV23
 178         stvx    v23,r1,r0
 179         b               LunalignedLoop          // enter unaligned loop
 180
 181
 182 // Main loop for unaligned operands.  We loop over 256-byte chunks (2 cache lines).
 183 // Destination is 128-byte aligned, source is unaligned.
 184
 185         .align  5
 186 LunalignedLoop:
 187         dcbt    c256,rs             // touch in next chunk
 188         dcbt    c384,rs
 189         addi    r2,rs,128           // point to 2nd 128 bytes of source
 190         lvxl    v2,c16,rs
 191         lvxl    v3,c32,rs
 192         lvxl    v4,c48,rs
 193         lvxl    v5,c64,rs
 194         lvxl    v6,c80,rs
 195         lvxl    v7,c96,rs
 196         lvxl    v8,c112,rs
 197         lvxl    v9,0,r2
 198         addi    rs,rs,256           // point to next source chunk
 199         lvxl    v10,c16,r2
 200         lvxl    v11,c32,r2
 201         vperm   v17,v1,v2,v0
 202         lvxl    v12,c48,r2
 203         lvxl    v13,c64,r2
 204         vperm   v18,v2,v3,v0
 205         lvxl    v14,c80,r2
 206         lvxl    v15,c96,r2
 207         vperm   v19,v3,v4,v0
 208         lvxl    v16,c112,r2
 209         lvxl    v1,0,rs             // peek ahead at first source quad in next chunk
 210         vperm   v20,v4,v5,v0
 211         addi    r2,rd,128           // point to 2nd 128 bytes of dest
 212         bne--   cr7,1f                          // skip dcbz's if cr7 beq has been turned off by kernel
 213         dcbz128 0,rd
 214         dcbz128 0,r2
 215 1:
 216         vperm   v21,v5,v6,v0
 217         stvxl   v17,0,rd
 218         vperm   v22,v6,v7,v0
 219         stvxl   v18,c16,rd
 220         vperm   v23,v7,v8,v0
 221         stvxl   v19,c32,rd
 222         vperm   v17,v8,v9,v0
 223         stvxl   v20,c48,rd
 224         vperm   v18,v9,v10,v0
 225         stvxl   v21,c64,rd
 226         vperm   v19,v10,v11,v0
 227         stvxl   v22,c80,rd
 228         vperm   v20,v11,v12,v0
 229         stvxl   v23,c96,rd
 230         vperm   v21,v12,v13,v0
 231         stvxl   v17,c112,rd
 232         vperm   v22,v13,v14,v0
 233         addi    rd,rd,256           // point to next dest chunk
 234         stvxl   v18,0,r2
 235         vperm   v23,v14,v15,v0
 236         stvxl   v19,c16,r2
 237         vperm   v17,v15,v16,v0
 238         stvxl   v20,c32,r2
 239         vperm   v18,v16,v1,v0
 240         stvxl   v21,c48,r2
 241         stvxl   v22,c64,r2
 242         stvxl   v23,c80,r2
 243         stvxl   v17,c96,r2
 244         stvxl   v18,c112,r2
 245         bdnz++  LunalignedLoop      // loop if another 256 bytes to go
 246
 247         li              r6,rzV20            // restore non-volatile VRs
 248         li              r7,rzV21
 249         li              r8,rzV22
 250         li              r9,rzV23
 251         lvx             v20,r1,r6
 252         lvx             v21,r1,r7
 253         lvx             v22,r1,r8
 254         lvx             v23,r1,r9
 255         b       Ldone
 256
 257
 258 // Aligned loop.  Destination is 128-byte aligned, and source is 16-byte
 259 // aligned.  Loop over 256-byte chunks (2 cache lines.)
 260
 261         .align  5
 262 LalignedLoop:
 263         dcbt    c256,rs             // touch in next chunk
 264         dcbt    c384,rs
 265         addi    r2,rs,128           // point to 2nd 128 bytes of source
 266         lvxl    v1,0,rs
 267         lvxl    v2,c16,rs
 268         lvxl    v3,c32,rs
 269         lvxl    v4,c48,rs
 270         lvxl    v5,c64,rs
 271         lvxl    v6,c80,rs
 272         lvxl    v7,c96,rs
 273         lvxl    v8,c112,rs
 274         lvxl    v9,0,r2
 275         lvxl    v10,c16,r2
 276         lvxl    v11,c32,r2
 277         lvxl    v12,c48,r2
 278         lvxl    v13,c64,r2
 279         lvxl    v14,c80,r2
 280         lvxl    v15,c96,r2
 281         lvxl    v16,c112,r2
 282         addi    r2,rd,128           // point to 2nd 128 bytes of dest
 283         bne--   cr7,1f                          // skip dcbz's if cr7 beq has been turned off by kernel
 284         dcbz128 0,rd
 285         dcbz128 0,r2
 286 1:
 287         addi    rs,rs,256           // point to next source chunk
 288         stvxl   v1,0,rd
 289         stvxl   v2,c16,rd
 290         stvxl   v3,c32,rd
 291         stvxl   v4,c48,rd
 292         stvxl   v5,c64,rd
 293         stvxl   v6,c80,rd
 294         stvxl   v7,c96,rd
 295         stvxl   v8,c112,rd
 296         addi    rd,rd,256           // point to next dest chunk
 297         stvxl   v9,0,r2
 298         stvxl   v10,c16,r2
 299         stvxl   v11,c32,r2
 300         stvxl   v12,c48,r2
 301         stvxl   v13,c64,r2
 302         stvxl   v14,c80,r2
 303         stvxl   v15,c96,r2
 304         stvxl   v16,c112,r2
 305         bdnz++  LalignedLoop            // loop if another 256 bytes to go
 306
 307
 308 // Done, except for 0..255 leftover bytes at end.
 309 //      rs = source ptr
 310 //      rd = dest ptr
 311 //      rc = remaining count in low 7 bits
 312 //      rv = caller's vrsave
 313 //  rx = caller's return address
 314
 315 Ldone:
 316         andi.   r5,rc,0xFF          // any leftover bytes? (0..255)
 317         mtspr   vrsave,rv                       // restore bitmap of live vr's
 318
 319         mr      r3,rd
 320         mr      r4,rs
 321         bnela   _COMM_PAGE_MEMCPY   // copy leftover bytes
 322
 323         mtlr    rx                  // restore return address
 324         ld      r3,rzR3(r1)         // restore non-volatile GPRs from redzone
 325         ld              r13,rzR13(r1)
 326         ld              r14,rzR14(r1)
 327         ld              r15,rzR15(r1)
 328         ld      r16,rzR16(r1)
 329         blr
 330
 331
 332         COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)
 333