osfmk/ppc/commpage/bigcopy_970.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* ====================================
  29  * Very Long Operand BCOPY for Mac OS X
  30  * ====================================
  31  *
  32  * Version of 2/21/2004, tuned for the IBM 970.  This is for operands at
  33  * least several pages long.  It is called from bcopy()/memcpy()/memmove(),
  34  * and runs both in 32 and 64-bit mode.
  35  *
  36  * We use the following additional strategies not used by the shorter
  37  * operand paths.  Mostly, we try to optimize for memory bandwidth:
  38  *      1. Use DCBZ128 to avoid reading destination lines.  Because this code
  39  *     resides on the commmpage, it can use a private interface with the
  40  *     kernel to minimize alignment exceptions if the destination is
  41  *     uncached.  The kernel will clear cr7 whenever it emulates a DCBZ or
  42  *     DCBZ128 on the commpage.  Thus we take at most one exception per call,
  43  *     which is amortized across the very long operand.
  44  *      2. Copy larger chunks per iteration to minimize R/W bus turnaround
  45  *     and maximize DRAM page locality (opening a new page is expensive.)
  46  *     We use 256-byte chunks.
  47  *  3. Touch in one source chunk ahead with DCBT.  This is probably the
  48  *     least important change, and probably only helps restart the
  49  *     hardware stream at the start of each source page.
  50  */
  51
  52 #define rs      r13
  53 #define rd      r14
  54 #define rc      r15
  55 #define rx  r16
  56
  57 #define c16     r3
  58 #define c32     r4
  59 #define c48     r5
  60 #define c64     r6
  61 #define c80     r7
  62 #define c96     r8
  63 #define c112    r9
  64 #define c256    r10
  65 #define c384    r11
  66 #define rv      r12     // vrsave
  67
  68 // Offsets within the "red zone" (which is 224 bytes long):
  69
  70 #define rzR3    -8
  71 #define rzR13   -16
  72 #define rzR14   -24
  73 #define rzR15   -32
  74 #define rzR16   -40
  75
  76 #define rzV20   -64
  77 #define rzV21   -80
  78 #define rzV22   -96
  79 #define rzV23   -112
  80
  81
  82 #include <sys/appleapiopts.h>
  83 #include <ppc/asm.h>
  84 #include <machine/cpu_capabilities.h>
  85 #include <machine/commpage.h>
  86
  87         .text
  88 /*
  89  * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
  90  * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
  91  * simple transformations:
  92  *      - all word compares are changed to doubleword
  93  *      - all "srwi[.]" opcodes are changed to "srdi[.]"
  94  * Nothing else is done.  For this to work, the following rules must be
  95  * carefully followed:
  96  *      - do not use carry or overflow
  97  *      - only use record mode if you are sure the results are mode-invariant
  98  *        for example, all "andi." and almost all "rlwinm." are fine
  99  *      - do not use "slwi", "slw", or "srw"
 100  * An imaginative programmer could break the porting model in other ways, but the above
 101  * are the most likely problem areas.  It is perhaps surprising how well in practice
 102  * this simple method works.
 103  */
 104
 105 // Entry point.  This is a subroutine of bcopy().  When called:
 106 //  r0 = return address (also stored in caller's SF)
 107 //      r4 = source ptr
 108 //      r5 = length (at least several pages)
 109 // r12 = dest ptr
 110 //
 111 // We only do "forward" moves, ie non-overlapping or toward 0.  We return with non-volatiles
 112 // and r3 preserved.
 113
 114         .align  5
 115 bigcopy_970:
 116         neg     r2,r12              // is destination cache-line-aligned?
 117         std     r3,rzR3(r1)         // save caller's r3, which must be preserved for memcpy()
 118         std             r13,rzR13(r1)           // spill non-volatile regs we use to redzone
 119         std             r14,rzR14(r1)
 120         std             r15,rzR15(r1)
 121         andi.   r2,r2,0x7F          // #bytes to align
 122         std     r16,rzR16(r1)
 123         mr      rs,r4               // copy parameters into nonvolatile registers
 124         mr      rd,r12
 125         mr      rc,r5
 126         mr      rx,r0               // also save return address
 127         beq     1f                  // skip if already aligned
 128
 129 // Cache-line-align destination.
 130
 131         mr      r3,rd               // set up dest ptr for memcpy()
 132         mr      r5,r2               // number of bytes to copy
 133         add     rs,rs,r2            // then bump our parameters past initial copy
 134         add     rd,rd,r2
 135         sub     rc,rc,r2
 136         bla     _COMM_PAGE_MEMCPY   // 128-byte-align destination
 137
 138
 139 // Load constant offsets and check whether source is 16-byte aligned.
 140 // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
 141 // and we dcbz only if cr7 beq is set.
 142
 143 1:
 144         dcbt    0,rs                // touch in 1st line of source
 145         andi.   r0,rs,15                        // check source alignment
 146         mfspr   rv,vrsave                       // save caller's bitmask
 147         li              c16,16                          // load the constant offsets for x-form ops
 148         li              c32,32
 149         srwi    r2,rc,8             // get number of 256-byte chunks to xfer
 150         li              r0,-256                         // we use 24 VRs (ie, 0-23)
 151         li              c48,48
 152         li      c64,64
 153         li      c80,80
 154         or      r0,r0,rv            // add our bits to caller's
 155         li      c96,96
 156         mtctr   r2                  // set up loop count
 157         li      c112,112
 158         cmpd    cr7,r2,r2           // initialize cr7_eq to "on", so we dcbz128
 159         mtspr   vrsave,r0           // say we use vr0..vr23
 160         li              c256,256
 161         li              c384,384
 162         beq             LalignedLoop            // handle aligned sources
 163
 164
 165 // Set up for unaligned loop.
 166
 167         lvsl    v0,0,rs                         // get permute vector for left shift
 168         lvxl    v1,0,rs                         // prime the loop
 169         li              r0,rzV20            // save non-volatile VRs in redzone
 170         stvx    v20,r1,r0
 171         li              r0,rzV21
 172         stvx    v21,r1,r0
 173         li              r0,rzV22
 174         stvx    v22,r1,r0
 175         li              r0,rzV23
 176         stvx    v23,r1,r0
 177         b               LunalignedLoop          // enter unaligned loop
 178
 179
 180 // Main loop for unaligned operands.  We loop over 256-byte chunks (2 cache lines).
 181 // Destination is 128-byte aligned, source is unaligned.
 182
 183         .align  5
 184 LunalignedLoop:
 185         dcbt    c256,rs             // touch in next chunk
 186         dcbt    c384,rs
 187         addi    r2,rs,128           // point to 2nd 128 bytes of source
 188         lvxl    v2,c16,rs
 189         lvxl    v3,c32,rs
 190         lvxl    v4,c48,rs
 191         lvxl    v5,c64,rs
 192         lvxl    v6,c80,rs
 193         lvxl    v7,c96,rs
 194         lvxl    v8,c112,rs
 195         lvxl    v9,0,r2
 196         addi    rs,rs,256           // point to next source chunk
 197         lvxl    v10,c16,r2
 198         lvxl    v11,c32,r2
 199         vperm   v17,v1,v2,v0
 200         lvxl    v12,c48,r2
 201         lvxl    v13,c64,r2
 202         vperm   v18,v2,v3,v0
 203         lvxl    v14,c80,r2
 204         lvxl    v15,c96,r2
 205         vperm   v19,v3,v4,v0
 206         lvxl    v16,c112,r2
 207         lvxl    v1,0,rs             // peek ahead at first source quad in next chunk
 208         vperm   v20,v4,v5,v0
 209         addi    r2,rd,128           // point to 2nd 128 bytes of dest
 210         bne--   cr7,1f                          // skip dcbz's if cr7 beq has been turned off by kernel
 211         dcbz128 0,rd
 212         dcbz128 0,r2
 213 1:
 214         vperm   v21,v5,v6,v0
 215         stvxl   v17,0,rd
 216         vperm   v22,v6,v7,v0
 217         stvxl   v18,c16,rd
 218         vperm   v23,v7,v8,v0
 219         stvxl   v19,c32,rd
 220         vperm   v17,v8,v9,v0
 221         stvxl   v20,c48,rd
 222         vperm   v18,v9,v10,v0
 223         stvxl   v21,c64,rd
 224         vperm   v19,v10,v11,v0
 225         stvxl   v22,c80,rd
 226         vperm   v20,v11,v12,v0
 227         stvxl   v23,c96,rd
 228         vperm   v21,v12,v13,v0
 229         stvxl   v17,c112,rd
 230         vperm   v22,v13,v14,v0
 231         addi    rd,rd,256           // point to next dest chunk
 232         stvxl   v18,0,r2
 233         vperm   v23,v14,v15,v0
 234         stvxl   v19,c16,r2
 235         vperm   v17,v15,v16,v0
 236         stvxl   v20,c32,r2
 237         vperm   v18,v16,v1,v0
 238         stvxl   v21,c48,r2
 239         stvxl   v22,c64,r2
 240         stvxl   v23,c80,r2
 241         stvxl   v17,c96,r2
 242         stvxl   v18,c112,r2
 243         bdnz++  LunalignedLoop      // loop if another 256 bytes to go
 244
 245         li              r6,rzV20            // restore non-volatile VRs
 246         li              r7,rzV21
 247         li              r8,rzV22
 248         li              r9,rzV23
 249         lvx             v20,r1,r6
 250         lvx             v21,r1,r7
 251         lvx             v22,r1,r8
 252         lvx             v23,r1,r9
 253         b       Ldone
 254
 255
 256 // Aligned loop.  Destination is 128-byte aligned, and source is 16-byte
 257 // aligned.  Loop over 256-byte chunks (2 cache lines.)
 258
 259         .align  5
 260 LalignedLoop:
 261         dcbt    c256,rs             // touch in next chunk
 262         dcbt    c384,rs
 263         addi    r2,rs,128           // point to 2nd 128 bytes of source
 264         lvxl    v1,0,rs
 265         lvxl    v2,c16,rs
 266         lvxl    v3,c32,rs
 267         lvxl    v4,c48,rs
 268         lvxl    v5,c64,rs
 269         lvxl    v6,c80,rs
 270         lvxl    v7,c96,rs
 271         lvxl    v8,c112,rs
 272         lvxl    v9,0,r2
 273         lvxl    v10,c16,r2
 274         lvxl    v11,c32,r2
 275         lvxl    v12,c48,r2
 276         lvxl    v13,c64,r2
 277         lvxl    v14,c80,r2
 278         lvxl    v15,c96,r2
 279         lvxl    v16,c112,r2
 280         addi    r2,rd,128           // point to 2nd 128 bytes of dest
 281         bne--   cr7,1f                          // skip dcbz's if cr7 beq has been turned off by kernel
 282         dcbz128 0,rd
 283         dcbz128 0,r2
 284 1:
 285         addi    rs,rs,256           // point to next source chunk
 286         stvxl   v1,0,rd
 287         stvxl   v2,c16,rd
 288         stvxl   v3,c32,rd
 289         stvxl   v4,c48,rd
 290         stvxl   v5,c64,rd
 291         stvxl   v6,c80,rd
 292         stvxl   v7,c96,rd
 293         stvxl   v8,c112,rd
 294         addi    rd,rd,256           // point to next dest chunk
 295         stvxl   v9,0,r2
 296         stvxl   v10,c16,r2
 297         stvxl   v11,c32,r2
 298         stvxl   v12,c48,r2
 299         stvxl   v13,c64,r2
 300         stvxl   v14,c80,r2
 301         stvxl   v15,c96,r2
 302         stvxl   v16,c112,r2
 303         bdnz++  LalignedLoop            // loop if another 256 bytes to go
 304
 305
 306 // Done, except for 0..255 leftover bytes at end.
 307 //      rs = source ptr
 308 //      rd = dest ptr
 309 //      rc = remaining count in low 7 bits
 310 //      rv = caller's vrsave
 311 //  rx = caller's return address
 312
 313 Ldone:
 314         andi.   r5,rc,0xFF          // any leftover bytes? (0..255)
 315         mtspr   vrsave,rv                       // restore bitmap of live vr's
 316
 317         mr      r3,rd
 318         mr      r4,rs
 319         bnela   _COMM_PAGE_MEMCPY   // copy leftover bytes
 320
 321         mtlr    rx                  // restore return address
 322         ld      r3,rzR3(r1)         // restore non-volatile GPRs from redzone
 323         ld              r13,rzR13(r1)
 324         ld              r14,rzR14(r1)
 325         ld              r15,rzR15(r1)
 326         ld      r16,rzR16(r1)
 327         blr
 328
 329
 330         COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)
 331