osfmk/ppc/commpage/bigcopy_970.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* ====================================
  23  * Very Long Operand BCOPY for Mac OS X
  24  * ====================================
  25  *
  26  * Version of 2/21/2004, tuned for the IBM 970.  This is for operands at
  27  * least several pages long.  It is called from bcopy()/memcpy()/memmove(),
  28  * and runs both in 32 and 64-bit mode.
  29  *
  30  * We use the following additional strategies not used by the shorter
  31  * operand paths.  Mostly, we try to optimize for memory bandwidth:
  32  *      1. Use DCBZ128 to avoid reading destination lines.  Because this code
  33  *     resides on the commmpage, it can use a private interface with the
  34  *     kernel to minimize alignment exceptions if the destination is
  35  *     uncached.  The kernel will clear cr7 whenever it emulates a DCBZ or
  36  *     DCBZ128 on the commpage.  Thus we take at most one exception per call,
  37  *     which is amortized across the very long operand.
  38  *      2. Copy larger chunks per iteration to minimize R/W bus turnaround
  39  *     and maximize DRAM page locality (opening a new page is expensive.)
  40  *     We use 256-byte chunks.
  41  *  3. Touch in one source chunk ahead with DCBT.  This is probably the
  42  *     least important change, and probably only helps restart the
  43  *     hardware stream at the start of each source page.
  44  */
  45
  46 #define rs      r13
  47 #define rd      r14
  48 #define rc      r15
  49 #define rx  r16
  50
  51 #define c16     r3
  52 #define c32     r4
  53 #define c48     r5
  54 #define c64     r6
  55 #define c80     r7
  56 #define c96     r8
  57 #define c112    r9
  58 #define c256    r10
  59 #define c384    r11
  60 #define rv      r12     // vrsave
  61
  62 // Offsets within the "red zone" (which is 224 bytes long):
  63
  64 #define rzR3    -8
  65 #define rzR13   -16
  66 #define rzR14   -24
  67 #define rzR15   -32
  68 #define rzR16   -40
  69
  70 #define rzV20   -64
  71 #define rzV21   -80
  72 #define rzV22   -96
  73 #define rzV23   -112
  74
  75
  76 #include <sys/appleapiopts.h>
  77 #include <ppc/asm.h>
  78 #include <machine/cpu_capabilities.h>
  79 #include <machine/commpage.h>
  80
  81         .text
  82 /*
  83  * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
  84  * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
  85  * simple transformations:
  86  *      - all word compares are changed to doubleword
  87  *      - all "srwi[.]" opcodes are changed to "srdi[.]"
  88  * Nothing else is done.  For this to work, the following rules must be
  89  * carefully followed:
  90  *      - do not use carry or overflow
  91  *      - only use record mode if you are sure the results are mode-invariant
  92  *        for example, all "andi." and almost all "rlwinm." are fine
  93  *      - do not use "slwi", "slw", or "srw"
  94  * An imaginative programmer could break the porting model in other ways, but the above
  95  * are the most likely problem areas.  It is perhaps surprising how well in practice
  96  * this simple method works.
  97  */
  98
  99 // Entry point.  This is a subroutine of bcopy().  When called:
 100 //  r0 = return address (also stored in caller's SF)
 101 //      r4 = source ptr
 102 //      r5 = length (at least several pages)
 103 // r12 = dest ptr
 104 //
 105 // We only do "forward" moves, ie non-overlapping or toward 0.  We return with non-volatiles
 106 // and r3 preserved.
 107
 108         .align  5
 109 bigcopy_970:
 110         neg     r2,r12              // is destination cache-line-aligned?
 111         std     r3,rzR3(r1)         // save caller's r3, which must be preserved for memcpy()
 112         std             r13,rzR13(r1)           // spill non-volatile regs we use to redzone
 113         std             r14,rzR14(r1)
 114         std             r15,rzR15(r1)
 115         andi.   r2,r2,0x7F          // #bytes to align
 116         std     r16,rzR16(r1)
 117         mr      rs,r4               // copy parameters into nonvolatile registers
 118         mr      rd,r12
 119         mr      rc,r5
 120         mr      rx,r0               // also save return address
 121         beq     1f                  // skip if already aligned
 122
 123 // Cache-line-align destination.
 124
 125         mr      r3,rd               // set up dest ptr for memcpy()
 126         mr      r5,r2               // number of bytes to copy
 127         add     rs,rs,r2            // then bump our parameters past initial copy
 128         add     rd,rd,r2
 129         sub     rc,rc,r2
 130         bla     _COMM_PAGE_MEMCPY   // 128-byte-align destination
 131
 132
 133 // Load constant offsets and check whether source is 16-byte aligned.
 134 // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
 135 // and we dcbz only if cr7 beq is set.
 136
 137 1:
 138         dcbt    0,rs                // touch in 1st line of source
 139         andi.   r0,rs,15                        // check source alignment
 140         mfspr   rv,vrsave                       // save caller's bitmask
 141         li              c16,16                          // load the constant offsets for x-form ops
 142         li              c32,32
 143         srwi    r2,rc,8             // get number of 256-byte chunks to xfer
 144         li              r0,-256                         // we use 24 VRs (ie, 0-23)
 145         li              c48,48
 146         li      c64,64
 147         li      c80,80
 148         or      r0,r0,rv            // add our bits to caller's
 149         li      c96,96
 150         mtctr   r2                  // set up loop count
 151         li      c112,112
 152         cmpd    cr7,r2,r2           // initialize cr7_eq to "on", so we dcbz128
 153         mtspr   vrsave,r0           // say we use vr0..vr23
 154         li              c256,256
 155         li              c384,384
 156         beq             LalignedLoop            // handle aligned sources
 157
 158
 159 // Set up for unaligned loop.
 160
 161         lvsl    v0,0,rs                         // get permute vector for left shift
 162         lvxl    v1,0,rs                         // prime the loop
 163         li              r0,rzV20            // save non-volatile VRs in redzone
 164         stvx    v20,r1,r0
 165         li              r0,rzV21
 166         stvx    v21,r1,r0
 167         li              r0,rzV22
 168         stvx    v22,r1,r0
 169         li              r0,rzV23
 170         stvx    v23,r1,r0
 171         b               LunalignedLoop          // enter unaligned loop
 172
 173
 174 // Main loop for unaligned operands.  We loop over 256-byte chunks (2 cache lines).
 175 // Destination is 128-byte aligned, source is unaligned.
 176
 177         .align  5
 178 LunalignedLoop:
 179         dcbt    c256,rs             // touch in next chunk
 180         dcbt    c384,rs
 181         addi    r2,rs,128           // point to 2nd 128 bytes of source
 182         lvxl    v2,c16,rs
 183         lvxl    v3,c32,rs
 184         lvxl    v4,c48,rs
 185         lvxl    v5,c64,rs
 186         lvxl    v6,c80,rs
 187         lvxl    v7,c96,rs
 188         lvxl    v8,c112,rs
 189         lvxl    v9,0,r2
 190         addi    rs,rs,256           // point to next source chunk
 191         lvxl    v10,c16,r2
 192         lvxl    v11,c32,r2
 193         vperm   v17,v1,v2,v0
 194         lvxl    v12,c48,r2
 195         lvxl    v13,c64,r2
 196         vperm   v18,v2,v3,v0
 197         lvxl    v14,c80,r2
 198         lvxl    v15,c96,r2
 199         vperm   v19,v3,v4,v0
 200         lvxl    v16,c112,r2
 201         lvxl    v1,0,rs             // peek ahead at first source quad in next chunk
 202         vperm   v20,v4,v5,v0
 203         addi    r2,rd,128           // point to 2nd 128 bytes of dest
 204         bne--   cr7,1f                          // skip dcbz's if cr7 beq has been turned off by kernel
 205         dcbz128 0,rd
 206         dcbz128 0,r2
 207 1:
 208         vperm   v21,v5,v6,v0
 209         stvxl   v17,0,rd
 210         vperm   v22,v6,v7,v0
 211         stvxl   v18,c16,rd
 212         vperm   v23,v7,v8,v0
 213         stvxl   v19,c32,rd
 214         vperm   v17,v8,v9,v0
 215         stvxl   v20,c48,rd
 216         vperm   v18,v9,v10,v0
 217         stvxl   v21,c64,rd
 218         vperm   v19,v10,v11,v0
 219         stvxl   v22,c80,rd
 220         vperm   v20,v11,v12,v0
 221         stvxl   v23,c96,rd
 222         vperm   v21,v12,v13,v0
 223         stvxl   v17,c112,rd
 224         vperm   v22,v13,v14,v0
 225         addi    rd,rd,256           // point to next dest chunk
 226         stvxl   v18,0,r2
 227         vperm   v23,v14,v15,v0
 228         stvxl   v19,c16,r2
 229         vperm   v17,v15,v16,v0
 230         stvxl   v20,c32,r2
 231         vperm   v18,v16,v1,v0
 232         stvxl   v21,c48,r2
 233         stvxl   v22,c64,r2
 234         stvxl   v23,c80,r2
 235         stvxl   v17,c96,r2
 236         stvxl   v18,c112,r2
 237         bdnz++  LunalignedLoop      // loop if another 256 bytes to go
 238
 239         li              r6,rzV20            // restore non-volatile VRs
 240         li              r7,rzV21
 241         li              r8,rzV22
 242         li              r9,rzV23
 243         lvx             v20,r1,r6
 244         lvx             v21,r1,r7
 245         lvx             v22,r1,r8
 246         lvx             v23,r1,r9
 247         b       Ldone
 248
 249
 250 // Aligned loop.  Destination is 128-byte aligned, and source is 16-byte
 251 // aligned.  Loop over 256-byte chunks (2 cache lines.)
 252
 253         .align  5
 254 LalignedLoop:
 255         dcbt    c256,rs             // touch in next chunk
 256         dcbt    c384,rs
 257         addi    r2,rs,128           // point to 2nd 128 bytes of source
 258         lvxl    v1,0,rs
 259         lvxl    v2,c16,rs
 260         lvxl    v3,c32,rs
 261         lvxl    v4,c48,rs
 262         lvxl    v5,c64,rs
 263         lvxl    v6,c80,rs
 264         lvxl    v7,c96,rs
 265         lvxl    v8,c112,rs
 266         lvxl    v9,0,r2
 267         lvxl    v10,c16,r2
 268         lvxl    v11,c32,r2
 269         lvxl    v12,c48,r2
 270         lvxl    v13,c64,r2
 271         lvxl    v14,c80,r2
 272         lvxl    v15,c96,r2
 273         lvxl    v16,c112,r2
 274         addi    r2,rd,128           // point to 2nd 128 bytes of dest
 275         bne--   cr7,1f                          // skip dcbz's if cr7 beq has been turned off by kernel
 276         dcbz128 0,rd
 277         dcbz128 0,r2
 278 1:
 279         addi    rs,rs,256           // point to next source chunk
 280         stvxl   v1,0,rd
 281         stvxl   v2,c16,rd
 282         stvxl   v3,c32,rd
 283         stvxl   v4,c48,rd
 284         stvxl   v5,c64,rd
 285         stvxl   v6,c80,rd
 286         stvxl   v7,c96,rd
 287         stvxl   v8,c112,rd
 288         addi    rd,rd,256           // point to next dest chunk
 289         stvxl   v9,0,r2
 290         stvxl   v10,c16,r2
 291         stvxl   v11,c32,r2
 292         stvxl   v12,c48,r2
 293         stvxl   v13,c64,r2
 294         stvxl   v14,c80,r2
 295         stvxl   v15,c96,r2
 296         stvxl   v16,c112,r2
 297         bdnz++  LalignedLoop            // loop if another 256 bytes to go
 298
 299
 300 // Done, except for 0..255 leftover bytes at end.
 301 //      rs = source ptr
 302 //      rd = dest ptr
 303 //      rc = remaining count in low 7 bits
 304 //      rv = caller's vrsave
 305 //  rx = caller's return address
 306
 307 Ldone:
 308         andi.   r5,rc,0xFF          // any leftover bytes? (0..255)
 309         mtspr   vrsave,rv                       // restore bitmap of live vr's
 310
 311         mr      r3,rd
 312         mr      r4,rs
 313         bnela   _COMM_PAGE_MEMCPY   // copy leftover bytes
 314
 315         mtlr    rx                  // restore return address
 316         ld      r3,rzR3(r1)         // restore non-volatile GPRs from redzone
 317         ld              r13,rzR13(r1)
 318         ld              r14,rzR14(r1)
 319         ld              r15,rzR15(r1)
 320         ld      r16,rzR16(r1)
 321         blr
 322
 323
 324         COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)
 325