osfmk/ppc/commpage/bigcopy_970.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* ====================================
  23  * Very Long Operand BCOPY for Mac OS X
  24  * ====================================
  25  *
  26  * Version of 6/11/2003, tuned for the IBM 970.  This is for operands at
  27  * least several pages long.  It is called from bcopy()/memcpy()/memmove().
  28  *
  29  * We use the following additional strategies not used by the shorter
  30  * operand paths.  Mostly, we try to optimize for memory bandwidth:
  31  *      1. Use DCBZ128 to avoid reading destination lines.  Because this code
  32  *     resides on the commmpage, it can use a private interface with the
  33  *     kernel to minimize alignment exceptions if the destination is
  34  *     uncached.  The kernel will clear cr7 whenever it emulates a DCBZ or
  35  *     DCBZ128 on the commpage.  Thus we take at most one exception per call,
  36  *     which is amortized across the very long operand.
  37  *      2. Copy larger chunks per iteration to minimize R/W bus turnaround
  38  *     and maximize DRAM page locality (opening a new page is expensive.)
  39  *  3. Touch in one source chunk ahead with DCBT.  This is probably the
  40  *     least important change, and probably only helps restart the
  41  *     hardware stream at the start of each source page.
  42  *
  43  * Register usage.  Note the rather delicate way we assign multiple uses
  44  * to the same register.  Beware.
  45  *   r0  = temp (NB: cannot use r0 for any constant such as "c16")
  46  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  47  *   r4  = source ptr ("rs")
  48  *   r5  = count of bytes to move ("rc")
  49  *   r6  = constant 16 ("c16")
  50  *   r7  = constant 32 (""c32")
  51  *   r8  = constant 48 (""c48")
  52  *   r9  = constant 128 (""c128")
  53  *   r10 = vrsave ("rv")
  54  *   r11 = constant 256 (""c256")
  55  *   r12 = destination ptr ("rd")
  56  *       r13 = constant 384 (""c384")
  57  *       r14 = temp ("rx")
  58  *       r15 = temp ("rt")
  59  */
  60 #define rs      r4
  61 #define rd      r12
  62 #define rc      r5
  63 #define rv      r10
  64 #define rx      r14
  65 #define rt      r15
  66
  67 #define c16     r6
  68 #define c32     r7
  69 #define c48     r8
  70 #define c128    r9
  71 #define c256    r11
  72 #define c384    r13
  73
  74 // Offsets within the "red zone" (which is 224 bytes long):
  75
  76 #define rzR13   -8
  77 #define rzR14   -12
  78 #define rzR15   -16
  79 #define rzV20   -32
  80 #define rzV21   -48
  81 #define rzV22   -64
  82 #define rzV23   -80
  83 #define rzV24   -96
  84 #define rzV25   -112
  85 #define rzV26   -128
  86 #define rzV27   -144
  87 #define rzV28   -160
  88 #define rzV29   -176
  89 #define rzV30   -192
  90 #define rzV31   -208
  91
  92
  93 #include <sys/appleapiopts.h>
  94 #include <ppc/asm.h>
  95 #include <machine/cpu_capabilities.h>
  96 #include <machine/commpage.h>
  97
  98         .text
  99         .globl  EXT(bigcopy_970)
 100
 101
 102 // Entry point.  This is a subroutine of bcopy().  When called:
 103 //      r4 = source ptr (aka "rs")
 104 // r12 = dest ptr (aka "rd")
 105 //      r5 = length (>= 16K bytes) (aka "rc")
 106 //
 107 // We only do "forward" moves, ie non-overlapping or toward 0.
 108 //
 109 // We return with non-volatiles and r3 preserved.
 110
 111         .align  5
 112 bigcopy_970:
 113         stw             r13,rzR13(r1)           // spill non-volatile regs we use to redzone
 114         stw             r14,rzR14(r1)
 115         stw             r15,rzR15(r1)
 116         li              r0,rzV20
 117         neg             rt,rd                           // start to cache-line-align destination
 118         stvx    v20,r1,r0                       // we use all 32 VRs
 119         li              r0,rzV21
 120         stvx    v21,r1,r0
 121         li              r0,rzV22
 122         stvx    v22,r1,r0
 123         li              r0,rzV23
 124         stvx    v23,r1,r0
 125         li              r0,rzV24
 126         andi.   rt,rt,127                       // get #bytes to 128-byte align
 127         stvx    v24,r1,r0
 128         li              r0,rzV25
 129         stvx    v25,r1,r0
 130         li              r0,rzV26
 131         sub             rc,rc,rt                        // adjust length by #bytes to align destination
 132         stvx    v26,r1,r0
 133         li              r0,rzV27
 134         stvx    v27,r1,r0
 135         li              r0,rzV28
 136         mtctr   rt                                      // #bytes to align destination
 137         stvx    v28,r1,r0
 138         li              r0,rzV29
 139         stvx    v29,r1,r0
 140         li              r0,rzV30
 141         stvx    v30,r1,r0
 142         li              r0,rzV31
 143         stvx    v31,r1,r0
 144         beq             2f                                      // dest already 128-byte aligned
 145         b               1f
 146
 147
 148 // Cache-line-align destination.
 149
 150         .align  5
 151 1:
 152         lbz             r0,0(rs)
 153         addi    rs,rs,1
 154         stb             r0,0(rd)
 155         addi    rd,rd,1
 156         bdnz    1b
 157
 158
 159 // Is source 16-byte aligned?  Load constant offsets.
 160
 161 2:
 162         andi.   r0,rs,15                        // check source alignment
 163         mfspr   rv,vrsave                       // save caller's bitmask
 164         li              r0,-1                           // we use all 32 VRs
 165         li              c16,16                          // load the constant offsets for x-form ops
 166         li              c32,32
 167         li              c48,48
 168         li              c128,128
 169         li              c256,256
 170         li              c384,384
 171         mtspr   vrsave,r0
 172
 173 // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
 174 // and we dcbz only if cr7 beq is set.  We check to be sure the dcbz's
 175 // won't zero source bytes before we load them, since we zero before
 176 // loading as this is faster than zeroing after loading and before storing.
 177
 178         cmpw    cr7,r0,r0                       // initialize cr7 beq to use dcbz128
 179         sub             rt,rs,rd                        // get (rs-rd)
 180         cmplwi  cr1,rt,512                      // are we moving down less than 512 bytes?
 181
 182 // Start fetching in source cache lines.
 183
 184         dcbt    c128,rs                         // first line already touched in
 185         dcbt    c256,rs
 186         dcbt    c384,rs
 187
 188         bge++   cr1,3f                          // skip if not moving down less than 512 bytes
 189         cmpw    cr7,c16,c32                     // cannot dcbz since it would zero source bytes
 190 3:
 191         beq             LalignedLoop            // handle aligned sources
 192         lvsl    v0,0,rs                         // get permute vector for left shift
 193         lvxl    v1,0,rs                         // prime the loop
 194         b               LunalignedLoop          // enter unaligned loop
 195
 196
 197 // Main loop for unaligned operands.  We loop over 384-byte chunks (3 cache lines)
 198 // since we need a few VRs for permuted destination QWs and the permute vector.
 199
 200         .align  5
 201 LunalignedLoop:
 202         subi    rc,rc,384                       // decrement byte count
 203         addi    rx,rs,384                       // get address of next chunk
 204         lvxl    v2,c16,rs
 205         lvxl    v3,c32,rs
 206         bne--   cr7,1f                          // skip dcbz's if cr7 beq has been turned off by kernel
 207         dcbz128 0,rd                            // (also skip if moving down less than 512 bytes)
 208         bne--   cr7,1f                          // catch it first time through
 209         dcbz128 c128,rd
 210         dcbz128 c256,rd
 211 1:
 212         addi    rt,rs,64
 213         dcbt    0,rx                            // touch in next chunk
 214         dcbt    c128,rx
 215         dcbt    c256,rx
 216         lvxl    v4,c48,rs
 217         addi    rs,rs,128
 218         lvxl    v5,0,rt
 219         cmplwi  rc,384                          // another chunk to go?
 220         lvxl    v6,c16,rt
 221         lvxl    v7,c32,rt
 222         lvxl    v8,c48,rt
 223         addi    rt,rs,64
 224         vperm   v25,v1,v2,v0
 225         lvxl    v9,0,rs
 226         lvxl    v10,c16,rs
 227         vperm   v26,v2,v3,v0
 228         lvxl    v11,c32,rs
 229         lvxl    v12,c48,rs
 230         vperm   v27,v3,v4,v0
 231         addi    rs,rs,128
 232         lvxl    v13,0,rt
 233         lvxl    v14,c16,rt
 234         vperm   v28,v4,v5,v0
 235         lvxl    v15,c32,rt
 236         lvxl    v16,c48,rt
 237         vperm   v29,v5,v6,v0
 238         addi    rt,rs,64
 239         lvxl    v17,0,rs
 240         lvxl    v18,c16,rs
 241         vperm   v30,v6,v7,v0
 242         lvxl    v19,c32,rs
 243         lvxl    v20,c48,rs
 244         vperm   v31,v7,v8,v0
 245         addi    rs,rs,128
 246         lvxl    v21,0,rt
 247         lvxl    v22,c16,rt
 248         vperm   v2,v8,v9,v0
 249         lvxl    v23,c32,rt
 250         lvxl    v24,c48,rt
 251         vperm   v3,v9,v10,v0
 252         lvx             v1,0,rs                         // get 1st qw of next chunk
 253         vperm   v4,v10,v11,v0
 254
 255         addi    rt,rd,64
 256         stvxl   v25,0,rd
 257         stvxl   v26,c16,rd
 258         vperm   v5,v11,v12,v0
 259         stvxl   v27,c32,rd
 260         stvxl   v28,c48,rd
 261         vperm   v6,v12,v13,v0
 262         addi    rd,rd,128
 263         stvxl   v29,0,rt
 264         stvxl   v30,c16,rt
 265         vperm   v7,v13,v14,v0
 266         stvxl   v31,c32,rt
 267         stvxl   v2,c48,rt
 268         vperm   v8,v14,v15,v0
 269         addi    rt,rd,64
 270         stvxl   v3,0,rd
 271         stvxl   v4,c16,rd
 272         vperm   v9,v15,v16,v0
 273         stvxl   v5,c32,rd
 274         stvxl   v6,c48,rd
 275         vperm   v10,v16,v17,v0
 276         addi    rd,rd,128
 277         stvxl   v7,0,rt
 278         vperm   v11,v17,v18,v0
 279         stvxl   v8,c16,rt
 280         stvxl   v9,c32,rt
 281         vperm   v12,v18,v19,v0
 282         stvxl   v10,c48,rt
 283         addi    rt,rd,64
 284         vperm   v13,v19,v20,v0
 285         stvxl   v11,0,rd
 286         stvxl   v12,c16,rd
 287         vperm   v14,v20,v21,v0
 288         stvxl   v13,c32,rd
 289         vperm   v15,v21,v22,v0
 290         stvxl   v14,c48,rd
 291         vperm   v16,v22,v23,v0
 292         addi    rd,rd,128
 293         stvxl   v15,0,rt
 294         vperm   v17,v23,v24,v0
 295         stvxl   v16,c16,rt
 296         vperm   v18,v24,v1,v0
 297         stvxl   v17,c32,rt
 298         stvxl   v18,c48,rt
 299         bge++   LunalignedLoop          // loop if another 384 bytes to go
 300
 301 // End of unaligned main loop.  Handle up to 384 leftover bytes.
 302
 303         srwi.   r0,rc,5                         // get count of 32-byte chunks remaining
 304         beq             Ldone                           // none
 305         rlwinm  rc,rc,0,0x1F            // mask count down to 0..31 leftover bytes
 306         mtctr   r0
 307 1:                                                                      // loop over 32-byte chunks
 308         lvx             v2,c16,rs
 309         lvx             v3,c32,rs
 310         addi    rs,rs,32
 311         vperm   v8,v1,v2,v0
 312         vperm   v9,v2,v3,v0
 313         vor             v1,v3,v3                        // v1 <- v3
 314         stvx    v8,0,rd
 315         stvx    v9,c16,rd
 316         addi    rd,rd,32
 317         bdnz    1b
 318
 319         b               Ldone
 320
 321
 322 // Aligned loop.  Destination is 128-byte aligned, and source is 16-byte
 323 // aligned.  Loop over 512-byte chunks (4 cache lines.)
 324
 325         .align  5
 326 LalignedLoop:
 327         subi    rc,rc,512                       // decrement count
 328         addi    rx,rs,512                       // address of next chunk
 329         lvxl    v1,0,rs
 330         lvxl    v2,c16,rs
 331         bne--   cr7,1f                          // skip dcbz's if cr7 beq has been turned off by kernel
 332         dcbz128 0,rd                            // (also skip if moving down less than 512 bytes)
 333         bne--   cr7,1f                          // catch it first time through
 334         dcbz128 c128,rd
 335         dcbz128 c256,rd
 336         dcbz128 c384,rd
 337 1:
 338         addi    rt,rs,64
 339         dcbt    0,rx                            // touch in next chunk
 340         dcbt    c128,rx
 341         dcbt    c256,rx
 342         dcbt    c384,rx
 343         lvxl    v3,c32,rs
 344         lvxl    v4,c48,rs
 345         addi    rs,rs,128
 346         lvxl    v5,0,rt
 347         cmplwi  rc,512                          // another chunk to go?
 348         lvxl    v6,c16,rt
 349         lvxl    v7,c32,rt
 350         lvxl    v8,c48,rt
 351         addi    rt,rs,64
 352         lvxl    v9,0,rs
 353         lvxl    v10,c16,rs
 354         lvxl    v11,c32,rs
 355         lvxl    v12,c48,rs
 356         addi    rs,rs,128
 357         lvxl    v13,0,rt
 358         lvxl    v14,c16,rt
 359         lvxl    v15,c32,rt
 360         lvxl    v16,c48,rt
 361         addi    rt,rs,64
 362         lvxl    v17,0,rs
 363         lvxl    v18,c16,rs
 364         lvxl    v19,c32,rs
 365         lvxl    v20,c48,rs
 366         addi    rs,rs,128
 367         lvxl    v21,0,rt
 368         lvxl    v22,c16,rt
 369         lvxl    v23,c32,rt
 370         lvxl    v24,c48,rt
 371         addi    rt,rs,64
 372         lvxl    v25,0,rs
 373         lvxl    v26,c16,rs
 374         lvxl    v27,c32,rs
 375         lvxl    v28,c48,rs
 376         addi    rs,rs,128
 377         lvxl    v29,0,rt
 378         lvxl    v30,c16,rt
 379         lvxl    v31,c32,rt
 380         lvxl    v0,c48,rt
 381
 382         addi    rt,rd,64
 383         stvxl   v1,0,rd
 384         stvxl   v2,c16,rd
 385         stvxl   v3,c32,rd
 386         stvxl   v4,c48,rd
 387         addi    rd,rd,128
 388         stvxl   v5,0,rt
 389         stvxl   v6,c16,rt
 390         stvxl   v7,c32,rt
 391         stvxl   v8,c48,rt
 392         addi    rt,rd,64
 393         stvxl   v9,0,rd
 394         stvxl   v10,c16,rd
 395         stvxl   v11,c32,rd
 396         stvxl   v12,c48,rd
 397         addi    rd,rd,128
 398         stvxl   v13,0,rt
 399         stvxl   v14,c16,rt
 400         stvxl   v15,c32,rt
 401         stvxl   v16,c48,rt
 402         addi    rt,rd,64
 403         stvxl   v17,0,rd
 404         stvxl   v18,c16,rd
 405         stvxl   v19,c32,rd
 406         stvxl   v20,c48,rd
 407         addi    rd,rd,128
 408         stvxl   v21,0,rt
 409         stvxl   v22,c16,rt
 410         stvxl   v23,c32,rt
 411         stvxl   v24,c48,rt
 412         addi    rt,rd,64
 413         stvxl   v25,0,rd
 414         stvxl   v26,c16,rd
 415         stvxl   v27,c32,rd
 416         stvxl   v28,c48,rd
 417         addi    rd,rd,128
 418         stvxl   v29,0,rt
 419         stvxl   v30,c16,rt
 420         stvxl   v31,c32,rt
 421         stvxl   v0,c48,rt
 422         bge++   LalignedLoop            // loop if another 512 bytes to go
 423
 424 // End of aligned main loop.  Handle up to 511 leftover bytes.
 425
 426         srwi.   r0,rc,5                         // get count of 32-byte chunks remaining
 427         beq             Ldone                           // none
 428         rlwinm  rc,rc,0,0x1F            // mask count down to 0..31 leftover bytes
 429         mtctr   r0
 430 1:                                                                      // loop over 32-byte chunks
 431         lvx             v1,0,rs
 432         lvx             v2,c16,rs
 433         addi    rs,rs,32
 434         stvx    v1,0,rd
 435         stvx    v2,c16,rd
 436         addi    rd,rd,32
 437         bdnz    1b
 438
 439
 440 // Done, except for 0..31 leftovers at end.  Restore non-volatiles.
 441 //      rs = source ptr
 442 //      rd = dest ptr
 443 //      rc = count (0..31)
 444 //      rv = caller's vrsave
 445
 446 Ldone:
 447         cmpwi   rc,0                            // any leftover bytes?
 448         lwz             r13,rzR13(r1)           // restore non-volatiles from redzone
 449         lwz             r14,rzR14(r1)
 450         lwz             r15,rzR15(r1)
 451         li              r0,rzV20
 452         lvx             v20,r1,r0
 453         li              r0,rzV21
 454         lvx             v21,r1,r0
 455         li              r0,rzV22
 456         lvx             v22,r1,r0
 457         li              r0,rzV23
 458         lvx             v23,r1,r0
 459         li              r0,rzV24
 460         lvx             v24,r1,r0
 461         li              r0,rzV25
 462         lvx             v25,r1,r0
 463         li              r0,rzV26
 464         lvx             v26,r1,r0
 465         li              r0,rzV27
 466         lvx             v27,r1,r0
 467         li              r0,rzV28
 468         lvx             v28,r1,r0
 469         li              r0,rzV29
 470         lvx             v29,r1,r0
 471         li              r0,rzV30
 472         lvx             v30,r1,r0
 473         li              r0,rzV31
 474         lvx             v31,r1,r0
 475         mtspr   vrsave,rv                       // restore caller's bitmask
 476         beqlr                                           // done if no leftover bytes
 477
 478
 479 // Handle 1..31 leftover bytes at end.
 480
 481         mtctr   rc                                      // set up loop count
 482         b               1f
 483
 484         .align  5
 485 1:
 486         lbz             r0,0(rs)
 487         addi    rs,rs,1
 488         stb             r0,0(rd)
 489         addi    rd,rd,1
 490         bdnz    1b
 491
 492         blr
 493
 494
 495         COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,0) // load on all machines for now
 496