osfmk/ppc/commpage/bigcopy_970.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25 /* ====================================
  26  * Very Long Operand BCOPY for Mac OS X
  27  * ====================================
  28  *
  29  * Version of 6/11/2003, tuned for the IBM 970.  This is for operands at
  30  * least several pages long.  It is called from bcopy()/memcpy()/memmove().
  31  *
  32  * We use the following additional strategies not used by the shorter
  33  * operand paths.  Mostly, we try to optimize for memory bandwidth:
  34  *      1. Use DCBZ128 to avoid reading destination lines.  Because this code
  35  *     resides on the commmpage, it can use a private interface with the
  36  *     kernel to minimize alignment exceptions if the destination is
  37  *     uncached.  The kernel will clear cr7 whenever it emulates a DCBZ or
  38  *     DCBZ128 on the commpage.  Thus we take at most one exception per call,
  39  *     which is amortized across the very long operand.
  40  *      2. Copy larger chunks per iteration to minimize R/W bus turnaround
  41  *     and maximize DRAM page locality (opening a new page is expensive.)
  42  *  3. Touch in one source chunk ahead with DCBT.  This is probably the
  43  *     least important change, and probably only helps restart the
  44  *     hardware stream at the start of each source page.
  45  *
  46  * Register usage.  Note the rather delicate way we assign multiple uses
  47  * to the same register.  Beware.
  48  *   r0  = temp (NB: cannot use r0 for any constant such as "c16")
  49  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  50  *   r4  = source ptr ("rs")
  51  *   r5  = count of bytes to move ("rc")
  52  *   r6  = constant 16 ("c16")
  53  *   r7  = constant 32 (""c32")
  54  *   r8  = constant 48 (""c48")
  55  *   r9  = constant 128 (""c128")
  56  *   r10 = vrsave ("rv")
  57  *   r11 = constant 256 (""c256")
  58  *   r12 = destination ptr ("rd")
  59  *       r13 = constant 384 (""c384")
  60  *       r14 = temp ("rx")
  61  *       r15 = temp ("rt")
  62  */
  63 #define rs      r4
  64 #define rd      r12
  65 #define rc      r5
  66 #define rv      r10
  67 #define rx      r14
  68 #define rt      r15
  69
  70 #define c16     r6
  71 #define c32     r7
  72 #define c48     r8
  73 #define c128    r9
  74 #define c256    r11
  75 #define c384    r13
  76
  77 // Offsets within the "red zone" (which is 224 bytes long):
  78
  79 #define rzR13   -8
  80 #define rzR14   -12
  81 #define rzR15   -16
  82 #define rzV20   -32
  83 #define rzV21   -48
  84 #define rzV22   -64
  85 #define rzV23   -80
  86 #define rzV24   -96
  87 #define rzV25   -112
  88 #define rzV26   -128
  89 #define rzV27   -144
  90 #define rzV28   -160
  91 #define rzV29   -176
  92 #define rzV30   -192
  93 #define rzV31   -208
  94
  95
  96 #include <sys/appleapiopts.h>
  97 #include <ppc/asm.h>
  98 #include <machine/cpu_capabilities.h>
  99 #include <machine/commpage.h>
 100
 101         .text
 102         .globl  EXT(bigcopy_970)
 103
 104
 105 // Entry point.  This is a subroutine of bcopy().  When called:
 106 //      r4 = source ptr (aka "rs")
 107 // r12 = dest ptr (aka "rd")
 108 //      r5 = length (>= 16K bytes) (aka "rc")
 109 //
 110 // We only do "forward" moves, ie non-overlapping or toward 0.
 111 //
 112 // We return with non-volatiles and r3 preserved.
 113
 114         .align  5
 115 bigcopy_970:
 116         stw             r13,rzR13(r1)           // spill non-volatile regs we use to redzone
 117         stw             r14,rzR14(r1)
 118         stw             r15,rzR15(r1)
 119         li              r0,rzV20
 120         neg             rt,rd                           // start to cache-line-align destination
 121         stvx    v20,r1,r0                       // we use all 32 VRs
 122         li              r0,rzV21
 123         stvx    v21,r1,r0
 124         li              r0,rzV22
 125         stvx    v22,r1,r0
 126         li              r0,rzV23
 127         stvx    v23,r1,r0
 128         li              r0,rzV24
 129         andi.   rt,rt,127                       // get #bytes to 128-byte align
 130         stvx    v24,r1,r0
 131         li              r0,rzV25
 132         stvx    v25,r1,r0
 133         li              r0,rzV26
 134         sub             rc,rc,rt                        // adjust length by #bytes to align destination
 135         stvx    v26,r1,r0
 136         li              r0,rzV27
 137         stvx    v27,r1,r0
 138         li              r0,rzV28
 139         mtctr   rt                                      // #bytes to align destination
 140         stvx    v28,r1,r0
 141         li              r0,rzV29
 142         stvx    v29,r1,r0
 143         li              r0,rzV30
 144         stvx    v30,r1,r0
 145         li              r0,rzV31
 146         stvx    v31,r1,r0
 147         beq             2f                                      // dest already 128-byte aligned
 148         b               1f
 149
 150
 151 // Cache-line-align destination.
 152
 153         .align  5
 154 1:
 155         lbz             r0,0(rs)
 156         addi    rs,rs,1
 157         stb             r0,0(rd)
 158         addi    rd,rd,1
 159         bdnz    1b
 160
 161
 162 // Is source 16-byte aligned?  Load constant offsets.
 163
 164 2:
 165         andi.   r0,rs,15                        // check source alignment
 166         mfspr   rv,vrsave                       // save caller's bitmask
 167         li              r0,-1                           // we use all 32 VRs
 168         li              c16,16                          // load the constant offsets for x-form ops
 169         li              c32,32
 170         li              c48,48
 171         li              c128,128
 172         li              c256,256
 173         li              c384,384
 174         mtspr   vrsave,r0
 175
 176 // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
 177 // and we dcbz only if cr7 beq is set.  We check to be sure the dcbz's
 178 // won't zero source bytes before we load them, since we zero before
 179 // loading as this is faster than zeroing after loading and before storing.
 180
 181         cmpw    cr7,r0,r0                       // initialize cr7 beq to use dcbz128
 182         sub             rt,rs,rd                        // get (rs-rd)
 183         cmplwi  cr1,rt,512                      // are we moving down less than 512 bytes?
 184
 185 // Start fetching in source cache lines.
 186
 187         dcbt    c128,rs                         // first line already touched in
 188         dcbt    c256,rs
 189         dcbt    c384,rs
 190
 191         bge++   cr1,3f                          // skip if not moving down less than 512 bytes
 192         cmpw    cr7,c16,c32                     // cannot dcbz since it would zero source bytes
 193 3:
 194         beq             LalignedLoop            // handle aligned sources
 195         lvsl    v0,0,rs                         // get permute vector for left shift
 196         lvxl    v1,0,rs                         // prime the loop
 197         b               LunalignedLoop          // enter unaligned loop
 198
 199
 200 // Main loop for unaligned operands.  We loop over 384-byte chunks (3 cache lines)
 201 // since we need a few VRs for permuted destination QWs and the permute vector.
 202
 203         .align  5
 204 LunalignedLoop:
 205         subi    rc,rc,384                       // decrement byte count
 206         addi    rx,rs,384                       // get address of next chunk
 207         lvxl    v2,c16,rs
 208         lvxl    v3,c32,rs
 209         bne--   cr7,1f                          // skip dcbz's if cr7 beq has been turned off by kernel
 210         dcbz128 0,rd                            // (also skip if moving down less than 512 bytes)
 211         bne--   cr7,1f                          // catch it first time through
 212         dcbz128 c128,rd
 213         dcbz128 c256,rd
 214 1:
 215         addi    rt,rs,64
 216         dcbt    0,rx                            // touch in next chunk
 217         dcbt    c128,rx
 218         dcbt    c256,rx
 219         lvxl    v4,c48,rs
 220         addi    rs,rs,128
 221         lvxl    v5,0,rt
 222         cmplwi  rc,384                          // another chunk to go?
 223         lvxl    v6,c16,rt
 224         lvxl    v7,c32,rt
 225         lvxl    v8,c48,rt
 226         addi    rt,rs,64
 227         vperm   v25,v1,v2,v0
 228         lvxl    v9,0,rs
 229         lvxl    v10,c16,rs
 230         vperm   v26,v2,v3,v0
 231         lvxl    v11,c32,rs
 232         lvxl    v12,c48,rs
 233         vperm   v27,v3,v4,v0
 234         addi    rs,rs,128
 235         lvxl    v13,0,rt
 236         lvxl    v14,c16,rt
 237         vperm   v28,v4,v5,v0
 238         lvxl    v15,c32,rt
 239         lvxl    v16,c48,rt
 240         vperm   v29,v5,v6,v0
 241         addi    rt,rs,64
 242         lvxl    v17,0,rs
 243         lvxl    v18,c16,rs
 244         vperm   v30,v6,v7,v0
 245         lvxl    v19,c32,rs
 246         lvxl    v20,c48,rs
 247         vperm   v31,v7,v8,v0
 248         addi    rs,rs,128
 249         lvxl    v21,0,rt
 250         lvxl    v22,c16,rt
 251         vperm   v2,v8,v9,v0
 252         lvxl    v23,c32,rt
 253         lvxl    v24,c48,rt
 254         vperm   v3,v9,v10,v0
 255         lvx             v1,0,rs                         // get 1st qw of next chunk
 256         vperm   v4,v10,v11,v0
 257
 258         addi    rt,rd,64
 259         stvxl   v25,0,rd
 260         stvxl   v26,c16,rd
 261         vperm   v5,v11,v12,v0
 262         stvxl   v27,c32,rd
 263         stvxl   v28,c48,rd
 264         vperm   v6,v12,v13,v0
 265         addi    rd,rd,128
 266         stvxl   v29,0,rt
 267         stvxl   v30,c16,rt
 268         vperm   v7,v13,v14,v0
 269         stvxl   v31,c32,rt
 270         stvxl   v2,c48,rt
 271         vperm   v8,v14,v15,v0
 272         addi    rt,rd,64
 273         stvxl   v3,0,rd
 274         stvxl   v4,c16,rd
 275         vperm   v9,v15,v16,v0
 276         stvxl   v5,c32,rd
 277         stvxl   v6,c48,rd
 278         vperm   v10,v16,v17,v0
 279         addi    rd,rd,128
 280         stvxl   v7,0,rt
 281         vperm   v11,v17,v18,v0
 282         stvxl   v8,c16,rt
 283         stvxl   v9,c32,rt
 284         vperm   v12,v18,v19,v0
 285         stvxl   v10,c48,rt
 286         addi    rt,rd,64
 287         vperm   v13,v19,v20,v0
 288         stvxl   v11,0,rd
 289         stvxl   v12,c16,rd
 290         vperm   v14,v20,v21,v0
 291         stvxl   v13,c32,rd
 292         vperm   v15,v21,v22,v0
 293         stvxl   v14,c48,rd
 294         vperm   v16,v22,v23,v0
 295         addi    rd,rd,128
 296         stvxl   v15,0,rt
 297         vperm   v17,v23,v24,v0
 298         stvxl   v16,c16,rt
 299         vperm   v18,v24,v1,v0
 300         stvxl   v17,c32,rt
 301         stvxl   v18,c48,rt
 302         bge++   LunalignedLoop          // loop if another 384 bytes to go
 303
 304 // End of unaligned main loop.  Handle up to 384 leftover bytes.
 305
 306         srwi.   r0,rc,5                         // get count of 32-byte chunks remaining
 307         beq             Ldone                           // none
 308         rlwinm  rc,rc,0,0x1F            // mask count down to 0..31 leftover bytes
 309         mtctr   r0
 310 1:                                                                      // loop over 32-byte chunks
 311         lvx             v2,c16,rs
 312         lvx             v3,c32,rs
 313         addi    rs,rs,32
 314         vperm   v8,v1,v2,v0
 315         vperm   v9,v2,v3,v0
 316         vor             v1,v3,v3                        // v1 <- v3
 317         stvx    v8,0,rd
 318         stvx    v9,c16,rd
 319         addi    rd,rd,32
 320         bdnz    1b
 321
 322         b               Ldone
 323
 324
 325 // Aligned loop.  Destination is 128-byte aligned, and source is 16-byte
 326 // aligned.  Loop over 512-byte chunks (4 cache lines.)
 327
 328         .align  5
 329 LalignedLoop:
 330         subi    rc,rc,512                       // decrement count
 331         addi    rx,rs,512                       // address of next chunk
 332         lvxl    v1,0,rs
 333         lvxl    v2,c16,rs
 334         bne--   cr7,1f                          // skip dcbz's if cr7 beq has been turned off by kernel
 335         dcbz128 0,rd                            // (also skip if moving down less than 512 bytes)
 336         bne--   cr7,1f                          // catch it first time through
 337         dcbz128 c128,rd
 338         dcbz128 c256,rd
 339         dcbz128 c384,rd
 340 1:
 341         addi    rt,rs,64
 342         dcbt    0,rx                            // touch in next chunk
 343         dcbt    c128,rx
 344         dcbt    c256,rx
 345         dcbt    c384,rx
 346         lvxl    v3,c32,rs
 347         lvxl    v4,c48,rs
 348         addi    rs,rs,128
 349         lvxl    v5,0,rt
 350         cmplwi  rc,512                          // another chunk to go?
 351         lvxl    v6,c16,rt
 352         lvxl    v7,c32,rt
 353         lvxl    v8,c48,rt
 354         addi    rt,rs,64
 355         lvxl    v9,0,rs
 356         lvxl    v10,c16,rs
 357         lvxl    v11,c32,rs
 358         lvxl    v12,c48,rs
 359         addi    rs,rs,128
 360         lvxl    v13,0,rt
 361         lvxl    v14,c16,rt
 362         lvxl    v15,c32,rt
 363         lvxl    v16,c48,rt
 364         addi    rt,rs,64
 365         lvxl    v17,0,rs
 366         lvxl    v18,c16,rs
 367         lvxl    v19,c32,rs
 368         lvxl    v20,c48,rs
 369         addi    rs,rs,128
 370         lvxl    v21,0,rt
 371         lvxl    v22,c16,rt
 372         lvxl    v23,c32,rt
 373         lvxl    v24,c48,rt
 374         addi    rt,rs,64
 375         lvxl    v25,0,rs
 376         lvxl    v26,c16,rs
 377         lvxl    v27,c32,rs
 378         lvxl    v28,c48,rs
 379         addi    rs,rs,128
 380         lvxl    v29,0,rt
 381         lvxl    v30,c16,rt
 382         lvxl    v31,c32,rt
 383         lvxl    v0,c48,rt
 384
 385         addi    rt,rd,64
 386         stvxl   v1,0,rd
 387         stvxl   v2,c16,rd
 388         stvxl   v3,c32,rd
 389         stvxl   v4,c48,rd
 390         addi    rd,rd,128
 391         stvxl   v5,0,rt
 392         stvxl   v6,c16,rt
 393         stvxl   v7,c32,rt
 394         stvxl   v8,c48,rt
 395         addi    rt,rd,64
 396         stvxl   v9,0,rd
 397         stvxl   v10,c16,rd
 398         stvxl   v11,c32,rd
 399         stvxl   v12,c48,rd
 400         addi    rd,rd,128
 401         stvxl   v13,0,rt
 402         stvxl   v14,c16,rt
 403         stvxl   v15,c32,rt
 404         stvxl   v16,c48,rt
 405         addi    rt,rd,64
 406         stvxl   v17,0,rd
 407         stvxl   v18,c16,rd
 408         stvxl   v19,c32,rd
 409         stvxl   v20,c48,rd
 410         addi    rd,rd,128
 411         stvxl   v21,0,rt
 412         stvxl   v22,c16,rt
 413         stvxl   v23,c32,rt
 414         stvxl   v24,c48,rt
 415         addi    rt,rd,64
 416         stvxl   v25,0,rd
 417         stvxl   v26,c16,rd
 418         stvxl   v27,c32,rd
 419         stvxl   v28,c48,rd
 420         addi    rd,rd,128
 421         stvxl   v29,0,rt
 422         stvxl   v30,c16,rt
 423         stvxl   v31,c32,rt
 424         stvxl   v0,c48,rt
 425         bge++   LalignedLoop            // loop if another 512 bytes to go
 426
 427 // End of aligned main loop.  Handle up to 511 leftover bytes.
 428
 429         srwi.   r0,rc,5                         // get count of 32-byte chunks remaining
 430         beq             Ldone                           // none
 431         rlwinm  rc,rc,0,0x1F            // mask count down to 0..31 leftover bytes
 432         mtctr   r0
 433 1:                                                                      // loop over 32-byte chunks
 434         lvx             v1,0,rs
 435         lvx             v2,c16,rs
 436         addi    rs,rs,32
 437         stvx    v1,0,rd
 438         stvx    v2,c16,rd
 439         addi    rd,rd,32
 440         bdnz    1b
 441
 442
 443 // Done, except for 0..31 leftovers at end.  Restore non-volatiles.
 444 //      rs = source ptr
 445 //      rd = dest ptr
 446 //      rc = count (0..31)
 447 //      rv = caller's vrsave
 448
 449 Ldone:
 450         cmpwi   rc,0                            // any leftover bytes?
 451         lwz             r13,rzR13(r1)           // restore non-volatiles from redzone
 452         lwz             r14,rzR14(r1)
 453         lwz             r15,rzR15(r1)
 454         li              r0,rzV20
 455         lvx             v20,r1,r0
 456         li              r0,rzV21
 457         lvx             v21,r1,r0
 458         li              r0,rzV22
 459         lvx             v22,r1,r0
 460         li              r0,rzV23
 461         lvx             v23,r1,r0
 462         li              r0,rzV24
 463         lvx             v24,r1,r0
 464         li              r0,rzV25
 465         lvx             v25,r1,r0
 466         li              r0,rzV26
 467         lvx             v26,r1,r0
 468         li              r0,rzV27
 469         lvx             v27,r1,r0
 470         li              r0,rzV28
 471         lvx             v28,r1,r0
 472         li              r0,rzV29
 473         lvx             v29,r1,r0
 474         li              r0,rzV30
 475         lvx             v30,r1,r0
 476         li              r0,rzV31
 477         lvx             v31,r1,r0
 478         mtspr   vrsave,rv                       // restore caller's bitmask
 479         beqlr                                           // done if no leftover bytes
 480
 481
 482 // Handle 1..31 leftover bytes at end.
 483
 484         mtctr   rc                                      // set up loop count
 485         b               1f
 486
 487         .align  5
 488 1:
 489         lbz             r0,0(rs)
 490         addi    rs,rs,1
 491         stb             r0,0(rd)
 492         addi    rd,rd,1
 493         bdnz    1b
 494
 495         blr
 496
 497
 498         COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,0) // load on all machines for now
 499