osfmk/ppc/commpage/bcopy_970.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 /* =======================================
  31  * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
  32  * =======================================
  33  *
  34  * Version of 6/11/2003, tuned for the IBM 970.
  35  *
  36  * Register usage.  Note the rather delicate way we assign multiple uses
  37  * to the same register.  Beware.
  38  *   r0  = temp (NB: cannot use r0 for any constant such as "c16")
  39  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  40  *   r4  = source ptr ("rs")
  41  *   r5  = count of bytes to move ("rc")
  42  *   r6  = "w1", "c16", or "cm17"
  43  *   r7  = "w2", "c32", or "cm33"
  44  *   r8  = "w3", "c48", or "cm49"
  45  *   r9  = "w4",        or "cm1"
  46  *   r10 = vrsave ("rv")
  47  *   r11 = unused
  48  *   r12 = destination ptr ("rd")
  49  *   v0  = permute vector ("vp")
  50  * v1-v8 = qw's loaded from source
  51  *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
  52  */
  53 #define rs      r4
  54 #define rd      r12
  55 #define rc      r5
  56 #define rv      r10
  57
  58 #define w1      r6
  59 #define w2      r7
  60 #define w3      r8
  61 #define w4      r9
  62
  63 #define c16             r6
  64 #define cm17    r6
  65 #define c32             r7
  66 #define cm33    r7
  67 #define c48             r8
  68 #define cm49    r8
  69 #define cm1             r9
  70
  71 #define vp      v0
  72 #define vw      v9
  73 #define vx      v10
  74 #define vy      v11
  75 #define vz      v12
  76
  77 #define ASSEMBLER
  78 #include <sys/appleapiopts.h>
  79 #include <ppc/asm.h>
  80 #include <machine/cpu_capabilities.h>
  81 #include <machine/commpage.h>
  82
  83         .text
  84 /*
  85  * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
  86  * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
  87  * simple transformations:
  88  *      - all word compares are changed to doubleword
  89  *      - all "srwi[.]" opcodes are changed to "srdi[.]"
  90  * Nothing else is done.  For this to work, the following rules must be
  91  * carefully followed:
  92  *      - do not use carry or overflow
  93  *      - only use record mode if you are sure the results are mode-invariant
  94  *        for example, all "andi." and almost all "rlwinm." are fine
  95  *      - do not use "slwi", "slw", or "srw"
  96  * An imaginative programmer could break the porting model in other ways, but the above
  97  * are the most likely problem areas.  It is perhaps surprising how well in practice
  98  * this simple method works.
  99  */
 100
 101 #define kShort          64
 102 #define kVeryLong       (128*1024)
 103
 104
 105 // Main entry points.
 106
 107         .align  5
 108 bcopy_970:                                                      // void bcopy(const void *src, void *dst, size_t len)
 109         cmplwi  rc,kShort                       // short or long?
 110         sub             w1,r4,r3                        // must move in reverse if (rd-rs)<rc
 111         mr              rd,r4                           // move registers to canonic spot
 112         mr              rs,r3
 113         blt             LShort                          // handle short operands
 114         dcbt    0,rs                            // touch in the first line of source
 115         dcbtst  0,rd                            // touch in destination
 116         b               LLong1                          // join long operand code
 117
 118 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
 119
 120         .align  5
 121 Lmemcpy_970:                                            // void* memcpy(void *dst, void *src, size_t len)
 122 Lmemmove_970:                                           // void* memmove(void *dst, const void *src, size_t len)
 123         cmplwi  rc,kShort                       // short or long?
 124         sub             w1,r3,r4                        // must move in reverse if (rd-rs)<rc
 125         mr              rd,r3                           // must leave r3 alone, it is return value for memcpy etc
 126         bge             LLong0                          // handle long operands
 127
 128 // Handle short operands.
 129 //              rs = source
 130 //              rd = destination
 131 //              rc = count
 132 //              w1 = (rd-rs), must move reverse if (rd-rs)<rc
 133
 134 LShort:
 135         cmplw   cr1,w1,rc                       // set cr1 blt if we must move reverse
 136         mtcrf   0x02,rc                         // move length to cr6 and cr7 one at a time
 137         mtcrf   0x01,rc
 138         blt--   cr1,LShortReverse
 139
 140 // Forward short operands.  This is the most frequent case, so it is inline.
 141
 142         bf              26,0f                           // 32-byte chunk to move?
 143         ld              w1,0(rs)
 144         ld              w2,8(rs)
 145         ld              w3,16(rs)
 146         ld              w4,24(rs)
 147         addi    rs,rs,32
 148         std             w1,0(rd)
 149         std             w2,8(rd)
 150         std             w3,16(rd)
 151         std             w4,24(rd)
 152         addi    rd,rd,32
 153 0:
 154 LShort32:
 155         bf              27,1f                           // quadword to move?
 156         ld              w1,0(rs)
 157         ld              w3,8(rs)
 158         addi    rs,rs,16
 159         std             w1,0(rd)
 160         std             w3,8(rd)
 161         addi    rd,rd,16
 162 1:
 163 LShort16:                                                       // join here to xfer 0-15 bytes
 164         bf              28,2f                           // doubleword?
 165         ld              w1,0(rs)
 166         addi    rs,rs,8
 167         std             w1,0(rd)
 168         addi    rd,rd,8
 169 2:
 170         bf              29,3f                           // word?
 171         lwz             w1,0(rs)
 172         addi    rs,rs,4
 173         stw             w1,0(rd)
 174         addi    rd,rd,4
 175 3:
 176         bf              30,4f                           // halfword to move?
 177         lhz             w1,0(rs)
 178         addi    rs,rs,2
 179         sth             w1,0(rd)
 180         addi    rd,rd,2
 181 4:
 182         bflr    31                                      // skip if no odd byte
 183         lbz             w1,0(rs)
 184         stb             w1,0(rd)
 185         blr
 186
 187
 188 // Handle short reverse operands.
 189 //              cr = length in bits 26-31
 190
 191 LShortReverse:
 192         add             rs,rs,rc                        // adjust ptrs for reverse move
 193         add             rd,rd,rc
 194         bf              26,0f                           // 32 bytes to move?
 195         ld              w1,-8(rs)
 196         ld              w2,-16(rs)
 197         ld              w3,-24(rs)
 198         ldu             w4,-32(rs)
 199         std             w1,-8(rd)
 200         std             w2,-16(rd)
 201         std             w3,-24(rd)
 202         stdu    w4,-32(rd)
 203 0:
 204         bf              27,1f                           // quadword to move?
 205         ld              w1,-8(rs)
 206         ldu             w2,-16(rs)
 207         std             w1,-8(rd)
 208         stdu    w2,-16(rd)
 209 1:
 210 LShortReverse16:                                        // join here to xfer 0-15 bytes and return
 211         bf              28,2f                           // doubleword?
 212         ldu             w1,-8(rs)
 213         stdu    w1,-8(rd)
 214 2:
 215         bf              29,3f                           // word?
 216         lwzu    w1,-4(rs)
 217         stwu    w1,-4(rd)
 218 3:
 219         bf              30,4f                           // halfword to move?
 220         lhzu    w1,-2(rs)
 221         sthu    w1,-2(rd)
 222 4:
 223         bflr    31                                      // done if no odd byte
 224         lbz     w1,-1(rs)                       // no update
 225         stb     w1,-1(rd)
 226         blr
 227
 228
 229 // Long operands, use Altivec in most cases.
 230 //              rs = source
 231 //              rd = destination
 232 //              rc = count
 233 //              w1 = (rd-rs), must move reverse if (rd-rs)<rc
 234
 235 LLong0:                                                         // entry from memmove()
 236         dcbt    0,rs                            // touch in source
 237         dcbtst  0,rd                            // touch in destination
 238 LLong1:                                                         // entry from bcopy() with operands already touched in
 239         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
 240         neg             w3,rd                           // start to compute #bytes to align destination
 241         rlwinm  w2,w1,0,0xF                     // 16-byte aligned?  (w2==0 if so)
 242         andi.   w4,w3,0xF                       // w4 <- #bytes to 16-byte align destination
 243         cmpwi   cr5,w2,0                        // set cr5 beq if relatively 16-byte aligned
 244         blt--   cr1,LLongReverse        // handle reverse moves
 245         sub             rc,rc,w4                        // adjust length for aligning destination
 246         srwi    r0,rc,7                         // get #cache lines to copy (may be 0)
 247         cmpwi   cr1,r0,0                        // set cr1 on #chunks
 248         beq             LFwdAligned                     // dest is already aligned
 249
 250 // 16-byte align destination.
 251
 252         mtcrf   0x01,w4                         // cr7 <- #bytes to align dest (nonzero)
 253         bf              31,1f                           // byte to move?
 254         lbz             w1,0(rs)
 255         addi    rs,rs,1
 256         stb             w1,0(rd)
 257         addi    rd,rd,1
 258 1:
 259         bf              30,2f                           // halfword?
 260         lhz             w1,0(rs)
 261         addi    rs,rs,2
 262         sth             w1,0(rd)
 263         addi    rd,rd,2
 264 2:
 265         bf              29,3f                           // word?
 266         lwz             w1,0(rs)
 267         addi    rs,rs,4
 268         stw             w1,0(rd)
 269         addi    rd,rd,4
 270 3:
 271         bf              28,LFwdAligned          // doubleword?
 272         ld              w1,0(rs)
 273         addi    rs,rs,8
 274         std             w1,0(rd)
 275         addi    rd,rd,8
 276
 277
 278 // Forward, destination is 16-byte aligned.  There are five cases:
 279 //  1. If the length>=kVeryLong (ie, several pages), then use the
 280 //     "bigcopy" path that pulls all the punches.  This is the fastest
 281 //         case for cold-cache operands, as any this long will likely be.
 282 //      2. If length>=128 and source is 16-byte aligned, then use the
 283 //         lvx/stvx loop over 128-byte chunks.  This is the fastest
 284 //     case for hot-cache operands, 2nd fastest for cold.
 285 //      3. If length>=128 and source is not 16-byte aligned, then use the
 286 //         lvx/vperm/stvx loop over 128-byte chunks.
 287 //      4. If length<128 and source is 8-byte aligned, then use the
 288 //         ld/std loop over 32-byte chunks.
 289 //      5. If length<128 and source is not 8-byte aligned, then use the
 290 //         lvx/vperm/stvx loop over 32-byte chunks.  This is the slowest case.
 291 // Registers at this point:
 292 //              r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
 293 //                      rs = alignment unknown
 294 //                  rd = 16-byte aligned
 295 //                      rc = bytes remaining
 296 //                      w2 = low 4 bits of (rd-rs), used to check alignment
 297 //                 cr5 = beq if source is also 16-byte aligned
 298
 299 LFwdAligned:
 300         andi.   w3,w2,7                         // is source at least 8-byte aligned?
 301         mtcrf   0x01,rc                         // move leftover count to cr7 for LShort16
 302         bne             cr1,LFwdLongVectors     // at least one 128-byte chunk, so use vectors
 303         srwi    w1,rc,5                         // get 32-byte chunk count
 304         mtcrf   0x02,rc                         // move bit 27 of length to cr6 for LShort32
 305         mtctr   w1                                      // set up 32-byte loop (w1!=0)
 306         beq             LFwdMedAligned          // source is 8-byte aligned, so use ld/std loop
 307         mfspr   rv,vrsave                       // get bitmap of live vector registers
 308         oris    w4,rv,0xFFF8            // we use v0-v12
 309         li              c16,16                          // get constant used in lvx
 310         li              c32,32
 311         mtspr   vrsave,w4                       // update mask
 312         lvx             v1,0,rs                         // prefetch 1st source quadword
 313         lvsl    vp,0,rs                         // get permute vector to shift left
 314
 315
 316 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
 317
 318 1:                                                                      // loop over 32-byte chunks
 319         lvx             v2,c16,rs
 320         lvx             v3,c32,rs
 321         addi    rs,rs,32
 322         vperm   vx,v1,v2,vp
 323         vperm   vy,v2,v3,vp
 324         vor             v1,v3,v3                        // v1 <- v3
 325         stvx    vx,0,rd
 326         stvx    vy,c16,rd
 327         addi    rd,rd,32
 328         bdnz    1b
 329
 330         mtspr   vrsave,rv                       // restore bitmap of live vr's
 331         b               LShort32
 332
 333
 334 // Fewer than 128 bytes and doubleword aligned: use ld/std.
 335
 336         .align  5
 337 LFwdMedAligned:                                                                 // loop over 32-byte chunks
 338         ld              w1,0(rs)
 339         ld              w2,8(rs)
 340         ld              w3,16(rs)
 341         ld              w4,24(rs)
 342         addi    rs,rs,32
 343         std             w1,0(rd)
 344         std             w2,8(rd)
 345         std             w3,16(rd)
 346         std             w4,24(rd)
 347         addi    rd,rd,32
 348         bdnz    LFwdMedAligned
 349
 350         b               LShort32
 351
 352
 353 // Forward, 128 bytes or more: use vectors.  When entered:
 354 //          r0 = 128-byte chunks to move (>0)
 355 //              rd = 16-byte aligned
 356 //         cr5 = beq if source is 16-byte aligned
 357 //         cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 358 // We set up many registers:
 359 //         ctr = number of 128-byte chunks to move
 360 //      r0/cr0 = leftover QWs to move
 361 //         cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 362 //         cr6 = beq if leftover byte count is 0
 363 //              rv = original value of VRSave
 364 // c16,c32,c48 = loaded
 365
 366 LFwdLongVectors:
 367         mfspr   rv,vrsave                       // get bitmap of live vector registers
 368         lis             w3,kVeryLong>>16        // cutoff for very-long-operand special case path
 369         cmplw   cr1,rc,w3                       // very long operand?
 370         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3
 371         bge--   cr1,LBigCopy        // handle big copies separately
 372         mtctr   r0                                      // set up loop count
 373         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 374         oris    w4,rv,0xFFF8            // we use v0-v12
 375         rlwinm. r0,rc,28,29,31          // get number of quadword leftovers (0-7) and set cr0
 376         li              c16,16                          // get constants used in ldvx/stvx
 377         mtspr   vrsave,w4                       // update mask
 378         li              c32,32
 379         li              c48,48
 380         beq             cr5,LFwdLongAligned     // source is also 16-byte aligned, no need for vperm
 381         lvsl    vp,0,rs                         // get permute vector to shift left
 382         lvx             v1,0,rs                         // prefetch 1st source quadword
 383         b               LFwdLongUnaligned
 384
 385
 386 // Forward, long, unaligned vector loop.
 387
 388         .align  5                                       // align inner loops
 389 LFwdLongUnaligned:                                      // loop over 128-byte chunks
 390         addi    w4,rs,64
 391         lvx             v2,c16,rs
 392         lvx             v3,c32,rs
 393         lvx             v4,c48,rs
 394         lvx             v5,0,w4
 395         lvx             v6,c16,w4
 396         vperm   vw,v1,v2,vp
 397         lvx             v7,c32,w4
 398         lvx             v8,c48,w4
 399         addi    rs,rs,128
 400         vperm   vx,v2,v3,vp
 401         addi    w4,rd,64
 402         lvx             v1,0,rs
 403         stvx    vw,0,rd
 404         vperm   vy,v3,v4,vp
 405         stvx    vx,c16,rd
 406         vperm   vz,v4,v5,vp
 407         stvx    vy,c32,rd
 408         vperm   vw,v5,v6,vp
 409         stvx    vz,c48,rd
 410         vperm   vx,v6,v7,vp
 411         addi    rd,rd,128
 412         stvx    vw,0,w4
 413         vperm   vy,v7,v8,vp
 414         stvx    vx,c16,w4
 415         vperm   vz,v8,v1,vp
 416         stvx    vy,c32,w4
 417         stvx    vz,c48,w4
 418         bdnz    LFwdLongUnaligned
 419
 420         beq             4f                                      // no leftover quadwords
 421         mtctr   r0
 422 3:                                                                      // loop over remaining quadwords
 423         lvx             v2,c16,rs
 424         addi    rs,rs,16
 425         vperm   vx,v1,v2,vp
 426         vor             v1,v2,v2                        // v1 <- v2
 427         stvx    vx,0,rd
 428         addi    rd,rd,16
 429         bdnz    3b
 430 4:
 431         mtspr   vrsave,rv                       // restore bitmap of live vr's
 432         bne             cr6,LShort16            // handle last 0-15 bytes if any
 433         blr
 434
 435
 436 // Forward, long, 16-byte aligned vector loop.
 437
 438         .align  5
 439 LFwdLongAligned:                                // loop over 128-byte chunks
 440         addi    w4,rs,64
 441         lvx             v1,0,rs
 442         lvx             v2,c16,rs
 443         lvx             v3,c32,rs
 444         lvx             v4,c48,rs
 445         lvx             v5,0,w4
 446         lvx             v6,c16,w4
 447         lvx             v7,c32,w4
 448         lvx             v8,c48,w4
 449         addi    rs,rs,128
 450         addi    w4,rd,64
 451         stvx    v1,0,rd
 452         stvx    v2,c16,rd
 453         stvx    v3,c32,rd
 454         stvx    v4,c48,rd
 455         stvx    v5,0,w4
 456         stvx    v6,c16,w4
 457         stvx    v7,c32,w4
 458         stvx    v8,c48,w4
 459         addi    rd,rd,128
 460         bdnz    LFwdLongAligned
 461
 462         beq             4f                                      // no leftover quadwords
 463         mtctr   r0
 464 3:                                                                      // loop over remaining quadwords (1-7)
 465         lvx             v1,0,rs
 466         addi    rs,rs,16
 467         stvx    v1,0,rd
 468         addi    rd,rd,16
 469         bdnz    3b
 470 4:
 471         mtspr   vrsave,rv                       // restore bitmap of live vr's
 472         bne             cr6,LShort16            // handle last 0-15 bytes if any
 473         blr
 474
 475
 476 // Long, reverse moves.
 477 //              rs = source
 478 //              rd = destination
 479 //              rc = count
 480 //         cr5 = beq if relatively 16-byte aligned
 481
 482 LLongReverse:
 483         add             rd,rd,rc                        // point to end of operands
 484         add             rs,rs,rc
 485         andi.   r0,rd,0xF                       // #bytes to 16-byte align destination
 486         beq             2f                                      // already aligned
 487
 488 // 16-byte align destination.
 489
 490         mtctr   r0                                      // set up for loop
 491         sub             rc,rc,r0
 492 1:
 493         lbzu    w1,-1(rs)
 494         stbu    w1,-1(rd)
 495         bdnz    1b
 496
 497 // Prepare for reverse vector loop.  When entered:
 498 //              rd = 16-byte aligned
 499 //              cr5 = beq if source also 16-byte aligned
 500 // We set up many registers:
 501 //              ctr/cr1 = number of 64-byte chunks to move (may be 0)
 502 //              r0/cr0 = leftover QWs to move
 503 //              cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 504 //              cr6 = beq if leftover byte count is 0
 505 //              cm1 = -1
 506 //              rv = original value of vrsave
 507
 508 2:
 509         mfspr   rv,vrsave                       // get bitmap of live vector registers
 510         srwi    r0,rc,6                         // get count of 64-byte chunks to move (may be 0)
 511         oris    w1,rv,0xFFF8            // we use v0-v12
 512         mtcrf   0x01,rc                         // prepare for moving last 0-15 bytes in LShortReverse16
 513         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3 too
 514         cmpwi   cr1,r0,0                        // set cr1 on chunk count
 515         mtspr   vrsave,w1                       // update mask
 516         mtctr   r0                                      // set up loop count
 517         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 518         rlwinm. r0,rc,28,30,31          // get number of quadword leftovers (0-3) and set cr0
 519         li              cm1,-1                          // get constants used in ldvx/stvx
 520
 521         bne             cr5,LReverseVecUnal     // handle unaligned operands
 522         beq             cr1,2f                          // no chunks (if no chunks, must be leftover QWs)
 523         li              cm17,-17
 524         li              cm33,-33
 525         li              cm49,-49
 526         b               1f
 527
 528 // Long, reverse 16-byte-aligned vector loop.
 529
 530         .align  5                                       // align inner loops
 531 1:                                                              // loop over 64-byte chunks
 532         lvx             v1,cm1,rs
 533         lvx             v2,cm17,rs
 534         lvx             v3,cm33,rs
 535         lvx             v4,cm49,rs
 536         subi    rs,rs,64
 537         stvx    v1,cm1,rd
 538         stvx    v2,cm17,rd
 539         stvx    v3,cm33,rd
 540         stvx    v4,cm49,rd
 541         subi    rd,rd,64
 542         bdnz    1b
 543
 544         beq             4f                                      // no leftover quadwords
 545 2:                                                                      // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
 546         mtctr   r0
 547 3:                                                                      // loop over remaining quadwords (1-7)
 548         lvx             v1,cm1,rs
 549         subi    rs,rs,16
 550         stvx    v1,cm1,rd
 551         subi    rd,rd,16
 552         bdnz    3b
 553 4:
 554         mtspr   vrsave,rv                       // restore bitmap of live vr's
 555         bne             cr6,LShortReverse16     // handle last 0-15 bytes if any
 556         blr
 557
 558
 559 // Long, reverse, unaligned vector loop.
 560 //              ctr/cr1 = number of 64-byte chunks to move (may be 0)
 561 //              r0/cr0 = leftover QWs to move
 562 //              cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 563 //              cr6 = beq if leftover byte count is 0
 564 //              rv = original value of vrsave
 565 //              cm1 = -1
 566
 567 LReverseVecUnal:
 568         lvsl    vp,0,rs                         // get permute vector to shift left
 569         lvx             v1,cm1,rs                       // v1 always looks ahead
 570         li              cm17,-17
 571         beq             cr1,2f                          // no chunks (if no chunks, must be leftover QWs)
 572         li              cm33,-33
 573         li              cm49,-49
 574         b               1f
 575
 576         .align  5                                       // align the inner loops
 577 1:                                                                      // loop over 64-byte chunks
 578         lvx             v2,cm17,rs
 579         lvx             v3,cm33,rs
 580         lvx             v4,cm49,rs
 581         subi    rs,rs,64
 582         vperm   vx,v2,v1,vp
 583         lvx             v1,cm1,rs
 584         vperm   vy,v3,v2,vp
 585         stvx    vx,cm1,rd
 586         vperm   vz,v4,v3,vp
 587         stvx    vy,cm17,rd
 588         vperm   vx,v1,v4,vp
 589         stvx    vz,cm33,rd
 590         stvx    vx,cm49,rd
 591         subi    rd,rd,64
 592         bdnz    1b
 593
 594         beq             4f                                      // no leftover quadwords
 595 2:                                                                      // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
 596         mtctr   r0
 597 3:                                                                      // loop over 1-3 quadwords
 598         lvx             v2,cm17,rs
 599         subi    rs,rs,16
 600         vperm   vx,v2,v1,vp
 601         vor             v1,v2,v2                        // v1 <- v2
 602         stvx    vx,cm1,rd
 603         subi    rd,rd,16
 604         bdnz    3b
 605 4:
 606         mtspr   vrsave,rv                       // restore bitmap of live vr's
 607         bne             cr6,LShortReverse16     // handle last 0-15 bytes iff any
 608         blr
 609
 610
 611 // Very Big Copy Path.  Save our return address in the stack for help decoding backtraces.
 612 // The conditions bigcopy expects are:
 613 //  r0 = return address (also stored in caller's SF)
 614 //      r4 = source ptr
 615 //      r5 = length (at least several pages)
 616 // r12 = dest ptr
 617
 618 LBigCopy:
 619                 lis             r2,0x4000                       // r2 <- 0x40000000
 620         mflr    r0                  // get our return address
 621                 add.    r2,r2,r2                        // set cr0_lt if running in 32-bit mode
 622         stw     r0,8(r1)            // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
 623                 blta    _COMM_PAGE_BIGCOPY  // 32-bit mode, join big operand copy
 624                 std             r0,16(r1)                       // save return in correct spot for 64-bit mode
 625         ba      _COMM_PAGE_BIGCOPY  // then join big operand code
 626
 627
 628         COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
 629                                 kCommPageMTCRF+kCommPageBoth+kPort32to64)