osfmk/ppc/commpage/bcopy_970.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* =======================================
  29  * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
  30  * =======================================
  31  *
  32  * Version of 6/11/2003, tuned for the IBM 970.
  33  *
  34  * Register usage.  Note the rather delicate way we assign multiple uses
  35  * to the same register.  Beware.
  36  *   r0  = temp (NB: cannot use r0 for any constant such as "c16")
  37  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  38  *   r4  = source ptr ("rs")
  39  *   r5  = count of bytes to move ("rc")
  40  *   r6  = "w1", "c16", or "cm17"
  41  *   r7  = "w2", "c32", or "cm33"
  42  *   r8  = "w3", "c48", or "cm49"
  43  *   r9  = "w4",        or "cm1"
  44  *   r10 = vrsave ("rv")
  45  *   r11 = unused
  46  *   r12 = destination ptr ("rd")
  47  *   v0  = permute vector ("vp")
  48  * v1-v8 = qw's loaded from source
  49  *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
  50  */
  51 #define rs      r4
  52 #define rd      r12
  53 #define rc      r5
  54 #define rv      r10
  55
  56 #define w1      r6
  57 #define w2      r7
  58 #define w3      r8
  59 #define w4      r9
  60
  61 #define c16             r6
  62 #define cm17    r6
  63 #define c32             r7
  64 #define cm33    r7
  65 #define c48             r8
  66 #define cm49    r8
  67 #define cm1             r9
  68
  69 #define vp      v0
  70 #define vw      v9
  71 #define vx      v10
  72 #define vy      v11
  73 #define vz      v12
  74
  75 #include <sys/appleapiopts.h>
  76 #include <ppc/asm.h>
  77 #include <machine/cpu_capabilities.h>
  78 #include <machine/commpage.h>
  79
  80         .text
  81 /*
  82  * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
  83  * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
  84  * simple transformations:
  85  *      - all word compares are changed to doubleword
  86  *      - all "srwi[.]" opcodes are changed to "srdi[.]"
  87  * Nothing else is done.  For this to work, the following rules must be
  88  * carefully followed:
  89  *      - do not use carry or overflow
  90  *      - only use record mode if you are sure the results are mode-invariant
  91  *        for example, all "andi." and almost all "rlwinm." are fine
  92  *      - do not use "slwi", "slw", or "srw"
  93  * An imaginative programmer could break the porting model in other ways, but the above
  94  * are the most likely problem areas.  It is perhaps surprising how well in practice
  95  * this simple method works.
  96  */
  97
  98 #define kShort          64
  99 #define kVeryLong       (128*1024)
 100
 101
 102 // Main entry points.
 103
 104         .align  5
 105 bcopy_970:                                                      // void bcopy(const void *src, void *dst, size_t len)
 106         cmplwi  rc,kShort                       // short or long?
 107         sub             w1,r4,r3                        // must move in reverse if (rd-rs)<rc
 108         mr              rd,r4                           // move registers to canonic spot
 109         mr              rs,r3
 110         blt             LShort                          // handle short operands
 111         dcbt    0,rs                            // touch in the first line of source
 112         dcbtst  0,rd                            // touch in destination
 113         b               LLong1                          // join long operand code
 114
 115 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
 116
 117         .align  5
 118 Lmemcpy_970:                                            // void* memcpy(void *dst, void *src, size_t len)
 119 Lmemmove_970:                                           // void* memmove(void *dst, const void *src, size_t len)
 120         cmplwi  rc,kShort                       // short or long?
 121         sub             w1,r3,r4                        // must move in reverse if (rd-rs)<rc
 122         mr              rd,r3                           // must leave r3 alone, it is return value for memcpy etc
 123         bge             LLong0                          // handle long operands
 124
 125 // Handle short operands.
 126 //              rs = source
 127 //              rd = destination
 128 //              rc = count
 129 //              w1 = (rd-rs), must move reverse if (rd-rs)<rc
 130
 131 LShort:
 132         cmplw   cr1,w1,rc                       // set cr1 blt if we must move reverse
 133         mtcrf   0x02,rc                         // move length to cr6 and cr7 one at a time
 134         mtcrf   0x01,rc
 135         blt--   cr1,LShortReverse
 136
 137 // Forward short operands.  This is the most frequent case, so it is inline.
 138
 139         bf              26,0f                           // 32-byte chunk to move?
 140         ld              w1,0(rs)
 141         ld              w2,8(rs)
 142         ld              w3,16(rs)
 143         ld              w4,24(rs)
 144         addi    rs,rs,32
 145         std             w1,0(rd)
 146         std             w2,8(rd)
 147         std             w3,16(rd)
 148         std             w4,24(rd)
 149         addi    rd,rd,32
 150 0:
 151 LShort32:
 152         bf              27,1f                           // quadword to move?
 153         ld              w1,0(rs)
 154         ld              w3,8(rs)
 155         addi    rs,rs,16
 156         std             w1,0(rd)
 157         std             w3,8(rd)
 158         addi    rd,rd,16
 159 1:
 160 LShort16:                                                       // join here to xfer 0-15 bytes
 161         bf              28,2f                           // doubleword?
 162         ld              w1,0(rs)
 163         addi    rs,rs,8
 164         std             w1,0(rd)
 165         addi    rd,rd,8
 166 2:
 167         bf              29,3f                           // word?
 168         lwz             w1,0(rs)
 169         addi    rs,rs,4
 170         stw             w1,0(rd)
 171         addi    rd,rd,4
 172 3:
 173         bf              30,4f                           // halfword to move?
 174         lhz             w1,0(rs)
 175         addi    rs,rs,2
 176         sth             w1,0(rd)
 177         addi    rd,rd,2
 178 4:
 179         bflr    31                                      // skip if no odd byte
 180         lbz             w1,0(rs)
 181         stb             w1,0(rd)
 182         blr
 183
 184
 185 // Handle short reverse operands.
 186 //              cr = length in bits 26-31
 187
 188 LShortReverse:
 189         add             rs,rs,rc                        // adjust ptrs for reverse move
 190         add             rd,rd,rc
 191         bf              26,0f                           // 32 bytes to move?
 192         ld              w1,-8(rs)
 193         ld              w2,-16(rs)
 194         ld              w3,-24(rs)
 195         ldu             w4,-32(rs)
 196         std             w1,-8(rd)
 197         std             w2,-16(rd)
 198         std             w3,-24(rd)
 199         stdu    w4,-32(rd)
 200 0:
 201         bf              27,1f                           // quadword to move?
 202         ld              w1,-8(rs)
 203         ldu             w2,-16(rs)
 204         std             w1,-8(rd)
 205         stdu    w2,-16(rd)
 206 1:
 207 LShortReverse16:                                        // join here to xfer 0-15 bytes and return
 208         bf              28,2f                           // doubleword?
 209         ldu             w1,-8(rs)
 210         stdu    w1,-8(rd)
 211 2:
 212         bf              29,3f                           // word?
 213         lwzu    w1,-4(rs)
 214         stwu    w1,-4(rd)
 215 3:
 216         bf              30,4f                           // halfword to move?
 217         lhzu    w1,-2(rs)
 218         sthu    w1,-2(rd)
 219 4:
 220         bflr    31                                      // done if no odd byte
 221         lbz     w1,-1(rs)                       // no update
 222         stb     w1,-1(rd)
 223         blr
 224
 225
 226 // Long operands, use Altivec in most cases.
 227 //              rs = source
 228 //              rd = destination
 229 //              rc = count
 230 //              w1 = (rd-rs), must move reverse if (rd-rs)<rc
 231
 232 LLong0:                                                         // entry from memmove()
 233         dcbt    0,rs                            // touch in source
 234         dcbtst  0,rd                            // touch in destination
 235 LLong1:                                                         // entry from bcopy() with operands already touched in
 236         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
 237         neg             w3,rd                           // start to compute #bytes to align destination
 238         rlwinm  w2,w1,0,0xF                     // 16-byte aligned?  (w2==0 if so)
 239         andi.   w4,w3,0xF                       // w4 <- #bytes to 16-byte align destination
 240         cmpwi   cr5,w2,0                        // set cr5 beq if relatively 16-byte aligned
 241         blt--   cr1,LLongReverse        // handle reverse moves
 242         sub             rc,rc,w4                        // adjust length for aligning destination
 243         srwi    r0,rc,7                         // get #cache lines to copy (may be 0)
 244         cmpwi   cr1,r0,0                        // set cr1 on #chunks
 245         beq             LFwdAligned                     // dest is already aligned
 246
 247 // 16-byte align destination.
 248
 249         mtcrf   0x01,w4                         // cr7 <- #bytes to align dest (nonzero)
 250         bf              31,1f                           // byte to move?
 251         lbz             w1,0(rs)
 252         addi    rs,rs,1
 253         stb             w1,0(rd)
 254         addi    rd,rd,1
 255 1:
 256         bf              30,2f                           // halfword?
 257         lhz             w1,0(rs)
 258         addi    rs,rs,2
 259         sth             w1,0(rd)
 260         addi    rd,rd,2
 261 2:
 262         bf              29,3f                           // word?
 263         lwz             w1,0(rs)
 264         addi    rs,rs,4
 265         stw             w1,0(rd)
 266         addi    rd,rd,4
 267 3:
 268         bf              28,LFwdAligned          // doubleword?
 269         ld              w1,0(rs)
 270         addi    rs,rs,8
 271         std             w1,0(rd)
 272         addi    rd,rd,8
 273
 274
 275 // Forward, destination is 16-byte aligned.  There are five cases:
 276 //  1. If the length>=kVeryLong (ie, several pages), then use the
 277 //     "bigcopy" path that pulls all the punches.  This is the fastest
 278 //         case for cold-cache operands, as any this long will likely be.
 279 //      2. If length>=128 and source is 16-byte aligned, then use the
 280 //         lvx/stvx loop over 128-byte chunks.  This is the fastest
 281 //     case for hot-cache operands, 2nd fastest for cold.
 282 //      3. If length>=128 and source is not 16-byte aligned, then use the
 283 //         lvx/vperm/stvx loop over 128-byte chunks.
 284 //      4. If length<128 and source is 8-byte aligned, then use the
 285 //         ld/std loop over 32-byte chunks.
 286 //      5. If length<128 and source is not 8-byte aligned, then use the
 287 //         lvx/vperm/stvx loop over 32-byte chunks.  This is the slowest case.
 288 // Registers at this point:
 289 //              r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
 290 //                      rs = alignment unknown
 291 //                  rd = 16-byte aligned
 292 //                      rc = bytes remaining
 293 //                      w2 = low 4 bits of (rd-rs), used to check alignment
 294 //                 cr5 = beq if source is also 16-byte aligned
 295
 296 LFwdAligned:
 297         andi.   w3,w2,7                         // is source at least 8-byte aligned?
 298         mtcrf   0x01,rc                         // move leftover count to cr7 for LShort16
 299         bne             cr1,LFwdLongVectors     // at least one 128-byte chunk, so use vectors
 300         srwi    w1,rc,5                         // get 32-byte chunk count
 301         mtcrf   0x02,rc                         // move bit 27 of length to cr6 for LShort32
 302         mtctr   w1                                      // set up 32-byte loop (w1!=0)
 303         beq             LFwdMedAligned          // source is 8-byte aligned, so use ld/std loop
 304         mfspr   rv,vrsave                       // get bitmap of live vector registers
 305         oris    w4,rv,0xFFF8            // we use v0-v12
 306         li              c16,16                          // get constant used in lvx
 307         li              c32,32
 308         mtspr   vrsave,w4                       // update mask
 309         lvx             v1,0,rs                         // prefetch 1st source quadword
 310         lvsl    vp,0,rs                         // get permute vector to shift left
 311
 312
 313 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
 314
 315 1:                                                                      // loop over 32-byte chunks
 316         lvx             v2,c16,rs
 317         lvx             v3,c32,rs
 318         addi    rs,rs,32
 319         vperm   vx,v1,v2,vp
 320         vperm   vy,v2,v3,vp
 321         vor             v1,v3,v3                        // v1 <- v3
 322         stvx    vx,0,rd
 323         stvx    vy,c16,rd
 324         addi    rd,rd,32
 325         bdnz    1b
 326
 327         mtspr   vrsave,rv                       // restore bitmap of live vr's
 328         b               LShort32
 329
 330
 331 // Fewer than 128 bytes and doubleword aligned: use ld/std.
 332
 333         .align  5
 334 LFwdMedAligned:                                                                 // loop over 32-byte chunks
 335         ld              w1,0(rs)
 336         ld              w2,8(rs)
 337         ld              w3,16(rs)
 338         ld              w4,24(rs)
 339         addi    rs,rs,32
 340         std             w1,0(rd)
 341         std             w2,8(rd)
 342         std             w3,16(rd)
 343         std             w4,24(rd)
 344         addi    rd,rd,32
 345         bdnz    LFwdMedAligned
 346
 347         b               LShort32
 348
 349
 350 // Forward, 128 bytes or more: use vectors.  When entered:
 351 //          r0 = 128-byte chunks to move (>0)
 352 //              rd = 16-byte aligned
 353 //         cr5 = beq if source is 16-byte aligned
 354 //         cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 355 // We set up many registers:
 356 //         ctr = number of 128-byte chunks to move
 357 //      r0/cr0 = leftover QWs to move
 358 //         cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 359 //         cr6 = beq if leftover byte count is 0
 360 //              rv = original value of VRSave
 361 // c16,c32,c48 = loaded
 362
 363 LFwdLongVectors:
 364         mfspr   rv,vrsave                       // get bitmap of live vector registers
 365         lis             w3,kVeryLong>>16        // cutoff for very-long-operand special case path
 366         cmplw   cr1,rc,w3                       // very long operand?
 367         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3
 368         bge--   cr1,LBigCopy        // handle big copies separately
 369         mtctr   r0                                      // set up loop count
 370         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 371         oris    w4,rv,0xFFF8            // we use v0-v12
 372         rlwinm. r0,rc,28,29,31          // get number of quadword leftovers (0-7) and set cr0
 373         li              c16,16                          // get constants used in ldvx/stvx
 374         mtspr   vrsave,w4                       // update mask
 375         li              c32,32
 376         li              c48,48
 377         beq             cr5,LFwdLongAligned     // source is also 16-byte aligned, no need for vperm
 378         lvsl    vp,0,rs                         // get permute vector to shift left
 379         lvx             v1,0,rs                         // prefetch 1st source quadword
 380         b               LFwdLongUnaligned
 381
 382
 383 // Forward, long, unaligned vector loop.
 384
 385         .align  5                                       // align inner loops
 386 LFwdLongUnaligned:                                      // loop over 128-byte chunks
 387         addi    w4,rs,64
 388         lvx             v2,c16,rs
 389         lvx             v3,c32,rs
 390         lvx             v4,c48,rs
 391         lvx             v5,0,w4
 392         lvx             v6,c16,w4
 393         vperm   vw,v1,v2,vp
 394         lvx             v7,c32,w4
 395         lvx             v8,c48,w4
 396         addi    rs,rs,128
 397         vperm   vx,v2,v3,vp
 398         addi    w4,rd,64
 399         lvx             v1,0,rs
 400         stvx    vw,0,rd
 401         vperm   vy,v3,v4,vp
 402         stvx    vx,c16,rd
 403         vperm   vz,v4,v5,vp
 404         stvx    vy,c32,rd
 405         vperm   vw,v5,v6,vp
 406         stvx    vz,c48,rd
 407         vperm   vx,v6,v7,vp
 408         addi    rd,rd,128
 409         stvx    vw,0,w4
 410         vperm   vy,v7,v8,vp
 411         stvx    vx,c16,w4
 412         vperm   vz,v8,v1,vp
 413         stvx    vy,c32,w4
 414         stvx    vz,c48,w4
 415         bdnz    LFwdLongUnaligned
 416
 417         beq             4f                                      // no leftover quadwords
 418         mtctr   r0
 419 3:                                                                      // loop over remaining quadwords
 420         lvx             v2,c16,rs
 421         addi    rs,rs,16
 422         vperm   vx,v1,v2,vp
 423         vor             v1,v2,v2                        // v1 <- v2
 424         stvx    vx,0,rd
 425         addi    rd,rd,16
 426         bdnz    3b
 427 4:
 428         mtspr   vrsave,rv                       // restore bitmap of live vr's
 429         bne             cr6,LShort16            // handle last 0-15 bytes if any
 430         blr
 431
 432
 433 // Forward, long, 16-byte aligned vector loop.
 434
 435         .align  5
 436 LFwdLongAligned:                                // loop over 128-byte chunks
 437         addi    w4,rs,64
 438         lvx             v1,0,rs
 439         lvx             v2,c16,rs
 440         lvx             v3,c32,rs
 441         lvx             v4,c48,rs
 442         lvx             v5,0,w4
 443         lvx             v6,c16,w4
 444         lvx             v7,c32,w4
 445         lvx             v8,c48,w4
 446         addi    rs,rs,128
 447         addi    w4,rd,64
 448         stvx    v1,0,rd
 449         stvx    v2,c16,rd
 450         stvx    v3,c32,rd
 451         stvx    v4,c48,rd
 452         stvx    v5,0,w4
 453         stvx    v6,c16,w4
 454         stvx    v7,c32,w4
 455         stvx    v8,c48,w4
 456         addi    rd,rd,128
 457         bdnz    LFwdLongAligned
 458
 459         beq             4f                                      // no leftover quadwords
 460         mtctr   r0
 461 3:                                                                      // loop over remaining quadwords (1-7)
 462         lvx             v1,0,rs
 463         addi    rs,rs,16
 464         stvx    v1,0,rd
 465         addi    rd,rd,16
 466         bdnz    3b
 467 4:
 468         mtspr   vrsave,rv                       // restore bitmap of live vr's
 469         bne             cr6,LShort16            // handle last 0-15 bytes if any
 470         blr
 471
 472
 473 // Long, reverse moves.
 474 //              rs = source
 475 //              rd = destination
 476 //              rc = count
 477 //         cr5 = beq if relatively 16-byte aligned
 478
 479 LLongReverse:
 480         add             rd,rd,rc                        // point to end of operands
 481         add             rs,rs,rc
 482         andi.   r0,rd,0xF                       // #bytes to 16-byte align destination
 483         beq             2f                                      // already aligned
 484
 485 // 16-byte align destination.
 486
 487         mtctr   r0                                      // set up for loop
 488         sub             rc,rc,r0
 489 1:
 490         lbzu    w1,-1(rs)
 491         stbu    w1,-1(rd)
 492         bdnz    1b
 493
 494 // Prepare for reverse vector loop.  When entered:
 495 //              rd = 16-byte aligned
 496 //              cr5 = beq if source also 16-byte aligned
 497 // We set up many registers:
 498 //              ctr/cr1 = number of 64-byte chunks to move (may be 0)
 499 //              r0/cr0 = leftover QWs to move
 500 //              cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 501 //              cr6 = beq if leftover byte count is 0
 502 //              cm1 = -1
 503 //              rv = original value of vrsave
 504
 505 2:
 506         mfspr   rv,vrsave                       // get bitmap of live vector registers
 507         srwi    r0,rc,6                         // get count of 64-byte chunks to move (may be 0)
 508         oris    w1,rv,0xFFF8            // we use v0-v12
 509         mtcrf   0x01,rc                         // prepare for moving last 0-15 bytes in LShortReverse16
 510         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3 too
 511         cmpwi   cr1,r0,0                        // set cr1 on chunk count
 512         mtspr   vrsave,w1                       // update mask
 513         mtctr   r0                                      // set up loop count
 514         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 515         rlwinm. r0,rc,28,30,31          // get number of quadword leftovers (0-3) and set cr0
 516         li              cm1,-1                          // get constants used in ldvx/stvx
 517
 518         bne             cr5,LReverseVecUnal     // handle unaligned operands
 519         beq             cr1,2f                          // no chunks (if no chunks, must be leftover QWs)
 520         li              cm17,-17
 521         li              cm33,-33
 522         li              cm49,-49
 523         b               1f
 524
 525 // Long, reverse 16-byte-aligned vector loop.
 526
 527         .align  5                                       // align inner loops
 528 1:                                                              // loop over 64-byte chunks
 529         lvx             v1,cm1,rs
 530         lvx             v2,cm17,rs
 531         lvx             v3,cm33,rs
 532         lvx             v4,cm49,rs
 533         subi    rs,rs,64
 534         stvx    v1,cm1,rd
 535         stvx    v2,cm17,rd
 536         stvx    v3,cm33,rd
 537         stvx    v4,cm49,rd
 538         subi    rd,rd,64
 539         bdnz    1b
 540
 541         beq             4f                                      // no leftover quadwords
 542 2:                                                                      // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
 543         mtctr   r0
 544 3:                                                                      // loop over remaining quadwords (1-7)
 545         lvx             v1,cm1,rs
 546         subi    rs,rs,16
 547         stvx    v1,cm1,rd
 548         subi    rd,rd,16
 549         bdnz    3b
 550 4:
 551         mtspr   vrsave,rv                       // restore bitmap of live vr's
 552         bne             cr6,LShortReverse16     // handle last 0-15 bytes if any
 553         blr
 554
 555
 556 // Long, reverse, unaligned vector loop.
 557 //              ctr/cr1 = number of 64-byte chunks to move (may be 0)
 558 //              r0/cr0 = leftover QWs to move
 559 //              cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 560 //              cr6 = beq if leftover byte count is 0
 561 //              rv = original value of vrsave
 562 //              cm1 = -1
 563
 564 LReverseVecUnal:
 565         lvsl    vp,0,rs                         // get permute vector to shift left
 566         lvx             v1,cm1,rs                       // v1 always looks ahead
 567         li              cm17,-17
 568         beq             cr1,2f                          // no chunks (if no chunks, must be leftover QWs)
 569         li              cm33,-33
 570         li              cm49,-49
 571         b               1f
 572
 573         .align  5                                       // align the inner loops
 574 1:                                                                      // loop over 64-byte chunks
 575         lvx             v2,cm17,rs
 576         lvx             v3,cm33,rs
 577         lvx             v4,cm49,rs
 578         subi    rs,rs,64
 579         vperm   vx,v2,v1,vp
 580         lvx             v1,cm1,rs
 581         vperm   vy,v3,v2,vp
 582         stvx    vx,cm1,rd
 583         vperm   vz,v4,v3,vp
 584         stvx    vy,cm17,rd
 585         vperm   vx,v1,v4,vp
 586         stvx    vz,cm33,rd
 587         stvx    vx,cm49,rd
 588         subi    rd,rd,64
 589         bdnz    1b
 590
 591         beq             4f                                      // no leftover quadwords
 592 2:                                                                      // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
 593         mtctr   r0
 594 3:                                                                      // loop over 1-3 quadwords
 595         lvx             v2,cm17,rs
 596         subi    rs,rs,16
 597         vperm   vx,v2,v1,vp
 598         vor             v1,v2,v2                        // v1 <- v2
 599         stvx    vx,cm1,rd
 600         subi    rd,rd,16
 601         bdnz    3b
 602 4:
 603         mtspr   vrsave,rv                       // restore bitmap of live vr's
 604         bne             cr6,LShortReverse16     // handle last 0-15 bytes iff any
 605         blr
 606
 607
 608 // Very Big Copy Path.  Save our return address in the stack for help decoding backtraces.
 609 // The conditions bigcopy expects are:
 610 //  r0 = return address (also stored in caller's SF)
 611 //      r4 = source ptr
 612 //      r5 = length (at least several pages)
 613 // r12 = dest ptr
 614
 615 LBigCopy:
 616                 lis             r2,0x4000                       // r2 <- 0x40000000
 617         mflr    r0                  // get our return address
 618                 add.    r2,r2,r2                        // set cr0_lt if running in 32-bit mode
 619         stw     r0,8(r1)            // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
 620                 blta    _COMM_PAGE_BIGCOPY  // 32-bit mode, join big operand copy
 621                 std             r0,16(r1)                       // save return in correct spot for 64-bit mode
 622         ba      _COMM_PAGE_BIGCOPY  // then join big operand code
 623
 624
 625         COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
 626                                 kCommPageMTCRF+kCommPageBoth+kPort32to64)