osfmk/ppc/commpage/bcopy_970.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* =======================================
  23  * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
  24  * =======================================
  25  *
  26  * Version of 6/11/2003, tuned for the IBM 970.
  27  *
  28  * Register usage.  Note the rather delicate way we assign multiple uses
  29  * to the same register.  Beware.
  30  *   r0  = temp (NB: cannot use r0 for any constant such as "c16")
  31  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  32  *   r4  = source ptr ("rs")
  33  *   r5  = count of bytes to move ("rc")
  34  *   r6  = "w1", "c16", or "cm17"
  35  *   r7  = "w2", "c32", or "cm33"
  36  *   r8  = "w3", "c48", or "cm49"
  37  *   r9  = "w4",        or "cm1"
  38  *   r10 = vrsave ("rv")
  39  *   r11 = unused
  40  *   r12 = destination ptr ("rd")
  41  *   v0  = permute vector ("vp")
  42  * v1-v8 = qw's loaded from source
  43  *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
  44  */
  45 #define rs      r4
  46 #define rd      r12
  47 #define rc      r5
  48 #define rv      r10
  49
  50 #define w1      r6
  51 #define w2      r7
  52 #define w3      r8
  53 #define w4      r9
  54
  55 #define c16             r6
  56 #define cm17    r6
  57 #define c32             r7
  58 #define cm33    r7
  59 #define c48             r8
  60 #define cm49    r8
  61 #define cm1             r9
  62
  63 #define vp      v0
  64 #define vw      v9
  65 #define vx      v10
  66 #define vy      v11
  67 #define vz      v12
  68
  69 #define ASSEMBLER
  70 #include <sys/appleapiopts.h>
  71 #include <ppc/asm.h>
  72 #include <machine/cpu_capabilities.h>
  73 #include <machine/commpage.h>
  74
  75         .text
  76 /*
  77  * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
  78  * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
  79  * simple transformations:
  80  *      - all word compares are changed to doubleword
  81  *      - all "srwi[.]" opcodes are changed to "srdi[.]"
  82  * Nothing else is done.  For this to work, the following rules must be
  83  * carefully followed:
  84  *      - do not use carry or overflow
  85  *      - only use record mode if you are sure the results are mode-invariant
  86  *        for example, all "andi." and almost all "rlwinm." are fine
  87  *      - do not use "slwi", "slw", or "srw"
  88  * An imaginative programmer could break the porting model in other ways, but the above
  89  * are the most likely problem areas.  It is perhaps surprising how well in practice
  90  * this simple method works.
  91  */
  92
  93 #define kShort          64
  94 #define kVeryLong       (128*1024)
  95
  96
  97 // Main entry points.
  98
  99         .align  5
 100 bcopy_970:                                                      // void bcopy(const void *src, void *dst, size_t len)
 101         cmplwi  rc,kShort                       // short or long?
 102         sub             w1,r4,r3                        // must move in reverse if (rd-rs)<rc
 103         mr              rd,r4                           // move registers to canonic spot
 104         mr              rs,r3
 105         blt             LShort                          // handle short operands
 106         dcbt    0,rs                            // touch in the first line of source
 107         dcbtst  0,rd                            // touch in destination
 108         b               LLong1                          // join long operand code
 109
 110 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
 111
 112         .align  5
 113 Lmemcpy_970:                                            // void* memcpy(void *dst, void *src, size_t len)
 114 Lmemmove_970:                                           // void* memmove(void *dst, const void *src, size_t len)
 115         cmplwi  rc,kShort                       // short or long?
 116         sub             w1,r3,r4                        // must move in reverse if (rd-rs)<rc
 117         mr              rd,r3                           // must leave r3 alone, it is return value for memcpy etc
 118         bge             LLong0                          // handle long operands
 119
 120 // Handle short operands.
 121 //              rs = source
 122 //              rd = destination
 123 //              rc = count
 124 //              w1 = (rd-rs), must move reverse if (rd-rs)<rc
 125
 126 LShort:
 127         cmplw   cr1,w1,rc                       // set cr1 blt if we must move reverse
 128         mtcrf   0x02,rc                         // move length to cr6 and cr7 one at a time
 129         mtcrf   0x01,rc
 130         blt--   cr1,LShortReverse
 131
 132 // Forward short operands.  This is the most frequent case, so it is inline.
 133
 134         bf              26,0f                           // 32-byte chunk to move?
 135         ld              w1,0(rs)
 136         ld              w2,8(rs)
 137         ld              w3,16(rs)
 138         ld              w4,24(rs)
 139         addi    rs,rs,32
 140         std             w1,0(rd)
 141         std             w2,8(rd)
 142         std             w3,16(rd)
 143         std             w4,24(rd)
 144         addi    rd,rd,32
 145 0:
 146 LShort32:
 147         bf              27,1f                           // quadword to move?
 148         ld              w1,0(rs)
 149         ld              w3,8(rs)
 150         addi    rs,rs,16
 151         std             w1,0(rd)
 152         std             w3,8(rd)
 153         addi    rd,rd,16
 154 1:
 155 LShort16:                                                       // join here to xfer 0-15 bytes
 156         bf              28,2f                           // doubleword?
 157         ld              w1,0(rs)
 158         addi    rs,rs,8
 159         std             w1,0(rd)
 160         addi    rd,rd,8
 161 2:
 162         bf              29,3f                           // word?
 163         lwz             w1,0(rs)
 164         addi    rs,rs,4
 165         stw             w1,0(rd)
 166         addi    rd,rd,4
 167 3:
 168         bf              30,4f                           // halfword to move?
 169         lhz             w1,0(rs)
 170         addi    rs,rs,2
 171         sth             w1,0(rd)
 172         addi    rd,rd,2
 173 4:
 174         bflr    31                                      // skip if no odd byte
 175         lbz             w1,0(rs)
 176         stb             w1,0(rd)
 177         blr
 178
 179
 180 // Handle short reverse operands.
 181 //              cr = length in bits 26-31
 182
 183 LShortReverse:
 184         add             rs,rs,rc                        // adjust ptrs for reverse move
 185         add             rd,rd,rc
 186         bf              26,0f                           // 32 bytes to move?
 187         ld              w1,-8(rs)
 188         ld              w2,-16(rs)
 189         ld              w3,-24(rs)
 190         ldu             w4,-32(rs)
 191         std             w1,-8(rd)
 192         std             w2,-16(rd)
 193         std             w3,-24(rd)
 194         stdu    w4,-32(rd)
 195 0:
 196         bf              27,1f                           // quadword to move?
 197         ld              w1,-8(rs)
 198         ldu             w2,-16(rs)
 199         std             w1,-8(rd)
 200         stdu    w2,-16(rd)
 201 1:
 202 LShortReverse16:                                        // join here to xfer 0-15 bytes and return
 203         bf              28,2f                           // doubleword?
 204         ldu             w1,-8(rs)
 205         stdu    w1,-8(rd)
 206 2:
 207         bf              29,3f                           // word?
 208         lwzu    w1,-4(rs)
 209         stwu    w1,-4(rd)
 210 3:
 211         bf              30,4f                           // halfword to move?
 212         lhzu    w1,-2(rs)
 213         sthu    w1,-2(rd)
 214 4:
 215         bflr    31                                      // done if no odd byte
 216         lbz     w1,-1(rs)                       // no update
 217         stb     w1,-1(rd)
 218         blr
 219
 220
 221 // Long operands, use Altivec in most cases.
 222 //              rs = source
 223 //              rd = destination
 224 //              rc = count
 225 //              w1 = (rd-rs), must move reverse if (rd-rs)<rc
 226
 227 LLong0:                                                         // entry from memmove()
 228         dcbt    0,rs                            // touch in source
 229         dcbtst  0,rd                            // touch in destination
 230 LLong1:                                                         // entry from bcopy() with operands already touched in
 231         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
 232         neg             w3,rd                           // start to compute #bytes to align destination
 233         rlwinm  w2,w1,0,0xF                     // 16-byte aligned?  (w2==0 if so)
 234         andi.   w4,w3,0xF                       // w4 <- #bytes to 16-byte align destination
 235         cmpwi   cr5,w2,0                        // set cr5 beq if relatively 16-byte aligned
 236         blt--   cr1,LLongReverse        // handle reverse moves
 237         sub             rc,rc,w4                        // adjust length for aligning destination
 238         srwi    r0,rc,7                         // get #cache lines to copy (may be 0)
 239         cmpwi   cr1,r0,0                        // set cr1 on #chunks
 240         beq             LFwdAligned                     // dest is already aligned
 241
 242 // 16-byte align destination.
 243
 244         mtcrf   0x01,w4                         // cr7 <- #bytes to align dest (nonzero)
 245         bf              31,1f                           // byte to move?
 246         lbz             w1,0(rs)
 247         addi    rs,rs,1
 248         stb             w1,0(rd)
 249         addi    rd,rd,1
 250 1:
 251         bf              30,2f                           // halfword?
 252         lhz             w1,0(rs)
 253         addi    rs,rs,2
 254         sth             w1,0(rd)
 255         addi    rd,rd,2
 256 2:
 257         bf              29,3f                           // word?
 258         lwz             w1,0(rs)
 259         addi    rs,rs,4
 260         stw             w1,0(rd)
 261         addi    rd,rd,4
 262 3:
 263         bf              28,LFwdAligned          // doubleword?
 264         ld              w1,0(rs)
 265         addi    rs,rs,8
 266         std             w1,0(rd)
 267         addi    rd,rd,8
 268
 269
 270 // Forward, destination is 16-byte aligned.  There are five cases:
 271 //  1. If the length>=kVeryLong (ie, several pages), then use the
 272 //     "bigcopy" path that pulls all the punches.  This is the fastest
 273 //         case for cold-cache operands, as any this long will likely be.
 274 //      2. If length>=128 and source is 16-byte aligned, then use the
 275 //         lvx/stvx loop over 128-byte chunks.  This is the fastest
 276 //     case for hot-cache operands, 2nd fastest for cold.
 277 //      3. If length>=128 and source is not 16-byte aligned, then use the
 278 //         lvx/vperm/stvx loop over 128-byte chunks.
 279 //      4. If length<128 and source is 8-byte aligned, then use the
 280 //         ld/std loop over 32-byte chunks.
 281 //      5. If length<128 and source is not 8-byte aligned, then use the
 282 //         lvx/vperm/stvx loop over 32-byte chunks.  This is the slowest case.
 283 // Registers at this point:
 284 //              r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
 285 //                      rs = alignment unknown
 286 //                  rd = 16-byte aligned
 287 //                      rc = bytes remaining
 288 //                      w2 = low 4 bits of (rd-rs), used to check alignment
 289 //                 cr5 = beq if source is also 16-byte aligned
 290
 291 LFwdAligned:
 292         andi.   w3,w2,7                         // is source at least 8-byte aligned?
 293         mtcrf   0x01,rc                         // move leftover count to cr7 for LShort16
 294         bne             cr1,LFwdLongVectors     // at least one 128-byte chunk, so use vectors
 295         srwi    w1,rc,5                         // get 32-byte chunk count
 296         mtcrf   0x02,rc                         // move bit 27 of length to cr6 for LShort32
 297         mtctr   w1                                      // set up 32-byte loop (w1!=0)
 298         beq             LFwdMedAligned          // source is 8-byte aligned, so use ld/std loop
 299         mfspr   rv,vrsave                       // get bitmap of live vector registers
 300         oris    w4,rv,0xFFF8            // we use v0-v12
 301         li              c16,16                          // get constant used in lvx
 302         li              c32,32
 303         mtspr   vrsave,w4                       // update mask
 304         lvx             v1,0,rs                         // prefetch 1st source quadword
 305         lvsl    vp,0,rs                         // get permute vector to shift left
 306
 307
 308 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
 309
 310 1:                                                                      // loop over 32-byte chunks
 311         lvx             v2,c16,rs
 312         lvx             v3,c32,rs
 313         addi    rs,rs,32
 314         vperm   vx,v1,v2,vp
 315         vperm   vy,v2,v3,vp
 316         vor             v1,v3,v3                        // v1 <- v3
 317         stvx    vx,0,rd
 318         stvx    vy,c16,rd
 319         addi    rd,rd,32
 320         bdnz    1b
 321
 322         mtspr   vrsave,rv                       // restore bitmap of live vr's
 323         b               LShort32
 324
 325
 326 // Fewer than 128 bytes and doubleword aligned: use ld/std.
 327
 328         .align  5
 329 LFwdMedAligned:                                                                 // loop over 32-byte chunks
 330         ld              w1,0(rs)
 331         ld              w2,8(rs)
 332         ld              w3,16(rs)
 333         ld              w4,24(rs)
 334         addi    rs,rs,32
 335         std             w1,0(rd)
 336         std             w2,8(rd)
 337         std             w3,16(rd)
 338         std             w4,24(rd)
 339         addi    rd,rd,32
 340         bdnz    LFwdMedAligned
 341
 342         b               LShort32
 343
 344
 345 // Forward, 128 bytes or more: use vectors.  When entered:
 346 //          r0 = 128-byte chunks to move (>0)
 347 //              rd = 16-byte aligned
 348 //         cr5 = beq if source is 16-byte aligned
 349 //         cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 350 // We set up many registers:
 351 //         ctr = number of 128-byte chunks to move
 352 //      r0/cr0 = leftover QWs to move
 353 //         cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 354 //         cr6 = beq if leftover byte count is 0
 355 //              rv = original value of VRSave
 356 // c16,c32,c48 = loaded
 357
 358 LFwdLongVectors:
 359         mfspr   rv,vrsave                       // get bitmap of live vector registers
 360         lis             w3,kVeryLong>>16        // cutoff for very-long-operand special case path
 361         cmplw   cr1,rc,w3                       // very long operand?
 362         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3
 363         bge--   cr1,LBigCopy        // handle big copies separately
 364         mtctr   r0                                      // set up loop count
 365         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 366         oris    w4,rv,0xFFF8            // we use v0-v12
 367         rlwinm. r0,rc,28,29,31          // get number of quadword leftovers (0-7) and set cr0
 368         li              c16,16                          // get constants used in ldvx/stvx
 369         mtspr   vrsave,w4                       // update mask
 370         li              c32,32
 371         li              c48,48
 372         beq             cr5,LFwdLongAligned     // source is also 16-byte aligned, no need for vperm
 373         lvsl    vp,0,rs                         // get permute vector to shift left
 374         lvx             v1,0,rs                         // prefetch 1st source quadword
 375         b               LFwdLongUnaligned
 376
 377
 378 // Forward, long, unaligned vector loop.
 379
 380         .align  5                                       // align inner loops
 381 LFwdLongUnaligned:                                      // loop over 128-byte chunks
 382         addi    w4,rs,64
 383         lvx             v2,c16,rs
 384         lvx             v3,c32,rs
 385         lvx             v4,c48,rs
 386         lvx             v5,0,w4
 387         lvx             v6,c16,w4
 388         vperm   vw,v1,v2,vp
 389         lvx             v7,c32,w4
 390         lvx             v8,c48,w4
 391         addi    rs,rs,128
 392         vperm   vx,v2,v3,vp
 393         addi    w4,rd,64
 394         lvx             v1,0,rs
 395         stvx    vw,0,rd
 396         vperm   vy,v3,v4,vp
 397         stvx    vx,c16,rd
 398         vperm   vz,v4,v5,vp
 399         stvx    vy,c32,rd
 400         vperm   vw,v5,v6,vp
 401         stvx    vz,c48,rd
 402         vperm   vx,v6,v7,vp
 403         addi    rd,rd,128
 404         stvx    vw,0,w4
 405         vperm   vy,v7,v8,vp
 406         stvx    vx,c16,w4
 407         vperm   vz,v8,v1,vp
 408         stvx    vy,c32,w4
 409         stvx    vz,c48,w4
 410         bdnz    LFwdLongUnaligned
 411
 412         beq             4f                                      // no leftover quadwords
 413         mtctr   r0
 414 3:                                                                      // loop over remaining quadwords
 415         lvx             v2,c16,rs
 416         addi    rs,rs,16
 417         vperm   vx,v1,v2,vp
 418         vor             v1,v2,v2                        // v1 <- v2
 419         stvx    vx,0,rd
 420         addi    rd,rd,16
 421         bdnz    3b
 422 4:
 423         mtspr   vrsave,rv                       // restore bitmap of live vr's
 424         bne             cr6,LShort16            // handle last 0-15 bytes if any
 425         blr
 426
 427
 428 // Forward, long, 16-byte aligned vector loop.
 429
 430         .align  5
 431 LFwdLongAligned:                                // loop over 128-byte chunks
 432         addi    w4,rs,64
 433         lvx             v1,0,rs
 434         lvx             v2,c16,rs
 435         lvx             v3,c32,rs
 436         lvx             v4,c48,rs
 437         lvx             v5,0,w4
 438         lvx             v6,c16,w4
 439         lvx             v7,c32,w4
 440         lvx             v8,c48,w4
 441         addi    rs,rs,128
 442         addi    w4,rd,64
 443         stvx    v1,0,rd
 444         stvx    v2,c16,rd
 445         stvx    v3,c32,rd
 446         stvx    v4,c48,rd
 447         stvx    v5,0,w4
 448         stvx    v6,c16,w4
 449         stvx    v7,c32,w4
 450         stvx    v8,c48,w4
 451         addi    rd,rd,128
 452         bdnz    LFwdLongAligned
 453
 454         beq             4f                                      // no leftover quadwords
 455         mtctr   r0
 456 3:                                                                      // loop over remaining quadwords (1-7)
 457         lvx             v1,0,rs
 458         addi    rs,rs,16
 459         stvx    v1,0,rd
 460         addi    rd,rd,16
 461         bdnz    3b
 462 4:
 463         mtspr   vrsave,rv                       // restore bitmap of live vr's
 464         bne             cr6,LShort16            // handle last 0-15 bytes if any
 465         blr
 466
 467
 468 // Long, reverse moves.
 469 //              rs = source
 470 //              rd = destination
 471 //              rc = count
 472 //         cr5 = beq if relatively 16-byte aligned
 473
 474 LLongReverse:
 475         add             rd,rd,rc                        // point to end of operands
 476         add             rs,rs,rc
 477         andi.   r0,rd,0xF                       // #bytes to 16-byte align destination
 478         beq             2f                                      // already aligned
 479
 480 // 16-byte align destination.
 481
 482         mtctr   r0                                      // set up for loop
 483         sub             rc,rc,r0
 484 1:
 485         lbzu    w1,-1(rs)
 486         stbu    w1,-1(rd)
 487         bdnz    1b
 488
 489 // Prepare for reverse vector loop.  When entered:
 490 //              rd = 16-byte aligned
 491 //              cr5 = beq if source also 16-byte aligned
 492 // We set up many registers:
 493 //              ctr/cr1 = number of 64-byte chunks to move (may be 0)
 494 //              r0/cr0 = leftover QWs to move
 495 //              cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 496 //              cr6 = beq if leftover byte count is 0
 497 //              cm1 = -1
 498 //              rv = original value of vrsave
 499
 500 2:
 501         mfspr   rv,vrsave                       // get bitmap of live vector registers
 502         srwi    r0,rc,6                         // get count of 64-byte chunks to move (may be 0)
 503         oris    w1,rv,0xFFF8            // we use v0-v12
 504         mtcrf   0x01,rc                         // prepare for moving last 0-15 bytes in LShortReverse16
 505         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3 too
 506         cmpwi   cr1,r0,0                        // set cr1 on chunk count
 507         mtspr   vrsave,w1                       // update mask
 508         mtctr   r0                                      // set up loop count
 509         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 510         rlwinm. r0,rc,28,30,31          // get number of quadword leftovers (0-3) and set cr0
 511         li              cm1,-1                          // get constants used in ldvx/stvx
 512
 513         bne             cr5,LReverseVecUnal     // handle unaligned operands
 514         beq             cr1,2f                          // no chunks (if no chunks, must be leftover QWs)
 515         li              cm17,-17
 516         li              cm33,-33
 517         li              cm49,-49
 518         b               1f
 519
 520 // Long, reverse 16-byte-aligned vector loop.
 521
 522         .align  5                                       // align inner loops
 523 1:                                                              // loop over 64-byte chunks
 524         lvx             v1,cm1,rs
 525         lvx             v2,cm17,rs
 526         lvx             v3,cm33,rs
 527         lvx             v4,cm49,rs
 528         subi    rs,rs,64
 529         stvx    v1,cm1,rd
 530         stvx    v2,cm17,rd
 531         stvx    v3,cm33,rd
 532         stvx    v4,cm49,rd
 533         subi    rd,rd,64
 534         bdnz    1b
 535
 536         beq             4f                                      // no leftover quadwords
 537 2:                                                                      // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
 538         mtctr   r0
 539 3:                                                                      // loop over remaining quadwords (1-7)
 540         lvx             v1,cm1,rs
 541         subi    rs,rs,16
 542         stvx    v1,cm1,rd
 543         subi    rd,rd,16
 544         bdnz    3b
 545 4:
 546         mtspr   vrsave,rv                       // restore bitmap of live vr's
 547         bne             cr6,LShortReverse16     // handle last 0-15 bytes if any
 548         blr
 549
 550
 551 // Long, reverse, unaligned vector loop.
 552 //              ctr/cr1 = number of 64-byte chunks to move (may be 0)
 553 //              r0/cr0 = leftover QWs to move
 554 //              cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 555 //              cr6 = beq if leftover byte count is 0
 556 //              rv = original value of vrsave
 557 //              cm1 = -1
 558
 559 LReverseVecUnal:
 560         lvsl    vp,0,rs                         // get permute vector to shift left
 561         lvx             v1,cm1,rs                       // v1 always looks ahead
 562         li              cm17,-17
 563         beq             cr1,2f                          // no chunks (if no chunks, must be leftover QWs)
 564         li              cm33,-33
 565         li              cm49,-49
 566         b               1f
 567
 568         .align  5                                       // align the inner loops
 569 1:                                                                      // loop over 64-byte chunks
 570         lvx             v2,cm17,rs
 571         lvx             v3,cm33,rs
 572         lvx             v4,cm49,rs
 573         subi    rs,rs,64
 574         vperm   vx,v2,v1,vp
 575         lvx             v1,cm1,rs
 576         vperm   vy,v3,v2,vp
 577         stvx    vx,cm1,rd
 578         vperm   vz,v4,v3,vp
 579         stvx    vy,cm17,rd
 580         vperm   vx,v1,v4,vp
 581         stvx    vz,cm33,rd
 582         stvx    vx,cm49,rd
 583         subi    rd,rd,64
 584         bdnz    1b
 585
 586         beq             4f                                      // no leftover quadwords
 587 2:                                                                      // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
 588         mtctr   r0
 589 3:                                                                      // loop over 1-3 quadwords
 590         lvx             v2,cm17,rs
 591         subi    rs,rs,16
 592         vperm   vx,v2,v1,vp
 593         vor             v1,v2,v2                        // v1 <- v2
 594         stvx    vx,cm1,rd
 595         subi    rd,rd,16
 596         bdnz    3b
 597 4:
 598         mtspr   vrsave,rv                       // restore bitmap of live vr's
 599         bne             cr6,LShortReverse16     // handle last 0-15 bytes iff any
 600         blr
 601
 602
 603 // Very Big Copy Path.  Save our return address in the stack for help decoding backtraces.
 604 // The conditions bigcopy expects are:
 605 //  r0 = return address (also stored in caller's SF)
 606 //      r4 = source ptr
 607 //      r5 = length (at least several pages)
 608 // r12 = dest ptr
 609
 610 LBigCopy:
 611                 lis             r2,0x4000                       // r2 <- 0x40000000
 612         mflr    r0                  // get our return address
 613                 add.    r2,r2,r2                        // set cr0_lt if running in 32-bit mode
 614         stw     r0,8(r1)            // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
 615                 blta    _COMM_PAGE_BIGCOPY  // 32-bit mode, join big operand copy
 616                 std             r0,16(r1)                       // save return in correct spot for 64-bit mode
 617         ba      _COMM_PAGE_BIGCOPY  // then join big operand code
 618
 619
 620         COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
 621                                 kCommPageMTCRF+kCommPageBoth+kPort32to64)