osfmk/ppc/commpage/bcopy_970.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25 /* =======================================
  26  * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
  27  * =======================================
  28  *
  29  * Version of 6/11/2003, tuned for the IBM 970.
  30  *
  31  *
  32  * Register usage.  Note the rather delicate way we assign multiple uses
  33  * to the same register.  Beware.
  34  *   r0  = temp (NB: cannot use r0 for any constant such as "c16")
  35  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  36  *   r4  = source ptr ("rs")
  37  *   r5  = count of bytes to move ("rc")
  38  *   r6  = "w1", "c16", or "cm17"
  39  *   r7  = "w2", "c32", or "cm33"
  40  *   r8  = "w3", "c48", or "cm49"
  41  *   r9  = "w4",        or "cm1"
  42  *   r10 = vrsave ("rv")
  43  *   r11 = unused
  44  *   r12 = destination ptr ("rd")
  45  *   v0  = permute vector ("vp")
  46  * v1-v8 = qw's loaded from source
  47  *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
  48  */
  49 #define rs      r4
  50 #define rd      r12
  51 #define rc      r5
  52 #define rv      r10
  53
  54 #define w1      r6
  55 #define w2      r7
  56 #define w3      r8
  57 #define w4      r9
  58
  59 #define c16             r6
  60 #define cm17    r6
  61 #define c32             r7
  62 #define cm33    r7
  63 #define c48             r8
  64 #define cm49    r8
  65 #define cm1             r9
  66
  67 #define vp      v0
  68 #define vw      v9
  69 #define vx      v10
  70 #define vy      v11
  71 #define vz      v12
  72
  73 #define ASSEMBLER
  74 #include <sys/appleapiopts.h>
  75 #include <ppc/asm.h>
  76 #include <machine/cpu_capabilities.h>
  77 #include <machine/commpage.h>
  78
  79         .text
  80         .globl  EXT(bcopy_970)
  81
  82
  83 #define kShort          64
  84 #define kVeryLong       (128*1024)
  85
  86
  87 // Main entry points.
  88
  89         .align  5
  90 bcopy_970:                                                      // void bcopy(const void *src, void *dst, size_t len)
  91         cmplwi  rc,kShort                       // short or long?
  92         sub             w1,r4,r3                        // must move in reverse if (rd-rs)<rc
  93         mr              rd,r4                           // move registers to canonic spot
  94         mr              rs,r3
  95         blt             LShort                          // handle short operands
  96         dcbt    0,rs                            // touch in the first line of source
  97         dcbtst  0,rd                            // touch in destination
  98         b               LLong1                          // join long operand code
  99
 100 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
 101
 102         .align  5
 103 Lmemcpy_970:                                            // void* memcpy(void *dst, void *src, size_t len)
 104 Lmemmove_970:                                           // void* memmove(void *dst, const void *src, size_t len)
 105         cmplwi  rc,kShort                       // short or long?
 106         sub             w1,r3,r4                        // must move in reverse if (rd-rs)<rc
 107         mr              rd,r3                           // must leave r3 alone, it is return value for memcpy etc
 108         bge             LLong0                          // handle long operands
 109
 110 // Handle short operands.
 111 //              rs = source
 112 //              rd = destination
 113 //              rc = count
 114 //              w1 = (rd-rs), must move reverse if (rd-rs)<rc
 115
 116 LShort:
 117         cmplw   cr1,w1,rc                       // set cr1 blt if we must move reverse
 118         mtcrf   0x02,rc                         // move length to cr6 and cr7 one at a time
 119         mtcrf   0x01,rc
 120         blt--   cr1,LShortReverse
 121
 122 // Forward short operands.  This is the most frequent case, so it is inline.
 123
 124         bf              26,0f                           // 32-byte chunk to move?
 125         ld              w1,0(rs)
 126         ld              w2,8(rs)
 127         ld              w3,16(rs)
 128         ld              w4,24(rs)
 129         addi    rs,rs,32
 130         std             w1,0(rd)
 131         std             w2,8(rd)
 132         std             w3,16(rd)
 133         std             w4,24(rd)
 134         addi    rd,rd,32
 135 0:
 136 LShort32:
 137         bf              27,1f                           // quadword to move?
 138         ld              w1,0(rs)
 139         ld              w3,8(rs)
 140         addi    rs,rs,16
 141         std             w1,0(rd)
 142         std             w3,8(rd)
 143         addi    rd,rd,16
 144 1:
 145 LShort16:                                                       // join here to xfer 0-15 bytes
 146         bf              28,2f                           // doubleword?
 147         ld              w1,0(rs)
 148         addi    rs,rs,8
 149         std             w1,0(rd)
 150         addi    rd,rd,8
 151 2:
 152         bf              29,3f                           // word?
 153         lwz             w1,0(rs)
 154         addi    rs,rs,4
 155         stw             w1,0(rd)
 156         addi    rd,rd,4
 157 3:
 158         bf              30,4f                           // halfword to move?
 159         lhz             w1,0(rs)
 160         addi    rs,rs,2
 161         sth             w1,0(rd)
 162         addi    rd,rd,2
 163 4:
 164         bflr    31                                      // skip if no odd byte
 165         lbz             w1,0(rs)
 166         stb             w1,0(rd)
 167         blr
 168
 169
 170 // Handle short reverse operands.
 171 //              cr = length in bits 26-31
 172
 173 LShortReverse:
 174         add             rs,rs,rc                        // adjust ptrs for reverse move
 175         add             rd,rd,rc
 176         bf              26,0f                           // 32 bytes to move?
 177         ld              w1,-8(rs)
 178         ld              w2,-16(rs)
 179         ld              w3,-24(rs)
 180         ldu             w4,-32(rs)
 181         std             w1,-8(rd)
 182         std             w2,-16(rd)
 183         std             w3,-24(rd)
 184         stdu    w4,-32(rd)
 185 0:
 186         bf              27,1f                           // quadword to move?
 187         ld              w1,-8(rs)
 188         ldu             w2,-16(rs)
 189         std             w1,-8(rd)
 190         stdu    w2,-16(rd)
 191 1:
 192 LShortReverse16:                                        // join here to xfer 0-15 bytes and return
 193         bf              28,2f                           // doubleword?
 194         ldu             w1,-8(rs)
 195         stdu    w1,-8(rd)
 196 2:
 197         bf              29,3f                           // word?
 198         lwzu    w1,-4(rs)
 199         stwu    w1,-4(rd)
 200 3:
 201         bf              30,4f                           // halfword to move?
 202         lhzu    w1,-2(rs)
 203         sthu    w1,-2(rd)
 204 4:
 205         bflr    31                                      // done if no odd byte
 206         lbz     w1,-1(rs)                       // no update
 207         stb     w1,-1(rd)
 208         blr
 209
 210
 211 // Long operands, use Altivec in most cases.
 212 //              rs = source
 213 //              rd = destination
 214 //              rc = count
 215 //              w1 = (rd-rs), must move reverse if (rd-rs)<rc
 216
 217 LLong0:                                                         // entry from memmove()
 218         dcbt    0,rs                            // touch in source
 219         dcbtst  0,rd                            // touch in destination
 220 LLong1:                                                         // entry from bcopy() with operands already touched in
 221         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
 222         neg             w3,rd                           // start to compute #bytes to align destination
 223         rlwinm  w2,w1,0,0xF                     // 16-byte aligned?  (w2==0 if so)
 224         andi.   w4,w3,0xF                       // w4 <- #bytes to 16-byte align destination
 225         cmpwi   cr5,w2,0                        // set cr5 beq if relatively 16-byte aligned
 226         blt--   cr1,LLongReverse        // handle reverse moves
 227         sub             rc,rc,w4                        // adjust length for aligning destination
 228         srwi    r0,rc,7                         // get #cache lines to copy (may be 0)
 229         cmpwi   cr1,r0,0                        // set cr1 on #chunks
 230         beq             LFwdAligned                     // dest is already aligned
 231
 232 // 16-byte align destination.
 233
 234         mtcrf   0x01,w4                         // cr7 <- #bytes to align dest (nonzero)
 235         bf              31,1f                           // byte to move?
 236         lbz             w1,0(rs)
 237         addi    rs,rs,1
 238         stb             w1,0(rd)
 239         addi    rd,rd,1
 240 1:
 241         bf              30,2f                           // halfword?
 242         lhz             w1,0(rs)
 243         addi    rs,rs,2
 244         sth             w1,0(rd)
 245         addi    rd,rd,2
 246 2:
 247         bf              29,3f                           // word?
 248         lwz             w1,0(rs)
 249         addi    rs,rs,4
 250         stw             w1,0(rd)
 251         addi    rd,rd,4
 252 3:
 253         bf              28,LFwdAligned          // doubleword?
 254         ld              w1,0(rs)
 255         addi    rs,rs,8
 256         std             w1,0(rd)
 257         addi    rd,rd,8
 258
 259
 260 // Forward, destination is 16-byte aligned.  There are five cases:
 261 //  1. If the length>=kVeryLong (ie, several pages), then use the
 262 //     "bigcopy" path that pulls all the punches.  This is the fastest
 263 //         case for cold-cache operands, as any this long will likely be.
 264 //      2. If length>=128 and source is 16-byte aligned, then use the
 265 //         lvx/stvx loop over 128-byte chunks.  This is the fastest
 266 //     case for hot-cache operands, 2nd fastest for cold.
 267 //      3. If length>=128 and source is not 16-byte aligned, then use the
 268 //         lvx/vperm/stvx loop over 128-byte chunks.
 269 //      4. If length<128 and source is 8-byte aligned, then use the
 270 //         ld/std loop over 32-byte chunks.
 271 //      5. If length<128 and source is not 8-byte aligned, then use the
 272 //         lvx/vperm/stvx loop over 32-byte chunks.  This is the slowest case.
 273 // Registers at this point:
 274 //              r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
 275 //                      rs = alignment unknown
 276 //                  rd = 16-byte aligned
 277 //                      rc = bytes remaining
 278 //                      w2 = low 4 bits of (rd-rs), used to check alignment
 279 //                 cr5 = beq if source is also 16-byte aligned
 280
 281 LFwdAligned:
 282         andi.   w3,w2,7                         // is source at least 8-byte aligned?
 283         mtcrf   0x01,rc                         // move leftover count to cr7 for LShort16
 284         bne             cr1,LFwdLongVectors     // at least one 128-byte chunk, so use vectors
 285         srwi    w1,rc,5                         // get 32-byte chunk count
 286         mtcrf   0x02,rc                         // move bit 27 of length to cr6 for LShort32
 287         mtctr   w1                                      // set up 32-byte loop (w1!=0)
 288         beq             LFwdMedAligned          // source is 8-byte aligned, so use ld/std loop
 289         mfspr   rv,vrsave                       // get bitmap of live vector registers
 290         oris    w4,rv,0xFFF8            // we use v0-v12
 291         li              c16,16                          // get constants used in lvx
 292         li              c32,32
 293         mtspr   vrsave,w4                       // update mask
 294         lvx             v1,0,rs                         // prefetch 1st source quadword
 295         lvsl    vp,0,rs                         // get permute vector to shift left
 296
 297
 298 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
 299
 300 1:                                                                      // loop over 32-byte chunks
 301         lvx             v2,c16,rs
 302         lvx             v3,c32,rs
 303         addi    rs,rs,32
 304         vperm   vx,v1,v2,vp
 305         vperm   vy,v2,v3,vp
 306         vor             v1,v3,v3                        // v1 <- v3
 307         stvx    vx,0,rd
 308         stvx    vy,c16,rd
 309         addi    rd,rd,32
 310         bdnz    1b
 311
 312         mtspr   vrsave,rv                       // restore bitmap of live vr's
 313         b               LShort32
 314
 315
 316 // Fewer than 128 bytes and doubleword aligned: use ld/std.
 317
 318         .align  5
 319 LFwdMedAligned:                                                                 // loop over 32-byte chunks
 320         ld              w1,0(rs)
 321         ld              w2,8(rs)
 322         ld              w3,16(rs)
 323         ld              w4,24(rs)
 324         addi    rs,rs,32
 325         std             w1,0(rd)
 326         std             w2,8(rd)
 327         std             w3,16(rd)
 328         std             w4,24(rd)
 329         addi    rd,rd,32
 330         bdnz    LFwdMedAligned
 331
 332         b               LShort32
 333
 334
 335 // Forward, 128 bytes or more: use vectors.  When entered:
 336 //          r0 = 128-byte chunks to move (>0)
 337 //              rd = 16-byte aligned
 338 //         cr5 = beq if source is 16-byte aligned
 339 //         cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 340 // We set up many registers:
 341 //         ctr = number of 128-byte chunks to move
 342 //      r0/cr0 = leftover QWs to move
 343 //         cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 344 //         cr6 = beq if leftover byte count is 0
 345 //              rv = original value of VRSave
 346 // c16,c32,c48 = loaded
 347
 348 LFwdLongVectors:
 349         mfspr   rv,vrsave                       // get bitmap of live vector registers
 350         lis             w3,kVeryLong>>16        // cutoff for very-long-operand special case path
 351         cmplw   cr1,rc,w3                       // very long operand?
 352         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3
 353         bgea--  cr1,_COMM_PAGE_BIGCOPY  // handle big copies separately
 354         mtctr   r0                                      // set up loop count
 355         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 356         oris    w4,rv,0xFFF8            // we use v0-v12
 357         rlwinm. r0,rc,28,29,31          // get number of quadword leftovers (0-7) and set cr0
 358         li              c16,16                          // get constants used in ldvx/stvx
 359         mtspr   vrsave,w4                       // update mask
 360         li              c32,32
 361         li              c48,48
 362         beq             cr5,LFwdLongAligned     // source is also 16-byte aligned, no need for vperm
 363         lvsl    vp,0,rs                         // get permute vector to shift left
 364         lvx             v1,0,rs                         // prefetch 1st source quadword
 365         b               LFwdLongUnaligned
 366
 367
 368 // Forward, long, unaligned vector loop.
 369
 370         .align  5                                       // align inner loops
 371 LFwdLongUnaligned:                                      // loop over 128-byte chunks
 372         addi    w4,rs,64
 373         lvx             v2,c16,rs
 374         lvx             v3,c32,rs
 375         lvx             v4,c48,rs
 376         lvx             v5,0,w4
 377         lvx             v6,c16,w4
 378         vperm   vw,v1,v2,vp
 379         lvx             v7,c32,w4
 380         lvx             v8,c48,w4
 381         addi    rs,rs,128
 382         vperm   vx,v2,v3,vp
 383         addi    w4,rd,64
 384         lvx             v1,0,rs
 385         stvx    vw,0,rd
 386         vperm   vy,v3,v4,vp
 387         stvx    vx,c16,rd
 388         vperm   vz,v4,v5,vp
 389         stvx    vy,c32,rd
 390         vperm   vw,v5,v6,vp
 391         stvx    vz,c48,rd
 392         vperm   vx,v6,v7,vp
 393         addi    rd,rd,128
 394         stvx    vw,0,w4
 395         vperm   vy,v7,v8,vp
 396         stvx    vx,c16,w4
 397         vperm   vz,v8,v1,vp
 398         stvx    vy,c32,w4
 399         stvx    vz,c48,w4
 400         bdnz    LFwdLongUnaligned
 401
 402         beq             4f                                      // no leftover quadwords
 403         mtctr   r0
 404 3:                                                                      // loop over remaining quadwords
 405         lvx             v2,c16,rs
 406         addi    rs,rs,16
 407         vperm   vx,v1,v2,vp
 408         vor             v1,v2,v2                        // v1 <- v2
 409         stvx    vx,0,rd
 410         addi    rd,rd,16
 411         bdnz    3b
 412 4:
 413         mtspr   vrsave,rv                       // restore bitmap of live vr's
 414         bne             cr6,LShort16            // handle last 0-15 bytes if any
 415         blr
 416
 417
 418 // Forward, long, 16-byte aligned vector loop.
 419
 420         .align  5
 421 LFwdLongAligned:                                // loop over 128-byte chunks
 422         addi    w4,rs,64
 423         lvx             v1,0,rs
 424         lvx             v2,c16,rs
 425         lvx             v3,c32,rs
 426         lvx             v4,c48,rs
 427         lvx             v5,0,w4
 428         lvx             v6,c16,w4
 429         lvx             v7,c32,w4
 430         lvx             v8,c48,w4
 431         addi    rs,rs,128
 432         addi    w4,rd,64
 433         stvx    v1,0,rd
 434         stvx    v2,c16,rd
 435         stvx    v3,c32,rd
 436         stvx    v4,c48,rd
 437         stvx    v5,0,w4
 438         stvx    v6,c16,w4
 439         stvx    v7,c32,w4
 440         stvx    v8,c48,w4
 441         addi    rd,rd,128
 442         bdnz    LFwdLongAligned
 443
 444         beq             4f                                      // no leftover quadwords
 445         mtctr   r0
 446 3:                                                                      // loop over remaining quadwords (1-7)
 447         lvx             v1,0,rs
 448         addi    rs,rs,16
 449         stvx    v1,0,rd
 450         addi    rd,rd,16
 451         bdnz    3b
 452 4:
 453         mtspr   vrsave,rv                       // restore bitmap of live vr's
 454         bne             cr6,LShort16            // handle last 0-15 bytes if any
 455         blr
 456
 457
 458 // Long, reverse moves.
 459 //              rs = source
 460 //              rd = destination
 461 //              rc = count
 462 //         cr5 = beq if relatively 16-byte aligned
 463
 464 LLongReverse:
 465         add             rd,rd,rc                        // point to end of operands
 466         add             rs,rs,rc
 467         andi.   r0,rd,0xF                       // #bytes to 16-byte align destination
 468         beq             2f                                      // already aligned
 469
 470 // 16-byte align destination.
 471
 472         mtctr   r0                                      // set up for loop
 473         sub             rc,rc,r0
 474 1:
 475         lbzu    w1,-1(rs)
 476         stbu    w1,-1(rd)
 477         bdnz    1b
 478
 479 // Prepare for reverse vector loop.  When entered:
 480 //              rd = 16-byte aligned
 481 //              cr5 = beq if source also 16-byte aligned
 482 // We set up many registers:
 483 //              ctr/cr1 = number of 64-byte chunks to move (may be 0)
 484 //              r0/cr0 = leftover QWs to move
 485 //              cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 486 //              cr6 = beq if leftover byte count is 0
 487 //              cm1 = -1
 488 //              rv = original value of vrsave
 489
 490 2:
 491         mfspr   rv,vrsave                       // get bitmap of live vector registers
 492         srwi    r0,rc,6                         // get count of 64-byte chunks to move (may be 0)
 493         oris    w1,rv,0xFFF8            // we use v0-v12
 494         mtcrf   0x01,rc                         // prepare for moving last 0-15 bytes in LShortReverse16
 495         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3 too
 496         cmpwi   cr1,r0,0                        // set cr1 on chunk count
 497         mtspr   vrsave,w1                       // update mask
 498         mtctr   r0                                      // set up loop count
 499         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 500         rlwinm. r0,rc,28,30,31          // get number of quadword leftovers (0-3) and set cr0
 501         li              cm1,-1                          // get constants used in ldvx/stvx
 502
 503         bne             cr5,LReverseVecUnal     // handle unaligned operands
 504         beq             cr1,2f                          // no chunks (if no chunks, must be leftover QWs)
 505         li              cm17,-17
 506         li              cm33,-33
 507         li              cm49,-49
 508         b               1f
 509
 510 // Long, reverse 16-byte-aligned vector loop.
 511
 512         .align  5                                       // align inner loops
 513 1:                                                              // loop over 64-byte chunks
 514         lvx             v1,cm1,rs
 515         lvx             v2,cm17,rs
 516         lvx             v3,cm33,rs
 517         lvx             v4,cm49,rs
 518         subi    rs,rs,64
 519         stvx    v1,cm1,rd
 520         stvx    v2,cm17,rd
 521         stvx    v3,cm33,rd
 522         stvx    v4,cm49,rd
 523         subi    rd,rd,64
 524         bdnz    1b
 525
 526         beq             4f                                      // no leftover quadwords
 527 2:                                                                      // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
 528         mtctr   r0
 529 3:                                                                      // loop over remaining quadwords (1-7)
 530         lvx             v1,cm1,rs
 531         subi    rs,rs,16
 532         stvx    v1,cm1,rd
 533         subi    rd,rd,16
 534         bdnz    3b
 535 4:
 536         mtspr   vrsave,rv                       // restore bitmap of live vr's
 537         bne             cr6,LShortReverse16     // handle last 0-15 bytes if any
 538         blr
 539
 540
 541 // Long, reverse, unaligned vector loop.
 542 //              ctr/cr1 = number of 64-byte chunks to move (may be 0)
 543 //              r0/cr0 = leftover QWs to move
 544 //              cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 545 //              cr6 = beq if leftover byte count is 0
 546 //              rv = original value of vrsave
 547 //              cm1 = -1
 548
 549 LReverseVecUnal:
 550         lvsl    vp,0,rs                         // get permute vector to shift left
 551         lvx             v1,cm1,rs                       // v1 always looks ahead
 552         li              cm17,-17
 553         beq             cr1,2f                          // no chunks (if no chunks, must be leftover QWs)
 554         li              cm33,-33
 555         li              cm49,-49
 556         b               1f
 557
 558         .align  5                                       // align the inner loops
 559 1:                                                                      // loop over 64-byte chunks
 560         lvx             v2,cm17,rs
 561         lvx             v3,cm33,rs
 562         lvx             v4,cm49,rs
 563         subi    rs,rs,64
 564         vperm   vx,v2,v1,vp
 565         lvx             v1,cm1,rs
 566         vperm   vy,v3,v2,vp
 567         stvx    vx,cm1,rd
 568         vperm   vz,v4,v3,vp
 569         stvx    vy,cm17,rd
 570         vperm   vx,v1,v4,vp
 571         stvx    vz,cm33,rd
 572         stvx    vx,cm49,rd
 573         subi    rd,rd,64
 574         bdnz    1b
 575
 576         beq             4f                                      // no leftover quadwords
 577 2:                                                                      // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
 578         mtctr   r0
 579 3:                                                                      // loop over 1-3 quadwords
 580         lvx             v2,cm17,rs
 581         subi    rs,rs,16
 582         vperm   vx,v2,v1,vp
 583         vor             v1,v2,v2                        // v1 <- v2
 584         stvx    vx,cm1,rd
 585         subi    rd,rd,16
 586         bdnz    3b
 587 4:
 588         mtspr   vrsave,rv                       // restore bitmap of live vr's
 589         bne             cr6,LShortReverse16     // handle last 0-15 bytes iff any
 590         blr
 591
 592         COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0,kCommPageMTCRF)