osfmk/ppc/commpage/bcopy_970.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* =======================================
  23  * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
  24  * =======================================
  25  *
  26  * Version of 6/11/2003, tuned for the IBM 970.
  27  *
  28  *
  29  * Register usage.  Note the rather delicate way we assign multiple uses
  30  * to the same register.  Beware.
  31  *   r0  = temp (NB: cannot use r0 for any constant such as "c16")
  32  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  33  *   r4  = source ptr ("rs")
  34  *   r5  = count of bytes to move ("rc")
  35  *   r6  = "w1", "c16", or "cm17"
  36  *   r7  = "w2", "c32", or "cm33"
  37  *   r8  = "w3", "c48", or "cm49"
  38  *   r9  = "w4",        or "cm1"
  39  *   r10 = vrsave ("rv")
  40  *   r11 = unused
  41  *   r12 = destination ptr ("rd")
  42  *   v0  = permute vector ("vp")
  43  * v1-v8 = qw's loaded from source
  44  *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
  45  */
  46 #define rs      r4
  47 #define rd      r12
  48 #define rc      r5
  49 #define rv      r10
  50
  51 #define w1      r6
  52 #define w2      r7
  53 #define w3      r8
  54 #define w4      r9
  55
  56 #define c16             r6
  57 #define cm17    r6
  58 #define c32             r7
  59 #define cm33    r7
  60 #define c48             r8
  61 #define cm49    r8
  62 #define cm1             r9
  63
  64 #define vp      v0
  65 #define vw      v9
  66 #define vx      v10
  67 #define vy      v11
  68 #define vz      v12
  69
  70 #define ASSEMBLER
  71 #include <sys/appleapiopts.h>
  72 #include <ppc/asm.h>
  73 #include <machine/cpu_capabilities.h>
  74 #include <machine/commpage.h>
  75
  76         .text
  77         .globl  EXT(bcopy_970)
  78
  79
  80 #define kShort          64
  81 #define kVeryLong       (128*1024)
  82
  83
  84 // Main entry points.
  85
  86         .align  5
  87 bcopy_970:                                                      // void bcopy(const void *src, void *dst, size_t len)
  88         cmplwi  rc,kShort                       // short or long?
  89         sub             w1,r4,r3                        // must move in reverse if (rd-rs)<rc
  90         mr              rd,r4                           // move registers to canonic spot
  91         mr              rs,r3
  92         blt             LShort                          // handle short operands
  93         dcbt    0,rs                            // touch in the first line of source
  94         dcbtst  0,rd                            // touch in destination
  95         b               LLong1                          // join long operand code
  96
  97 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
  98
  99         .align  5
 100 Lmemcpy_970:                                            // void* memcpy(void *dst, void *src, size_t len)
 101 Lmemmove_970:                                           // void* memmove(void *dst, const void *src, size_t len)
 102         cmplwi  rc,kShort                       // short or long?
 103         sub             w1,r3,r4                        // must move in reverse if (rd-rs)<rc
 104         mr              rd,r3                           // must leave r3 alone, it is return value for memcpy etc
 105         bge             LLong0                          // handle long operands
 106
 107 // Handle short operands.
 108 //              rs = source
 109 //              rd = destination
 110 //              rc = count
 111 //              w1 = (rd-rs), must move reverse if (rd-rs)<rc
 112
 113 LShort:
 114         cmplw   cr1,w1,rc                       // set cr1 blt if we must move reverse
 115         mtcrf   0x02,rc                         // move length to cr6 and cr7 one at a time
 116         mtcrf   0x01,rc
 117         blt--   cr1,LShortReverse
 118
 119 // Forward short operands.  This is the most frequent case, so it is inline.
 120
 121         bf              26,0f                           // 32-byte chunk to move?
 122         ld              w1,0(rs)
 123         ld              w2,8(rs)
 124         ld              w3,16(rs)
 125         ld              w4,24(rs)
 126         addi    rs,rs,32
 127         std             w1,0(rd)
 128         std             w2,8(rd)
 129         std             w3,16(rd)
 130         std             w4,24(rd)
 131         addi    rd,rd,32
 132 0:
 133 LShort32:
 134         bf              27,1f                           // quadword to move?
 135         ld              w1,0(rs)
 136         ld              w3,8(rs)
 137         addi    rs,rs,16
 138         std             w1,0(rd)
 139         std             w3,8(rd)
 140         addi    rd,rd,16
 141 1:
 142 LShort16:                                                       // join here to xfer 0-15 bytes
 143         bf              28,2f                           // doubleword?
 144         ld              w1,0(rs)
 145         addi    rs,rs,8
 146         std             w1,0(rd)
 147         addi    rd,rd,8
 148 2:
 149         bf              29,3f                           // word?
 150         lwz             w1,0(rs)
 151         addi    rs,rs,4
 152         stw             w1,0(rd)
 153         addi    rd,rd,4
 154 3:
 155         bf              30,4f                           // halfword to move?
 156         lhz             w1,0(rs)
 157         addi    rs,rs,2
 158         sth             w1,0(rd)
 159         addi    rd,rd,2
 160 4:
 161         bflr    31                                      // skip if no odd byte
 162         lbz             w1,0(rs)
 163         stb             w1,0(rd)
 164         blr
 165
 166
 167 // Handle short reverse operands.
 168 //              cr = length in bits 26-31
 169
 170 LShortReverse:
 171         add             rs,rs,rc                        // adjust ptrs for reverse move
 172         add             rd,rd,rc
 173         bf              26,0f                           // 32 bytes to move?
 174         ld              w1,-8(rs)
 175         ld              w2,-16(rs)
 176         ld              w3,-24(rs)
 177         ldu             w4,-32(rs)
 178         std             w1,-8(rd)
 179         std             w2,-16(rd)
 180         std             w3,-24(rd)
 181         stdu    w4,-32(rd)
 182 0:
 183         bf              27,1f                           // quadword to move?
 184         ld              w1,-8(rs)
 185         ldu             w2,-16(rs)
 186         std             w1,-8(rd)
 187         stdu    w2,-16(rd)
 188 1:
 189 LShortReverse16:                                        // join here to xfer 0-15 bytes and return
 190         bf              28,2f                           // doubleword?
 191         ldu             w1,-8(rs)
 192         stdu    w1,-8(rd)
 193 2:
 194         bf              29,3f                           // word?
 195         lwzu    w1,-4(rs)
 196         stwu    w1,-4(rd)
 197 3:
 198         bf              30,4f                           // halfword to move?
 199         lhzu    w1,-2(rs)
 200         sthu    w1,-2(rd)
 201 4:
 202         bflr    31                                      // done if no odd byte
 203         lbz     w1,-1(rs)                       // no update
 204         stb     w1,-1(rd)
 205         blr
 206
 207
 208 // Long operands, use Altivec in most cases.
 209 //              rs = source
 210 //              rd = destination
 211 //              rc = count
 212 //              w1 = (rd-rs), must move reverse if (rd-rs)<rc
 213
 214 LLong0:                                                         // entry from memmove()
 215         dcbt    0,rs                            // touch in source
 216         dcbtst  0,rd                            // touch in destination
 217 LLong1:                                                         // entry from bcopy() with operands already touched in
 218         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
 219         neg             w3,rd                           // start to compute #bytes to align destination
 220         rlwinm  w2,w1,0,0xF                     // 16-byte aligned?  (w2==0 if so)
 221         andi.   w4,w3,0xF                       // w4 <- #bytes to 16-byte align destination
 222         cmpwi   cr5,w2,0                        // set cr5 beq if relatively 16-byte aligned
 223         blt--   cr1,LLongReverse        // handle reverse moves
 224         sub             rc,rc,w4                        // adjust length for aligning destination
 225         srwi    r0,rc,7                         // get #cache lines to copy (may be 0)
 226         cmpwi   cr1,r0,0                        // set cr1 on #chunks
 227         beq             LFwdAligned                     // dest is already aligned
 228
 229 // 16-byte align destination.
 230
 231         mtcrf   0x01,w4                         // cr7 <- #bytes to align dest (nonzero)
 232         bf              31,1f                           // byte to move?
 233         lbz             w1,0(rs)
 234         addi    rs,rs,1
 235         stb             w1,0(rd)
 236         addi    rd,rd,1
 237 1:
 238         bf              30,2f                           // halfword?
 239         lhz             w1,0(rs)
 240         addi    rs,rs,2
 241         sth             w1,0(rd)
 242         addi    rd,rd,2
 243 2:
 244         bf              29,3f                           // word?
 245         lwz             w1,0(rs)
 246         addi    rs,rs,4
 247         stw             w1,0(rd)
 248         addi    rd,rd,4
 249 3:
 250         bf              28,LFwdAligned          // doubleword?
 251         ld              w1,0(rs)
 252         addi    rs,rs,8
 253         std             w1,0(rd)
 254         addi    rd,rd,8
 255
 256
 257 // Forward, destination is 16-byte aligned.  There are five cases:
 258 //  1. If the length>=kVeryLong (ie, several pages), then use the
 259 //     "bigcopy" path that pulls all the punches.  This is the fastest
 260 //         case for cold-cache operands, as any this long will likely be.
 261 //      2. If length>=128 and source is 16-byte aligned, then use the
 262 //         lvx/stvx loop over 128-byte chunks.  This is the fastest
 263 //     case for hot-cache operands, 2nd fastest for cold.
 264 //      3. If length>=128 and source is not 16-byte aligned, then use the
 265 //         lvx/vperm/stvx loop over 128-byte chunks.
 266 //      4. If length<128 and source is 8-byte aligned, then use the
 267 //         ld/std loop over 32-byte chunks.
 268 //      5. If length<128 and source is not 8-byte aligned, then use the
 269 //         lvx/vperm/stvx loop over 32-byte chunks.  This is the slowest case.
 270 // Registers at this point:
 271 //              r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
 272 //                      rs = alignment unknown
 273 //                  rd = 16-byte aligned
 274 //                      rc = bytes remaining
 275 //                      w2 = low 4 bits of (rd-rs), used to check alignment
 276 //                 cr5 = beq if source is also 16-byte aligned
 277
 278 LFwdAligned:
 279         andi.   w3,w2,7                         // is source at least 8-byte aligned?
 280         mtcrf   0x01,rc                         // move leftover count to cr7 for LShort16
 281         bne             cr1,LFwdLongVectors     // at least one 128-byte chunk, so use vectors
 282         srwi    w1,rc,5                         // get 32-byte chunk count
 283         mtcrf   0x02,rc                         // move bit 27 of length to cr6 for LShort32
 284         mtctr   w1                                      // set up 32-byte loop (w1!=0)
 285         beq             LFwdMedAligned          // source is 8-byte aligned, so use ld/std loop
 286         mfspr   rv,vrsave                       // get bitmap of live vector registers
 287         oris    w4,rv,0xFFF8            // we use v0-v12
 288         li              c16,16                          // get constant used in lvx
 289         li              c32,32
 290         mtspr   vrsave,w4                       // update mask
 291         lvx             v1,0,rs                         // prefetch 1st source quadword
 292         lvsl    vp,0,rs                         // get permute vector to shift left
 293
 294
 295 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
 296
 297 1:                                                                      // loop over 32-byte chunks
 298         lvx             v2,c16,rs
 299         lvx             v3,c32,rs
 300         addi    rs,rs,32
 301         vperm   vx,v1,v2,vp
 302         vperm   vy,v2,v3,vp
 303         vor             v1,v3,v3                        // v1 <- v3
 304         stvx    vx,0,rd
 305         stvx    vy,c16,rd
 306         addi    rd,rd,32
 307         bdnz    1b
 308
 309         mtspr   vrsave,rv                       // restore bitmap of live vr's
 310         b               LShort32
 311
 312
 313 // Fewer than 128 bytes and doubleword aligned: use ld/std.
 314
 315         .align  5
 316 LFwdMedAligned:                                                                 // loop over 32-byte chunks
 317         ld              w1,0(rs)
 318         ld              w2,8(rs)
 319         ld              w3,16(rs)
 320         ld              w4,24(rs)
 321         addi    rs,rs,32
 322         std             w1,0(rd)
 323         std             w2,8(rd)
 324         std             w3,16(rd)
 325         std             w4,24(rd)
 326         addi    rd,rd,32
 327         bdnz    LFwdMedAligned
 328
 329         b               LShort32
 330
 331
 332 // Forward, 128 bytes or more: use vectors.  When entered:
 333 //          r0 = 128-byte chunks to move (>0)
 334 //              rd = 16-byte aligned
 335 //         cr5 = beq if source is 16-byte aligned
 336 //         cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 337 // We set up many registers:
 338 //         ctr = number of 128-byte chunks to move
 339 //      r0/cr0 = leftover QWs to move
 340 //         cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 341 //         cr6 = beq if leftover byte count is 0
 342 //              rv = original value of VRSave
 343 // c16,c32,c48 = loaded
 344
 345 LFwdLongVectors:
 346         mfspr   rv,vrsave                       // get bitmap of live vector registers
 347         lis             w3,kVeryLong>>16        // cutoff for very-long-operand special case path
 348         cmplw   cr1,rc,w3                       // very long operand?
 349         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3
 350         bgea--  cr1,_COMM_PAGE_BIGCOPY  // handle big copies separately
 351         mtctr   r0                                      // set up loop count
 352         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 353         oris    w4,rv,0xFFF8            // we use v0-v12
 354         rlwinm. r0,rc,28,29,31          // get number of quadword leftovers (0-7) and set cr0
 355         li              c16,16                          // get constants used in ldvx/stvx
 356         mtspr   vrsave,w4                       // update mask
 357         li              c32,32
 358         li              c48,48
 359         beq             cr5,LFwdLongAligned     // source is also 16-byte aligned, no need for vperm
 360         lvsl    vp,0,rs                         // get permute vector to shift left
 361         lvx             v1,0,rs                         // prefetch 1st source quadword
 362         b               LFwdLongUnaligned
 363
 364
 365 // Forward, long, unaligned vector loop.
 366
 367         .align  5                                       // align inner loops
 368 LFwdLongUnaligned:                                      // loop over 128-byte chunks
 369         addi    w4,rs,64
 370         lvx             v2,c16,rs
 371         lvx             v3,c32,rs
 372         lvx             v4,c48,rs
 373         lvx             v5,0,w4
 374         lvx             v6,c16,w4
 375         vperm   vw,v1,v2,vp
 376         lvx             v7,c32,w4
 377         lvx             v8,c48,w4
 378         addi    rs,rs,128
 379         vperm   vx,v2,v3,vp
 380         addi    w4,rd,64
 381         lvx             v1,0,rs
 382         stvx    vw,0,rd
 383         vperm   vy,v3,v4,vp
 384         stvx    vx,c16,rd
 385         vperm   vz,v4,v5,vp
 386         stvx    vy,c32,rd
 387         vperm   vw,v5,v6,vp
 388         stvx    vz,c48,rd
 389         vperm   vx,v6,v7,vp
 390         addi    rd,rd,128
 391         stvx    vw,0,w4
 392         vperm   vy,v7,v8,vp
 393         stvx    vx,c16,w4
 394         vperm   vz,v8,v1,vp
 395         stvx    vy,c32,w4
 396         stvx    vz,c48,w4
 397         bdnz    LFwdLongUnaligned
 398
 399         beq             4f                                      // no leftover quadwords
 400         mtctr   r0
 401 3:                                                                      // loop over remaining quadwords
 402         lvx             v2,c16,rs
 403         addi    rs,rs,16
 404         vperm   vx,v1,v2,vp
 405         vor             v1,v2,v2                        // v1 <- v2
 406         stvx    vx,0,rd
 407         addi    rd,rd,16
 408         bdnz    3b
 409 4:
 410         mtspr   vrsave,rv                       // restore bitmap of live vr's
 411         bne             cr6,LShort16            // handle last 0-15 bytes if any
 412         blr
 413
 414
 415 // Forward, long, 16-byte aligned vector loop.
 416
 417         .align  5
 418 LFwdLongAligned:                                // loop over 128-byte chunks
 419         addi    w4,rs,64
 420         lvx             v1,0,rs
 421         lvx             v2,c16,rs
 422         lvx             v3,c32,rs
 423         lvx             v4,c48,rs
 424         lvx             v5,0,w4
 425         lvx             v6,c16,w4
 426         lvx             v7,c32,w4
 427         lvx             v8,c48,w4
 428         addi    rs,rs,128
 429         addi    w4,rd,64
 430         stvx    v1,0,rd
 431         stvx    v2,c16,rd
 432         stvx    v3,c32,rd
 433         stvx    v4,c48,rd
 434         stvx    v5,0,w4
 435         stvx    v6,c16,w4
 436         stvx    v7,c32,w4
 437         stvx    v8,c48,w4
 438         addi    rd,rd,128
 439         bdnz    LFwdLongAligned
 440
 441         beq             4f                                      // no leftover quadwords
 442         mtctr   r0
 443 3:                                                                      // loop over remaining quadwords (1-7)
 444         lvx             v1,0,rs
 445         addi    rs,rs,16
 446         stvx    v1,0,rd
 447         addi    rd,rd,16
 448         bdnz    3b
 449 4:
 450         mtspr   vrsave,rv                       // restore bitmap of live vr's
 451         bne             cr6,LShort16            // handle last 0-15 bytes if any
 452         blr
 453
 454
 455 // Long, reverse moves.
 456 //              rs = source
 457 //              rd = destination
 458 //              rc = count
 459 //         cr5 = beq if relatively 16-byte aligned
 460
 461 LLongReverse:
 462         add             rd,rd,rc                        // point to end of operands
 463         add             rs,rs,rc
 464         andi.   r0,rd,0xF                       // #bytes to 16-byte align destination
 465         beq             2f                                      // already aligned
 466
 467 // 16-byte align destination.
 468
 469         mtctr   r0                                      // set up for loop
 470         sub             rc,rc,r0
 471 1:
 472         lbzu    w1,-1(rs)
 473         stbu    w1,-1(rd)
 474         bdnz    1b
 475
 476 // Prepare for reverse vector loop.  When entered:
 477 //              rd = 16-byte aligned
 478 //              cr5 = beq if source also 16-byte aligned
 479 // We set up many registers:
 480 //              ctr/cr1 = number of 64-byte chunks to move (may be 0)
 481 //              r0/cr0 = leftover QWs to move
 482 //              cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 483 //              cr6 = beq if leftover byte count is 0
 484 //              cm1 = -1
 485 //              rv = original value of vrsave
 486
 487 2:
 488         mfspr   rv,vrsave                       // get bitmap of live vector registers
 489         srwi    r0,rc,6                         // get count of 64-byte chunks to move (may be 0)
 490         oris    w1,rv,0xFFF8            // we use v0-v12
 491         mtcrf   0x01,rc                         // prepare for moving last 0-15 bytes in LShortReverse16
 492         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3 too
 493         cmpwi   cr1,r0,0                        // set cr1 on chunk count
 494         mtspr   vrsave,w1                       // update mask
 495         mtctr   r0                                      // set up loop count
 496         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 497         rlwinm. r0,rc,28,30,31          // get number of quadword leftovers (0-3) and set cr0
 498         li              cm1,-1                          // get constants used in ldvx/stvx
 499
 500         bne             cr5,LReverseVecUnal     // handle unaligned operands
 501         beq             cr1,2f                          // no chunks (if no chunks, must be leftover QWs)
 502         li              cm17,-17
 503         li              cm33,-33
 504         li              cm49,-49
 505         b               1f
 506
 507 // Long, reverse 16-byte-aligned vector loop.
 508
 509         .align  5                                       // align inner loops
 510 1:                                                              // loop over 64-byte chunks
 511         lvx             v1,cm1,rs
 512         lvx             v2,cm17,rs
 513         lvx             v3,cm33,rs
 514         lvx             v4,cm49,rs
 515         subi    rs,rs,64
 516         stvx    v1,cm1,rd
 517         stvx    v2,cm17,rd
 518         stvx    v3,cm33,rd
 519         stvx    v4,cm49,rd
 520         subi    rd,rd,64
 521         bdnz    1b
 522
 523         beq             4f                                      // no leftover quadwords
 524 2:                                                                      // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
 525         mtctr   r0
 526 3:                                                                      // loop over remaining quadwords (1-7)
 527         lvx             v1,cm1,rs
 528         subi    rs,rs,16
 529         stvx    v1,cm1,rd
 530         subi    rd,rd,16
 531         bdnz    3b
 532 4:
 533         mtspr   vrsave,rv                       // restore bitmap of live vr's
 534         bne             cr6,LShortReverse16     // handle last 0-15 bytes if any
 535         blr
 536
 537
 538 // Long, reverse, unaligned vector loop.
 539 //              ctr/cr1 = number of 64-byte chunks to move (may be 0)
 540 //              r0/cr0 = leftover QWs to move
 541 //              cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 542 //              cr6 = beq if leftover byte count is 0
 543 //              rv = original value of vrsave
 544 //              cm1 = -1
 545
 546 LReverseVecUnal:
 547         lvsl    vp,0,rs                         // get permute vector to shift left
 548         lvx             v1,cm1,rs                       // v1 always looks ahead
 549         li              cm17,-17
 550         beq             cr1,2f                          // no chunks (if no chunks, must be leftover QWs)
 551         li              cm33,-33
 552         li              cm49,-49
 553         b               1f
 554
 555         .align  5                                       // align the inner loops
 556 1:                                                                      // loop over 64-byte chunks
 557         lvx             v2,cm17,rs
 558         lvx             v3,cm33,rs
 559         lvx             v4,cm49,rs
 560         subi    rs,rs,64
 561         vperm   vx,v2,v1,vp
 562         lvx             v1,cm1,rs
 563         vperm   vy,v3,v2,vp
 564         stvx    vx,cm1,rd
 565         vperm   vz,v4,v3,vp
 566         stvx    vy,cm17,rd
 567         vperm   vx,v1,v4,vp
 568         stvx    vz,cm33,rd
 569         stvx    vx,cm49,rd
 570         subi    rd,rd,64
 571         bdnz    1b
 572
 573         beq             4f                                      // no leftover quadwords
 574 2:                                                                      // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
 575         mtctr   r0
 576 3:                                                                      // loop over 1-3 quadwords
 577         lvx             v2,cm17,rs
 578         subi    rs,rs,16
 579         vperm   vx,v2,v1,vp
 580         vor             v1,v2,v2                        // v1 <- v2
 581         stvx    vx,cm1,rd
 582         subi    rd,rd,16
 583         bdnz    3b
 584 4:
 585         mtspr   vrsave,rv                       // restore bitmap of live vr's
 586         bne             cr6,LShortReverse16     // handle last 0-15 bytes iff any
 587         blr
 588
 589         COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0,kCommPageMTCRF)