osfmk/ppc/commpage/bcopy_970.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* =======================================
  29  * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
  30  * =======================================
  31  *
  32  * Version of 6/11/2003, tuned for the IBM 970.
  33  *
  34  * Register usage.  Note the rather delicate way we assign multiple uses
  35  * to the same register.  Beware.
  36  *   r0  = temp (NB: cannot use r0 for any constant such as "c16")
  37  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  38  *   r4  = source ptr ("rs")
  39  *   r5  = count of bytes to move ("rc")
  40  *   r6  = "w1", "c16", or "cm17"
  41  *   r7  = "w2", "c32", or "cm33"
  42  *   r8  = "w3", "c48", or "cm49"
  43  *   r9  = "w4",        or "cm1"
  44  *   r10 = vrsave ("rv")
  45  *   r11 = unused
  46  *   r12 = destination ptr ("rd")
  47  *   v0  = permute vector ("vp")
  48  * v1-v8 = qw's loaded from source
  49  *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
  50  */
  51 #define rs      r4
  52 #define rd      r12
  53 #define rc      r5
  54 #define rv      r10
  55
  56 #define w1      r6
  57 #define w2      r7
  58 #define w3      r8
  59 #define w4      r9
  60
  61 #define c16             r6
  62 #define cm17    r6
  63 #define c32             r7
  64 #define cm33    r7
  65 #define c48             r8
  66 #define cm49    r8
  67 #define cm1             r9
  68
  69 #define vp      v0
  70 #define vw      v9
  71 #define vx      v10
  72 #define vy      v11
  73 #define vz      v12
  74
  75 #define ASSEMBLER
  76 #include <sys/appleapiopts.h>
  77 #include <ppc/asm.h>
  78 #include <machine/cpu_capabilities.h>
  79 #include <machine/commpage.h>
  80
  81         .text
  82 /*
  83  * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
  84  * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
  85  * simple transformations:
  86  *      - all word compares are changed to doubleword
  87  *      - all "srwi[.]" opcodes are changed to "srdi[.]"
  88  * Nothing else is done.  For this to work, the following rules must be
  89  * carefully followed:
  90  *      - do not use carry or overflow
  91  *      - only use record mode if you are sure the results are mode-invariant
  92  *        for example, all "andi." and almost all "rlwinm." are fine
  93  *      - do not use "slwi", "slw", or "srw"
  94  * An imaginative programmer could break the porting model in other ways, but the above
  95  * are the most likely problem areas.  It is perhaps surprising how well in practice
  96  * this simple method works.
  97  */
  98
  99 #define kShort          64
 100 #define kVeryLong       (128*1024)
 101
 102
 103 // Main entry points.
 104
 105         .align  5
 106 bcopy_970:                                                      // void bcopy(const void *src, void *dst, size_t len)
 107         cmplwi  rc,kShort                       // short or long?
 108         sub             w1,r4,r3                        // must move in reverse if (rd-rs)<rc
 109         mr              rd,r4                           // move registers to canonic spot
 110         mr              rs,r3
 111         blt             LShort                          // handle short operands
 112         dcbt    0,rs                            // touch in the first line of source
 113         dcbtst  0,rd                            // touch in destination
 114         b               LLong1                          // join long operand code
 115
 116 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
 117
 118         .align  5
 119 Lmemcpy_970:                                            // void* memcpy(void *dst, void *src, size_t len)
 120 Lmemmove_970:                                           // void* memmove(void *dst, const void *src, size_t len)
 121         cmplwi  rc,kShort                       // short or long?
 122         sub             w1,r3,r4                        // must move in reverse if (rd-rs)<rc
 123         mr              rd,r3                           // must leave r3 alone, it is return value for memcpy etc
 124         bge             LLong0                          // handle long operands
 125
 126 // Handle short operands.
 127 //              rs = source
 128 //              rd = destination
 129 //              rc = count
 130 //              w1 = (rd-rs), must move reverse if (rd-rs)<rc
 131
 132 LShort:
 133         cmplw   cr1,w1,rc                       // set cr1 blt if we must move reverse
 134         mtcrf   0x02,rc                         // move length to cr6 and cr7 one at a time
 135         mtcrf   0x01,rc
 136         blt--   cr1,LShortReverse
 137
 138 // Forward short operands.  This is the most frequent case, so it is inline.
 139
 140         bf              26,0f                           // 32-byte chunk to move?
 141         ld              w1,0(rs)
 142         ld              w2,8(rs)
 143         ld              w3,16(rs)
 144         ld              w4,24(rs)
 145         addi    rs,rs,32
 146         std             w1,0(rd)
 147         std             w2,8(rd)
 148         std             w3,16(rd)
 149         std             w4,24(rd)
 150         addi    rd,rd,32
 151 0:
 152 LShort32:
 153         bf              27,1f                           // quadword to move?
 154         ld              w1,0(rs)
 155         ld              w3,8(rs)
 156         addi    rs,rs,16
 157         std             w1,0(rd)
 158         std             w3,8(rd)
 159         addi    rd,rd,16
 160 1:
 161 LShort16:                                                       // join here to xfer 0-15 bytes
 162         bf              28,2f                           // doubleword?
 163         ld              w1,0(rs)
 164         addi    rs,rs,8
 165         std             w1,0(rd)
 166         addi    rd,rd,8
 167 2:
 168         bf              29,3f                           // word?
 169         lwz             w1,0(rs)
 170         addi    rs,rs,4
 171         stw             w1,0(rd)
 172         addi    rd,rd,4
 173 3:
 174         bf              30,4f                           // halfword to move?
 175         lhz             w1,0(rs)
 176         addi    rs,rs,2
 177         sth             w1,0(rd)
 178         addi    rd,rd,2
 179 4:
 180         bflr    31                                      // skip if no odd byte
 181         lbz             w1,0(rs)
 182         stb             w1,0(rd)
 183         blr
 184
 185
 186 // Handle short reverse operands.
 187 //              cr = length in bits 26-31
 188
 189 LShortReverse:
 190         add             rs,rs,rc                        // adjust ptrs for reverse move
 191         add             rd,rd,rc
 192         bf              26,0f                           // 32 bytes to move?
 193         ld              w1,-8(rs)
 194         ld              w2,-16(rs)
 195         ld              w3,-24(rs)
 196         ldu             w4,-32(rs)
 197         std             w1,-8(rd)
 198         std             w2,-16(rd)
 199         std             w3,-24(rd)
 200         stdu    w4,-32(rd)
 201 0:
 202         bf              27,1f                           // quadword to move?
 203         ld              w1,-8(rs)
 204         ldu             w2,-16(rs)
 205         std             w1,-8(rd)
 206         stdu    w2,-16(rd)
 207 1:
 208 LShortReverse16:                                        // join here to xfer 0-15 bytes and return
 209         bf              28,2f                           // doubleword?
 210         ldu             w1,-8(rs)
 211         stdu    w1,-8(rd)
 212 2:
 213         bf              29,3f                           // word?
 214         lwzu    w1,-4(rs)
 215         stwu    w1,-4(rd)
 216 3:
 217         bf              30,4f                           // halfword to move?
 218         lhzu    w1,-2(rs)
 219         sthu    w1,-2(rd)
 220 4:
 221         bflr    31                                      // done if no odd byte
 222         lbz     w1,-1(rs)                       // no update
 223         stb     w1,-1(rd)
 224         blr
 225
 226
 227 // Long operands, use Altivec in most cases.
 228 //              rs = source
 229 //              rd = destination
 230 //              rc = count
 231 //              w1 = (rd-rs), must move reverse if (rd-rs)<rc
 232
 233 LLong0:                                                         // entry from memmove()
 234         dcbt    0,rs                            // touch in source
 235         dcbtst  0,rd                            // touch in destination
 236 LLong1:                                                         // entry from bcopy() with operands already touched in
 237         cmplw   cr1,w1,rc                       // set cr1 blt iff we must move reverse
 238         neg             w3,rd                           // start to compute #bytes to align destination
 239         rlwinm  w2,w1,0,0xF                     // 16-byte aligned?  (w2==0 if so)
 240         andi.   w4,w3,0xF                       // w4 <- #bytes to 16-byte align destination
 241         cmpwi   cr5,w2,0                        // set cr5 beq if relatively 16-byte aligned
 242         blt--   cr1,LLongReverse        // handle reverse moves
 243         sub             rc,rc,w4                        // adjust length for aligning destination
 244         srwi    r0,rc,7                         // get #cache lines to copy (may be 0)
 245         cmpwi   cr1,r0,0                        // set cr1 on #chunks
 246         beq             LFwdAligned                     // dest is already aligned
 247
 248 // 16-byte align destination.
 249
 250         mtcrf   0x01,w4                         // cr7 <- #bytes to align dest (nonzero)
 251         bf              31,1f                           // byte to move?
 252         lbz             w1,0(rs)
 253         addi    rs,rs,1
 254         stb             w1,0(rd)
 255         addi    rd,rd,1
 256 1:
 257         bf              30,2f                           // halfword?
 258         lhz             w1,0(rs)
 259         addi    rs,rs,2
 260         sth             w1,0(rd)
 261         addi    rd,rd,2
 262 2:
 263         bf              29,3f                           // word?
 264         lwz             w1,0(rs)
 265         addi    rs,rs,4
 266         stw             w1,0(rd)
 267         addi    rd,rd,4
 268 3:
 269         bf              28,LFwdAligned          // doubleword?
 270         ld              w1,0(rs)
 271         addi    rs,rs,8
 272         std             w1,0(rd)
 273         addi    rd,rd,8
 274
 275
 276 // Forward, destination is 16-byte aligned.  There are five cases:
 277 //  1. If the length>=kVeryLong (ie, several pages), then use the
 278 //     "bigcopy" path that pulls all the punches.  This is the fastest
 279 //         case for cold-cache operands, as any this long will likely be.
 280 //      2. If length>=128 and source is 16-byte aligned, then use the
 281 //         lvx/stvx loop over 128-byte chunks.  This is the fastest
 282 //     case for hot-cache operands, 2nd fastest for cold.
 283 //      3. If length>=128 and source is not 16-byte aligned, then use the
 284 //         lvx/vperm/stvx loop over 128-byte chunks.
 285 //      4. If length<128 and source is 8-byte aligned, then use the
 286 //         ld/std loop over 32-byte chunks.
 287 //      5. If length<128 and source is not 8-byte aligned, then use the
 288 //         lvx/vperm/stvx loop over 32-byte chunks.  This is the slowest case.
 289 // Registers at this point:
 290 //              r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
 291 //                      rs = alignment unknown
 292 //                  rd = 16-byte aligned
 293 //                      rc = bytes remaining
 294 //                      w2 = low 4 bits of (rd-rs), used to check alignment
 295 //                 cr5 = beq if source is also 16-byte aligned
 296
 297 LFwdAligned:
 298         andi.   w3,w2,7                         // is source at least 8-byte aligned?
 299         mtcrf   0x01,rc                         // move leftover count to cr7 for LShort16
 300         bne             cr1,LFwdLongVectors     // at least one 128-byte chunk, so use vectors
 301         srwi    w1,rc,5                         // get 32-byte chunk count
 302         mtcrf   0x02,rc                         // move bit 27 of length to cr6 for LShort32
 303         mtctr   w1                                      // set up 32-byte loop (w1!=0)
 304         beq             LFwdMedAligned          // source is 8-byte aligned, so use ld/std loop
 305         mfspr   rv,vrsave                       // get bitmap of live vector registers
 306         oris    w4,rv,0xFFF8            // we use v0-v12
 307         li              c16,16                          // get constant used in lvx
 308         li              c32,32
 309         mtspr   vrsave,w4                       // update mask
 310         lvx             v1,0,rs                         // prefetch 1st source quadword
 311         lvsl    vp,0,rs                         // get permute vector to shift left
 312
 313
 314 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
 315
 316 1:                                                                      // loop over 32-byte chunks
 317         lvx             v2,c16,rs
 318         lvx             v3,c32,rs
 319         addi    rs,rs,32
 320         vperm   vx,v1,v2,vp
 321         vperm   vy,v2,v3,vp
 322         vor             v1,v3,v3                        // v1 <- v3
 323         stvx    vx,0,rd
 324         stvx    vy,c16,rd
 325         addi    rd,rd,32
 326         bdnz    1b
 327
 328         mtspr   vrsave,rv                       // restore bitmap of live vr's
 329         b               LShort32
 330
 331
 332 // Fewer than 128 bytes and doubleword aligned: use ld/std.
 333
 334         .align  5
 335 LFwdMedAligned:                                                                 // loop over 32-byte chunks
 336         ld              w1,0(rs)
 337         ld              w2,8(rs)
 338         ld              w3,16(rs)
 339         ld              w4,24(rs)
 340         addi    rs,rs,32
 341         std             w1,0(rd)
 342         std             w2,8(rd)
 343         std             w3,16(rd)
 344         std             w4,24(rd)
 345         addi    rd,rd,32
 346         bdnz    LFwdMedAligned
 347
 348         b               LShort32
 349
 350
 351 // Forward, 128 bytes or more: use vectors.  When entered:
 352 //          r0 = 128-byte chunks to move (>0)
 353 //              rd = 16-byte aligned
 354 //         cr5 = beq if source is 16-byte aligned
 355 //         cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 356 // We set up many registers:
 357 //         ctr = number of 128-byte chunks to move
 358 //      r0/cr0 = leftover QWs to move
 359 //         cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 360 //         cr6 = beq if leftover byte count is 0
 361 //              rv = original value of VRSave
 362 // c16,c32,c48 = loaded
 363
 364 LFwdLongVectors:
 365         mfspr   rv,vrsave                       // get bitmap of live vector registers
 366         lis             w3,kVeryLong>>16        // cutoff for very-long-operand special case path
 367         cmplw   cr1,rc,w3                       // very long operand?
 368         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3
 369         bge--   cr1,LBigCopy        // handle big copies separately
 370         mtctr   r0                                      // set up loop count
 371         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 372         oris    w4,rv,0xFFF8            // we use v0-v12
 373         rlwinm. r0,rc,28,29,31          // get number of quadword leftovers (0-7) and set cr0
 374         li              c16,16                          // get constants used in ldvx/stvx
 375         mtspr   vrsave,w4                       // update mask
 376         li              c32,32
 377         li              c48,48
 378         beq             cr5,LFwdLongAligned     // source is also 16-byte aligned, no need for vperm
 379         lvsl    vp,0,rs                         // get permute vector to shift left
 380         lvx             v1,0,rs                         // prefetch 1st source quadword
 381         b               LFwdLongUnaligned
 382
 383
 384 // Forward, long, unaligned vector loop.
 385
 386         .align  5                                       // align inner loops
 387 LFwdLongUnaligned:                                      // loop over 128-byte chunks
 388         addi    w4,rs,64
 389         lvx             v2,c16,rs
 390         lvx             v3,c32,rs
 391         lvx             v4,c48,rs
 392         lvx             v5,0,w4
 393         lvx             v6,c16,w4
 394         vperm   vw,v1,v2,vp
 395         lvx             v7,c32,w4
 396         lvx             v8,c48,w4
 397         addi    rs,rs,128
 398         vperm   vx,v2,v3,vp
 399         addi    w4,rd,64
 400         lvx             v1,0,rs
 401         stvx    vw,0,rd
 402         vperm   vy,v3,v4,vp
 403         stvx    vx,c16,rd
 404         vperm   vz,v4,v5,vp
 405         stvx    vy,c32,rd
 406         vperm   vw,v5,v6,vp
 407         stvx    vz,c48,rd
 408         vperm   vx,v6,v7,vp
 409         addi    rd,rd,128
 410         stvx    vw,0,w4
 411         vperm   vy,v7,v8,vp
 412         stvx    vx,c16,w4
 413         vperm   vz,v8,v1,vp
 414         stvx    vy,c32,w4
 415         stvx    vz,c48,w4
 416         bdnz    LFwdLongUnaligned
 417
 418         beq             4f                                      // no leftover quadwords
 419         mtctr   r0
 420 3:                                                                      // loop over remaining quadwords
 421         lvx             v2,c16,rs
 422         addi    rs,rs,16
 423         vperm   vx,v1,v2,vp
 424         vor             v1,v2,v2                        // v1 <- v2
 425         stvx    vx,0,rd
 426         addi    rd,rd,16
 427         bdnz    3b
 428 4:
 429         mtspr   vrsave,rv                       // restore bitmap of live vr's
 430         bne             cr6,LShort16            // handle last 0-15 bytes if any
 431         blr
 432
 433
 434 // Forward, long, 16-byte aligned vector loop.
 435
 436         .align  5
 437 LFwdLongAligned:                                // loop over 128-byte chunks
 438         addi    w4,rs,64
 439         lvx             v1,0,rs
 440         lvx             v2,c16,rs
 441         lvx             v3,c32,rs
 442         lvx             v4,c48,rs
 443         lvx             v5,0,w4
 444         lvx             v6,c16,w4
 445         lvx             v7,c32,w4
 446         lvx             v8,c48,w4
 447         addi    rs,rs,128
 448         addi    w4,rd,64
 449         stvx    v1,0,rd
 450         stvx    v2,c16,rd
 451         stvx    v3,c32,rd
 452         stvx    v4,c48,rd
 453         stvx    v5,0,w4
 454         stvx    v6,c16,w4
 455         stvx    v7,c32,w4
 456         stvx    v8,c48,w4
 457         addi    rd,rd,128
 458         bdnz    LFwdLongAligned
 459
 460         beq             4f                                      // no leftover quadwords
 461         mtctr   r0
 462 3:                                                                      // loop over remaining quadwords (1-7)
 463         lvx             v1,0,rs
 464         addi    rs,rs,16
 465         stvx    v1,0,rd
 466         addi    rd,rd,16
 467         bdnz    3b
 468 4:
 469         mtspr   vrsave,rv                       // restore bitmap of live vr's
 470         bne             cr6,LShort16            // handle last 0-15 bytes if any
 471         blr
 472
 473
 474 // Long, reverse moves.
 475 //              rs = source
 476 //              rd = destination
 477 //              rc = count
 478 //         cr5 = beq if relatively 16-byte aligned
 479
 480 LLongReverse:
 481         add             rd,rd,rc                        // point to end of operands
 482         add             rs,rs,rc
 483         andi.   r0,rd,0xF                       // #bytes to 16-byte align destination
 484         beq             2f                                      // already aligned
 485
 486 // 16-byte align destination.
 487
 488         mtctr   r0                                      // set up for loop
 489         sub             rc,rc,r0
 490 1:
 491         lbzu    w1,-1(rs)
 492         stbu    w1,-1(rd)
 493         bdnz    1b
 494
 495 // Prepare for reverse vector loop.  When entered:
 496 //              rd = 16-byte aligned
 497 //              cr5 = beq if source also 16-byte aligned
 498 // We set up many registers:
 499 //              ctr/cr1 = number of 64-byte chunks to move (may be 0)
 500 //              r0/cr0 = leftover QWs to move
 501 //              cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 502 //              cr6 = beq if leftover byte count is 0
 503 //              cm1 = -1
 504 //              rv = original value of vrsave
 505
 506 2:
 507         mfspr   rv,vrsave                       // get bitmap of live vector registers
 508         srwi    r0,rc,6                         // get count of 64-byte chunks to move (may be 0)
 509         oris    w1,rv,0xFFF8            // we use v0-v12
 510         mtcrf   0x01,rc                         // prepare for moving last 0-15 bytes in LShortReverse16
 511         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3 too
 512         cmpwi   cr1,r0,0                        // set cr1 on chunk count
 513         mtspr   vrsave,w1                       // update mask
 514         mtctr   r0                                      // set up loop count
 515         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 516         rlwinm. r0,rc,28,30,31          // get number of quadword leftovers (0-3) and set cr0
 517         li              cm1,-1                          // get constants used in ldvx/stvx
 518
 519         bne             cr5,LReverseVecUnal     // handle unaligned operands
 520         beq             cr1,2f                          // no chunks (if no chunks, must be leftover QWs)
 521         li              cm17,-17
 522         li              cm33,-33
 523         li              cm49,-49
 524         b               1f
 525
 526 // Long, reverse 16-byte-aligned vector loop.
 527
 528         .align  5                                       // align inner loops
 529 1:                                                              // loop over 64-byte chunks
 530         lvx             v1,cm1,rs
 531         lvx             v2,cm17,rs
 532         lvx             v3,cm33,rs
 533         lvx             v4,cm49,rs
 534         subi    rs,rs,64
 535         stvx    v1,cm1,rd
 536         stvx    v2,cm17,rd
 537         stvx    v3,cm33,rd
 538         stvx    v4,cm49,rd
 539         subi    rd,rd,64
 540         bdnz    1b
 541
 542         beq             4f                                      // no leftover quadwords
 543 2:                                                                      // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
 544         mtctr   r0
 545 3:                                                                      // loop over remaining quadwords (1-7)
 546         lvx             v1,cm1,rs
 547         subi    rs,rs,16
 548         stvx    v1,cm1,rd
 549         subi    rd,rd,16
 550         bdnz    3b
 551 4:
 552         mtspr   vrsave,rv                       // restore bitmap of live vr's
 553         bne             cr6,LShortReverse16     // handle last 0-15 bytes if any
 554         blr
 555
 556
 557 // Long, reverse, unaligned vector loop.
 558 //              ctr/cr1 = number of 64-byte chunks to move (may be 0)
 559 //              r0/cr0 = leftover QWs to move
 560 //              cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 561 //              cr6 = beq if leftover byte count is 0
 562 //              rv = original value of vrsave
 563 //              cm1 = -1
 564
 565 LReverseVecUnal:
 566         lvsl    vp,0,rs                         // get permute vector to shift left
 567         lvx             v1,cm1,rs                       // v1 always looks ahead
 568         li              cm17,-17
 569         beq             cr1,2f                          // no chunks (if no chunks, must be leftover QWs)
 570         li              cm33,-33
 571         li              cm49,-49
 572         b               1f
 573
 574         .align  5                                       // align the inner loops
 575 1:                                                                      // loop over 64-byte chunks
 576         lvx             v2,cm17,rs
 577         lvx             v3,cm33,rs
 578         lvx             v4,cm49,rs
 579         subi    rs,rs,64
 580         vperm   vx,v2,v1,vp
 581         lvx             v1,cm1,rs
 582         vperm   vy,v3,v2,vp
 583         stvx    vx,cm1,rd
 584         vperm   vz,v4,v3,vp
 585         stvx    vy,cm17,rd
 586         vperm   vx,v1,v4,vp
 587         stvx    vz,cm33,rd
 588         stvx    vx,cm49,rd
 589         subi    rd,rd,64
 590         bdnz    1b
 591
 592         beq             4f                                      // no leftover quadwords
 593 2:                                                                      // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
 594         mtctr   r0
 595 3:                                                                      // loop over 1-3 quadwords
 596         lvx             v2,cm17,rs
 597         subi    rs,rs,16
 598         vperm   vx,v2,v1,vp
 599         vor             v1,v2,v2                        // v1 <- v2
 600         stvx    vx,cm1,rd
 601         subi    rd,rd,16
 602         bdnz    3b
 603 4:
 604         mtspr   vrsave,rv                       // restore bitmap of live vr's
 605         bne             cr6,LShortReverse16     // handle last 0-15 bytes iff any
 606         blr
 607
 608
 609 // Very Big Copy Path.  Save our return address in the stack for help decoding backtraces.
 610 // The conditions bigcopy expects are:
 611 //  r0 = return address (also stored in caller's SF)
 612 //      r4 = source ptr
 613 //      r5 = length (at least several pages)
 614 // r12 = dest ptr
 615
 616 LBigCopy:
 617                 lis             r2,0x4000                       // r2 <- 0x40000000
 618         mflr    r0                  // get our return address
 619                 add.    r2,r2,r2                        // set cr0_lt if running in 32-bit mode
 620         stw     r0,8(r1)            // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
 621                 blta    _COMM_PAGE_BIGCOPY  // 32-bit mode, join big operand copy
 622                 std             r0,16(r1)                       // save return in correct spot for 64-bit mode
 623         ba      _COMM_PAGE_BIGCOPY  // then join big operand code
 624
 625
 626         COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
 627                                 kCommPageMTCRF+kCommPageBoth+kPort32to64)