gen.subproj/ppc.subproj/blockmoof.s

   1 /*
   2  * Copyright (c) 1992-2001 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 #include <architecture/ppc/asm_help.h>
  23
  24 // =================================================================================================
  25 // *** The easiest way to assemble things on Mac OS X is via "cc", so this uses #defines and such.
  26 // =================================================================================================
  27
  28 // Keep track of whether we have Altivec
  29 // This gets set in pthread_init()
  30
  31 .data
  32 .align 2
  33 .globl __cpu_has_altivec
  34 __cpu_has_altivec:
  35 .long 0
  36
  37 .text
  38 .align 2
  39 .globl _bcopy
  40 .globl _memcpy
  41 .globl _memmove
  42
  43 _bcopy:
  44         mr      r2,r4   // Since bcopy uses (src,dest,count), swap r3,r4
  45         mr      r4,r3
  46         mr      r3,r2
  47 _memcpy:
  48 _memmove:
  49         mr      r2,r3   // Store dest ptr in r2 to preserve r3 on return
  50
  51 // ------------------
  52 // Standard registers
  53
  54 #define rs      r4
  55 #define rd      r2
  56 #define rc      r5
  57
  58 // Should we bother using Altivec?
  59
  60         cmpwi   r5, 128
  61         blt+    LScalar
  62
  63 // Determine whether we have Altivec enabled
  64
  65         mflr    r0
  66         bcl     20,31,1f
  67 1:
  68         mflr    r6
  69         mtlr    r0
  70         addis   r6, r6, ha16(__cpu_has_altivec - 1b)
  71         lwz     r6, lo16(__cpu_has_altivec - 1b)(r6)
  72         cmpwi   r6, 0
  73         bne+    LAltivec
  74
  75 // =================================================================================================
  76
  77 //  *****************************************
  78 //  * S c a l a r B l o c k M o o f D a t a *
  79 //  *****************************************
  80 //
  81 //  This is the scalar (non-AltiVec) version of BlockMoofData.
  82 //
  83 //              void ScalarBlockMoofData                        (ptr sou, ptr dest, long len)
  84 //              void ScalarBlockMoofDataUncached        (ptr sou, ptr dest, long len)
  85 //
  86 //
  87 //  Calling Sequence:   r3 = source pointer
  88 //                                              r4 = destination pointer
  89 //                                              r5 = length in bytes
  90 //
  91 //  Uses: all volatile registers.
  92
  93 LScalar:
  94                 cmplwi  cr7,rc,32                               //  length <= 32 bytes?
  95                 cmplw   cr6,rd,rs                               //  up or down?
  96                 mr.             r0,rc                                   //  copy to r0 for MoveShort, and test for negative
  97                 bgt             cr7,Lbm1                                //  skip if count > 32
  98
  99 //  Handle short moves (<=32 bytes.)
 100
 101                 beq             cr7,LMove32                             //  special case 32-byte blocks
 102                 blt             cr6,LMoveDownShort              //  move down in memory and return
 103                 add             rs,rs,rc                                //  moving up (right-to-left), so adjust pointers
 104                 add             rd,rd,rc
 105                 b               LMoveUpShort                    //  move up in memory and return
 106
 107 //  Handle long moves (>32 bytes.)
 108
 109 Lbm1:
 110                 beqlr   cr6                                             //  rs==rd, so nothing to move
 111                 bltlr   cr0                                             //  length<0, so ignore call and return
 112                 mflr    r12                                             //  save return address
 113                 bge             cr6,Lbm2                                //  rd>=rs, so move up
 114
 115 //  Long moves down (left-to-right.)
 116
 117                 neg             r6,rd                                   //  start to 32-byte-align destination
 118                 andi.   r0,r6,0x1F                              //  r0 <- bytes to move to align destination
 119                 bnel    LMoveDownShort                  //  align destination if necessary
 120                 bl              LMoveDownLong                   //  move 32-byte chunks down
 121                 andi.   r0,rc,0x1F                              //  done?
 122                 mtlr    r12                                             //  restore caller's return address
 123                 bne             LMoveDownShort                  //  move trailing leftover bytes and done
 124                 blr                                                             //  no leftovers, so done
 125
 126 //  Long moves up (right-to-left.)
 127
 128 Lbm2:
 129                 add             rs,rs,rc                                //  moving up (right-to-left), so adjust pointers
 130                 add             rd,rd,rc
 131                 andi.   r0,rd,0x1F                              //  r0 <- bytes to move to align destination
 132                 bnel    LMoveUpShort                    //  align destination if necessary
 133                 bl              LMoveUpLong                             //  move 32-byte chunks up
 134                 andi.   r0,rc,0x1F                              //  done?
 135                 mtlr    r12                                             //  restore caller's return address
 136                 bne             LMoveUpShort                    //  move trailing leftover bytes and done
 137                 blr                                                             //  no leftovers, so done
 138
 139 //  ***************
 140 //  * M O V E 3 2 *
 141 //  ***************
 142 //
 143 //  Special case subroutine to move a 32-byte block.  MoveDownShort and
 144 //  MoveUpShort only handle 0..31 bytes, and we believe 32 bytes is too
 145 //  common a case to send it through the general purpose long-block code.
 146 //  Since it moves both up and down, we must load all 32 bytes before
 147 //  storing any.
 148 //
 149 //  Calling Sequence:  rs = source ptr
 150 //                                       rd = destination ptr
 151 //
 152 //  Uses: r0,r5-r11.
 153 //
 154
 155 LMove32:
 156                 lwz             r0,0(rs)
 157                 lwz             r5,4(rs)
 158                 lwz             r6,8(rs)
 159                 lwz             r7,12(rs)
 160                 lwz             r8,16(rs)
 161                 lwz             r9,20(rs)
 162                 lwz             r10,24(rs)
 163                 lwz             r11,28(rs)
 164                 stw             r0,0(rd)
 165                 stw             r5,4(rd)
 166                 stw             r6,8(rd)
 167                 stw             r7,12(rd)
 168                 stw             r8,16(rd)
 169                 stw             r9,20(rd)
 170                 stw             r10,24(rd)
 171                 stw             r11,28(rd)
 172                 blr
 173
 174
 175 //  *************************
 176 //  * M o v e U p S h o r t *
 177 //  *************************
 178 //
 179 //  Subroutine called to move <32 bytes up in memory (ie, right-to-left).
 180 //
 181 //  Entry conditions: rs = last byte moved from source (right-to-left)
 182 //                                      rd = last byte moved into destination
 183 //                                      r0 = #bytes to move (0..31)
 184 //
 185 //  Exit conditions:  rs = updated source ptr
 186 //                                      rd = updated destination ptr
 187 //                                      rc = decremented by #bytes moved
 188 //
 189 //  Uses: r0,r6,r7,r8,cr7.
 190 //
 191
 192 LMoveUpShort:
 193                 andi.   r6,r0,0x10                              //  test 0x10 bit in length
 194                 mtcrf   0x1,r0                                  //  move count to cr7 so we can test bits
 195                 sub             rc,rc,r0                                //  decrement count of bytes remaining to be moved
 196                 beq             Lmus1                                   //  skip if 0x10 bit in length is 0
 197                 lwzu    r0,-16(rs)                              //  set, so copy up 16 bytes
 198                 lwz             r6,4(rs)
 199                 lwz             r7,8(rs)
 200                 lwz             r8,12(rs)
 201                 stwu    r0,-16(rd)
 202                 stw             r6,4(rd)
 203                 stw             r7,8(rd)
 204                 stw             r8,12(rd)
 205
 206 Lmus1:
 207                 bf              28,Lmus2                                //  test 0x08 bit
 208                 lwzu    r0,-8(rs)
 209                 lwz             r6,4(rs)
 210                 stwu    r0,-8(rd)
 211                 stw             r6,4(rd)
 212
 213 Lmus2:
 214                 bf              29,Lmus3                                //  test 0x4 bit
 215                 lwzu    r0,-4(rs)
 216                 stwu    r0,-4(rd)
 217
 218 Lmus3:
 219                 bf              30,Lmus4                                //  test 0x2 bit
 220                 lhzu    r0,-2(rs)
 221                 sthu    r0,-2(rd)
 222
 223 Lmus4:
 224                 bflr    31                                              //  test 0x1 bit, return if 0
 225                 lbzu    r0,-1(rs)
 226                 stbu    r0,-1(rd)
 227                 blr
 228
 229
 230 //  *****************************
 231 //  * M o v e D o w n S h o r t *
 232 //  *****************************
 233 //
 234 //  Subroutine called to move <32 bytes down in memory (ie, left-to-right).
 235 //
 236 //  Entry conditions: rs = source pointer
 237 //                                      rd = destination pointer
 238 //                                      r0 = #bytes to move (0..31)
 239 //
 240 //  Exit conditions:  rs = ptr to 1st byte not moved
 241 //                                      rd = ptr to 1st byte not moved
 242 //                                      rc = decremented by #bytes moved
 243 //
 244 //  Uses: r0,r6,r7,r8,cr7.
 245 //
 246
 247 LMoveDownShort:
 248                 andi.   r6,r0,0x10                              //  test 0x10 bit in length
 249                 mtcrf   0x1,r0                                  //  move count to cr7 so we can test bits
 250                 sub             rc,rc,r0                                //  decrement count of bytes remaining to be moved
 251                 beq             Lmds1                                   //  skip if 0x10 bit in length is 0
 252                 lwz             r0,0(rs)                                //  set, so copy up 16 bytes
 253                 lwz             r6,4(rs)
 254                 lwz             r7,8(rs)
 255                 lwz             r8,12(rs)
 256                 addi    rs,rs,16
 257                 stw             r0,0(rd)
 258                 stw             r6,4(rd)
 259                 stw             r7,8(rd)
 260                 stw             r8,12(rd)
 261                 addi    rd,rd,16
 262
 263 Lmds1:
 264                 bf              28,Lmds2                                //  test 0x08 bit
 265                 lwz             r0,0(rs)
 266                 lwz             r6,4(rs)
 267                 addi    rs,rs,8
 268                 stw             r0,0(rd)
 269                 stw             r6,4(rd)
 270                 addi    rd,rd,8
 271
 272 Lmds2:
 273                 bf              29,Lmds3                                //  test 0x4 bit
 274                 lwz             r0,0(rs)
 275                 addi    rs,rs,4
 276                 stw             r0,0(rd)
 277                 addi    rd,rd,4
 278
 279 Lmds3:
 280                 bf              30,Lmds4                                //  test 0x2 bit
 281                 lhz             r0,0(rs)
 282                 addi    rs,rs,2
 283                 sth             r0,0(rd)
 284                 addi    rd,rd,2
 285
 286 Lmds4:
 287                 bflr    31                                              //  test 0x1 bit, return if 0
 288                 lbz             r0,0(rs)
 289                 addi    rs,rs,1
 290                 stb             r0,0(rd)
 291                 addi    rd,rd,1
 292                 blr
 293
 294
 295 //  ***********************
 296 //  * M o v e U p L o n g *
 297 //  ***********************
 298 //
 299 //  Subroutine to move 32-byte chunks of memory up (ie, right-to-left.)
 300 //  The destination is known to be 32-byte aligned, but the source is
 301 //  *not* necessarily aligned.
 302 //
 303 //  Entry conditions: rs = last byte moved from source (right-to-left)
 304 //                                      rd = last byte moved into destination
 305 //                                      rc = count of bytes to move
 306 //                                      cr = crCached set iff destination is cacheable
 307 //
 308 //  Exit conditions:  rs = updated source ptr
 309 //                                      rd = updated destination ptr
 310 //                                      rc = low order 8 bits of count of bytes to move
 311 //
 312 //  Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7.
 313 //
 314
 315 LMoveUpLong:
 316                 srwi.   r11,rc,5                                // r11 <- #32 byte chunks to move
 317                 mtctr   r11                                             //  prepare loop count
 318                 beqlr                                                   //  return if no chunks to move
 319                 andi.   r0,rs,7                                 //  is source at least doubleword aligned?
 320                 beq             Lmup3                                   //  yes, can optimize this case
 321                 mtcrf   0x1,rc                                  //  save low bits of count
 322                 mtcrf   0x2,rc                                  //  (one cr at a time, as 604 prefers)
 323
 324 Lmup1:                                                                  //  loop over each 32-byte-chunk
 325                 lwzu    r0,-32(rs)
 326                 subi    rd,rd,32                                //  prepare destination address for 'dcbz'
 327                 lwz             r5,4(rs)
 328                 lwz             r6,8(rs)
 329                 lwz             r7,12(rs)
 330                 lwz             r8,16(rs)
 331                 lwz             r9,20(rs)
 332                 lwz             r10,24(rs)
 333                 lwz             r11,28(rs)
 334                 stw             r0,0(rd)
 335                 stw             r5,4(rd)
 336                 stw             r6,8(rd)
 337                 stw             r7,12(rd)
 338                 stw             r8,16(rd)
 339                 stw             r9,20(rd)
 340                 stw             r10,24(rd)
 341                 stw             r11,28(rd)
 342                 bdnz    Lmup1
 343                 mfcr    rc                                              //  restore low bits of count
 344                 blr                                                             //  return to caller
 345
 346 //  Aligned operands, so use d.p. floating point registers to move data.
 347
 348 Lmup3:
 349                 lfdu    f0,-32(rs)
 350                 subi    rd,rd,32                                //  prepare destination address for 'dcbz'
 351                 lfd             f1,8(rs)
 352                 lfd             f2,16(rs)
 353                 lfd             f3,24(rs)
 354                 stfd    f0,0(rd)
 355                 stfd    f1,8(rd)
 356                 stfd    f2,16(rd)
 357                 stfd    f3,24(rd)
 358                 bdnz    Lmup3
 359                 blr                                                             //  return to caller
 360
 361
 362 //  ***************************
 363 //  * M o v e D o w n L o n g *
 364 //  ***************************
 365 //
 366 //  Subroutine to move 32-byte chunks of memory down (ie, left-to-right.)
 367 //  The destination is known to be 32-byte aligned, but the source is
 368 //  *not* necessarily aligned.
 369 //
 370 //  Entry conditions: rs = source ptr (next byte to move)
 371 //                                      rd = dest ptr (next byte to move into)
 372 //                                      rc = count of bytes to move
 373 //                                      cr = crCached set iff destination is cacheable
 374 //
 375 //  Exit conditions:  rs = updated source ptr
 376 //                                      rd = updated destination ptr
 377 //                                      rc = low order 8 bits of count of bytes to move
 378 //
 379 //  Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7.
 380 //
 381
 382 LMoveDownLong:
 383                 srwi.   r11,rc,5                                // r11 <- #32 byte chunks to move
 384                 mtctr   r11                                             //  prepare loop count
 385                 beqlr                                                   //  return if no chunks to move
 386                 andi.   r0,rs,7                                 //  is source at least doubleword aligned?
 387                 beq             Lmdown3                                 //  yes, can optimize this case
 388                 mtcrf   0x1,rc                                  //  save low 8 bits of count
 389                 mtcrf   0x2,rc                                  //  (one cr at a time, as 604 prefers)
 390
 391 Lmdown1:                                                                        //  loop over each 32-byte-chunk
 392                 lwz             r0,0(rs)
 393                 lwz             r5,4(rs)
 394                 lwz             r6,8(rs)
 395                 lwz             r7,12(rs)
 396                 lwz             r8,16(rs)
 397                 lwz             r9,20(rs)
 398                 lwz             r10,24(rs)
 399                 lwz             r11,28(rs)
 400                 stw             r0,0(rd)
 401                 stw             r5,4(rd)
 402                 stw             r6,8(rd)
 403                 stw             r7,12(rd)
 404                 stw             r8,16(rd)
 405                 stw             r9,20(rd)
 406                 addi    rs,rs,32
 407                 stw             r10,24(rd)
 408                 stw             r11,28(rd)
 409                 addi    rd,rd,32
 410                 bdnz    Lmdown1
 411                 mfcr    rc                                              //  restore low bits of count
 412                 blr                                                             //  return to caller
 413
 414 //  Aligned operands, so use d.p. floating point registers to move data.
 415
 416 Lmdown3:
 417                 lfd             f0,0(rs)
 418                 lfd             f1,8(rs)
 419                 lfd             f2,16(rs)
 420                 lfd             f3,24(rs)
 421                 addi    rs,rs,32
 422                 stfd    f0,0(rd)
 423                 stfd    f1,8(rd)
 424                 stfd    f2,16(rd)
 425                 stfd    f3,24(rd)
 426                 addi    rd,rd,32
 427                 bdnz    Lmdown3
 428                 blr                                                             //  return to caller
 429
 430 //
 431 // Register use conventions are as follows:
 432 //
 433 // r0 - temp
 434 // r6 - copy of VMX SPR at entry
 435 // r7 - temp
 436 // r8 - constant -1 (also temp and a string op buffer)
 437 // r9 - constant 16 or -17 (also temp and a string op buffer)
 438 // r10- constant 32 or -33 (also temp and a string op buffer)
 439 // r11- constant 48 or -49 (also temp and a string op buffer)
 440 // r12- chunk count ("c") in long moves
 441 //
 442 // v0 - vp - permute vector
 443 // v1 - va - 1st quadword of source
 444 // v2 - vb - 2nd quadword of source
 445 // v3 - vc - 3rd quadword of source
 446 // v4 - vd - 4th quadword of source
 447 // v5 - vx - temp
 448 // v6 - vy - temp
 449 // v7 - vz - temp
 450
 451 #define vp      v0
 452 #define va      v1
 453 #define vb      v2
 454 #define vc      v3
 455 #define vd      v4
 456 #define vx      v5
 457 #define vy      v6
 458 #define vz      v7
 459
 460 #define VRSave  256
 461
 462 // kShort should be the crossover point where the long algorithm is faster than the short.
 463 // WARNING: kShort must be >= 64
 464
 465 // Yes, I know, we just checked rc > 128 to get here...
 466
 467 #define kShort  128
 468 LAltivec:
 469                 cmpwi   cr1,rc,kShort           //(1) too short to bother using vector regs?
 470                 sub.    r0,rd,rs                        //(1) must move reverse if (rd-rs)<rc
 471                 dcbt    0,rs                            //(2) prefetch first source block
 472                 cmplw   cr6,r0,rc                       //(2) set cr6 blt iff we must move reverse
 473                 beqlr-                                          //(2) done if src==dest
 474                 srawi.  r9,rc,4                         //(3) r9 <- quadwords to move, test for zero
 475                 or              r8,rs,rd                        //(3) start to check for word alignment
 476                 dcbtst  0,rd                            //(4) prefetch first destination block
 477                 rlwinm  r8,r8,0,30,31           //(4) r8 is zero if word aligned
 478                 bgt-    cr1,LMoveLong           //(4) handle long operands
 479                 cmpwi   cr1,r8,0                        //(5) word aligned?
 480                 rlwinm  r7,rc,0,28,31           //(5) r7 <- leftover bytes to move after quadwords
 481                 bltlr-                                          //(5) done if negative count
 482                 blt-    cr6,LShortReverse       //(5) handle reverse moves
 483                 cmpwi   cr7,r7,0                        //(6) leftover bytes?
 484                 beq-    Leftovers                       //(6) r9==0, so no quadwords to move
 485                 mtctr   r9                                      //(7) set up for quadword loop
 486                 bne-    cr1,LUnalignedLoop      //(7) not word aligned (less common than word aligned)
 487
 488
 489 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 490 // <><>                         S H O R T   O P E R A N D S                        <><>
 491 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 492
 493 LAlignedLoop:                                           // word aligned operands (the common case)
 494                 lfd             f0,0(rs)                        //(1)
 495                 lfd             f1,8(rs)                        //(2)
 496                 addi    rs,rs,16                        //(2)
 497                 stfd    f0,0(rd)                        //(3)
 498                 stfd    f1,8(rd)                        //(4)
 499                 addi    rd,rd,16                        //(4)
 500                 bdnz    LAlignedLoop            //(4)
 501
 502 Leftovers:
 503                 beqlr-  cr7                                     //(8) done if r7==0, ie no leftover bytes
 504                 mtxer   r7                                      //(9) count of bytes to move (1-15)
 505                 lswx    r8,0,rs
 506                 stswx   r8,0,rd
 507                 blr                                                     //(17)
 508
 509 LUnalignedLoop:                                         // not word aligned, cannot use lfd/stfd
 510                 lwz             r8,0(rs)                        //(1)
 511                 lwz             r9,4(rs)                        //(2)
 512                 lwz             r10,8(rs)                       //(3)
 513                 lwz             r11,12(rs)                      //(4)
 514                 addi    rs,rs,16                        //(4)
 515                 stw             r8,0(rd)                        //(5)
 516                 stw             r9,4(rd)                        //(6)
 517                 stw             r10,8(rd)                       //(7)
 518                 stw             r11,12(rd)                      //(8)
 519                 addi    rd,rd,16                        //(8)
 520                 bdnz    LUnalignedLoop          //(8)
 521
 522                 b               Leftovers
 523
 524
 525 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 526 // <><>                   S H O R T   R E V E R S E   M O V E S                    <><>
 527 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 528
 529                 // cr0 & r9 <- #doublewords to move (>=0)
 530                 // cr1      <- beq if word aligned
 531                 //       r7 <- #leftover bytes to move (0-15)
 532
 533 LShortReverse:
 534                 cmpwi   cr7,r7,0                        // leftover bytes?
 535                 add             rs,rs,rc                        // point 1 past end of string for reverse moves
 536                 add             rd,rd,rc
 537                 beq-    LeftoversReverse        // r9==0, ie no words to move
 538                 mtctr   r9                                      // set up for quadword loop
 539                 bne-    cr1,LUnalignedLoopReverse
 540
 541 LAlignedLoopReverse:                                    // word aligned, so use lfd/stfd
 542                 lfd             f0,-8(rs)
 543                 lfdu    f1,-16(rs)
 544                 stfd    f0,-8(rd)
 545                 stfdu   f1,-16(rd)
 546                 bdnz    LAlignedLoopReverse
 547
 548 LeftoversReverse:
 549                 beqlr-  cr7                                     // done if r7==0, ie no leftover bytes
 550                 mtxer   r7                                      // count of bytes to move (1-15)
 551                 neg             r7,r7                           // index back by #bytes
 552                 lswx    r8,r7,rs
 553                 stswx   r8,r7,rd
 554                 blr
 555
 556 LUnalignedLoopReverse:                          // not word aligned, cannot use lfd/stfd
 557                 lwz             r8,-4(rs)
 558                 lwz     r9,-8(rs)
 559                 lwz             r10,-12(rs)
 560                 lwzu    r11,-16(rs)
 561                 stw             r8,-4(rd)
 562                 stw             r9,-8(rd)
 563                 stw             r10,-12(rd)
 564                 stwu    r11,-16(rd)
 565                 bdnz    LUnalignedLoopReverse
 566
 567                 b               LeftoversReverse
 568
 569 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 570 // <><>                          L O N G   O P E R A N D S                         <><>
 571 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 572
 573                 // cr6 set (blt) if must move reverse
 574                 // r0 <- (rd - rs)
 575
 576 LMoveLong:
 577                 mfspr   r6,VRSave                       //(5) save caller's VMX mask register
 578                 stw             r6,-4(r1)                       // use CR save area so we can use r6 later
 579                 neg             r8,rd                           //(5) start to compute #bytes to fill in 1st dest quadword
 580                 rlwinm  r0,r0,0,28,31           //(6) start to determine relative alignment
 581                 andi.   r7,r8,0xF                       //(6) r7 <- #bytes to fill in 1st dest quadword
 582                 cmpwi   cr7,r0,0                        //(7) relatively aligned? (ie, 16 bytes apart?)
 583                 oris    r9,r6,0xFF00            //(7) light bits for regs we use (v0-v7)
 584                 mtspr   VRSave,r9                       //(8) update live register bitmask
 585                 blt-    cr6,LongReverse         //(8) must move reverse direction
 586                 sub             rc,rc,r7                        //(9) adjust length while we wait
 587                 beq-    LDest16Aligned          //(9) r7==0, ie destination already quadword aligned
 588
 589                 // Align destination on a quadword.
 590
 591                 mtxer   r7                                      //(10) set up byte count (1-15)
 592                 lswx    r8,0,rs                         // load into r8-r11
 593                 stswx   r8,0,rd                         // store r8-r11 (measured latency on arthur is 7.2 cycles)
 594                 add             rd,rd,r7                        //(18) adjust ptrs
 595                 add             rs,rs,r7                        //(18)
 596
 597                 // Begin preparation for inner loop and "dst" stream.
 598
 599 LDest16Aligned:
 600         andi.   r0,rd,0x10          //(19) is destination cache-block aligned?
 601                 li              r9,16                           //(19) r9 <- constant used to access 2nd quadword
 602                 li              r10,32                          //(20) r10<- constant used to access 3rd quadword
 603                 beq-    cr7,LAligned            //(20) handle relatively aligned operands
 604                 lvx             va,0,rs                         //(20) prefetch 1st source quadword
 605                 li              r11,48                          //(21) r11<- constant used to access 4th quadword
 606                 lvsl    vp,0,rs                         //(21) get permute vector to left shift
 607                 beq             LDest32Aligned          //(22) destination already cache-block aligned
 608
 609                 // Copy 16 bytes to align destination on 32-byte (cache block) boundary
 610                 // to maximize store gathering.
 611
 612                 lvx             vb,r9,rs                        //(23) get 2nd source qw
 613                 subi    rc,rc,16                        //(23) adjust count
 614                 addi    rs,rs,16                        //(24) adjust source ptr
 615                 vperm   vx,va,vb,vp                     //(25) vx <- 1st destination qw
 616                 vor             va,vb,vb                        //(25) va <- vb
 617                 stvx    vx,0,rd                         //(26) assuming store Q deep enough to avoid latency
 618                 addi    rd,rd,16                        //(26) adjust dest ptr
 619
 620                 // Destination 32-byte aligned, source alignment unknown.
 621
 622 LDest32Aligned:
 623                 srwi.   r12,rc,6                        //(27) r12<- count of 64-byte chunks to move
 624                 rlwinm  r7,rc,28,30,31          //(27) r7 <- count of 16-byte chunks to move
 625                 cmpwi   cr1,r7,0                        //(28) remember if any 16-byte chunks
 626                 rlwinm  r8,r12,0,26,31          //(29) mask chunk count down to 0-63
 627                 subi    r0,r8,1                         //(30) r8==0?
 628                 beq-    LNoChunks                       //(30) r12==0, ie no chunks to move
 629                 rlwimi  r8,r0,0,25,25           //(31) if r8==0, then r8 <- 64
 630                 li              r0,64                           //(31) r0 <- used to get 1st quadword of next chunk
 631                 sub.    r12,r12,r8                      //(32) adjust chunk count, set cr0
 632                 mtctr   r8                                      //(32) set up loop count
 633                 li              r8,96                           //SKP
 634                 li              r6,128                          //SKP
 635                 // Inner loop for unaligned sources.  We copy 64 bytes per iteration.
 636                 // We loop at most 64 times, then reprime the "dst" and loop again for
 637                 // the next 4KB.  This loop is tuned to keep the CPU flat out, which
 638                 // means we need to execute a lvx or stvx every cycle.
 639
 640 LoopBy64:
 641                 dcbt    rs,r8                           //SKP
 642                 dcbt    rs,r6                           //SKP
 643                 lvx             vb,r9,rs                        //(1) 2nd source quadword (1st already in va)
 644                 lvx             vc,r10,rs                       //(2) 3rd
 645                 lvx             vd,r11,rs                       //(3) 4th
 646                 vperm   vx,va,vb,vp                     //(3) vx <- 1st destination quadword
 647                 lvx             va,rs,r0                        //(4) get 1st qw of next 64-byte chunk (r0 must be RB!)
 648                 vperm   vy,vb,vc,vp                     //(4) vy <- 2nd dest qw
 649                 stvx    vx,0,rd                         //(5)
 650                 vperm   vz,vc,vd,vp                     //(5) vz <- 3rd dest qw
 651                 stvx    vy,r9,rd                        //(6)
 652                 vperm   vx,vd,va,vp                     //(6) vx <- 4th
 653                 stvx    vz,r10,rd                       //(7)
 654                 addi    rs,rs,64                        //(7)
 655                 stvx    vx,r11,rd                       //(8)
 656                 addi    rd,rd,64                        //(8)
 657                 bdnz    LoopBy64                        //(8)
 658
 659                 // End of inner loop.  Should we reprime dst stream and restart loop?
 660                 // This block is only executed when we're moving more than 4KB.
 661                 // It is usually folded out because cr0 is set in the loop prologue.
 662
 663                 beq+    LNoChunks                       // r12==0, ie no more chunks to move
 664                 sub.    r12,r12,r0                      // set cr0 if more than 4KB remain to xfer
 665                 mtctr   r0                                      // initialize loop count to 64
 666                 b               LoopBy64                        // restart inner loop, xfer another 4KB
 667
 668                 // Fewer than 64 bytes remain to be moved.
 669
 670 LNoChunks:                                                      // r7 and cr1 are set with the number of QWs
 671                 andi.   rc,rc,0xF                       //(33) rc <- leftover bytes
 672                 beq-    cr1,LCleanup            //(33) r7==0, ie fewer than 16 bytes remaining
 673                 mtctr   r7                                      //(34) we will loop over 1-3 QWs
 674
 675 LoopBy16:
 676                 lvx             vb,r9,rs                        //(1) vb <- 2nd source quadword
 677                 addi    rs,rs,16                        //(1)
 678                 vperm   vx,va,vb,vp                     //(3) vx <- next destination quadword
 679                 vor             va,vb,vb                        //(3) va <- vb
 680                 stvx    vx,0,rd                         //(4) assuming store Q is deep enough to mask latency
 681                 addi    rd,rd,16                        //(4)
 682                 bdnz    LoopBy16                        //(4)
 683
 684                 // Move remaining bytes in last quadword.  rc and cr0 have the count.
 685
 686 LCleanup:
 687                 lwz             r6,-4(r1)                   // load VRSave from CR save area
 688                 mtspr   VRSave,r6                       //(35) restore caller's live-register bitmask
 689                 beqlr                                           //(36) rc==0, ie no leftovers, so done
 690                 mtxer   rc                                      //(37) load byte count (1-15)
 691                 lswx    r8,0,rs
 692                 stswx   r8,0,rd
 693                 blr                                                     //(45)
 694
 695
 696 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 697 // <><>              L O N G   A L I G N E D   M O V E S                           <><>
 698 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 699
 700                 // rs, rd <- both quadword aligned
 701                 // cr0 <- beq if dest is cache block (32-byte) aligned
 702                 // r9  <- 16
 703                 // r10 <- 32
 704
 705 LAligned:
 706                 lvx             va,0,rs                         // prefetch 1st source quadword
 707                 li              r11,48                          // r11<- constant used to access 4th quadword
 708                 beq             LAligned32                      // destination already cache-block aligned
 709
 710                 // Copy 16 bytes to align destination on 32-byte (cache block) boundary
 711                 // to maximize store gathering.
 712
 713                 subi    rc,rc,16                        // adjust count
 714                 addi    rs,rs,16                        // adjust source ptr
 715                 stvx    va,0,rd                         // assuming store Q deep enough to avoid latency
 716                 addi    rd,rd,16                        // adjust dest ptr
 717
 718                 // Destination 32-byte aligned, source 16-byte aligned.  Set up for inner loop.
 719
 720 LAligned32:
 721                 srwi.   r12,rc,6                        // r12<- count of 64-byte chunks to move
 722                 rlwinm  r7,rc,28,30,31          // r7 <- count of 16-byte chunks to move
 723                 cmpwi   cr1,r7,0                        // remember if any 16-byte chunks
 724                 rlwinm  r8,r12,0,26,31          // mask chunk count down to 0-63
 725                 subi    r0,r8,1                         // r8==0?
 726                 beq-    LAlignedNoChunks        // r12==0, ie no chunks to move
 727                 rlwimi  r8,r0,0,25,25           // if r8==0, then r8 <- 64
 728                 li              r0,64                           // r0 <- used at end of loop
 729                 sub.    r12,r12,r8                      // adjust chunk count, set cr0
 730                 mtctr   r8                                      // set up loop count
 731                 li              r8,96                           //SKP
 732                 li              r6,128                          //SKP
 733
 734                 // Inner loop for aligned sources.  We copy 64 bytes per iteration.
 735
 736 LAlignedLoopBy64:
 737                 dcbt    rs,r8                           //SKP
 738                 dcbt    rs,r6                           //SKP
 739                 lvx             va,0,rs                         //(1)
 740                 lvx             vb,r9,rs                        //(2)
 741                 lvx             vc,r10,rs                       //(3)
 742                 lvx             vd,r11,rs                       //(4)
 743                 addi    rs,rs,64                        //(4)
 744                 stvx    va,0,rd                         //(5)
 745                 stvx    vb,r9,rd                        //(6)
 746                 stvx    vc,r10,rd                       //(7)
 747                 stvx    vd,r11,rd                       //(8)
 748                 addi    rd,rd,64                        //(8)
 749                 bdnz    LAlignedLoopBy64        //(8)
 750
 751                 // End of inner loop.  Loop again for next 4KB iff any.
 752
 753                 beq+    LAlignedNoChunks        // r12==0, ie no more chunks to move
 754                 sub.    r12,r12,r0                      // set cr0 if more than 4KB remain to xfer
 755                 mtctr   r0                                      // reinitialize loop count to 64
 756                 b               LAlignedLoopBy64        // restart inner loop, xfer another 4KB
 757
 758                 // Fewer than 64 bytes remain to be moved.
 759
 760 LAlignedNoChunks:                                       // r7 and cr1 are set with the number of QWs
 761                 andi.   rc,rc,0xF                       // rc <- leftover bytes
 762                 beq-    cr1,LCleanup            // r7==0, ie fewer than 16 bytes remaining
 763                 mtctr   r7                                      // we will loop over 1-3 QWs
 764
 765 LAlignedLoopBy16:
 766                 lvx             va,0,rs                         // get next quadword
 767                 addi    rs,rs,16
 768                 stvx    va,0,rd
 769                 addi    rd,rd,16
 770                 bdnz    LAlignedLoopBy16
 771
 772                 b               LCleanup                        // handle last 0-15 bytes, if any
 773
 774
 775 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 776 // <><>              L O N G   R E V E R S E   M O V E S                           <><>
 777 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 778
 779                 // Reverse moves.  These involve overlapping operands, with the source
 780                 // lower in memory (lower addresses) than the destination.  They must be
 781                 // done right-to-left, ie from high addresses down to low addresses.
 782                 // Throughout this code, we maintain rs and rd as pointers one byte past
 783                 // the end of the untransferred operands.
 784                 //
 785                 // The byte count is >=kShort and the following registers are already loaded:
 786                 //
 787                 //      r6  - VMX mask at entry
 788                 //      cr7 - beq if relatively aligned
 789                 //
 790
 791 LongReverse:
 792                 add             rd,rd,rc                        // update source/dest ptrs to be 1 byte past end
 793                 add             rs,rs,rc
 794                 andi.   r7,rd,0xF                       // r7 <- #bytes needed to move to align destination
 795                 sub             rc,rc,r7                        // adjust length while we wait
 796                 sub             rs,rs,r7                        // adjust ptrs by #bytes to xfer, also while we wait
 797                 sub             rd,rd,r7
 798                 beq-    LDest16AlignedReverse
 799
 800                 // Align destination on a quadword.  Note that we do NOT align on a cache
 801                 // block boundary for store gathering etc// since all these operands overlap
 802                 // many dest cache blocks will already be in the L1, so its not clear that
 803                 // this would be a win.
 804
 805                 mtxer   r7                                      // load byte count
 806                 lswx    r8,0,rs
 807                 stswx   r8,0,rd
 808
 809                 // Prepare for inner loop and start "dstst" stream.  Frankly, its not
 810                 // clear whether "dst" or "dstst" would be better// somebody should
 811                 // measure.  We use "dstst" because, being overlapped, at least some
 812                 // source cache blocks will also be stored into.
 813
 814 LDest16AlignedReverse:
 815                 srwi.   r12,rc,6                        // r12 <- count of 64-byte chunks to move
 816                 rlwinm  r0,rc,11,9,15           // position quadword count for dst
 817                 rlwinm  r11,r12,0,26,31         // mask chunk count down to 0-63
 818                 li              r9,-17                          // r9 <- constant used to access 2nd quadword
 819                 oris    r0,r0,0x0100            // set dst block size to 1 qw
 820                 li              r10,-33                         // r10<- constant used to access 3rd quadword
 821                 ori             r0,r0,0xFFE0            // set dst stride to -16 bytes
 822                 li              r8,-1                           // r8<- constant used to access 1st quadword
 823                 dstst   rs,r0,3                         // start stream 0
 824                 subi    r0,r11,1                        // r11==0 ?
 825                 lvx             va,r8,rs                        // prefetch 1st source quadword
 826                 rlwinm  r7,rc,28,30,31          // r7 <- count of 16-byte chunks to move
 827                 lvsl    vp,0,rs                         // get permute vector to right shift
 828                 cmpwi   cr1,r7,0                        // remember if any 16-byte chunks
 829                 beq-    LNoChunksReverse        // r12==0, so skip inner loop
 830                 rlwimi  r11,r0,0,25,25          // if r11==0, then r11 <- 64
 831                 sub.    r12,r12,r11                     // adjust chunk count, set cr0
 832                 mtctr   r11                                     // set up loop count
 833                 li              r11,-49                         // r11<- constant used to access 4th quadword
 834                 li              r0,-64                          // r0 <- used for several purposes
 835                 beq-    cr7,LAlignedLoopBy64Reverse
 836
 837                 // Inner loop for unaligned sources.  We copy 64 bytes per iteration.
 838
 839 LoopBy64Reverse:
 840                 lvx             vb,r9,rs                        //(1) 2nd source quadword (1st already in va)
 841                 lvx             vc,r10,rs                       //(2) 3rd quadword
 842                 lvx             vd,r11,rs                       //(3) 4th
 843                 vperm   vx,vb,va,vp                     //(3) vx <- 1st destination quadword
 844                 lvx             va,rs,r0                        //(4) get 1st qw of next 64-byte chunk (note r0 must be RB)
 845                 vperm   vy,vc,vb,vp                     //(4) vy <- 2nd dest qw
 846                 stvx    vx,r8,rd                        //(5)
 847                 vperm   vz,vd,vc,vp                     //(5) vz <- 3rd destination quadword
 848                 stvx    vy,r9,rd                        //(6)
 849                 vperm   vx,va,vd,vp                     //(6) vx <- 4th qw
 850                 stvx    vz,r10,rd                       //(7)
 851                 subi    rs,rs,64                        //(7)
 852                 stvx    vx,r11,rd                       //(8)
 853                 subi    rd,rd,64                        //(8)
 854                 bdnz    LoopBy64Reverse         //(8)
 855
 856                 // End of inner loop.  Should we reprime dst stream and restart loop?
 857                 // This block is only executed when we're moving more than 4KB.
 858                 // It is usually folded out because cr0 is set in the loop prologue.
 859
 860                 beq+    LNoChunksReverse        // r12==0, ie no more chunks to move
 861                 lis             r8,0x0440                       // dst control: 64 4-qw blocks
 862                 add.    r12,r12,r0                      // set cr0 if more than 4KB remain to xfer
 863                 ori             r8,r8,0xFFC0            // stride is -64 bytes
 864                 dstst   rs,r8,3                         // restart the prefetch stream
 865                 li              r8,64                           // inner loop count
 866                 mtctr   r8                                      // initialize loop count to 64
 867                 li              r8,-1                           // restore qw1 offset for inner loop
 868                 b               LoopBy64Reverse         // restart inner loop, xfer another 4KB
 869
 870                 // Fewer than 64 bytes remain to be moved.
 871
 872 LNoChunksReverse:                                       // r7 and cr1 are set with the number of QWs
 873                 andi.   rc,rc,0xF                       // rc <- leftover bytes
 874                 beq-    cr1,LCleanupReverse     // r7==0, ie fewer than 16 bytes left
 875                 mtctr   r7
 876                 beq-    cr7,LAlignedLoopBy16Reverse
 877
 878 LoopBy16Reverse:
 879                 lvx             vb,r9,rs                        // vb <- 2nd source quadword
 880                 subi    rs,rs,16
 881                 vperm   vx,vb,va,vp                     // vx <- next destination quadword
 882                 vor             va,vb,vb                        // va <- vb
 883                 stvx    vx,r8,rd
 884                 subi    rd,rd,16
 885                 bdnz    LoopBy16Reverse
 886
 887                 // Fewer that 16 bytes remain to be moved.
 888
 889 LCleanupReverse:                                        // rc and cr0 set with remaining byte count
 890                 lwz             r6,-4(r1)                       // load VRSave from CR save area
 891                 mtspr   VRSave,r6                       // restore caller's live-register bitmask
 892                 beqlr                                           // rc==0, ie no leftovers so done
 893                 neg             r7,rc                           // get -(#bytes)
 894                 mtxer   rc                                      // byte count
 895                 lswx    r8,r7,rs
 896                 stswx   r8,r7,rd
 897                 blr
 898
 899
 900 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 901 // <><>        A L I G N E D   L O N G   R E V E R S E   M O V E S                 <><>
 902 // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 903
 904                 // Inner loop.  We copy 64 bytes per iteration.
 905
 906 LAlignedLoopBy64Reverse:
 907                 lvx             va,r8,rs                        //(1)
 908                 lvx             vb,r9,rs                        //(2)
 909                 lvx             vc,r10,rs                       //(3)
 910                 lvx             vd,r11,rs                       //(4)
 911                 subi    rs,rs,64                        //(4)
 912                 stvx    va,r8,rd                        //(5)
 913                 stvx    vb,r9,rd                        //(6)
 914                 stvx    vc,r10,rd                       //(7)
 915                 stvx    vd,r11,rd                       //(8)
 916                 subi    rd,rd,64                        //(8)
 917                 bdnz    LAlignedLoopBy64Reverse //(8)
 918
 919                 // End of inner loop.  Loop for next 4KB iff any.
 920
 921                 beq+    LNoChunksReverse        // r12==0, ie no more chunks to move
 922                 lis             r8,0x0440                       // dst control: 64 4-qw blocks
 923                 add.    r12,r12,r0                      // r12 <- r12 - 64, set cr0
 924                 ori             r8,r8,0xFFC0            // stride is -64 bytes
 925                 dstst   rs,r8,3                         // restart the prefetch stream
 926                 li              r8,64                           // inner loop count
 927                 mtctr   r8                                      // initialize loop count to 64
 928                 li              r8,-1                           // restore qw1 offset for inner loop
 929                 b               LAlignedLoopBy64Reverse
 930
 931                 // Loop to copy leftover quadwords (1-3).
 932
 933 LAlignedLoopBy16Reverse:
 934                 lvx             va,r8,rs                        // get next qw
 935                 subi    rs,rs,16
 936                 stvx    va,r8,rd
 937                 subi    rd,rd,16
 938                 bdnz    LAlignedLoopBy16Reverse
 939
 940                 b               LCleanupReverse         // handle up to 15 bytes in last qw