ppc/gen/bcopy.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25
  26 #define __APPLE_API_PRIVATE
  27 #include <machine/cpu_capabilities.h>
  28 #undef  __APPLE_API_PRIVATE
  29
  30  // These functions have migrated to the comm page.
  31
  32 .text
  33 .globl _bcopy
  34 .globl _memcpy
  35 .globl _memmove
  36
  37         .align  5
  38 _bcopy:                                                         // void bcopy(const void *src, void *dst, size_t len)
  39         ba              _COMM_PAGE_BCOPY
  40
  41         .align  5
  42 _memcpy:                                                        // void* memcpy(void *dst, void *src, size_t len)
  43 _memmove:                                                       // void* memmove(void *dst, const void *src, size_t len)
  44         ba              _COMM_PAGE_MEMCPY
  45
  46
  47 #if 0
  48 /* =======================================
  49  * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
  50  * =======================================
  51  *
  52  * Version of 6/17/2002, for G3, G4, and G4+.
  53  *
  54  * There are many paths through this code, depending on length, reverse/forward,
  55  * processor type, and alignment.  We use reverse paths only when the operands
  56  * overlap and the destination is higher than the source.  They are not quite as
  57  * fast as the forward paths.
  58  *
  59  * Judicious use of DCBTs, just far enough ahead to minimize waiting, is critical in
  60  * the inner loops for long operands.  DST is less effective than DCBT, because it
  61  * can get out of sync with the inner loop.  DCBTST is usually not a win, so we
  62  * don't use it except during initialization when we're not using the LSU.
  63  * We don't DCBT on G3, which only handles one load miss at a time.
  64  *
  65  * We don't use DCBZ, because it takes an alignment exception on uncached memory
  66  * like frame buffers.  Bcopy to frame buffers must work.  This hurts G3 in the
  67  * cold-cache case, but G4 can use DCBA (which does not take alignment exceptions.)
  68  *
  69  * Using DCBA on G4 is a tradeoff.  For the cold-cache case it can be a big win,
  70  * since it avoids the read of destination cache lines.  But for the hot-cache case
  71  * it is always slower, because of the cycles spent needlessly zeroing data.  Some
  72  * machines store-gather and can cancel the read if all bytes of a line are stored,
  73  * others cannot.  Unless explicitly told which is better, we time loops with and
  74  * without DCBA and use the fastest.  Note that we never DCBA in reverse loops,
  75  * since by definition they are overlapped so dest lines will be in the cache.
  76  *
  77  * For longer operands we use an 8-element branch table, based on the CPU type,
  78  * to select the appropriate inner loop.  The branch table is indexed as follows:
  79  *
  80  *   bit 10000 set if a Reverse move is required
  81  *  bits 01100 set on the relative operand alignment: 0=unaligned, 1=word,
  82  *             2=doubleword, and 3=quadword.
  83  *
  84  * By "relatively" n-byte aligned, we mean the source and destination are a multiple
  85  * of n bytes apart (they need not be absolutely aligned.)
  86  *
  87  * The branch table for the running CPU type is pointed to by LBranchTablePtr.
  88  * Initially, LBranchtablePtr points to G3's table, since that is the lowest
  89  * common denominator that will run on any CPU.  Later, pthread initialization
  90  * sets up the _cpu_capabilities vector and calls _bcopy_initialize, which sets
  91  * up the correct pointer for the running CPU.
  92  *
  93  * We distinguish between "short", "medium", and "long" operands:
  94  *  short     (<= 32 bytes)    most common case, minimum path length is important
  95  *  medium    (> 32, < kLong)  too short for Altivec or use of cache ops like DCBA
  96  *  long      (>= kLong)       long enough for cache ops and to amortize use of Altivec
  97  *
  98  * WARNING:  kLong must be >=96, due to implicit assumptions about operand length.
  99  */
 100 #define kLong           96
 101
 102 /* Register usage.  Note we use R2, so this code will not run in a PEF/CFM
 103  * environment.  Note also the rather delicate way we assign multiple uses
 104  * to the same register.  Beware.
 105  *
 106  *   r0  = "w7" or "r0" (NB: cannot use r0 for any constant such as "c16")
 107  *   r2  = "w8" or VRSave ("rv")
 108  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
 109  *   r4  = source ptr ("rs")
 110  *   r5  = count of bytes to move ("rc")
 111  *   r6  = "w1", "c16", or "cm17"
 112  *   r7  = "w2", "c32", or "cm33"
 113  *   r8  = "w3", "c48", or "cm49"
 114  *   r9  = "w4", "c64", or "cm1"
 115  *   r10 = "w5", "c96", or "cm97"
 116  *   r11 = "w6", "c128", "cm129", or return address ("ra")
 117  *   r12 = destination ptr ("rd")
 118  * f0-f8 = used for moving 8-byte aligned data
 119  *   v0  = permute vector ("vp")
 120  * v1-v4 = qw's loaded from source ("v1", "v2", "v3", and "v4")
 121  * v5-v7 = permuted qw's ("vx", "vy", and "vz")
 122  */
 123 #define rs      r4
 124 #define rd      r12
 125 #define rc      r5
 126 #define ra      r11
 127 #define rv      r2
 128
 129 #define w1      r6
 130 #define w2      r7
 131 #define w3      r8
 132 #define w4      r9
 133 #define w5      r10
 134 #define w6      r11
 135 #define w7      r0
 136 #define w8      r2
 137
 138 #define c16             r6
 139 #define cm17    r6
 140 #define c32             r7
 141 #define cm33    r7
 142 #define c48             r8
 143 #define cm49    r8
 144 #define c64             r9
 145 #define cm1             r9
 146 #define c96             r10
 147 #define cm97    r10
 148 #define c128    r11
 149 #define cm129   r11
 150
 151 #define vp      v0
 152 #define vx      v5
 153 #define vy      v6
 154 #define vz      v7
 155
 156 #define VRSave  256
 157
 158 #include <architecture/ppc/asm_help.h>
 159
 160 // The branch tables, 8 entries per CPU type.
 161 // NB: we depend on 5 low-order 0s in the address of branch tables.
 162
 163     .data
 164     .align      5                                               // must be 32-byte aligned
 165
 166     // G3 (the default CPU type)
 167
 168 LG3:
 169     .long       LForwardWord                    // 000: forward,       unaligned
 170     .long       LForwardFloat                   // 001: forward,  4-byte aligned
 171     .long       LForwardFloat                   // 010: forward,  8-byte aligned
 172     .long       LForwardFloat                   // 011: forward, 16-byte aligned
 173     .long       LReverseWord                    // 100: reverse,       unaligned
 174     .long       LReverseFloat                   // 101: reverse,  4-byte aligned
 175     .long       LReverseFloat                   // 110: reverse,  8-byte aligned
 176     .long       LReverseFloat                   // 111: reverse, 16-byte aligned
 177
 178     // G4s that benefit from DCBA.
 179
 180 LG4UseDcba:
 181     .long       LForwardVecUnal32Dcba   // 000: forward,       unaligned
 182     .long       LForwardVecUnal32Dcba   // 001: forward,  4-byte aligned
 183     .long       LForwardVecUnal32Dcba   // 010: forward,  8-byte aligned
 184     .long       LForwardVecAlig32Dcba   // 011: forward, 16-byte aligned
 185     .long       LReverseVectorUnal32    // 100: reverse,       unaligned
 186     .long       LReverseVectorUnal32    // 101: reverse,  4-byte aligned
 187     .long       LReverseVectorUnal32    // 110: reverse,  8-byte aligned
 188     .long       LReverseVectorAligned32 // 111: reverse, 16-byte aligned
 189
 190     // G4s that should not use DCBA.
 191
 192 LG4NoDcba:
 193     .long       LForwardVecUnal32NoDcba // 000: forward,       unaligned
 194     .long       LForwardVecUnal32NoDcba // 001: forward,  4-byte aligned
 195     .long       LForwardVecUnal32NoDcba // 010: forward,  8-byte aligned
 196     .long       LForwardVecAlig32NoDcba // 011: forward, 16-byte aligned
 197     .long       LReverseVectorUnal32    // 100: reverse,       unaligned
 198     .long       LReverseVectorUnal32    // 101: reverse,  4-byte aligned
 199     .long       LReverseVectorUnal32    // 110: reverse,  8-byte aligned
 200     .long       LReverseVectorAligned32 // 111: reverse, 16-byte aligned
 201
 202
 203 // Pointer to the 8-element branch table for running CPU type:
 204
 205 LBranchTablePtr:
 206     .long       LG3                                             // default to G3 until "bcopy_initialize" called
 207
 208
 209 // The CPU capability vector, initialized in pthread_init().
 210 // "_bcopy_initialize" uses this to set up LBranchTablePtr:
 211
 212     .globl __cpu_capabilities
 213 __cpu_capabilities:
 214     .long 0
 215
 216 // Bit definitions for _cpu_capabilities:
 217
 218 #define kHasAltivec             0x01
 219 #define k64Bit                  0x02
 220 #define kCache32                0x04
 221 #define kCache64                0x08
 222 #define kCache128               0x10
 223 #define kUseDcba                0x20
 224 #define kNoDcba                 0x40
 225
 226
 227 .text
 228 .globl _bcopy
 229 .globl _memcpy
 230 .globl _memmove
 231 .globl __bcopy_initialize
 232
 233
 234 // Main entry points.
 235
 236         .align  5
 237 _bcopy:                                                         // void bcopy(const void *src, void *dst, size_t len)
 238         mr              r10,r3                          // reverse source and dest ptrs, to be like memcpy
 239         mr              r3,r4
 240         mr              r4,r10
 241 _memcpy:                                                        // void* memcpy(void *dst, void *src, size_t len)
 242 _memmove:                                                       // void* memmove(void *dst, const void *src, size_t len)
 243         cmplwi  cr7,rc,32                       // length <= 32 bytes?
 244         sub.    w1,r3,rs                        // must move in reverse if (rd-rs)<rc, set cr0 on sou==dst
 245         dcbt    0,rs                            // touch in the first line of source
 246         cmplw   cr6,w1,rc                       // set cr6 blt iff we must move reverse
 247         cmplwi  cr1,rc,kLong-1          // set cr1 bgt if long
 248         mr              rd,r3                           // must leave r3 alone, it is return value for memcpy etc
 249         bgt-    cr7,LMedium                     // longer than 32 bytes
 250         dcbtst  0,rd                            // touch in destination
 251         beq-    cr7,LMove32                     // special case moves of 32 bytes
 252         blt-    cr6,LShortReverse0
 253
 254 // Forward short operands.  This is the most frequent case, so it is inline.
 255 // We also end up here to xfer the last 0-31 bytes of longer operands.
 256
 257 LShort:                                                         // WARNING: can fall into this routine
 258         andi.   r0,rc,0x10                      // test bit 27 separately (sometimes faster than a mtcrf)
 259         mtcrf   0x01,rc                         // move rest of length to cr7
 260         beq             1f                                      // quadword to move?
 261         lwz             w1,0(rs)
 262         lwz             w2,4(rs)
 263         lwz             w3,8(rs)
 264         lwz             w4,12(rs)
 265         addi    rs,rs,16
 266         stw             w1,0(rd)
 267         stw             w2,4(rd)
 268         stw             w3,8(rd)
 269         stw             w4,12(rd)
 270         addi    rd,rd,16
 271 1:
 272 LShort16:                                                       // join here to xfer 0-15 bytes
 273         bf              28,2f                           // doubleword?
 274         lwz             w1,0(rs)
 275         lwz             w2,4(rs)
 276         addi    rs,rs,8
 277         stw             w1,0(rd)
 278         stw             w2,4(rd)
 279         addi    rd,rd,8
 280 2:
 281         bf              29,3f                           // word?
 282         lwz             w1,0(rs)
 283         addi    rs,rs,4
 284         stw             w1,0(rd)
 285         addi    rd,rd,4
 286 3:
 287         bf              30,4f                           // halfword to move?
 288         lhz             w1,0(rs)
 289         addi    rs,rs,2
 290         sth             w1,0(rd)
 291         addi    rd,rd,2
 292 4:
 293         bflr    31                                      // skip if no odd byte
 294         lbz             w1,0(rs)
 295         stb             w1,0(rd)
 296         blr
 297
 298
 299 // Handle short reverse operands, up to kShort in length.
 300 // This is also used to transfer the last 0-31 bytes of longer operands.
 301
 302 LShortReverse0:
 303         add             rs,rs,rc                        // adjust ptrs for reverse move
 304         add             rd,rd,rc
 305 LShortReverse:
 306         andi.   r0,rc,0x10                      // test bit 27 separately (sometimes faster than a mtcrf)
 307         mtcrf   0x01,rc                         // move rest of length to cr7
 308         beq             1f                                      // quadword to move?
 309         lwz             w1,-4(rs)
 310         lwz             w2,-8(rs)
 311         lwz             w3,-12(rs)
 312         lwzu    w4,-16(rs)
 313         stw             w1,-4(rd)
 314         stw             w2,-8(rd)
 315         stw             w3,-12(rd)
 316         stwu    w4,-16(rd)
 317 1:
 318 LShortReverse16:                                        // join here to xfer 0-15 bytes and return
 319         bf              28,2f                           // doubleword?
 320         lwz             w1,-4(rs)
 321         lwzu    w2,-8(rs)
 322         stw             w1,-4(rd)
 323         stwu    w2,-8(rd
 324 2:
 325         bf              29,3f                           // word?
 326         lwzu    w1,-4(rs)
 327         stwu    w1,-4(rd)
 328 3:
 329         bf              30,4f                           // halfword to move?
 330         lhzu    w1,-2(rs)
 331         sthu    w1,-2(rd)
 332 4:
 333         bflr    31                                      // done if no odd byte
 334         lbz     w1,-1(rs)                       // no update
 335         stb     w1,-1(rd)
 336         blr
 337
 338
 339 // Special case for 32-byte moves.  Too long for LShort, too common for LMedium.
 340
 341 LMove32:
 342         lwz             w1,0(rs)
 343         lwz             w2,4(rs)
 344         lwz             w3,8(rs)
 345         lwz             w4,12(rs)
 346         lwz             w5,16(rs)
 347         lwz             w6,20(rs)
 348         lwz             w7,24(rs)
 349         lwz             w8,28(rs)
 350         stw             w1,0(rd)
 351         stw             w2,4(rd)
 352         stw             w3,8(rd)
 353         stw             w4,12(rd)
 354         stw             w5,16(rd)
 355         stw             w6,20(rd)
 356         stw             w7,24(rd)
 357         stw             w8,28(rd)
 358 LExit:
 359         blr
 360
 361
 362 // Medium length operands (32 < rc < kLong.)  These loops run on all CPUs, as the
 363 // operands are not long enough to bother with the branch table, using cache ops, or
 364 // Altivec.  We word align the source, not the dest as we do for long operands,
 365 // since doing so is faster on G4+ and probably beyond, we never DCBA on medium-length
 366 // operands, and the opportunity to cancel reads of dest cache lines is limited.
 367 //              w1  = (rd-rs), used to check for alignment
 368 //              cr0 = set on (rd-rs)
 369 //              cr1 = bgt if long operand
 370 //              cr6 = blt if reverse move
 371
 372 LMedium:
 373         dcbtst  0,rd                            // touch in 1st line of destination
 374         rlwinm  r0,w1,0,29,31           // r0 <- ((rd-rs) & 7), ie 0 if doubleword aligned
 375         beq-    LExit                           // early exit if (rs==rd), avoiding use of "beqlr"
 376         neg             w2,rs                           // we align source, not dest, and assume forward
 377         cmpwi   cr5,r0,0                        // set cr5 beq if doubleword aligned
 378         bgt-    cr1,LLong                       // handle long operands
 379         andi.   w3,w2,3                         // W3 <- #bytes to word-align source
 380         blt-    cr6,LMediumReverse      // handle reverse move
 381         lwz             w1,0(rs)                        // pre-fetch first 4 bytes of source
 382         beq-    cr5,LMediumAligned      // operands are doubleword aligned
 383         sub             rc,rc,w3                        // adjust count for alignment
 384         mtcrf   0x01,rc                         // remaining byte count (0-15) to cr7 for LShort16
 385         srwi    w4,rc,4                         // w4 <- number of 16-byte chunks to xfer (>=1)
 386         mtctr   w4                                      // prepare loop count
 387         beq+    2f                                      // source already aligned
 388
 389         lwzx    w2,w3,rs                        // get 1st aligned word (which we might partially overwrite)
 390         add             rs,rs,w3                        // word-align source ptr
 391         stw             w1,0(rd)                        // store all (w3) bytes at once to avoid a loop
 392         add             rd,rd,w3
 393         mr              w1,w2                           // first aligned word to w1
 394         b               2f
 395
 396         .align  4                                       // align inner loops
 397 1:                                                                      // loop over 16-byte chunks
 398         lwz             w1,0(rs)
 399 2:
 400         lwz             w2,4(rs)
 401         lwz             w3,8(rs)
 402         lwz             w4,12(rs)
 403         addi    rs,rs,16
 404         stw             w1,0(rd)
 405         stw             w2,4(rd)
 406         stw             w3,8(rd)
 407         stw             w4,12(rd)
 408         addi    rd,rd,16
 409         bdnz    1b
 410
 411         b               LShort16
 412
 413
 414 // Medium, doubleword aligned.  We use floating point.  Note that G4+ has bigger latencies
 415 // and reduced throughput for floating pt loads and stores; future processors will probably
 416 // have even worse lfd/stfd performance.  We use it here because it is so important for G3,
 417 // and not slower for G4+.  But we only do so for doubleword aligned operands, whereas the
 418 // G3-only long operand loops use floating pt even for word-aligned operands.
 419 //              w2 = neg(rs)
 420 //              w1 = first 4 bytes of source
 421
 422 LMediumAligned:
 423         andi.   w3,w2,7                         // already aligned?
 424         sub             rc,rc,w3                        // adjust count by 0-7 bytes
 425         lfdx    f0,rs,w3                        // pre-fetch first aligned source doubleword
 426         srwi    w4,rc,5                         // get count of 32-byte chunks (might be 0 if unaligned)
 427         mtctr   w4
 428         beq-    LForwardFloatLoop1      // already aligned
 429
 430         cmpwi   w4,0                            // are there any 32-byte chunks to xfer?
 431         lwz             w2,4(rs)                        // get 2nd (unaligned) source word
 432         add             rs,rs,w3                        // doubleword align source pointer
 433         stw             w1,0(rd)                        // store first 8 bytes of source to align...
 434         stw             w2,4(rd)                        // ...which could overwrite source
 435         add             rd,rd,w3                        // doubleword align destination
 436         bne+    LForwardFloatLoop1      // at least 1 chunk, so enter loop
 437
 438         subi    rc,rc,8                         // unfortunate degenerate case: no chunks to xfer
 439         stfd    f0,0(rd)                        // must store f1 since source might have been overwriten
 440         addi    rs,rs,8
 441         addi    rd,rd,8
 442         b               LShort
 443
 444
 445 // Medium reverse moves.  This loop runs on all processors.
 446
 447 LMediumReverse:
 448         add             rs,rs,rc                        // point to other end of operands when in reverse
 449         add             rd,rd,rc
 450         andi.   w3,rs,3                         // w3 <- #bytes to word align source
 451         lwz             w1,-4(rs)                       // pre-fetch 1st 4 bytes of source
 452         sub             rc,rc,w3                        // adjust count
 453         srwi    w4,rc,4                         // get count of 16-byte chunks (>=1)
 454         mtcrf   0x01,rc                         // remaining byte count (0-15) to cr7 for LShortReverse16
 455         mtctr   w4                                      // prepare loop count
 456         beq+    2f                                      // source already aligned
 457
 458         sub             rs,rs,w3                        // word-align source ptr
 459         lwz             w2,-4(rs)                       // get 1st aligned word which we may overwrite
 460         stw             w1,-4(rd)                       // store all 4 bytes to align without a loop
 461         sub             rd,rd,w3
 462         mr              w1,w2                           // shift 1st aligned source word to w1
 463         b               2f
 464
 465 1:
 466         lwz             w1,-4(rs)
 467 2:
 468         lwz             w2,-8(rs)
 469         lwz             w3,-12(rs)
 470         lwzu    w4,-16(rs)
 471         stw             w1,-4(rd)
 472         stw             w2,-8(rd)
 473         stw             w3,-12(rd)
 474         stwu    w4,-16(rd)
 475         bdnz    1b
 476
 477         b               LShortReverse16
 478
 479
 480 // Long operands.  Use branch table to decide which loop to use.
 481 //              w1  = (rd-rs), used to determine alignment
 482
 483 LLong:
 484         xor             w4,w1,rc                        // we must move reverse if (rd-rs)<rc
 485         mflr    ra                                      // save return address
 486         rlwinm  w5,w1,1,27,30           // w5 <- ((w1 & 0xF) << 1)
 487         bcl             20,31,1f                        // use reserved form to get our location
 488 1:
 489         mflr    w3                                      // w3 == addr(1b)
 490         lis             w8,0x0408                       // load a 16 element, 2-bit array into w8...
 491         cntlzw  w4,w4                           // find first difference between (rd-rs) and rc
 492         addis   w2,w3,ha16(LBranchTablePtr-1b)
 493         ori             w8,w8,0x040C            // ...used to map w5 to alignment encoding (ie, to 0-3)
 494         lwz             w2,lo16(LBranchTablePtr-1b)(w2) // w2 <- branch table address
 495         slw             w4,rc,w4                        // bit 0 of w4 set iff (rd-rs)<rc
 496         rlwnm   w5,w8,w5,28,29          // put alignment encoding in bits 01100 of w5
 497         rlwimi  w2,w4,5,27,27           // put reverse bit in bit 10000 of branch table address
 498         lwzx    w3,w2,w5                        // w3 <- load loop address from branch table
 499         neg             w1,rd                           // start to compute destination alignment
 500         mtctr   w3
 501         andi.   r0,w1,0x1F                      // r0 <- bytes req'd to 32-byte align dest (if forward move)
 502         bctr                                            // NB: r0/cr0 and w1 are passed as parameters
 503
 504
 505 // G3, forward, long, unaligned.
 506 //              w1 = neg(rd)
 507
 508 LForwardWord:
 509         andi.   w3,w1,3                         // W3 <- #bytes to word-align destination
 510         mtlr    ra                                      // restore return address
 511         sub             rc,rc,w3                        // adjust count for alignment
 512         srwi    r0,rc,5                         // number of 32-byte chunks to xfer (>=1)
 513         mtctr   r0                                      // prepare loop count
 514         beq+    1f                                      // dest already aligned
 515
 516         lwz             w2,0(rs)                        // get first 4 bytes of source
 517         lwzx    w1,w3,rs                        // get source bytes we might overwrite
 518         add             rs,rs,w3                        // adjust source ptr
 519         stw             w2,0(rd)                        // store all 4 bytes to avoid a loop
 520         add             rd,rd,w3                        // word-align destination
 521         b               2f
 522 1:
 523         lwz             w1,0(rs)
 524 2:
 525         lwz             w2,4(rs)
 526         lwz             w3,8(rs)
 527         lwz             w4,12(rs)
 528         lwz             w5,16(rs)
 529         lwz             w6,20(rs)
 530         lwz             w7,24(rs)
 531         lwz             w8,28(rs)
 532         addi    rs,rs,32
 533         stw             w1,0(rd)
 534         stw             w2,4(rd)
 535         stw             w3,8(rd)
 536         stw             w4,12(rd)
 537         stw             w5,16(rd)
 538         stw             w6,20(rd)
 539         stw             w7,24(rd)
 540         stw             w8,28(rd)
 541         addi    rd,rd,32
 542         bdnz    1b
 543
 544         b               LShort
 545
 546
 547 // G3, forward, long, word aligned.  We use floating pt even when only word aligned.
 548 //              w1 = neg(rd)
 549
 550 LForwardFloat:
 551         andi.   w3,w1,7                         // W3 <- #bytes to doubleword-align destination
 552         mtlr    ra                                      // restore return address
 553         sub             rc,rc,w3                        // adjust count for alignment
 554         srwi    r0,rc,5                         // number of 32-byte chunks to xfer (>=1)
 555         mtctr   r0                                      // prepare loop count
 556         beq             LForwardFloatLoop       // dest already aligned
 557
 558         lwz             w1,0(rs)                        // get first 8 bytes of source
 559         lwz             w2,4(rs)
 560         lfdx    f0,w3,rs                        // get source bytes we might overwrite
 561         add             rs,rs,w3                        // word-align source ptr
 562         stw             w1,0(rd)                        // store all 8 bytes to avoid a loop
 563         stw             w2,4(rd)
 564         add             rd,rd,w3
 565         b               LForwardFloatLoop1
 566
 567         .align  4                                       // align since this loop is executed by G4s too
 568 LForwardFloatLoop:
 569         lfd             f0,0(rs)
 570 LForwardFloatLoop1:                                     // enter here from LMediumAligned and above
 571         lfd             f1,8(rs)
 572         lfd             f2,16(rs)
 573         lfd             f3,24(rs)
 574         addi    rs,rs,32
 575         stfd    f0,0(rd)
 576         stfd    f1,8(rd)
 577         stfd    f2,16(rd)
 578         stfd    f3,24(rd)
 579         addi    rd,rd,32
 580         bdnz    LForwardFloatLoop
 581
 582         b               LShort
 583
 584
 585 // G4 Forward, long, 16-byte aligned, 32-byte cache ops, use DCBA and DCBT.
 586 //              r0/cr0 = #bytes to 32-byte align
 587
 588 LForwardVecAlig32Dcba:
 589         bnel+   LAlign32                        // align destination iff necessary
 590         bl              LPrepareForwardVectors
 591         mtlr    ra                                      // restore return address before loading c128
 592         li              c128,128
 593         b               1f                                      // enter aligned loop
 594
 595         .align  5                                       // long loop heads should be at least 16-byte aligned
 596 1:                                                              // loop over aligned 64-byte chunks
 597         dcbt    c96,rs                          // pre-fetch three cache lines ahead
 598         dcbt    c128,rs                         // and four
 599         lvx             v1,0,rs
 600         lvx             v2,c16,rs
 601         lvx             v3,c32,rs
 602         lvx             v4,c48,rs
 603         addi    rs,rs,64
 604         dcba    0,rd                            // avoid read of destination cache lines
 605         stvx    v1,0,rd
 606         stvx    v2,c16,rd
 607         dcba    c32,rd
 608         stvx    v3,c32,rd
 609         stvx    v4,c48,rd
 610         addi    rd,rd,64
 611         bdnz    1b
 612
 613 LForwardVectorAlignedEnd:                       // r0/cr0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7
 614         beq-    3f                                      // no leftover quadwords
 615         mtctr   r0
 616 2:                                                                      // loop over remaining quadwords (1-7)
 617         lvx             v1,0,rs
 618         addi    rs,rs,16
 619         stvx    v1,0,rd
 620         addi    rd,rd,16
 621         bdnz    2b
 622 3:
 623         mtspr   VRSave,rv                       // restore bitmap of live vr's
 624         bne             cr6,LShort16            // handle last 0-15 bytes if any
 625         blr
 626
 627
 628 // G4 Forward, long, 16-byte aligned, 32-byte cache, use DCBT but not DCBA.
 629 //              r0/cr0 = #bytes to 32-byte align
 630
 631 LForwardVecAlig32NoDcba:
 632         bnel+   LAlign32                        // align destination iff necessary
 633         bl              LPrepareForwardVectors
 634         mtlr    ra                                      // restore return address before loading c128
 635         li              c128,128
 636         b               1f                                      // enter aligned loop
 637
 638         .align  4                                       // balance 13-word loop between QWs...
 639         nop                                                     // ...which improves performance 5% +/-
 640         nop
 641 1:                                                              // loop over aligned 64-byte chunks
 642         dcbt    c96,rs                          // pre-fetch three cache lines ahead
 643         dcbt    c128,rs                         // and four
 644         lvx             v1,0,rs
 645         lvx             v2,c16,rs
 646         lvx             v3,c32,rs
 647         lvx             v4,c48,rs
 648         addi    rs,rs,64
 649         stvx    v1,0,rd
 650         stvx    v2,c16,rd
 651         stvx    v3,c32,rd
 652         stvx    v4,c48,rd
 653         addi    rd,rd,64
 654         bdnz    1b
 655
 656         b               LForwardVectorAlignedEnd
 657
 658
 659 // G4 Forward, long, unaligned, 32-byte cache ops, use DCBT and DCBA.  At least on
 660 // some CPUs, this routine is no slower than the simpler aligned version that does
 661 // not use permutes.  But it cannot be used with aligned operands, because of the
 662 // way it prefetches source QWs.
 663 //              r0/cr0 = #bytes to 32-byte align
 664
 665 LForwardVecUnal32Dcba:
 666         bnel+   LAlign32                        // align destination iff necessary
 667         bl              LPrepareForwardVectors
 668         lvx             v1,0,rs                         // prime loop
 669         mtlr    ra                                      // restore return address before loading c128
 670         lvsl    vp,0,rs                         // get permute vector to shift left
 671         li              c128,128
 672         b               1f                                      // enter aligned loop
 673
 674         .align  4                                       // long loop heads should be at least 16-byte aligned
 675 1:                                                              // loop over aligned 64-byte destination chunks
 676         lvx             v2,c16,rs
 677         dcbt    c96,rs                          // touch 3rd cache line ahead
 678         lvx             v3,c32,rs
 679         dcbt    c128,rs                         // touch 4th cache line ahead
 680         lvx             v4,c48,rs
 681         addi    rs,rs,64
 682         vperm   vx,v1,v2,vp
 683         lvx             v1,0,rs
 684         vperm   vy,v2,v3,vp
 685         dcba    0,rd                            // avoid read of destination lines
 686         stvx    vx,0,rd
 687         vperm   vz,v3,v4,vp
 688         stvx    vy,c16,rd
 689         dcba    c32,rd
 690         vperm   vx,v4,v1,vp
 691         stvx    vz,c32,rd
 692         stvx    vx,c48,rd
 693         addi    rd,rd,64
 694         bdnz    1b
 695
 696 LForwardVectorUnalignedEnd:                     // r0/cr0=#QWs, rv=VRSave, v1=next QW, cr7=(rc & F), cr6 set on cr7
 697         beq-    3f                                      // no leftover quadwords
 698         mtctr   r0
 699 2:                                                                      // loop over remaining quadwords
 700         lvx             v2,c16,rs
 701         addi    rs,rs,16
 702         vperm   vx,v1,v2,vp
 703         vor             v1,v2,v2                        // v1 <- v2
 704         stvx    vx,0,rd
 705         addi    rd,rd,16
 706         bdnz    2b
 707 3:
 708         mtspr   VRSave,rv                       // restore bitmap of live vr's
 709         bne             cr6,LShort16            // handle last 0-15 bytes if any
 710         blr
 711
 712
 713 // G4 Forward, long, unaligned, 32-byte cache ops, use DCBT but not DCBA.
 714 //              r0/cr0 = #bytes to 32-byte align
 715
 716 LForwardVecUnal32NoDcba:
 717         bnel+   LAlign32                        // align destination iff necessary
 718         bl              LPrepareForwardVectors
 719         lvx             v1,0,rs                         // prime loop
 720         mtlr    ra                                      // restore return address before loading c128
 721         lvsl    vp,0,rs                         // get permute vector to shift left
 722         li              c128,128
 723         b               1f                                      // enter aligned loop
 724
 725         .align  4
 726         nop                                                     // balance 17-word loop between QWs
 727         nop
 728 1:                                                              // loop over aligned 64-byte destination chunks
 729         lvx             v2,c16,rs
 730         dcbt    c96,rs                          // touch 3rd cache line ahead
 731         lvx             v3,c32,rs
 732         dcbt    c128,rs                         // touch 4th cache line ahead
 733         lvx             v4,c48,rs
 734         addi    rs,rs,64
 735         vperm   vx,v1,v2,vp
 736         lvx             v1,0,rs
 737         vperm   vy,v2,v3,vp
 738         stvx    vx,0,rd
 739         vperm   vz,v3,v4,vp
 740         stvx    vy,c16,rd
 741         vperm   vx,v4,v1,vp
 742         stvx    vz,c32,rd
 743         stvx    vx,c48,rd
 744         addi    rd,rd,64
 745         bdnz    1b
 746
 747         b               LForwardVectorUnalignedEnd
 748
 749
 750 // G3 Reverse, long, unaligned.
 751
 752 LReverseWord:
 753         bl              LAlign8Reverse          // 8-byte align destination
 754         mtlr    ra                                      // restore return address
 755         srwi    r0,rc,5                         // get count of 32-byte chunks to xfer (> 1)
 756         mtctr   r0
 757 1:
 758         lwz             w1,-4(rs)
 759         lwz             w2,-8(rs)
 760         lwz             w3,-12(rs)
 761         lwz             w4,-16(rs)
 762         stw             w1,-4(rd)
 763         lwz             w5,-20(rs)
 764         stw             w2,-8(rd)
 765         lwz             w6,-24(rs)
 766         stw             w3,-12(rd)
 767         lwz             w7,-28(rs)
 768         stw             w4,-16(rd)
 769         lwzu    w8,-32(rs)
 770         stw             w5,-20(rd)
 771         stw             w6,-24(rd)
 772         stw             w7,-28(rd)
 773         stwu    w8,-32(rd)
 774         bdnz    1b
 775
 776         b               LShortReverse
 777
 778
 779 // G3 Reverse, long, word aligned.
 780
 781 LReverseFloat:
 782         bl              LAlign8Reverse          // 8-byte align
 783         mtlr    ra                                      // restore return address
 784         srwi    r0,rc,5                         // get count of 32-byte chunks to xfer (> 1)
 785         mtctr   r0
 786 1:
 787         lfd             f0,-8(rs)
 788         lfd             f1,-16(rs)
 789         lfd             f2,-24(rs)
 790         lfdu    f3,-32(rs)
 791         stfd    f0,-8(rd)
 792         stfd    f1,-16(rd)
 793         stfd    f2,-24(rd)
 794         stfdu   f3,-32(rd)
 795         bdnz    1b
 796
 797         b               LShortReverse
 798
 799
 800 // G4 Reverse, long, 16-byte aligned, 32-byte DCBT but no DCBA.
 801
 802 LReverseVectorAligned32:
 803         bl              LAlign32Reverse         // 32-byte align destination iff necessary
 804         bl              LPrepareReverseVectors
 805         mtlr    ra                                      // restore return address before loading cm129
 806         li              cm129,-129
 807         b               1f                                      // enter aligned loop
 808
 809         .align  4
 810         nop                                                     // must start in 3rd word of QW...
 811         nop                                                     // ...to keep balanced
 812 1:                                                              // loop over aligned 64-byte chunks
 813         dcbt    cm97,rs                         // pre-fetch three cache lines ahead
 814         dcbt    cm129,rs                        // and four
 815         lvx             v1,cm1,rs
 816         lvx             v2,cm17,rs
 817         lvx             v3,cm33,rs
 818         lvx             v4,cm49,rs
 819         subi    rs,rs,64
 820         stvx    v1,cm1,rd
 821         stvx    v2,cm17,rd
 822         stvx    v3,cm33,rd
 823         stvx    v4,cm49,rd
 824         subi    rd,rd,64
 825         bdnz    1b
 826
 827 LReverseVectorAlignedEnd:                       // cr0/r0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7
 828         beq             3f                                      // no leftover quadwords
 829         mtctr   r0
 830 2:                                                                      // loop over 1-3 quadwords
 831         lvx             v1,cm1,rs
 832         subi    rs,rs,16
 833         stvx    v1,cm1,rd
 834         subi    rd,rd,16
 835         bdnz    2b
 836 3:
 837         mtspr   VRSave,rv                       // restore bitmap of live vr's
 838         bne             cr6,LShortReverse16     // handle last 0-15 bytes iff any
 839         blr
 840
 841
 842 // G4 Reverse, long, unaligned, 32-byte DCBT.
 843
 844 LReverseVectorUnal32:
 845         bl              LAlign32Reverse         // align destination iff necessary
 846         bl              LPrepareReverseVectors
 847         lvx             v1,cm1,rs                       // prime loop
 848         mtlr    ra                                      // restore return address before loading cm129
 849         lvsl    vp,0,rs                         // get permute vector to shift left
 850         li              cm129,-129
 851         b               1f                                      // enter aligned loop
 852
 853         .align  4
 854         nop                                                     // start loop in 3rd word on QW to balance
 855         nop
 856 1:                                                              // loop over aligned 64-byte destination chunks
 857         lvx             v2,cm17,rs
 858         dcbt    cm97,rs                         // touch in 3rd source block
 859         lvx             v3,cm33,rs
 860         dcbt    cm129,rs                        // touch in 4th
 861         lvx             v4,cm49,rs
 862         subi    rs,rs,64
 863         vperm   vx,v2,v1,vp
 864         lvx             v1,cm1,rs
 865         vperm   vy,v3,v2,vp
 866         stvx    vx,cm1,rd
 867         vperm   vz,v4,v3,vp
 868         stvx    vy,cm17,rd
 869         vperm   vx,v1,v4,vp
 870         stvx    vz,cm33,rd
 871         stvx    vx,cm49,rd
 872         subi    rd,rd,64
 873         bdnz    1b
 874
 875 LReverseVectorUnalignedEnd:                     // r0/cr0=#QWs, rv=VRSave, v1=source QW, cr7=low 4 bits of rc, cr6 set on cr7
 876         beq             3f                                      // no leftover quadwords
 877         mtctr   r0
 878 2:                                                                      // loop over 1-3 quadwords
 879         lvx             v2,cm17,rs
 880         subi    rs,rs,16
 881         vperm   vx,v2,v1,vp
 882         vor             v1,v2,v2                        // v1 <- v2
 883         stvx    vx,cm1,rd
 884         subi    rd,rd,16
 885         bdnz    2b
 886 3:
 887         mtspr   VRSave,rv                       // restore bitmap of live vr's
 888         bne             cr6,LShortReverse16     // handle last 0-15 bytes iff any
 889         blr
 890
 891
 892 // Subroutine to prepare for 64-byte forward vector loops.
 893 //              Returns many things:
 894 //                      ctr = number of 64-byte chunks to move
 895 //                      r0/cr0 = leftover QWs to move
 896 //                      cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 897 //                      cr6 = beq if leftover byte count is 0
 898 //                      c16..c96 loaded
 899 //                      rv = original value of VRSave
 900 //              NB: c128 not set (if needed), since it is still "ra"
 901
 902 LPrepareForwardVectors:
 903         mfspr   rv,VRSave                       // get bitmap of live vector registers
 904         srwi    r0,rc,6                         // get count of 64-byte chunks to move (>=1)
 905         oris    w1,rv,0xFF00            // we use v0-v7
 906         mtcrf   0x01,rc                         // prepare for moving last 0-15 bytes in LShort16
 907         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3 too
 908         mtspr   VRSave,w1                       // update mask
 909         li              c16,16                          // get constants used in ldvx/stvx
 910         li              c32,32
 911         mtctr   r0                                      // set up loop count
 912         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 913         li              c48,48
 914         li              c96,96
 915         rlwinm. r0,rc,28,30,31          // get number of quadword leftovers (0-3) and set cr0
 916         blr
 917
 918
 919 // Subroutine to prepare for 64-byte reverse vector loops.
 920 //              Returns many things:
 921 //                      ctr = number of 64-byte chunks to move
 922 //                      r0/cr0 = leftover QWs to move
 923 //                      cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 924 //                      cr6 = beq if leftover byte count is 0
 925 //                      cm1..cm97 loaded
 926 //                      rv = original value of VRSave
 927 //              NB: cm129 not set (if needed), since it is still "ra"
 928
 929 LPrepareReverseVectors:
 930         mfspr   rv,VRSave                       // get bitmap of live vector registers
 931         srwi    r0,rc,6                         // get count of 64-byte chunks to move (>=1)
 932         oris    w1,rv,0xFF00            // we use v0-v7
 933         mtcrf   0x01,rc                         // prepare for moving last 0-15 bytes in LShortReverse16
 934         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3 too
 935         mtspr   VRSave,w1                       // update mask
 936         li              cm1,-1                          // get constants used in ldvx/stvx
 937         li              cm17,-17
 938         mtctr   r0                                      // set up loop count
 939         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 940         li              cm33,-33
 941         li              cm49,-49
 942         rlwinm. r0,rc,28,30,31          // get number of quadword leftovers (0-3) and set cr0
 943         li              cm97,-97
 944         blr
 945
 946
 947 // Subroutine to align destination on a 32-byte boundary.
 948 //      r0 = number of bytes to xfer (0-31)
 949
 950 LAlign32:
 951         mtcrf   0x01,r0                         // length to cr (faster to change 1 CR at a time)
 952         mtcrf   0x02,r0
 953         sub             rc,rc,r0                        // adjust length
 954         bf              31,1f                           // skip if no odd bit
 955         lbz             w1,0(rs)
 956         addi    rs,rs,1
 957         stb             w1,0(rd)
 958         addi    rd,rd,1
 959 1:
 960         bf              30,2f                           // halfword to move?
 961         lhz             w1,0(rs)
 962         addi    rs,rs,2
 963         sth             w1,0(rd)
 964         addi    rd,rd,2
 965 2:
 966         bf              29,3f                           // word?
 967         lwz             w1,0(rs)
 968         addi    rs,rs,4
 969         stw             w1,0(rd)
 970         addi    rd,rd,4
 971 3:
 972         bf              28,4f                           // doubleword?
 973         lwz             w1,0(rs)
 974         lwz             w2,4(rs)
 975         addi    rs,rs,8
 976         stw             w1,0(rd)
 977         stw             w2,4(rd)
 978         addi    rd,rd,8
 979 4:
 980         bflr    27                                      // done if no quadword to move
 981         lwz             w1,0(rs)
 982         lwz             w2,4(rs)
 983         lwz             w3,8(rs)
 984         lwz             w4,12(rs)
 985         addi    rs,rs,16
 986         stw             w1,0(rd)
 987         stw             w2,4(rd)
 988         stw             w3,8(rd)
 989         stw             w4,12(rd)
 990         addi    rd,rd,16
 991         blr
 992
 993 // Subroutine to align destination if necessary on a 32-byte boundary for reverse moves.
 994 //   rs and rd still point to low end of operands
 995 //       we adjust rs and rd to point to last byte moved
 996
 997 LAlign32Reverse:
 998         add             rd,rd,rc                        // point to last byte moved (ie, 1 past end of operands)
 999         add             rs,rs,rc
1000         andi.   r0,rd,0x1F                      // r0 <- #bytes that must be moved to align destination
1001         mtcrf   0x01,r0                         // length to cr (faster to change 1 CR at a time)
1002         mtcrf   0x02,r0
1003         sub             rc,rc,r0                        // update length
1004         beqlr-                                          // destination already 32-byte aligned
1005
1006         bf              31,1f                           // odd byte?
1007         lbzu    w1,-1(rs)
1008         stbu    w1,-1(rd)
1009 1:
1010         bf              30,2f                           // halfword to move?
1011         lhzu    w1,-2(rs)
1012         sthu    w1,-2(rd)
1013 2:
1014         bf              29,3f                           // word?
1015         lwzu    w1,-4(rs)
1016         stwu    w1,-4(rd)
1017 3:
1018         bf              28,4f                           // doubleword?
1019         lwz             w1,-4(rs)
1020         lwzu    w2,-8(rs)
1021         stw             w1,-4(rd)
1022         stwu    w2,-8(rd
1023 4:
1024         bflr    27                                      // done if no quadwords
1025         lwz             w1,-4(rs)
1026         lwz             w2,-8(rs)
1027         lwz             w3,-12(rs)
1028         lwzu    w4,-16(rs)
1029         stw             w1,-4(rd)
1030         stw             w2,-8(rd)
1031         stw             w3,-12(rd)
1032         stwu    w4,-16(rd)
1033         blr
1034
1035
1036 // Subroutine to align destination on an 8-byte boundary for reverse moves.
1037 //   rs and rd still point to low end of operands
1038 //       we adjust rs and rd to point to last byte moved
1039
1040 LAlign8Reverse:
1041         add             rd,rd,rc                        // point to last byte moved (ie, 1 past end of operands)
1042         add             rs,rs,rc
1043         andi.   r0,rd,0x7                       // r0 <- #bytes that must be moved to align destination
1044         beqlr-                                          // destination already 8-byte aligned
1045         mtctr   r0                                      // set up for loop
1046         sub             rc,rc,r0                        // update length
1047 1:
1048         lbzu    w1,-1(rs)
1049         stbu    w1,-1(rd)
1050         bdnz    1b
1051
1052         blr
1053
1054
1055 // Called by pthread initialization to set up the branch table pointer based on
1056 // the CPU capability vector.  This routine may be called more than once (for
1057 // example, during testing.)
1058
1059 // Size of the buffer we use to do DCBA timing on G4:
1060 #define kBufSiz 1024
1061
1062 // Stack frame size, which contains the 128-byte-aligned buffer:
1063 #define kSFSize (kBufSiz+128+16)
1064
1065 // Iterations of the timing loop:
1066 #define kLoopCnt        5
1067
1068 // Bit in cr5 used as a flag in timing loop:
1069 #define kDCBA           22
1070
1071 __bcopy_initialize:                                     // int _bcopy_initialize(void)
1072         mflr    ra                                      // get return
1073         stw             ra,8(r1)                        // save
1074         stwu    r1,-kSFSize(r1)         // carve our temp buffer from the stack
1075         addi    w6,r1,127+16            // get base address...
1076         rlwinm  w6,w6,0,0,24            // ...of our buffer, 128-byte aligned
1077         bcl             20,31,1f                        // get our PIC base
1078 1:
1079         mflr    w1
1080         addis   w2,w1,ha16(__cpu_capabilities - 1b)
1081         lwz             w3,lo16(__cpu_capabilities - 1b)(w2)
1082         andi.   r0,w3,kUseDcba+kNoDcba+kCache32+k64Bit+kHasAltivec
1083         cmpwi   r0,kCache32+kHasAltivec // untyped G4?
1084         li              w8,0                            // assume no need to test
1085         bne             2f                                      // not an untyped G4, so do not test
1086
1087         // G4, but neither kUseDcba or kNoDcba are set.  Time and select fastest.
1088
1089         crset   kDCBA                           // first, use DCBA
1090         bl              LTest32                         // time it
1091         mr              w8,w4                           // w8 <- best time using DCBA
1092         srwi    r0,w8,3                         // bias 12 pct in favor of not using DCBA...
1093         add             w8,w8,r0                        // ...because DCBA is always slower with warm cache
1094         crclr   kDCBA
1095         bl              LTest32                         // w4 <- best time without DCBA
1096         cmplw   w8,w4                           // which is better?
1097         li              w8,kUseDcba                     // assume using DCBA is faster
1098         blt             2f
1099         li              w8,kNoDcba                      // no DCBA is faster
1100
1101         // What branch table to use?
1102
1103 2:                                                                      // here with w8 = 0, kUseDcba, or kNoDcba
1104         bcl             20,31,4f                        // get our PIC base again
1105 4:
1106         mflr    w1
1107         addis   w2,w1,ha16(__cpu_capabilities - 4b)
1108         lwz             w3,lo16(__cpu_capabilities - 4b)(w2)
1109         or              w3,w3,w8                        // add in kUseDcba or kNoDcba if untyped G4
1110         mr              r3,w8                           // return dynamic selection, if any (used in testing)
1111
1112         andi.   r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba
1113         cmpwi   r0,kHasAltivec+kCache32+kUseDcba        // G4 with DCBA?
1114         addis   w4,w1,ha16(LG4UseDcba - 4b)
1115         addi    w4,w4,lo16(LG4UseDcba - 4b)
1116         beq             5f
1117
1118         andi.   r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba
1119         cmpwi   r0,kHasAltivec+kCache32+kNoDcba         // G4 without DCBA?
1120         addis   w4,w1,ha16(LG4NoDcba - 4b)
1121         addi    w4,w4,lo16(LG4NoDcba - 4b)
1122         beq             5f
1123
1124         andi.   r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32
1125         cmpwi   r0,kCache32                                                     // G3?
1126         addis   w4,w1,ha16(LG3 - 4b)
1127         addi    w4,w4,lo16(LG3 - 4b)
1128         beq             5f
1129
1130         // Map unrecognized CPU types to G3 (lowest common denominator)
1131
1132 5:                                                                      // w4 <- branch table pointer
1133         addis   w5,w1,ha16(LBranchTablePtr - 4b)
1134         stw             w4,lo16(LBranchTablePtr - 4b)(w5)
1135         lwz             ra,kSFSize+8(r1)        // recover return address
1136         mtlr    ra                                      // restore it
1137         lwz             r1,0(r1)                        // pop off our stack frame
1138         blr                                                     // return dynamic selection (or 0) in r3
1139
1140
1141 // Subroutine to time a 32-byte cache.
1142 //              kDCBA = set if we should use DCBA
1143 //              w6 = base of buffer to use for test (kBufSiz bytes)
1144 //              w4 = we return time of fastest loop in w4
1145
1146 LTest32:
1147         li              w1,kLoopCnt                     // number of times to loop
1148         li              w4,-1                           // initialize fastest time
1149 1:
1150         mr              rd,w6                           // initialize buffer ptr
1151         li              r0,kBufSiz/32           // r0 <- cache blocks to test
1152         mtctr   r0
1153 2:
1154         dcbf    0,rd                            // first, force the blocks out of the cache
1155         addi    rd,rd,32
1156         bdnz    2b
1157         sync                                            // make sure all the flushes take
1158         mr              rd,w6                           // re-initialize buffer ptr
1159         mtctr   r0                                      // reset cache-block count
1160         mftbu   w5                                      // remember upper half so we can check for carry
1161         mftb    w2                                      // start the timer
1162 3:                                                                      // loop over cache blocks
1163         bf              kDCBA,4f                        // should we DCBA?
1164         dcba    0,rd
1165 4:
1166         stfd    f1,0(rd)                        // store the entire cache block
1167         stfd    f1,8(rd)
1168         stfd    f1,16(rd)
1169         stfd    f1,24(rd)
1170         addi    rd,rd,32
1171         bdnz    3b
1172         mftb    w3
1173         mftbu   r0
1174         cmpw    r0,w5                           // did timebase carry?
1175         bne             1b                                      // yes, retest rather than fuss
1176         sub             w3,w3,w2                        // w3 <- time for this loop
1177         cmplw   w3,w4                           // faster than current best?
1178         bge             5f                                      // no
1179         mr              w4,w3                           // remember fastest time through loop
1180 5:
1181         subi    w1,w1,1                         // decrement outer loop count
1182         cmpwi   w1,0                            // more to go?
1183         bne             1b                                      // loop if so
1184         blr
1185
1186 #endif  /* 0 */