ppc/gen/bcopy.s

   1 /*
   2  * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /* =======================================
  23  * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
  24  * =======================================
  25  *
  26  * Version of 6/17/2002, for G3, G4, and G4+.
  27  *
  28  * There are many paths through this code, depending on length, reverse/forward,
  29  * processor type, and alignment.  We use reverse paths only when the operands
  30  * overlap and the destination is higher than the source.  They are not quite as
  31  * fast as the forward paths.
  32  *
  33  * Judicious use of DCBTs, just far enough ahead to minimize waiting, is critical in
  34  * the inner loops for long operands.  DST is less effective than DCBT, because it
  35  * can get out of sync with the inner loop.  DCBTST is usually not a win, so we
  36  * don't use it except during initialization when we're not using the LSU.
  37  * We don't DCBT on G3, which only handles one load miss at a time.
  38  *
  39  * We don't use DCBZ, because it takes an alignment exception on uncached memory
  40  * like frame buffers.  Bcopy to frame buffers must work.  This hurts G3 in the
  41  * cold-cache case, but G4 can use DCBA (which does not take alignment exceptions.)
  42  *
  43  * Using DCBA on G4 is a tradeoff.  For the cold-cache case it can be a big win,
  44  * since it avoids the read of destination cache lines.  But for the hot-cache case
  45  * it is always slower, because of the cycles spent needlessly zeroing data.  Some
  46  * machines store-gather and can cancel the read if all bytes of a line are stored,
  47  * others cannot.  Unless explicitly told which is better, we time loops with and
  48  * without DCBA and use the fastest.  Note that we never DCBA in reverse loops,
  49  * since by definition they are overlapped so dest lines will be in the cache.
  50  *
  51  * For longer operands we use an 8-element branch table, based on the CPU type,
  52  * to select the appropriate inner loop.  The branch table is indexed as follows:
  53  *
  54  *   bit 10000 set if a Reverse move is required
  55  *  bits 01100 set on the relative operand alignment: 0=unaligned, 1=word,
  56  *             2=doubleword, and 3=quadword.
  57  *
  58  * By "relatively" n-byte aligned, we mean the source and destination are a multiple
  59  * of n bytes apart (they need not be absolutely aligned.)
  60  *
  61  * The branch table for the running CPU type is pointed to by LBranchTablePtr.
  62  * Initially, LBranchtablePtr points to G3's table, since that is the lowest
  63  * common denominator that will run on any CPU.  Later, pthread initialization
  64  * sets up the _cpu_capabilities vector and calls _bcopy_initialize, which sets
  65  * up the correct pointer for the running CPU.
  66  *
  67  * We distinguish between "short", "medium", and "long" operands:
  68  *  short     (<= 32 bytes)    most common case, minimum path length is important
  69  *  medium    (> 32, < kLong)  too short for Altivec or use of cache ops like DCBA
  70  *  long      (>= kLong)       long enough for cache ops and to amortize use of Altivec
  71  *
  72  * WARNING:  kLong must be >=96, due to implicit assumptions about operand length.
  73  */
  74 #define kLong           96
  75
  76 /* Register usage.  Note we use R2, so this code will not run in a PEF/CFM
  77  * environment.  Note also the rather delicate way we assign multiple uses
  78  * to the same register.  Beware.
  79  *
  80  *   r0  = "w7" or "r0" (NB: cannot use r0 for any constant such as "c16")
  81  *   r2  = "w8" or VRSave ("rv")
  82  *   r3  = not used, as memcpy and memmove return 1st parameter as a value
  83  *   r4  = source ptr ("rs")
  84  *   r5  = count of bytes to move ("rc")
  85  *   r6  = "w1", "c16", or "cm17"
  86  *   r7  = "w2", "c32", or "cm33"
  87  *   r8  = "w3", "c48", or "cm49"
  88  *   r9  = "w4", "c64", or "cm1"
  89  *   r10 = "w5", "c96", or "cm97"
  90  *   r11 = "w6", "c128", "cm129", or return address ("ra")
  91  *   r12 = destination ptr ("rd")
  92  * f0-f8 = used for moving 8-byte aligned data
  93  *   v0  = permute vector ("vp")
  94  * v1-v4 = qw's loaded from source ("v1", "v2", "v3", and "v4")
  95  * v5-v7 = permuted qw's ("vx", "vy", and "vz")
  96  */
  97 #define rs      r4
  98 #define rd      r12
  99 #define rc      r5
 100 #define ra      r11
 101 #define rv      r2
 102
 103 #define w1      r6
 104 #define w2      r7
 105 #define w3      r8
 106 #define w4      r9
 107 #define w5      r10
 108 #define w6      r11
 109 #define w7      r0
 110 #define w8      r2
 111
 112 #define c16             r6
 113 #define cm17    r6
 114 #define c32             r7
 115 #define cm33    r7
 116 #define c48             r8
 117 #define cm49    r8
 118 #define c64             r9
 119 #define cm1             r9
 120 #define c96             r10
 121 #define cm97    r10
 122 #define c128    r11
 123 #define cm129   r11
 124
 125 #define vp      v0
 126 #define vx      v5
 127 #define vy      v6
 128 #define vz      v7
 129
 130 #define VRSave  256
 131
 132 #include <architecture/ppc/asm_help.h>
 133
 134 // The branch tables, 8 entries per CPU type.
 135 // NB: we depend on 5 low-order 0s in the address of branch tables.
 136
 137     .data
 138     .align      5                                               // must be 32-byte aligned
 139
 140     // G3 (the default CPU type)
 141
 142 LG3:
 143     .long       LForwardWord                    // 000: forward,       unaligned
 144     .long       LForwardFloat                   // 001: forward,  4-byte aligned
 145     .long       LForwardFloat                   // 010: forward,  8-byte aligned
 146     .long       LForwardFloat                   // 011: forward, 16-byte aligned
 147     .long       LReverseWord                    // 100: reverse,       unaligned
 148     .long       LReverseFloat                   // 101: reverse,  4-byte aligned
 149     .long       LReverseFloat                   // 110: reverse,  8-byte aligned
 150     .long       LReverseFloat                   // 111: reverse, 16-byte aligned
 151
 152     // G4s that benefit from DCBA.
 153
 154 LG4UseDcba:
 155     .long       LForwardVecUnal32Dcba   // 000: forward,       unaligned
 156     .long       LForwardVecUnal32Dcba   // 001: forward,  4-byte aligned
 157     .long       LForwardVecUnal32Dcba   // 010: forward,  8-byte aligned
 158     .long       LForwardVecAlig32Dcba   // 011: forward, 16-byte aligned
 159     .long       LReverseVectorUnal32    // 100: reverse,       unaligned
 160     .long       LReverseVectorUnal32    // 101: reverse,  4-byte aligned
 161     .long       LReverseVectorUnal32    // 110: reverse,  8-byte aligned
 162     .long       LReverseVectorAligned32 // 111: reverse, 16-byte aligned
 163
 164     // G4s that should not use DCBA.
 165
 166 LG4NoDcba:
 167     .long       LForwardVecUnal32NoDcba // 000: forward,       unaligned
 168     .long       LForwardVecUnal32NoDcba // 001: forward,  4-byte aligned
 169     .long       LForwardVecUnal32NoDcba // 010: forward,  8-byte aligned
 170     .long       LForwardVecAlig32NoDcba // 011: forward, 16-byte aligned
 171     .long       LReverseVectorUnal32    // 100: reverse,       unaligned
 172     .long       LReverseVectorUnal32    // 101: reverse,  4-byte aligned
 173     .long       LReverseVectorUnal32    // 110: reverse,  8-byte aligned
 174     .long       LReverseVectorAligned32 // 111: reverse, 16-byte aligned
 175
 176
 177 // Pointer to the 8-element branch table for running CPU type:
 178
 179 LBranchTablePtr:
 180     .long       LG3                                             // default to G3 until "bcopy_initialize" called
 181
 182
 183 // The CPU capability vector, initialized in pthread_init().
 184 // "_bcopy_initialize" uses this to set up LBranchTablePtr:
 185
 186     .globl __cpu_capabilities
 187 __cpu_capabilities:
 188     .long 0
 189
 190 // Bit definitions for _cpu_capabilities:
 191
 192 #define kHasAltivec             0x01
 193 #define k64Bit                  0x02
 194 #define kCache32                0x04
 195 #define kCache64                0x08
 196 #define kCache128               0x10
 197 #define kUseDcba                0x20
 198 #define kNoDcba                 0x40
 199
 200
 201 .text
 202 .globl _bcopy
 203 .globl _memcpy
 204 .globl _memmove
 205 .globl __bcopy_initialize
 206
 207
 208 // Main entry points.
 209
 210         .align  5
 211 _bcopy:                                                         // void bcopy(const void *src, void *dst, size_t len)
 212         mr              r10,r3                          // reverse source and dest ptrs, to be like memcpy
 213         mr              r3,r4
 214         mr              r4,r10
 215 _memcpy:                                                        // void* memcpy(void *dst, void *src, size_t len)
 216 _memmove:                                                       // void* memmove(void *dst, const void *src, size_t len)
 217         cmplwi  cr7,rc,32                       // length <= 32 bytes?
 218         sub.    w1,r3,rs                        // must move in reverse if (rd-rs)<rc, set cr0 on sou==dst
 219         dcbt    0,rs                            // touch in the first line of source
 220         cmplw   cr6,w1,rc                       // set cr6 blt iff we must move reverse
 221         cmplwi  cr1,rc,kLong-1          // set cr1 bgt if long
 222         mr              rd,r3                           // must leave r3 alone, it is return value for memcpy etc
 223         bgt-    cr7,LMedium                     // longer than 32 bytes
 224         dcbtst  0,rd                            // touch in destination
 225         beq-    cr7,LMove32                     // special case moves of 32 bytes
 226         blt-    cr6,LShortReverse0
 227
 228 // Forward short operands.  This is the most frequent case, so it is inline.
 229 // We also end up here to xfer the last 0-31 bytes of longer operands.
 230
 231 LShort:                                                         // WARNING: can fall into this routine
 232         andi.   r0,rc,0x10                      // test bit 27 separately (sometimes faster than a mtcrf)
 233         mtcrf   0x01,rc                         // move rest of length to cr7
 234         beq             1f                                      // quadword to move?
 235         lwz             w1,0(rs)
 236         lwz             w2,4(rs)
 237         lwz             w3,8(rs)
 238         lwz             w4,12(rs)
 239         addi    rs,rs,16
 240         stw             w1,0(rd)
 241         stw             w2,4(rd)
 242         stw             w3,8(rd)
 243         stw             w4,12(rd)
 244         addi    rd,rd,16
 245 1:
 246 LShort16:                                                       // join here to xfer 0-15 bytes
 247         bf              28,2f                           // doubleword?
 248         lwz             w1,0(rs)
 249         lwz             w2,4(rs)
 250         addi    rs,rs,8
 251         stw             w1,0(rd)
 252         stw             w2,4(rd)
 253         addi    rd,rd,8
 254 2:
 255         bf              29,3f                           // word?
 256         lwz             w1,0(rs)
 257         addi    rs,rs,4
 258         stw             w1,0(rd)
 259         addi    rd,rd,4
 260 3:
 261         bf              30,4f                           // halfword to move?
 262         lhz             w1,0(rs)
 263         addi    rs,rs,2
 264         sth             w1,0(rd)
 265         addi    rd,rd,2
 266 4:
 267         bflr    31                                      // skip if no odd byte
 268         lbz             w1,0(rs)
 269         stb             w1,0(rd)
 270         blr
 271
 272
 273 // Handle short reverse operands, up to kShort in length.
 274 // This is also used to transfer the last 0-31 bytes of longer operands.
 275
 276 LShortReverse0:
 277         add             rs,rs,rc                        // adjust ptrs for reverse move
 278         add             rd,rd,rc
 279 LShortReverse:
 280         andi.   r0,rc,0x10                      // test bit 27 separately (sometimes faster than a mtcrf)
 281         mtcrf   0x01,rc                         // move rest of length to cr7
 282         beq             1f                                      // quadword to move?
 283         lwz             w1,-4(rs)
 284         lwz             w2,-8(rs)
 285         lwz             w3,-12(rs)
 286         lwzu    w4,-16(rs)
 287         stw             w1,-4(rd)
 288         stw             w2,-8(rd)
 289         stw             w3,-12(rd)
 290         stwu    w4,-16(rd)
 291 1:
 292 LShortReverse16:                                        // join here to xfer 0-15 bytes and return
 293         bf              28,2f                           // doubleword?
 294         lwz             w1,-4(rs)
 295         lwzu    w2,-8(rs)
 296         stw             w1,-4(rd)
 297         stwu    w2,-8(rd
 298 2:
 299         bf              29,3f                           // word?
 300         lwzu    w1,-4(rs)
 301         stwu    w1,-4(rd)
 302 3:
 303         bf              30,4f                           // halfword to move?
 304         lhzu    w1,-2(rs)
 305         sthu    w1,-2(rd)
 306 4:
 307         bflr    31                                      // done if no odd byte
 308         lbz     w1,-1(rs)                       // no update
 309         stb     w1,-1(rd)
 310         blr
 311
 312
 313 // Special case for 32-byte moves.  Too long for LShort, too common for LMedium.
 314
 315 LMove32:
 316         lwz             w1,0(rs)
 317         lwz             w2,4(rs)
 318         lwz             w3,8(rs)
 319         lwz             w4,12(rs)
 320         lwz             w5,16(rs)
 321         lwz             w6,20(rs)
 322         lwz             w7,24(rs)
 323         lwz             w8,28(rs)
 324         stw             w1,0(rd)
 325         stw             w2,4(rd)
 326         stw             w3,8(rd)
 327         stw             w4,12(rd)
 328         stw             w5,16(rd)
 329         stw             w6,20(rd)
 330         stw             w7,24(rd)
 331         stw             w8,28(rd)
 332 LExit:
 333         blr
 334
 335
 336 // Medium length operands (32 < rc < kLong.)  These loops run on all CPUs, as the
 337 // operands are not long enough to bother with the branch table, using cache ops, or
 338 // Altivec.  We word align the source, not the dest as we do for long operands,
 339 // since doing so is faster on G4+ and probably beyond, we never DCBA on medium-length
 340 // operands, and the opportunity to cancel reads of dest cache lines is limited.
 341 //              w1  = (rd-rs), used to check for alignment
 342 //              cr0 = set on (rd-rs)
 343 //              cr1 = bgt if long operand
 344 //              cr6 = blt if reverse move
 345
 346 LMedium:
 347         dcbtst  0,rd                            // touch in 1st line of destination
 348         rlwinm  r0,w1,0,29,31           // r0 <- ((rd-rs) & 7), ie 0 if doubleword aligned
 349         beq-    LExit                           // early exit if (rs==rd), avoiding use of "beqlr"
 350         neg             w2,rs                           // we align source, not dest, and assume forward
 351         cmpwi   cr5,r0,0                        // set cr5 beq if doubleword aligned
 352         bgt-    cr1,LLong                       // handle long operands
 353         andi.   w3,w2,3                         // W3 <- #bytes to word-align source
 354         blt-    cr6,LMediumReverse      // handle reverse move
 355         lwz             w1,0(rs)                        // pre-fetch first 4 bytes of source
 356         beq-    cr5,LMediumAligned      // operands are doubleword aligned
 357         sub             rc,rc,w3                        // adjust count for alignment
 358         mtcrf   0x01,rc                         // remaining byte count (0-15) to cr7 for LShort16
 359         srwi    w4,rc,4                         // w4 <- number of 16-byte chunks to xfer (>=1)
 360         mtctr   w4                                      // prepare loop count
 361         beq+    2f                                      // source already aligned
 362
 363         lwzx    w2,w3,rs                        // get 1st aligned word (which we might partially overwrite)
 364         add             rs,rs,w3                        // word-align source ptr
 365         stw             w1,0(rd)                        // store all (w3) bytes at once to avoid a loop
 366         add             rd,rd,w3
 367         mr              w1,w2                           // first aligned word to w1
 368         b               2f
 369
 370         .align  4                                       // align inner loops
 371 1:                                                                      // loop over 16-byte chunks
 372         lwz             w1,0(rs)
 373 2:
 374         lwz             w2,4(rs)
 375         lwz             w3,8(rs)
 376         lwz             w4,12(rs)
 377         addi    rs,rs,16
 378         stw             w1,0(rd)
 379         stw             w2,4(rd)
 380         stw             w3,8(rd)
 381         stw             w4,12(rd)
 382         addi    rd,rd,16
 383         bdnz    1b
 384
 385         b               LShort16
 386
 387
 388 // Medium, doubleword aligned.  We use floating point.  Note that G4+ has bigger latencies
 389 // and reduced throughput for floating pt loads and stores; future processors will probably
 390 // have even worse lfd/stfd performance.  We use it here because it is so important for G3,
 391 // and not slower for G4+.  But we only do so for doubleword aligned operands, whereas the
 392 // G3-only long operand loops use floating pt even for word-aligned operands.
 393 //              w2 = neg(rs)
 394 //              w1 = first 4 bytes of source
 395
 396 LMediumAligned:
 397         andi.   w3,w2,7                         // already aligned?
 398         sub             rc,rc,w3                        // adjust count by 0-7 bytes
 399         lfdx    f0,rs,w3                        // pre-fetch first aligned source doubleword
 400         srwi    w4,rc,5                         // get count of 32-byte chunks (might be 0 if unaligned)
 401         mtctr   w4
 402         beq-    LForwardFloatLoop1      // already aligned
 403
 404         cmpwi   w4,0                            // are there any 32-byte chunks to xfer?
 405         lwz             w2,4(rs)                        // get 2nd (unaligned) source word
 406         add             rs,rs,w3                        // doubleword align source pointer
 407         stw             w1,0(rd)                        // store first 8 bytes of source to align...
 408         stw             w2,4(rd)                        // ...which could overwrite source
 409         add             rd,rd,w3                        // doubleword align destination
 410         bne+    LForwardFloatLoop1      // at least 1 chunk, so enter loop
 411
 412         subi    rc,rc,8                         // unfortunate degenerate case: no chunks to xfer
 413         stfd    f0,0(rd)                        // must store f1 since source might have been overwriten
 414         addi    rs,rs,8
 415         addi    rd,rd,8
 416         b               LShort
 417
 418
 419 // Medium reverse moves.  This loop runs on all processors.
 420
 421 LMediumReverse:
 422         add             rs,rs,rc                        // point to other end of operands when in reverse
 423         add             rd,rd,rc
 424         andi.   w3,rs,3                         // w3 <- #bytes to word align source
 425         lwz             w1,-4(rs)                       // pre-fetch 1st 4 bytes of source
 426         sub             rc,rc,w3                        // adjust count
 427         srwi    w4,rc,4                         // get count of 16-byte chunks (>=1)
 428         mtcrf   0x01,rc                         // remaining byte count (0-15) to cr7 for LShortReverse16
 429         mtctr   w4                                      // prepare loop count
 430         beq+    2f                                      // source already aligned
 431
 432         sub             rs,rs,w3                        // word-align source ptr
 433         lwz             w2,-4(rs)                       // get 1st aligned word which we may overwrite
 434         stw             w1,-4(rd)                       // store all 4 bytes to align without a loop
 435         sub             rd,rd,w3
 436         mr              w1,w2                           // shift 1st aligned source word to w1
 437         b               2f
 438
 439 1:
 440         lwz             w1,-4(rs)
 441 2:
 442         lwz             w2,-8(rs)
 443         lwz             w3,-12(rs)
 444         lwzu    w4,-16(rs)
 445         stw             w1,-4(rd)
 446         stw             w2,-8(rd)
 447         stw             w3,-12(rd)
 448         stwu    w4,-16(rd)
 449         bdnz    1b
 450
 451         b               LShortReverse16
 452
 453
 454 // Long operands.  Use branch table to decide which loop to use.
 455 //              w1  = (rd-rs), used to determine alignment
 456
 457 LLong:
 458         xor             w4,w1,rc                        // we must move reverse if (rd-rs)<rc
 459         mflr    ra                                      // save return address
 460         rlwinm  w5,w1,1,27,30           // w5 <- ((w1 & 0xF) << 1)
 461         bcl             20,31,1f                        // use reserved form to get our location
 462 1:
 463         mflr    w3                                      // w3 == addr(1b)
 464         lis             w8,0x0408                       // load a 16 element, 2-bit array into w8...
 465         cntlzw  w4,w4                           // find first difference between (rd-rs) and rc
 466         addis   w2,w3,ha16(LBranchTablePtr-1b)
 467         ori             w8,w8,0x040C            // ...used to map w5 to alignment encoding (ie, to 0-3)
 468         lwz             w2,lo16(LBranchTablePtr-1b)(w2) // w2 <- branch table address
 469         slw             w4,rc,w4                        // bit 0 of w4 set iff (rd-rs)<rc
 470         rlwnm   w5,w8,w5,28,29          // put alignment encoding in bits 01100 of w5
 471         rlwimi  w2,w4,5,27,27           // put reverse bit in bit 10000 of branch table address
 472         lwzx    w3,w2,w5                        // w3 <- load loop address from branch table
 473         neg             w1,rd                           // start to compute destination alignment
 474         mtctr   w3
 475         andi.   r0,w1,0x1F                      // r0 <- bytes req'd to 32-byte align dest (if forward move)
 476         bctr                                            // NB: r0/cr0 and w1 are passed as parameters
 477
 478
 479 // G3, forward, long, unaligned.
 480 //              w1 = neg(rd)
 481
 482 LForwardWord:
 483         andi.   w3,w1,3                         // W3 <- #bytes to word-align destination
 484         mtlr    ra                                      // restore return address
 485         sub             rc,rc,w3                        // adjust count for alignment
 486         srwi    r0,rc,5                         // number of 32-byte chunks to xfer (>=1)
 487         mtctr   r0                                      // prepare loop count
 488         beq+    1f                                      // dest already aligned
 489
 490         lwz             w2,0(rs)                        // get first 4 bytes of source
 491         lwzx    w1,w3,rs                        // get source bytes we might overwrite
 492         add             rs,rs,w3                        // adjust source ptr
 493         stw             w2,0(rd)                        // store all 4 bytes to avoid a loop
 494         add             rd,rd,w3                        // word-align destination
 495         b               2f
 496 1:
 497         lwz             w1,0(rs)
 498 2:
 499         lwz             w2,4(rs)
 500         lwz             w3,8(rs)
 501         lwz             w4,12(rs)
 502         lwz             w5,16(rs)
 503         lwz             w6,20(rs)
 504         lwz             w7,24(rs)
 505         lwz             w8,28(rs)
 506         addi    rs,rs,32
 507         stw             w1,0(rd)
 508         stw             w2,4(rd)
 509         stw             w3,8(rd)
 510         stw             w4,12(rd)
 511         stw             w5,16(rd)
 512         stw             w6,20(rd)
 513         stw             w7,24(rd)
 514         stw             w8,28(rd)
 515         addi    rd,rd,32
 516         bdnz    1b
 517
 518         b               LShort
 519
 520
 521 // G3, forward, long, word aligned.  We use floating pt even when only word aligned.
 522 //              w1 = neg(rd)
 523
 524 LForwardFloat:
 525         andi.   w3,w1,7                         // W3 <- #bytes to doubleword-align destination
 526         mtlr    ra                                      // restore return address
 527         sub             rc,rc,w3                        // adjust count for alignment
 528         srwi    r0,rc,5                         // number of 32-byte chunks to xfer (>=1)
 529         mtctr   r0                                      // prepare loop count
 530         beq             LForwardFloatLoop       // dest already aligned
 531
 532         lwz             w1,0(rs)                        // get first 8 bytes of source
 533         lwz             w2,4(rs)
 534         lfdx    f0,w3,rs                        // get source bytes we might overwrite
 535         add             rs,rs,w3                        // word-align source ptr
 536         stw             w1,0(rd)                        // store all 8 bytes to avoid a loop
 537         stw             w2,4(rd)
 538         add             rd,rd,w3
 539         b               LForwardFloatLoop1
 540
 541         .align  4                                       // align since this loop is executed by G4s too
 542 LForwardFloatLoop:
 543         lfd             f0,0(rs)
 544 LForwardFloatLoop1:                                     // enter here from LMediumAligned and above
 545         lfd             f1,8(rs)
 546         lfd             f2,16(rs)
 547         lfd             f3,24(rs)
 548         addi    rs,rs,32
 549         stfd    f0,0(rd)
 550         stfd    f1,8(rd)
 551         stfd    f2,16(rd)
 552         stfd    f3,24(rd)
 553         addi    rd,rd,32
 554         bdnz    LForwardFloatLoop
 555
 556         b               LShort
 557
 558
 559 // G4 Forward, long, 16-byte aligned, 32-byte cache ops, use DCBA and DCBT.
 560 //              r0/cr0 = #bytes to 32-byte align
 561
 562 LForwardVecAlig32Dcba:
 563         bnel+   LAlign32                        // align destination iff necessary
 564         bl              LPrepareForwardVectors
 565         mtlr    ra                                      // restore return address before loading c128
 566         li              c128,128
 567         b               1f                                      // enter aligned loop
 568
 569         .align  5                                       // long loop heads should be at least 16-byte aligned
 570 1:                                                              // loop over aligned 64-byte chunks
 571         dcbt    c96,rs                          // pre-fetch three cache lines ahead
 572         dcbt    c128,rs                         // and four
 573         lvx             v1,0,rs
 574         lvx             v2,c16,rs
 575         lvx             v3,c32,rs
 576         lvx             v4,c48,rs
 577         addi    rs,rs,64
 578         dcba    0,rd                            // avoid read of destination cache lines
 579         stvx    v1,0,rd
 580         stvx    v2,c16,rd
 581         dcba    c32,rd
 582         stvx    v3,c32,rd
 583         stvx    v4,c48,rd
 584         addi    rd,rd,64
 585         bdnz    1b
 586
 587 LForwardVectorAlignedEnd:                       // r0/cr0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7
 588         beq-    3f                                      // no leftover quadwords
 589         mtctr   r0
 590 2:                                                                      // loop over remaining quadwords (1-7)
 591         lvx             v1,0,rs
 592         addi    rs,rs,16
 593         stvx    v1,0,rd
 594         addi    rd,rd,16
 595         bdnz    2b
 596 3:
 597         mtspr   VRSave,rv                       // restore bitmap of live vr's
 598         bne             cr6,LShort16            // handle last 0-15 bytes if any
 599         blr
 600
 601
 602 // G4 Forward, long, 16-byte aligned, 32-byte cache, use DCBT but not DCBA.
 603 //              r0/cr0 = #bytes to 32-byte align
 604
 605 LForwardVecAlig32NoDcba:
 606         bnel+   LAlign32                        // align destination iff necessary
 607         bl              LPrepareForwardVectors
 608         mtlr    ra                                      // restore return address before loading c128
 609         li              c128,128
 610         b               1f                                      // enter aligned loop
 611
 612         .align  4                                       // balance 13-word loop between QWs...
 613         nop                                                     // ...which improves performance 5% +/-
 614         nop
 615 1:                                                              // loop over aligned 64-byte chunks
 616         dcbt    c96,rs                          // pre-fetch three cache lines ahead
 617         dcbt    c128,rs                         // and four
 618         lvx             v1,0,rs
 619         lvx             v2,c16,rs
 620         lvx             v3,c32,rs
 621         lvx             v4,c48,rs
 622         addi    rs,rs,64
 623         stvx    v1,0,rd
 624         stvx    v2,c16,rd
 625         stvx    v3,c32,rd
 626         stvx    v4,c48,rd
 627         addi    rd,rd,64
 628         bdnz    1b
 629
 630         b               LForwardVectorAlignedEnd
 631
 632
 633 // G4 Forward, long, unaligned, 32-byte cache ops, use DCBT and DCBA.  At least on
 634 // some CPUs, this routine is no slower than the simpler aligned version that does
 635 // not use permutes.  But it cannot be used with aligned operands, because of the
 636 // way it prefetches source QWs.
 637 //              r0/cr0 = #bytes to 32-byte align
 638
 639 LForwardVecUnal32Dcba:
 640         bnel+   LAlign32                        // align destination iff necessary
 641         bl              LPrepareForwardVectors
 642         lvx             v1,0,rs                         // prime loop
 643         mtlr    ra                                      // restore return address before loading c128
 644         lvsl    vp,0,rs                         // get permute vector to shift left
 645         li              c128,128
 646         b               1f                                      // enter aligned loop
 647
 648         .align  4                                       // long loop heads should be at least 16-byte aligned
 649 1:                                                              // loop over aligned 64-byte destination chunks
 650         lvx             v2,c16,rs
 651         dcbt    c96,rs                          // touch 3rd cache line ahead
 652         lvx             v3,c32,rs
 653         dcbt    c128,rs                         // touch 4th cache line ahead
 654         lvx             v4,c48,rs
 655         addi    rs,rs,64
 656         vperm   vx,v1,v2,vp
 657         lvx             v1,0,rs
 658         vperm   vy,v2,v3,vp
 659         dcba    0,rd                            // avoid read of destination lines
 660         stvx    vx,0,rd
 661         vperm   vz,v3,v4,vp
 662         stvx    vy,c16,rd
 663         dcba    c32,rd
 664         vperm   vx,v4,v1,vp
 665         stvx    vz,c32,rd
 666         stvx    vx,c48,rd
 667         addi    rd,rd,64
 668         bdnz    1b
 669
 670 LForwardVectorUnalignedEnd:                     // r0/cr0=#QWs, rv=VRSave, v1=next QW, cr7=(rc & F), cr6 set on cr7
 671         beq-    3f                                      // no leftover quadwords
 672         mtctr   r0
 673 2:                                                                      // loop over remaining quadwords
 674         lvx             v2,c16,rs
 675         addi    rs,rs,16
 676         vperm   vx,v1,v2,vp
 677         vor             v1,v2,v2                        // v1 <- v2
 678         stvx    vx,0,rd
 679         addi    rd,rd,16
 680         bdnz    2b
 681 3:
 682         mtspr   VRSave,rv                       // restore bitmap of live vr's
 683         bne             cr6,LShort16            // handle last 0-15 bytes if any
 684         blr
 685
 686
 687 // G4 Forward, long, unaligned, 32-byte cache ops, use DCBT but not DCBA.
 688 //              r0/cr0 = #bytes to 32-byte align
 689
 690 LForwardVecUnal32NoDcba:
 691         bnel+   LAlign32                        // align destination iff necessary
 692         bl              LPrepareForwardVectors
 693         lvx             v1,0,rs                         // prime loop
 694         mtlr    ra                                      // restore return address before loading c128
 695         lvsl    vp,0,rs                         // get permute vector to shift left
 696         li              c128,128
 697         b               1f                                      // enter aligned loop
 698
 699         .align  4
 700         nop                                                     // balance 17-word loop between QWs
 701         nop
 702 1:                                                              // loop over aligned 64-byte destination chunks
 703         lvx             v2,c16,rs
 704         dcbt    c96,rs                          // touch 3rd cache line ahead
 705         lvx             v3,c32,rs
 706         dcbt    c128,rs                         // touch 4th cache line ahead
 707         lvx             v4,c48,rs
 708         addi    rs,rs,64
 709         vperm   vx,v1,v2,vp
 710         lvx             v1,0,rs
 711         vperm   vy,v2,v3,vp
 712         stvx    vx,0,rd
 713         vperm   vz,v3,v4,vp
 714         stvx    vy,c16,rd
 715         vperm   vx,v4,v1,vp
 716         stvx    vz,c32,rd
 717         stvx    vx,c48,rd
 718         addi    rd,rd,64
 719         bdnz    1b
 720
 721         b               LForwardVectorUnalignedEnd
 722
 723
 724 // G3 Reverse, long, unaligned.
 725
 726 LReverseWord:
 727         bl              LAlign8Reverse          // 8-byte align destination
 728         mtlr    ra                                      // restore return address
 729         srwi    r0,rc,5                         // get count of 32-byte chunks to xfer (> 1)
 730         mtctr   r0
 731 1:
 732         lwz             w1,-4(rs)
 733         lwz             w2,-8(rs)
 734         lwz             w3,-12(rs)
 735         lwz             w4,-16(rs)
 736         stw             w1,-4(rd)
 737         lwz             w5,-20(rs)
 738         stw             w2,-8(rd)
 739         lwz             w6,-24(rs)
 740         stw             w3,-12(rd)
 741         lwz             w7,-28(rs)
 742         stw             w4,-16(rd)
 743         lwzu    w8,-32(rs)
 744         stw             w5,-20(rd)
 745         stw             w6,-24(rd)
 746         stw             w7,-28(rd)
 747         stwu    w8,-32(rd)
 748         bdnz    1b
 749
 750         b               LShortReverse
 751
 752
 753 // G3 Reverse, long, word aligned.
 754
 755 LReverseFloat:
 756         bl              LAlign8Reverse          // 8-byte align
 757         mtlr    ra                                      // restore return address
 758         srwi    r0,rc,5                         // get count of 32-byte chunks to xfer (> 1)
 759         mtctr   r0
 760 1:
 761         lfd             f0,-8(rs)
 762         lfd             f1,-16(rs)
 763         lfd             f2,-24(rs)
 764         lfdu    f3,-32(rs)
 765         stfd    f0,-8(rd)
 766         stfd    f1,-16(rd)
 767         stfd    f2,-24(rd)
 768         stfdu   f3,-32(rd)
 769         bdnz    1b
 770
 771         b               LShortReverse
 772
 773
 774 // G4 Reverse, long, 16-byte aligned, 32-byte DCBT but no DCBA.
 775
 776 LReverseVectorAligned32:
 777         bl              LAlign32Reverse         // 32-byte align destination iff necessary
 778         bl              LPrepareReverseVectors
 779         mtlr    ra                                      // restore return address before loading cm129
 780         li              cm129,-129
 781         b               1f                                      // enter aligned loop
 782
 783         .align  4
 784         nop                                                     // must start in 3rd word of QW...
 785         nop                                                     // ...to keep balanced
 786 1:                                                              // loop over aligned 64-byte chunks
 787         dcbt    cm97,rs                         // pre-fetch three cache lines ahead
 788         dcbt    cm129,rs                        // and four
 789         lvx             v1,cm1,rs
 790         lvx             v2,cm17,rs
 791         lvx             v3,cm33,rs
 792         lvx             v4,cm49,rs
 793         subi    rs,rs,64
 794         stvx    v1,cm1,rd
 795         stvx    v2,cm17,rd
 796         stvx    v3,cm33,rd
 797         stvx    v4,cm49,rd
 798         subi    rd,rd,64
 799         bdnz    1b
 800
 801 LReverseVectorAlignedEnd:                       // cr0/r0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7
 802         beq             3f                                      // no leftover quadwords
 803         mtctr   r0
 804 2:                                                                      // loop over 1-3 quadwords
 805         lvx             v1,cm1,rs
 806         subi    rs,rs,16
 807         stvx    v1,cm1,rd
 808         subi    rd,rd,16
 809         bdnz    2b
 810 3:
 811         mtspr   VRSave,rv                       // restore bitmap of live vr's
 812         bne             cr6,LShortReverse16     // handle last 0-15 bytes iff any
 813         blr
 814
 815
 816 // G4 Reverse, long, unaligned, 32-byte DCBT.
 817
 818 LReverseVectorUnal32:
 819         bl              LAlign32Reverse         // align destination iff necessary
 820         bl              LPrepareReverseVectors
 821         lvx             v1,cm1,rs                       // prime loop
 822         mtlr    ra                                      // restore return address before loading cm129
 823         lvsl    vp,0,rs                         // get permute vector to shift left
 824         li              cm129,-129
 825         b               1f                                      // enter aligned loop
 826
 827         .align  4
 828         nop                                                     // start loop in 3rd word on QW to balance
 829         nop
 830 1:                                                              // loop over aligned 64-byte destination chunks
 831         lvx             v2,cm17,rs
 832         dcbt    cm97,rs                         // touch in 3rd source block
 833         lvx             v3,cm33,rs
 834         dcbt    cm129,rs                        // touch in 4th
 835         lvx             v4,cm49,rs
 836         subi    rs,rs,64
 837         vperm   vx,v2,v1,vp
 838         lvx             v1,cm1,rs
 839         vperm   vy,v3,v2,vp
 840         stvx    vx,cm1,rd
 841         vperm   vz,v4,v3,vp
 842         stvx    vy,cm17,rd
 843         vperm   vx,v1,v4,vp
 844         stvx    vz,cm33,rd
 845         stvx    vx,cm49,rd
 846         subi    rd,rd,64
 847         bdnz    1b
 848
 849 LReverseVectorUnalignedEnd:                     // r0/cr0=#QWs, rv=VRSave, v1=source QW, cr7=low 4 bits of rc, cr6 set on cr7
 850         beq             3f                                      // no leftover quadwords
 851         mtctr   r0
 852 2:                                                                      // loop over 1-3 quadwords
 853         lvx             v2,cm17,rs
 854         subi    rs,rs,16
 855         vperm   vx,v2,v1,vp
 856         vor             v1,v2,v2                        // v1 <- v2
 857         stvx    vx,cm1,rd
 858         subi    rd,rd,16
 859         bdnz    2b
 860 3:
 861         mtspr   VRSave,rv                       // restore bitmap of live vr's
 862         bne             cr6,LShortReverse16     // handle last 0-15 bytes iff any
 863         blr
 864
 865
 866 // Subroutine to prepare for 64-byte forward vector loops.
 867 //              Returns many things:
 868 //                      ctr = number of 64-byte chunks to move
 869 //                      r0/cr0 = leftover QWs to move
 870 //                      cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 871 //                      cr6 = beq if leftover byte count is 0
 872 //                      c16..c96 loaded
 873 //                      rv = original value of VRSave
 874 //              NB: c128 not set (if needed), since it is still "ra"
 875
 876 LPrepareForwardVectors:
 877         mfspr   rv,VRSave                       // get bitmap of live vector registers
 878         srwi    r0,rc,6                         // get count of 64-byte chunks to move (>=1)
 879         oris    w1,rv,0xFF00            // we use v0-v7
 880         mtcrf   0x01,rc                         // prepare for moving last 0-15 bytes in LShort16
 881         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3 too
 882         mtspr   VRSave,w1                       // update mask
 883         li              c16,16                          // get constants used in ldvx/stvx
 884         li              c32,32
 885         mtctr   r0                                      // set up loop count
 886         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 887         li              c48,48
 888         li              c96,96
 889         rlwinm. r0,rc,28,30,31          // get number of quadword leftovers (0-3) and set cr0
 890         blr
 891
 892
 893 // Subroutine to prepare for 64-byte reverse vector loops.
 894 //              Returns many things:
 895 //                      ctr = number of 64-byte chunks to move
 896 //                      r0/cr0 = leftover QWs to move
 897 //                      cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
 898 //                      cr6 = beq if leftover byte count is 0
 899 //                      cm1..cm97 loaded
 900 //                      rv = original value of VRSave
 901 //              NB: cm129 not set (if needed), since it is still "ra"
 902
 903 LPrepareReverseVectors:
 904         mfspr   rv,VRSave                       // get bitmap of live vector registers
 905         srwi    r0,rc,6                         // get count of 64-byte chunks to move (>=1)
 906         oris    w1,rv,0xFF00            // we use v0-v7
 907         mtcrf   0x01,rc                         // prepare for moving last 0-15 bytes in LShortReverse16
 908         rlwinm  w3,rc,0,28,31           // move last 0-15 byte count to w3 too
 909         mtspr   VRSave,w1                       // update mask
 910         li              cm1,-1                          // get constants used in ldvx/stvx
 911         li              cm17,-17
 912         mtctr   r0                                      // set up loop count
 913         cmpwi   cr6,w3,0                        // set cr6 on leftover byte count
 914         li              cm33,-33
 915         li              cm49,-49
 916         rlwinm. r0,rc,28,30,31          // get number of quadword leftovers (0-3) and set cr0
 917         li              cm97,-97
 918         blr
 919
 920
 921 // Subroutine to align destination on a 32-byte boundary.
 922 //      r0 = number of bytes to xfer (0-31)
 923
 924 LAlign32:
 925         mtcrf   0x01,r0                         // length to cr (faster to change 1 CR at a time)
 926         mtcrf   0x02,r0
 927         sub             rc,rc,r0                        // adjust length
 928         bf              31,1f                           // skip if no odd bit
 929         lbz             w1,0(rs)
 930         addi    rs,rs,1
 931         stb             w1,0(rd)
 932         addi    rd,rd,1
 933 1:
 934         bf              30,2f                           // halfword to move?
 935         lhz             w1,0(rs)
 936         addi    rs,rs,2
 937         sth             w1,0(rd)
 938         addi    rd,rd,2
 939 2:
 940         bf              29,3f                           // word?
 941         lwz             w1,0(rs)
 942         addi    rs,rs,4
 943         stw             w1,0(rd)
 944         addi    rd,rd,4
 945 3:
 946         bf              28,4f                           // doubleword?
 947         lwz             w1,0(rs)
 948         lwz             w2,4(rs)
 949         addi    rs,rs,8
 950         stw             w1,0(rd)
 951         stw             w2,4(rd)
 952         addi    rd,rd,8
 953 4:
 954         bflr    27                                      // done if no quadword to move
 955         lwz             w1,0(rs)
 956         lwz             w2,4(rs)
 957         lwz             w3,8(rs)
 958         lwz             w4,12(rs)
 959         addi    rs,rs,16
 960         stw             w1,0(rd)
 961         stw             w2,4(rd)
 962         stw             w3,8(rd)
 963         stw             w4,12(rd)
 964         addi    rd,rd,16
 965         blr
 966
 967 // Subroutine to align destination if necessary on a 32-byte boundary for reverse moves.
 968 //   rs and rd still point to low end of operands
 969 //       we adjust rs and rd to point to last byte moved
 970
 971 LAlign32Reverse:
 972         add             rd,rd,rc                        // point to last byte moved (ie, 1 past end of operands)
 973         add             rs,rs,rc
 974         andi.   r0,rd,0x1F                      // r0 <- #bytes that must be moved to align destination
 975         mtcrf   0x01,r0                         // length to cr (faster to change 1 CR at a time)
 976         mtcrf   0x02,r0
 977         sub             rc,rc,r0                        // update length
 978         beqlr-                                          // destination already 32-byte aligned
 979
 980         bf              31,1f                           // odd byte?
 981         lbzu    w1,-1(rs)
 982         stbu    w1,-1(rd)
 983 1:
 984         bf              30,2f                           // halfword to move?
 985         lhzu    w1,-2(rs)
 986         sthu    w1,-2(rd)
 987 2:
 988         bf              29,3f                           // word?
 989         lwzu    w1,-4(rs)
 990         stwu    w1,-4(rd)
 991 3:
 992         bf              28,4f                           // doubleword?
 993         lwz             w1,-4(rs)
 994         lwzu    w2,-8(rs)
 995         stw             w1,-4(rd)
 996         stwu    w2,-8(rd
 997 4:
 998         bflr    27                                      // done if no quadwords
 999         lwz             w1,-4(rs)
1000         lwz             w2,-8(rs)
1001         lwz             w3,-12(rs)
1002         lwzu    w4,-16(rs)
1003         stw             w1,-4(rd)
1004         stw             w2,-8(rd)
1005         stw             w3,-12(rd)
1006         stwu    w4,-16(rd)
1007         blr
1008
1009
1010 // Subroutine to align destination on an 8-byte boundary for reverse moves.
1011 //   rs and rd still point to low end of operands
1012 //       we adjust rs and rd to point to last byte moved
1013
1014 LAlign8Reverse:
1015         add             rd,rd,rc                        // point to last byte moved (ie, 1 past end of operands)
1016         add             rs,rs,rc
1017         andi.   r0,rd,0x7                       // r0 <- #bytes that must be moved to align destination
1018         beqlr-                                          // destination already 8-byte aligned
1019         mtctr   r0                                      // set up for loop
1020         sub             rc,rc,r0                        // update length
1021 1:
1022         lbzu    w1,-1(rs)
1023         stbu    w1,-1(rd)
1024         bdnz    1b
1025
1026         blr
1027
1028
1029 // Called by pthread initialization to set up the branch table pointer based on
1030 // the CPU capability vector.  This routine may be called more than once (for
1031 // example, during testing.)
1032
1033 // Size of the buffer we use to do DCBA timing on G4:
1034 #define kBufSiz 1024
1035
1036 // Stack frame size, which contains the 128-byte-aligned buffer:
1037 #define kSFSize (kBufSiz+128+16)
1038
1039 // Iterations of the timing loop:
1040 #define kLoopCnt        5
1041
1042 // Bit in cr5 used as a flag in timing loop:
1043 #define kDCBA           22
1044
1045 __bcopy_initialize:                                     // int _bcopy_initialize(void)
1046         mflr    ra                                      // get return
1047         stw             ra,8(r1)                        // save
1048         stwu    r1,-kSFSize(r1)         // carve our temp buffer from the stack
1049         addi    w6,r1,127+16            // get base address...
1050         rlwinm  w6,w6,0,0,24            // ...of our buffer, 128-byte aligned
1051         bcl             20,31,1f                        // get our PIC base
1052 1:
1053         mflr    w1
1054         addis   w2,w1,ha16(__cpu_capabilities - 1b)
1055         lwz             w3,lo16(__cpu_capabilities - 1b)(w2)
1056         andi.   r0,w3,kUseDcba+kNoDcba+kCache32+k64Bit+kHasAltivec
1057         cmpwi   r0,kCache32+kHasAltivec // untyped G4?
1058         li              w8,0                            // assume no need to test
1059         bne             2f                                      // not an untyped G4, so do not test
1060
1061         // G4, but neither kUseDcba or kNoDcba are set.  Time and select fastest.
1062
1063         crset   kDCBA                           // first, use DCBA
1064         bl              LTest32                         // time it
1065         mr              w8,w4                           // w8 <- best time using DCBA
1066         srwi    r0,w8,3                         // bias 12 pct in favor of not using DCBA...
1067         add             w8,w8,r0                        // ...because DCBA is always slower with warm cache
1068         crclr   kDCBA
1069         bl              LTest32                         // w4 <- best time without DCBA
1070         cmplw   w8,w4                           // which is better?
1071         li              w8,kUseDcba                     // assume using DCBA is faster
1072         blt             2f
1073         li              w8,kNoDcba                      // no DCBA is faster
1074
1075         // What branch table to use?
1076
1077 2:                                                                      // here with w8 = 0, kUseDcba, or kNoDcba
1078         bcl             20,31,4f                        // get our PIC base again
1079 4:
1080         mflr    w1
1081         addis   w2,w1,ha16(__cpu_capabilities - 4b)
1082         lwz             w3,lo16(__cpu_capabilities - 4b)(w2)
1083         or              w3,w3,w8                        // add in kUseDcba or kNoDcba if untyped G4
1084         mr              r3,w8                           // return dynamic selection, if any (used in testing)
1085
1086         andi.   r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba
1087         cmpwi   r0,kHasAltivec+kCache32+kUseDcba        // G4 with DCBA?
1088         addis   w4,w1,ha16(LG4UseDcba - 4b)
1089         addi    w4,w4,lo16(LG4UseDcba - 4b)
1090         beq             5f
1091
1092         andi.   r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba
1093         cmpwi   r0,kHasAltivec+kCache32+kNoDcba         // G4 without DCBA?
1094         addis   w4,w1,ha16(LG4NoDcba - 4b)
1095         addi    w4,w4,lo16(LG4NoDcba - 4b)
1096         beq             5f
1097
1098         andi.   r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32
1099         cmpwi   r0,kCache32                                                     // G3?
1100         addis   w4,w1,ha16(LG3 - 4b)
1101         addi    w4,w4,lo16(LG3 - 4b)
1102         beq             5f
1103
1104         // Map unrecognized CPU types to G3 (lowest common denominator)
1105
1106 5:                                                                      // w4 <- branch table pointer
1107         addis   w5,w1,ha16(LBranchTablePtr - 4b)
1108         stw             w4,lo16(LBranchTablePtr - 4b)(w5)
1109         lwz             ra,kSFSize+8(r1)        // recover return address
1110         mtlr    ra                                      // restore it
1111         lwz             r1,0(r1)                        // pop off our stack frame
1112         blr                                                     // return dynamic selection (or 0) in r3
1113
1114
1115 // Subroutine to time a 32-byte cache.
1116 //              kDCBA = set if we should use DCBA
1117 //              w6 = base of buffer to use for test (kBufSiz bytes)
1118 //              w4 = we return time of fastest loop in w4
1119
1120 LTest32:
1121         li              w1,kLoopCnt                     // number of times to loop
1122         li              w4,-1                           // initialize fastest time
1123 1:
1124         mr              rd,w6                           // initialize buffer ptr
1125         li              r0,kBufSiz/32           // r0 <- cache blocks to test
1126         mtctr   r0
1127 2:
1128         dcbf    0,rd                            // first, force the blocks out of the cache
1129         addi    rd,rd,32
1130         bdnz    2b
1131         sync                                            // make sure all the flushes take
1132         mr              rd,w6                           // re-initialize buffer ptr
1133         mtctr   r0                                      // reset cache-block count
1134         mftbu   w5                                      // remember upper half so we can check for carry
1135         mftb    w2                                      // start the timer
1136 3:                                                                      // loop over cache blocks
1137         bf              kDCBA,4f                        // should we DCBA?
1138         dcba    0,rd
1139 4:
1140         stfd    f1,0(rd)                        // store the entire cache block
1141         stfd    f1,8(rd)
1142         stfd    f1,16(rd)
1143         stfd    f1,24(rd)
1144         addi    rd,rd,32
1145         bdnz    3b
1146         mftb    w3
1147         mftbu   r0
1148         cmpw    r0,w5                           // did timebase carry?
1149         bne             1b                                      // yes, retest rather than fuss
1150         sub             w3,w3,w2                        // w3 <- time for this loop
1151         cmplw   w3,w4                           // faster than current best?
1152         bge             5f                                      // no
1153         mr              w4,w3                           // remember fastest time through loop
1154 5:
1155         subi    w1,w1,1                         // decrement outer loop count
1156         cmpwi   w1,0                            // more to go?
1157         bne             1b                                      // loop if so
1158         blr
1159