2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
26 #define __APPLE_API_PRIVATE
27 #include <machine/cpu_capabilities.h>
28 #undef __APPLE_API_PRIVATE
30 // These functions have migrated to the comm page.
38 _bcopy: // void bcopy(const void *src, void *dst, size_t len)
42 _memcpy: // void* memcpy(void *dst, void *src, size_t len)
43 _memmove: // void* memmove(void *dst, const void *src, size_t len)
48 /* =======================================
49 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
50 * =======================================
52 * Version of 6/17/2002, for G3, G4, and G4+.
54 * There are many paths through this code, depending on length, reverse/forward,
55 * processor type, and alignment. We use reverse paths only when the operands
56 * overlap and the destination is higher than the source. They are not quite as
57 * fast as the forward paths.
59 * Judicious use of DCBTs, just far enough ahead to minimize waiting, is critical in
60 * the inner loops for long operands. DST is less effective than DCBT, because it
61 * can get out of sync with the inner loop. DCBTST is usually not a win, so we
62 * don't use it except during initialization when we're not using the LSU.
63 * We don't DCBT on G3, which only handles one load miss at a time.
65 * We don't use DCBZ, because it takes an alignment exception on uncached memory
66 * like frame buffers. Bcopy to frame buffers must work. This hurts G3 in the
67 * cold-cache case, but G4 can use DCBA (which does not take alignment exceptions.)
69 * Using DCBA on G4 is a tradeoff. For the cold-cache case it can be a big win,
70 * since it avoids the read of destination cache lines. But for the hot-cache case
71 * it is always slower, because of the cycles spent needlessly zeroing data. Some
72 * machines store-gather and can cancel the read if all bytes of a line are stored,
73 * others cannot. Unless explicitly told which is better, we time loops with and
74 * without DCBA and use the fastest. Note that we never DCBA in reverse loops,
75 * since by definition they are overlapped so dest lines will be in the cache.
77 * For longer operands we use an 8-element branch table, based on the CPU type,
78 * to select the appropriate inner loop. The branch table is indexed as follows:
80 * bit 10000 set if a Reverse move is required
81 * bits 01100 set on the relative operand alignment: 0=unaligned, 1=word,
82 * 2=doubleword, and 3=quadword.
84 * By "relatively" n-byte aligned, we mean the source and destination are a multiple
85 * of n bytes apart (they need not be absolutely aligned.)
87 * The branch table for the running CPU type is pointed to by LBranchTablePtr.
88 * Initially, LBranchtablePtr points to G3's table, since that is the lowest
89 * common denominator that will run on any CPU. Later, pthread initialization
90 * sets up the _cpu_capabilities vector and calls _bcopy_initialize, which sets
91 * up the correct pointer for the running CPU.
93 * We distinguish between "short", "medium", and "long" operands:
94 * short (<= 32 bytes) most common case, minimum path length is important
95 * medium (> 32, < kLong) too short for Altivec or use of cache ops like DCBA
96 * long (>= kLong) long enough for cache ops and to amortize use of Altivec
98 * WARNING: kLong must be >=96, due to implicit assumptions about operand length.
102 /* Register usage. Note we use R2, so this code will not run in a PEF/CFM
103 * environment. Note also the rather delicate way we assign multiple uses
104 * to the same register. Beware.
106 * r0 = "w7" or "r0" (NB: cannot use r0 for any constant such as "c16")
107 * r2 = "w8" or VRSave ("rv")
108 * r3 = not used, as memcpy and memmove return 1st parameter as a value
109 * r4 = source ptr ("rs")
110 * r5 = count of bytes to move ("rc")
111 * r6 = "w1", "c16", or "cm17"
112 * r7 = "w2", "c32", or "cm33"
113 * r8 = "w3", "c48", or "cm49"
114 * r9 = "w4", "c64", or "cm1"
115 * r10 = "w5", "c96", or "cm97"
116 * r11 = "w6", "c128", "cm129", or return address ("ra")
117 * r12 = destination ptr ("rd")
118 * f0-f8 = used for moving 8-byte aligned data
119 * v0 = permute vector ("vp")
120 * v1-v4 = qw's loaded from source ("v1", "v2", "v3", and "v4")
121 * v5-v7 = permuted qw's ("vx", "vy", and "vz")
158 #include <architecture/ppc/asm_help.h>
160 // The branch tables, 8 entries per CPU type.
161 // NB: we depend on 5 low-order 0s in the address of branch tables.
164 .align 5 // must be 32-byte aligned
166 // G3 (the default CPU type)
169 .long LForwardWord // 000: forward, unaligned
170 .long LForwardFloat // 001: forward, 4-byte aligned
171 .long LForwardFloat // 010: forward, 8-byte aligned
172 .long LForwardFloat // 011: forward, 16-byte aligned
173 .long LReverseWord // 100: reverse, unaligned
174 .long LReverseFloat // 101: reverse, 4-byte aligned
175 .long LReverseFloat // 110: reverse, 8-byte aligned
176 .long LReverseFloat // 111: reverse, 16-byte aligned
178 // G4s that benefit from DCBA.
181 .long LForwardVecUnal32Dcba // 000: forward, unaligned
182 .long LForwardVecUnal32Dcba // 001: forward, 4-byte aligned
183 .long LForwardVecUnal32Dcba // 010: forward, 8-byte aligned
184 .long LForwardVecAlig32Dcba // 011: forward, 16-byte aligned
185 .long LReverseVectorUnal32 // 100: reverse, unaligned
186 .long LReverseVectorUnal32 // 101: reverse, 4-byte aligned
187 .long LReverseVectorUnal32 // 110: reverse, 8-byte aligned
188 .long LReverseVectorAligned32 // 111: reverse, 16-byte aligned
190 // G4s that should not use DCBA.
193 .long LForwardVecUnal32NoDcba // 000: forward, unaligned
194 .long LForwardVecUnal32NoDcba // 001: forward, 4-byte aligned
195 .long LForwardVecUnal32NoDcba // 010: forward, 8-byte aligned
196 .long LForwardVecAlig32NoDcba // 011: forward, 16-byte aligned
197 .long LReverseVectorUnal32 // 100: reverse, unaligned
198 .long LReverseVectorUnal32 // 101: reverse, 4-byte aligned
199 .long LReverseVectorUnal32 // 110: reverse, 8-byte aligned
200 .long LReverseVectorAligned32 // 111: reverse, 16-byte aligned
203 // Pointer to the 8-element branch table for running CPU type:
206 .long LG3 // default to G3 until "bcopy_initialize" called
209 // The CPU capability vector, initialized in pthread_init().
210 // "_bcopy_initialize" uses this to set up LBranchTablePtr:
212 .globl __cpu_capabilities
216 // Bit definitions for _cpu_capabilities:
218 #define kHasAltivec 0x01
220 #define kCache32 0x04
221 #define kCache64 0x08
222 #define kCache128 0x10
223 #define kUseDcba 0x20
231 .globl __bcopy_initialize
234 // Main entry points.
237 _bcopy: // void bcopy(const void *src, void *dst, size_t len)
238 mr r10,r3 // reverse source and dest ptrs, to be like memcpy
241 _memcpy: // void* memcpy(void *dst, void *src, size_t len)
242 _memmove: // void* memmove(void *dst, const void *src, size_t len)
243 cmplwi cr7,rc,32 // length <= 32 bytes?
244 sub. w1,r3,rs // must move in reverse if (rd-rs)<rc, set cr0 on sou==dst
245 dcbt 0,rs // touch in the first line of source
246 cmplw cr6,w1,rc // set cr6 blt iff we must move reverse
247 cmplwi cr1,rc,kLong-1 // set cr1 bgt if long
248 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
249 bgt- cr7,LMedium // longer than 32 bytes
250 dcbtst 0,rd // touch in destination
251 beq- cr7,LMove32 // special case moves of 32 bytes
252 blt- cr6,LShortReverse0
254 // Forward short operands. This is the most frequent case, so it is inline.
255 // We also end up here to xfer the last 0-31 bytes of longer operands.
257 LShort: // WARNING: can fall into this routine
258 andi. r0,rc,0x10 // test bit 27 separately (sometimes faster than a mtcrf)
259 mtcrf 0x01,rc // move rest of length to cr7
260 beq 1f // quadword to move?
272 LShort16: // join here to xfer 0-15 bytes
273 bf 28,2f // doubleword?
287 bf 30,4f // halfword to move?
293 bflr 31 // skip if no odd byte
299 // Handle short reverse operands, up to kShort in length.
300 // This is also used to transfer the last 0-31 bytes of longer operands.
303 add rs,rs,rc // adjust ptrs for reverse move
306 andi. r0,rc,0x10 // test bit 27 separately (sometimes faster than a mtcrf)
307 mtcrf 0x01,rc // move rest of length to cr7
308 beq 1f // quadword to move?
318 LShortReverse16: // join here to xfer 0-15 bytes and return
319 bf 28,2f // doubleword?
329 bf 30,4f // halfword to move?
333 bflr 31 // done if no odd byte
334 lbz w1,-1(rs) // no update
339 // Special case for 32-byte moves. Too long for LShort, too common for LMedium.
362 // Medium length operands (32 < rc < kLong.) These loops run on all CPUs, as the
363 // operands are not long enough to bother with the branch table, using cache ops, or
364 // Altivec. We word align the source, not the dest as we do for long operands,
365 // since doing so is faster on G4+ and probably beyond, we never DCBA on medium-length
366 // operands, and the opportunity to cancel reads of dest cache lines is limited.
367 // w1 = (rd-rs), used to check for alignment
368 // cr0 = set on (rd-rs)
369 // cr1 = bgt if long operand
370 // cr6 = blt if reverse move
373 dcbtst 0,rd // touch in 1st line of destination
374 rlwinm r0,w1,0,29,31 // r0 <- ((rd-rs) & 7), ie 0 if doubleword aligned
375 beq- LExit // early exit if (rs==rd), avoiding use of "beqlr"
376 neg w2,rs // we align source, not dest, and assume forward
377 cmpwi cr5,r0,0 // set cr5 beq if doubleword aligned
378 bgt- cr1,LLong // handle long operands
379 andi. w3,w2,3 // W3 <- #bytes to word-align source
380 blt- cr6,LMediumReverse // handle reverse move
381 lwz w1,0(rs) // pre-fetch first 4 bytes of source
382 beq- cr5,LMediumAligned // operands are doubleword aligned
383 sub rc,rc,w3 // adjust count for alignment
384 mtcrf 0x01,rc // remaining byte count (0-15) to cr7 for LShort16
385 srwi w4,rc,4 // w4 <- number of 16-byte chunks to xfer (>=1)
386 mtctr w4 // prepare loop count
387 beq+ 2f // source already aligned
389 lwzx w2,w3,rs // get 1st aligned word (which we might partially overwrite)
390 add rs,rs,w3 // word-align source ptr
391 stw w1,0(rd) // store all (w3) bytes at once to avoid a loop
393 mr w1,w2 // first aligned word to w1
396 .align 4 // align inner loops
397 1: // loop over 16-byte chunks
414 // Medium, doubleword aligned. We use floating point. Note that G4+ has bigger latencies
415 // and reduced throughput for floating pt loads and stores; future processors will probably
416 // have even worse lfd/stfd performance. We use it here because it is so important for G3,
417 // and not slower for G4+. But we only do so for doubleword aligned operands, whereas the
418 // G3-only long operand loops use floating pt even for word-aligned operands.
420 // w1 = first 4 bytes of source
423 andi. w3,w2,7 // already aligned?
424 sub rc,rc,w3 // adjust count by 0-7 bytes
425 lfdx f0,rs,w3 // pre-fetch first aligned source doubleword
426 srwi w4,rc,5 // get count of 32-byte chunks (might be 0 if unaligned)
428 beq- LForwardFloatLoop1 // already aligned
430 cmpwi w4,0 // are there any 32-byte chunks to xfer?
431 lwz w2,4(rs) // get 2nd (unaligned) source word
432 add rs,rs,w3 // doubleword align source pointer
433 stw w1,0(rd) // store first 8 bytes of source to align...
434 stw w2,4(rd) // ...which could overwrite source
435 add rd,rd,w3 // doubleword align destination
436 bne+ LForwardFloatLoop1 // at least 1 chunk, so enter loop
438 subi rc,rc,8 // unfortunate degenerate case: no chunks to xfer
439 stfd f0,0(rd) // must store f1 since source might have been overwriten
445 // Medium reverse moves. This loop runs on all processors.
448 add rs,rs,rc // point to other end of operands when in reverse
450 andi. w3,rs,3 // w3 <- #bytes to word align source
451 lwz w1,-4(rs) // pre-fetch 1st 4 bytes of source
452 sub rc,rc,w3 // adjust count
453 srwi w4,rc,4 // get count of 16-byte chunks (>=1)
454 mtcrf 0x01,rc // remaining byte count (0-15) to cr7 for LShortReverse16
455 mtctr w4 // prepare loop count
456 beq+ 2f // source already aligned
458 sub rs,rs,w3 // word-align source ptr
459 lwz w2,-4(rs) // get 1st aligned word which we may overwrite
460 stw w1,-4(rd) // store all 4 bytes to align without a loop
462 mr w1,w2 // shift 1st aligned source word to w1
480 // Long operands. Use branch table to decide which loop to use.
481 // w1 = (rd-rs), used to determine alignment
484 xor w4,w1,rc // we must move reverse if (rd-rs)<rc
485 mflr ra // save return address
486 rlwinm w5,w1,1,27,30 // w5 <- ((w1 & 0xF) << 1)
487 bcl 20,31,1f // use reserved form to get our location
489 mflr w3 // w3 == addr(1b)
490 lis w8,0x0408 // load a 16 element, 2-bit array into w8...
491 cntlzw w4,w4 // find first difference between (rd-rs) and rc
492 addis w2,w3,ha16(LBranchTablePtr-1b)
493 ori w8,w8,0x040C // ...used to map w5 to alignment encoding (ie, to 0-3)
494 lwz w2,lo16(LBranchTablePtr-1b)(w2) // w2 <- branch table address
495 slw w4,rc,w4 // bit 0 of w4 set iff (rd-rs)<rc
496 rlwnm w5,w8,w5,28,29 // put alignment encoding in bits 01100 of w5
497 rlwimi w2,w4,5,27,27 // put reverse bit in bit 10000 of branch table address
498 lwzx w3,w2,w5 // w3 <- load loop address from branch table
499 neg w1,rd // start to compute destination alignment
501 andi. r0,w1,0x1F // r0 <- bytes req'd to 32-byte align dest (if forward move)
502 bctr // NB: r0/cr0 and w1 are passed as parameters
505 // G3, forward, long, unaligned.
509 andi. w3,w1,3 // W3 <- #bytes to word-align destination
510 mtlr ra // restore return address
511 sub rc,rc,w3 // adjust count for alignment
512 srwi r0,rc,5 // number of 32-byte chunks to xfer (>=1)
513 mtctr r0 // prepare loop count
514 beq+ 1f // dest already aligned
516 lwz w2,0(rs) // get first 4 bytes of source
517 lwzx w1,w3,rs // get source bytes we might overwrite
518 add rs,rs,w3 // adjust source ptr
519 stw w2,0(rd) // store all 4 bytes to avoid a loop
520 add rd,rd,w3 // word-align destination
547 // G3, forward, long, word aligned. We use floating pt even when only word aligned.
551 andi. w3,w1,7 // W3 <- #bytes to doubleword-align destination
552 mtlr ra // restore return address
553 sub rc,rc,w3 // adjust count for alignment
554 srwi r0,rc,5 // number of 32-byte chunks to xfer (>=1)
555 mtctr r0 // prepare loop count
556 beq LForwardFloatLoop // dest already aligned
558 lwz w1,0(rs) // get first 8 bytes of source
560 lfdx f0,w3,rs // get source bytes we might overwrite
561 add rs,rs,w3 // word-align source ptr
562 stw w1,0(rd) // store all 8 bytes to avoid a loop
567 .align 4 // align since this loop is executed by G4s too
570 LForwardFloatLoop1: // enter here from LMediumAligned and above
580 bdnz LForwardFloatLoop
585 // G4 Forward, long, 16-byte aligned, 32-byte cache ops, use DCBA and DCBT.
586 // r0/cr0 = #bytes to 32-byte align
588 LForwardVecAlig32Dcba:
589 bnel+ LAlign32 // align destination iff necessary
590 bl LPrepareForwardVectors
591 mtlr ra // restore return address before loading c128
593 b 1f // enter aligned loop
595 .align 5 // long loop heads should be at least 16-byte aligned
596 1: // loop over aligned 64-byte chunks
597 dcbt c96,rs // pre-fetch three cache lines ahead
598 dcbt c128,rs // and four
604 dcba 0,rd // avoid read of destination cache lines
613 LForwardVectorAlignedEnd: // r0/cr0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7
614 beq- 3f // no leftover quadwords
616 2: // loop over remaining quadwords (1-7)
623 mtspr VRSave,rv // restore bitmap of live vr's
624 bne cr6,LShort16 // handle last 0-15 bytes if any
628 // G4 Forward, long, 16-byte aligned, 32-byte cache, use DCBT but not DCBA.
629 // r0/cr0 = #bytes to 32-byte align
631 LForwardVecAlig32NoDcba:
632 bnel+ LAlign32 // align destination iff necessary
633 bl LPrepareForwardVectors
634 mtlr ra // restore return address before loading c128
636 b 1f // enter aligned loop
638 .align 4 // balance 13-word loop between QWs...
639 nop // ...which improves performance 5% +/-
641 1: // loop over aligned 64-byte chunks
642 dcbt c96,rs // pre-fetch three cache lines ahead
643 dcbt c128,rs // and four
656 b LForwardVectorAlignedEnd
659 // G4 Forward, long, unaligned, 32-byte cache ops, use DCBT and DCBA. At least on
660 // some CPUs, this routine is no slower than the simpler aligned version that does
661 // not use permutes. But it cannot be used with aligned operands, because of the
662 // way it prefetches source QWs.
663 // r0/cr0 = #bytes to 32-byte align
665 LForwardVecUnal32Dcba:
666 bnel+ LAlign32 // align destination iff necessary
667 bl LPrepareForwardVectors
668 lvx v1,0,rs // prime loop
669 mtlr ra // restore return address before loading c128
670 lvsl vp,0,rs // get permute vector to shift left
672 b 1f // enter aligned loop
674 .align 4 // long loop heads should be at least 16-byte aligned
675 1: // loop over aligned 64-byte destination chunks
677 dcbt c96,rs // touch 3rd cache line ahead
679 dcbt c128,rs // touch 4th cache line ahead
685 dcba 0,rd // avoid read of destination lines
696 LForwardVectorUnalignedEnd: // r0/cr0=#QWs, rv=VRSave, v1=next QW, cr7=(rc & F), cr6 set on cr7
697 beq- 3f // no leftover quadwords
699 2: // loop over remaining quadwords
703 vor v1,v2,v2 // v1 <- v2
708 mtspr VRSave,rv // restore bitmap of live vr's
709 bne cr6,LShort16 // handle last 0-15 bytes if any
713 // G4 Forward, long, unaligned, 32-byte cache ops, use DCBT but not DCBA.
714 // r0/cr0 = #bytes to 32-byte align
716 LForwardVecUnal32NoDcba:
717 bnel+ LAlign32 // align destination iff necessary
718 bl LPrepareForwardVectors
719 lvx v1,0,rs // prime loop
720 mtlr ra // restore return address before loading c128
721 lvsl vp,0,rs // get permute vector to shift left
723 b 1f // enter aligned loop
726 nop // balance 17-word loop between QWs
728 1: // loop over aligned 64-byte destination chunks
730 dcbt c96,rs // touch 3rd cache line ahead
732 dcbt c128,rs // touch 4th cache line ahead
747 b LForwardVectorUnalignedEnd
750 // G3 Reverse, long, unaligned.
753 bl LAlign8Reverse // 8-byte align destination
754 mtlr ra // restore return address
755 srwi r0,rc,5 // get count of 32-byte chunks to xfer (> 1)
779 // G3 Reverse, long, word aligned.
782 bl LAlign8Reverse // 8-byte align
783 mtlr ra // restore return address
784 srwi r0,rc,5 // get count of 32-byte chunks to xfer (> 1)
800 // G4 Reverse, long, 16-byte aligned, 32-byte DCBT but no DCBA.
802 LReverseVectorAligned32:
803 bl LAlign32Reverse // 32-byte align destination iff necessary
804 bl LPrepareReverseVectors
805 mtlr ra // restore return address before loading cm129
807 b 1f // enter aligned loop
810 nop // must start in 3rd word of QW...
811 nop // ...to keep balanced
812 1: // loop over aligned 64-byte chunks
813 dcbt cm97,rs // pre-fetch three cache lines ahead
814 dcbt cm129,rs // and four
827 LReverseVectorAlignedEnd: // cr0/r0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7
828 beq 3f // no leftover quadwords
830 2: // loop over 1-3 quadwords
837 mtspr VRSave,rv // restore bitmap of live vr's
838 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
842 // G4 Reverse, long, unaligned, 32-byte DCBT.
844 LReverseVectorUnal32:
845 bl LAlign32Reverse // align destination iff necessary
846 bl LPrepareReverseVectors
847 lvx v1,cm1,rs // prime loop
848 mtlr ra // restore return address before loading cm129
849 lvsl vp,0,rs // get permute vector to shift left
851 b 1f // enter aligned loop
854 nop // start loop in 3rd word on QW to balance
856 1: // loop over aligned 64-byte destination chunks
858 dcbt cm97,rs // touch in 3rd source block
860 dcbt cm129,rs // touch in 4th
875 LReverseVectorUnalignedEnd: // r0/cr0=#QWs, rv=VRSave, v1=source QW, cr7=low 4 bits of rc, cr6 set on cr7
876 beq 3f // no leftover quadwords
878 2: // loop over 1-3 quadwords
882 vor v1,v2,v2 // v1 <- v2
887 mtspr VRSave,rv // restore bitmap of live vr's
888 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
892 // Subroutine to prepare for 64-byte forward vector loops.
893 // Returns many things:
894 // ctr = number of 64-byte chunks to move
895 // r0/cr0 = leftover QWs to move
896 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
897 // cr6 = beq if leftover byte count is 0
899 // rv = original value of VRSave
900 // NB: c128 not set (if needed), since it is still "ra"
902 LPrepareForwardVectors:
903 mfspr rv,VRSave // get bitmap of live vector registers
904 srwi r0,rc,6 // get count of 64-byte chunks to move (>=1)
905 oris w1,rv,0xFF00 // we use v0-v7
906 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShort16
907 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
908 mtspr VRSave,w1 // update mask
909 li c16,16 // get constants used in ldvx/stvx
911 mtctr r0 // set up loop count
912 cmpwi cr6,w3,0 // set cr6 on leftover byte count
915 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
919 // Subroutine to prepare for 64-byte reverse vector loops.
920 // Returns many things:
921 // ctr = number of 64-byte chunks to move
922 // r0/cr0 = leftover QWs to move
923 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
924 // cr6 = beq if leftover byte count is 0
926 // rv = original value of VRSave
927 // NB: cm129 not set (if needed), since it is still "ra"
929 LPrepareReverseVectors:
930 mfspr rv,VRSave // get bitmap of live vector registers
931 srwi r0,rc,6 // get count of 64-byte chunks to move (>=1)
932 oris w1,rv,0xFF00 // we use v0-v7
933 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
934 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
935 mtspr VRSave,w1 // update mask
936 li cm1,-1 // get constants used in ldvx/stvx
938 mtctr r0 // set up loop count
939 cmpwi cr6,w3,0 // set cr6 on leftover byte count
942 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
947 // Subroutine to align destination on a 32-byte boundary.
948 // r0 = number of bytes to xfer (0-31)
951 mtcrf 0x01,r0 // length to cr (faster to change 1 CR at a time)
953 sub rc,rc,r0 // adjust length
954 bf 31,1f // skip if no odd bit
960 bf 30,2f // halfword to move?
972 bf 28,4f // doubleword?
980 bflr 27 // done if no quadword to move
993 // Subroutine to align destination if necessary on a 32-byte boundary for reverse moves.
994 // rs and rd still point to low end of operands
995 // we adjust rs and rd to point to last byte moved
998 add rd,rd,rc // point to last byte moved (ie, 1 past end of operands)
1000 andi. r0,rd,0x1F // r0 <- #bytes that must be moved to align destination
1001 mtcrf 0x01,r0 // length to cr (faster to change 1 CR at a time)
1003 sub rc,rc,r0 // update length
1004 beqlr- // destination already 32-byte aligned
1006 bf 31,1f // odd byte?
1010 bf 30,2f // halfword to move?
1018 bf 28,4f // doubleword?
1024 bflr 27 // done if no quadwords
1036 // Subroutine to align destination on an 8-byte boundary for reverse moves.
1037 // rs and rd still point to low end of operands
1038 // we adjust rs and rd to point to last byte moved
1041 add rd,rd,rc // point to last byte moved (ie, 1 past end of operands)
1043 andi. r0,rd,0x7 // r0 <- #bytes that must be moved to align destination
1044 beqlr- // destination already 8-byte aligned
1045 mtctr r0 // set up for loop
1046 sub rc,rc,r0 // update length
1055 // Called by pthread initialization to set up the branch table pointer based on
1056 // the CPU capability vector. This routine may be called more than once (for
1057 // example, during testing.)
1059 // Size of the buffer we use to do DCBA timing on G4:
1060 #define kBufSiz 1024
1062 // Stack frame size, which contains the 128-byte-aligned buffer:
1063 #define kSFSize (kBufSiz+128+16)
1065 // Iterations of the timing loop:
1068 // Bit in cr5 used as a flag in timing loop:
1071 __bcopy_initialize: // int _bcopy_initialize(void)
1072 mflr ra // get return
1073 stw ra,8(r1) // save
1074 stwu r1,-kSFSize(r1) // carve our temp buffer from the stack
1075 addi w6,r1,127+16 // get base address...
1076 rlwinm w6,w6,0,0,24 // ...of our buffer, 128-byte aligned
1077 bcl 20,31,1f // get our PIC base
1080 addis w2,w1,ha16(__cpu_capabilities - 1b)
1081 lwz w3,lo16(__cpu_capabilities - 1b)(w2)
1082 andi. r0,w3,kUseDcba+kNoDcba+kCache32+k64Bit+kHasAltivec
1083 cmpwi r0,kCache32+kHasAltivec // untyped G4?
1084 li w8,0 // assume no need to test
1085 bne 2f // not an untyped G4, so do not test
1087 // G4, but neither kUseDcba or kNoDcba are set. Time and select fastest.
1089 crset kDCBA // first, use DCBA
1090 bl LTest32 // time it
1091 mr w8,w4 // w8 <- best time using DCBA
1092 srwi r0,w8,3 // bias 12 pct in favor of not using DCBA...
1093 add w8,w8,r0 // ...because DCBA is always slower with warm cache
1095 bl LTest32 // w4 <- best time without DCBA
1096 cmplw w8,w4 // which is better?
1097 li w8,kUseDcba // assume using DCBA is faster
1099 li w8,kNoDcba // no DCBA is faster
1101 // What branch table to use?
1103 2: // here with w8 = 0, kUseDcba, or kNoDcba
1104 bcl 20,31,4f // get our PIC base again
1107 addis w2,w1,ha16(__cpu_capabilities - 4b)
1108 lwz w3,lo16(__cpu_capabilities - 4b)(w2)
1109 or w3,w3,w8 // add in kUseDcba or kNoDcba if untyped G4
1110 mr r3,w8 // return dynamic selection, if any (used in testing)
1112 andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba
1113 cmpwi r0,kHasAltivec+kCache32+kUseDcba // G4 with DCBA?
1114 addis w4,w1,ha16(LG4UseDcba - 4b)
1115 addi w4,w4,lo16(LG4UseDcba - 4b)
1118 andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba
1119 cmpwi r0,kHasAltivec+kCache32+kNoDcba // G4 without DCBA?
1120 addis w4,w1,ha16(LG4NoDcba - 4b)
1121 addi w4,w4,lo16(LG4NoDcba - 4b)
1124 andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32
1125 cmpwi r0,kCache32 // G3?
1126 addis w4,w1,ha16(LG3 - 4b)
1127 addi w4,w4,lo16(LG3 - 4b)
1130 // Map unrecognized CPU types to G3 (lowest common denominator)
1132 5: // w4 <- branch table pointer
1133 addis w5,w1,ha16(LBranchTablePtr - 4b)
1134 stw w4,lo16(LBranchTablePtr - 4b)(w5)
1135 lwz ra,kSFSize+8(r1) // recover return address
1136 mtlr ra // restore it
1137 lwz r1,0(r1) // pop off our stack frame
1138 blr // return dynamic selection (or 0) in r3
1141 // Subroutine to time a 32-byte cache.
1142 // kDCBA = set if we should use DCBA
1143 // w6 = base of buffer to use for test (kBufSiz bytes)
1144 // w4 = we return time of fastest loop in w4
1147 li w1,kLoopCnt // number of times to loop
1148 li w4,-1 // initialize fastest time
1150 mr rd,w6 // initialize buffer ptr
1151 li r0,kBufSiz/32 // r0 <- cache blocks to test
1154 dcbf 0,rd // first, force the blocks out of the cache
1157 sync // make sure all the flushes take
1158 mr rd,w6 // re-initialize buffer ptr
1159 mtctr r0 // reset cache-block count
1160 mftbu w5 // remember upper half so we can check for carry
1161 mftb w2 // start the timer
1162 3: // loop over cache blocks
1163 bf kDCBA,4f // should we DCBA?
1166 stfd f1,0(rd) // store the entire cache block
1174 cmpw r0,w5 // did timebase carry?
1175 bne 1b // yes, retest rather than fuss
1176 sub w3,w3,w2 // w3 <- time for this loop
1177 cmplw w3,w4 // faster than current best?
1179 mr w4,w3 // remember fastest time through loop
1181 subi w1,w1,1 // decrement outer loop count
1182 cmpwi w1,0 // more to go?
1183 bne 1b // loop if so