2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* =======================================
23 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
24 * =======================================
26 * Version of 6/17/2002, for G3, G4, and G4+.
28 * There are many paths through this code, depending on length, reverse/forward,
29 * processor type, and alignment. We use reverse paths only when the operands
30 * overlap and the destination is higher than the source. They are not quite as
31 * fast as the forward paths.
33 * Judicious use of DCBTs, just far enough ahead to minimize waiting, is critical in
34 * the inner loops for long operands. DST is less effective than DCBT, because it
35 * can get out of sync with the inner loop. DCBTST is usually not a win, so we
36 * don't use it except during initialization when we're not using the LSU.
37 * We don't DCBT on G3, which only handles one load miss at a time.
39 * We don't use DCBZ, because it takes an alignment exception on uncached memory
40 * like frame buffers. Bcopy to frame buffers must work. This hurts G3 in the
41 * cold-cache case, but G4 can use DCBA (which does not take alignment exceptions.)
43 * Using DCBA on G4 is a tradeoff. For the cold-cache case it can be a big win,
44 * since it avoids the read of destination cache lines. But for the hot-cache case
45 * it is always slower, because of the cycles spent needlessly zeroing data. Some
46 * machines store-gather and can cancel the read if all bytes of a line are stored,
47 * others cannot. Unless explicitly told which is better, we time loops with and
48 * without DCBA and use the fastest. Note that we never DCBA in reverse loops,
49 * since by definition they are overlapped so dest lines will be in the cache.
51 * For longer operands we use an 8-element branch table, based on the CPU type,
52 * to select the appropriate inner loop. The branch table is indexed as follows:
54 * bit 10000 set if a Reverse move is required
55 * bits 01100 set on the relative operand alignment: 0=unaligned, 1=word,
56 * 2=doubleword, and 3=quadword.
58 * By "relatively" n-byte aligned, we mean the source and destination are a multiple
59 * of n bytes apart (they need not be absolutely aligned.)
61 * The branch table for the running CPU type is pointed to by LBranchTablePtr.
62 * Initially, LBranchtablePtr points to G3's table, since that is the lowest
63 * common denominator that will run on any CPU. Later, pthread initialization
64 * sets up the _cpu_capabilities vector and calls _bcopy_initialize, which sets
65 * up the correct pointer for the running CPU.
67 * We distinguish between "short", "medium", and "long" operands:
68 * short (<= 32 bytes) most common case, minimum path length is important
69 * medium (> 32, < kLong) too short for Altivec or use of cache ops like DCBA
70 * long (>= kLong) long enough for cache ops and to amortize use of Altivec
72 * WARNING: kLong must be >=96, due to implicit assumptions about operand length.
76 /* Register usage. Note we use R2, so this code will not run in a PEF/CFM
77 * environment. Note also the rather delicate way we assign multiple uses
78 * to the same register. Beware.
80 * r0 = "w7" or "r0" (NB: cannot use r0 for any constant such as "c16")
81 * r2 = "w8" or VRSave ("rv")
82 * r3 = not used, as memcpy and memmove return 1st parameter as a value
83 * r4 = source ptr ("rs")
84 * r5 = count of bytes to move ("rc")
85 * r6 = "w1", "c16", or "cm17"
86 * r7 = "w2", "c32", or "cm33"
87 * r8 = "w3", "c48", or "cm49"
88 * r9 = "w4", "c64", or "cm1"
89 * r10 = "w5", "c96", or "cm97"
90 * r11 = "w6", "c128", "cm129", or return address ("ra")
91 * r12 = destination ptr ("rd")
92 * f0-f8 = used for moving 8-byte aligned data
93 * v0 = permute vector ("vp")
94 * v1-v4 = qw's loaded from source ("v1", "v2", "v3", and "v4")
95 * v5-v7 = permuted qw's ("vx", "vy", and "vz")
132 #include <architecture/ppc/asm_help.h>
134 // The branch tables, 8 entries per CPU type.
135 // NB: we depend on 5 low-order 0s in the address of branch tables.
138 .align 5 // must be 32-byte aligned
140 // G3 (the default CPU type)
143 .long LForwardWord // 000: forward, unaligned
144 .long LForwardFloat // 001: forward, 4-byte aligned
145 .long LForwardFloat // 010: forward, 8-byte aligned
146 .long LForwardFloat // 011: forward, 16-byte aligned
147 .long LReverseWord // 100: reverse, unaligned
148 .long LReverseFloat // 101: reverse, 4-byte aligned
149 .long LReverseFloat // 110: reverse, 8-byte aligned
150 .long LReverseFloat // 111: reverse, 16-byte aligned
152 // G4s that benefit from DCBA.
155 .long LForwardVecUnal32Dcba // 000: forward, unaligned
156 .long LForwardVecUnal32Dcba // 001: forward, 4-byte aligned
157 .long LForwardVecUnal32Dcba // 010: forward, 8-byte aligned
158 .long LForwardVecAlig32Dcba // 011: forward, 16-byte aligned
159 .long LReverseVectorUnal32 // 100: reverse, unaligned
160 .long LReverseVectorUnal32 // 101: reverse, 4-byte aligned
161 .long LReverseVectorUnal32 // 110: reverse, 8-byte aligned
162 .long LReverseVectorAligned32 // 111: reverse, 16-byte aligned
164 // G4s that should not use DCBA.
167 .long LForwardVecUnal32NoDcba // 000: forward, unaligned
168 .long LForwardVecUnal32NoDcba // 001: forward, 4-byte aligned
169 .long LForwardVecUnal32NoDcba // 010: forward, 8-byte aligned
170 .long LForwardVecAlig32NoDcba // 011: forward, 16-byte aligned
171 .long LReverseVectorUnal32 // 100: reverse, unaligned
172 .long LReverseVectorUnal32 // 101: reverse, 4-byte aligned
173 .long LReverseVectorUnal32 // 110: reverse, 8-byte aligned
174 .long LReverseVectorAligned32 // 111: reverse, 16-byte aligned
177 // Pointer to the 8-element branch table for running CPU type:
180 .long LG3 // default to G3 until "bcopy_initialize" called
183 // The CPU capability vector, initialized in pthread_init().
184 // "_bcopy_initialize" uses this to set up LBranchTablePtr:
186 .globl __cpu_capabilities
190 // Bit definitions for _cpu_capabilities:
192 #define kHasAltivec 0x01
194 #define kCache32 0x04
195 #define kCache64 0x08
196 #define kCache128 0x10
197 #define kUseDcba 0x20
205 .globl __bcopy_initialize
208 // Main entry points.
211 _bcopy: // void bcopy(const void *src, void *dst, size_t len)
212 mr r10,r3 // reverse source and dest ptrs, to be like memcpy
215 _memcpy: // void* memcpy(void *dst, void *src, size_t len)
216 _memmove: // void* memmove(void *dst, const void *src, size_t len)
217 cmplwi cr7,rc,32 // length <= 32 bytes?
218 sub. w1,r3,rs // must move in reverse if (rd-rs)<rc, set cr0 on sou==dst
219 dcbt 0,rs // touch in the first line of source
220 cmplw cr6,w1,rc // set cr6 blt iff we must move reverse
221 cmplwi cr1,rc,kLong-1 // set cr1 bgt if long
222 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
223 bgt- cr7,LMedium // longer than 32 bytes
224 dcbtst 0,rd // touch in destination
225 beq- cr7,LMove32 // special case moves of 32 bytes
226 blt- cr6,LShortReverse0
228 // Forward short operands. This is the most frequent case, so it is inline.
229 // We also end up here to xfer the last 0-31 bytes of longer operands.
231 LShort: // WARNING: can fall into this routine
232 andi. r0,rc,0x10 // test bit 27 separately (sometimes faster than a mtcrf)
233 mtcrf 0x01,rc // move rest of length to cr7
234 beq 1f // quadword to move?
246 LShort16: // join here to xfer 0-15 bytes
247 bf 28,2f // doubleword?
261 bf 30,4f // halfword to move?
267 bflr 31 // skip if no odd byte
273 // Handle short reverse operands, up to kShort in length.
274 // This is also used to transfer the last 0-31 bytes of longer operands.
277 add rs,rs,rc // adjust ptrs for reverse move
280 andi. r0,rc,0x10 // test bit 27 separately (sometimes faster than a mtcrf)
281 mtcrf 0x01,rc // move rest of length to cr7
282 beq 1f // quadword to move?
292 LShortReverse16: // join here to xfer 0-15 bytes and return
293 bf 28,2f // doubleword?
303 bf 30,4f // halfword to move?
307 bflr 31 // done if no odd byte
308 lbz w1,-1(rs) // no update
313 // Special case for 32-byte moves. Too long for LShort, too common for LMedium.
336 // Medium length operands (32 < rc < kLong.) These loops run on all CPUs, as the
337 // operands are not long enough to bother with the branch table, using cache ops, or
338 // Altivec. We word align the source, not the dest as we do for long operands,
339 // since doing so is faster on G4+ and probably beyond, we never DCBA on medium-length
340 // operands, and the opportunity to cancel reads of dest cache lines is limited.
341 // w1 = (rd-rs), used to check for alignment
342 // cr0 = set on (rd-rs)
343 // cr1 = bgt if long operand
344 // cr6 = blt if reverse move
347 dcbtst 0,rd // touch in 1st line of destination
348 rlwinm r0,w1,0,29,31 // r0 <- ((rd-rs) & 7), ie 0 if doubleword aligned
349 beq- LExit // early exit if (rs==rd), avoiding use of "beqlr"
350 neg w2,rs // we align source, not dest, and assume forward
351 cmpwi cr5,r0,0 // set cr5 beq if doubleword aligned
352 bgt- cr1,LLong // handle long operands
353 andi. w3,w2,3 // W3 <- #bytes to word-align source
354 blt- cr6,LMediumReverse // handle reverse move
355 lwz w1,0(rs) // pre-fetch first 4 bytes of source
356 beq- cr5,LMediumAligned // operands are doubleword aligned
357 sub rc,rc,w3 // adjust count for alignment
358 mtcrf 0x01,rc // remaining byte count (0-15) to cr7 for LShort16
359 srwi w4,rc,4 // w4 <- number of 16-byte chunks to xfer (>=1)
360 mtctr w4 // prepare loop count
361 beq+ 2f // source already aligned
363 lwzx w2,w3,rs // get 1st aligned word (which we might partially overwrite)
364 add rs,rs,w3 // word-align source ptr
365 stw w1,0(rd) // store all (w3) bytes at once to avoid a loop
367 mr w1,w2 // first aligned word to w1
370 .align 4 // align inner loops
371 1: // loop over 16-byte chunks
388 // Medium, doubleword aligned. We use floating point. Note that G4+ has bigger latencies
389 // and reduced throughput for floating pt loads and stores; future processors will probably
390 // have even worse lfd/stfd performance. We use it here because it is so important for G3,
391 // and not slower for G4+. But we only do so for doubleword aligned operands, whereas the
392 // G3-only long operand loops use floating pt even for word-aligned operands.
394 // w1 = first 4 bytes of source
397 andi. w3,w2,7 // already aligned?
398 sub rc,rc,w3 // adjust count by 0-7 bytes
399 lfdx f0,rs,w3 // pre-fetch first aligned source doubleword
400 srwi w4,rc,5 // get count of 32-byte chunks (might be 0 if unaligned)
402 beq- LForwardFloatLoop1 // already aligned
404 cmpwi w4,0 // are there any 32-byte chunks to xfer?
405 lwz w2,4(rs) // get 2nd (unaligned) source word
406 add rs,rs,w3 // doubleword align source pointer
407 stw w1,0(rd) // store first 8 bytes of source to align...
408 stw w2,4(rd) // ...which could overwrite source
409 add rd,rd,w3 // doubleword align destination
410 bne+ LForwardFloatLoop1 // at least 1 chunk, so enter loop
412 subi rc,rc,8 // unfortunate degenerate case: no chunks to xfer
413 stfd f0,0(rd) // must store f1 since source might have been overwriten
419 // Medium reverse moves. This loop runs on all processors.
422 add rs,rs,rc // point to other end of operands when in reverse
424 andi. w3,rs,3 // w3 <- #bytes to word align source
425 lwz w1,-4(rs) // pre-fetch 1st 4 bytes of source
426 sub rc,rc,w3 // adjust count
427 srwi w4,rc,4 // get count of 16-byte chunks (>=1)
428 mtcrf 0x01,rc // remaining byte count (0-15) to cr7 for LShortReverse16
429 mtctr w4 // prepare loop count
430 beq+ 2f // source already aligned
432 sub rs,rs,w3 // word-align source ptr
433 lwz w2,-4(rs) // get 1st aligned word which we may overwrite
434 stw w1,-4(rd) // store all 4 bytes to align without a loop
436 mr w1,w2 // shift 1st aligned source word to w1
454 // Long operands. Use branch table to decide which loop to use.
455 // w1 = (rd-rs), used to determine alignment
458 xor w4,w1,rc // we must move reverse if (rd-rs)<rc
459 mflr ra // save return address
460 rlwinm w5,w1,1,27,30 // w5 <- ((w1 & 0xF) << 1)
461 bcl 20,31,1f // use reserved form to get our location
463 mflr w3 // w3 == addr(1b)
464 lis w8,0x0408 // load a 16 element, 2-bit array into w8...
465 cntlzw w4,w4 // find first difference between (rd-rs) and rc
466 addis w2,w3,ha16(LBranchTablePtr-1b)
467 ori w8,w8,0x040C // ...used to map w5 to alignment encoding (ie, to 0-3)
468 lwz w2,lo16(LBranchTablePtr-1b)(w2) // w2 <- branch table address
469 slw w4,rc,w4 // bit 0 of w4 set iff (rd-rs)<rc
470 rlwnm w5,w8,w5,28,29 // put alignment encoding in bits 01100 of w5
471 rlwimi w2,w4,5,27,27 // put reverse bit in bit 10000 of branch table address
472 lwzx w3,w2,w5 // w3 <- load loop address from branch table
473 neg w1,rd // start to compute destination alignment
475 andi. r0,w1,0x1F // r0 <- bytes req'd to 32-byte align dest (if forward move)
476 bctr // NB: r0/cr0 and w1 are passed as parameters
479 // G3, forward, long, unaligned.
483 andi. w3,w1,3 // W3 <- #bytes to word-align destination
484 mtlr ra // restore return address
485 sub rc,rc,w3 // adjust count for alignment
486 srwi r0,rc,5 // number of 32-byte chunks to xfer (>=1)
487 mtctr r0 // prepare loop count
488 beq+ 1f // dest already aligned
490 lwz w2,0(rs) // get first 4 bytes of source
491 lwzx w1,w3,rs // get source bytes we might overwrite
492 add rs,rs,w3 // adjust source ptr
493 stw w2,0(rd) // store all 4 bytes to avoid a loop
494 add rd,rd,w3 // word-align destination
521 // G3, forward, long, word aligned. We use floating pt even when only word aligned.
525 andi. w3,w1,7 // W3 <- #bytes to doubleword-align destination
526 mtlr ra // restore return address
527 sub rc,rc,w3 // adjust count for alignment
528 srwi r0,rc,5 // number of 32-byte chunks to xfer (>=1)
529 mtctr r0 // prepare loop count
530 beq LForwardFloatLoop // dest already aligned
532 lwz w1,0(rs) // get first 8 bytes of source
534 lfdx f0,w3,rs // get source bytes we might overwrite
535 add rs,rs,w3 // word-align source ptr
536 stw w1,0(rd) // store all 8 bytes to avoid a loop
541 .align 4 // align since this loop is executed by G4s too
544 LForwardFloatLoop1: // enter here from LMediumAligned and above
554 bdnz LForwardFloatLoop
559 // G4 Forward, long, 16-byte aligned, 32-byte cache ops, use DCBA and DCBT.
560 // r0/cr0 = #bytes to 32-byte align
562 LForwardVecAlig32Dcba:
563 bnel+ LAlign32 // align destination iff necessary
564 bl LPrepareForwardVectors
565 mtlr ra // restore return address before loading c128
567 b 1f // enter aligned loop
569 .align 5 // long loop heads should be at least 16-byte aligned
570 1: // loop over aligned 64-byte chunks
571 dcbt c96,rs // pre-fetch three cache lines ahead
572 dcbt c128,rs // and four
578 dcba 0,rd // avoid read of destination cache lines
587 LForwardVectorAlignedEnd: // r0/cr0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7
588 beq- 3f // no leftover quadwords
590 2: // loop over remaining quadwords (1-7)
597 mtspr VRSave,rv // restore bitmap of live vr's
598 bne cr6,LShort16 // handle last 0-15 bytes if any
602 // G4 Forward, long, 16-byte aligned, 32-byte cache, use DCBT but not DCBA.
603 // r0/cr0 = #bytes to 32-byte align
605 LForwardVecAlig32NoDcba:
606 bnel+ LAlign32 // align destination iff necessary
607 bl LPrepareForwardVectors
608 mtlr ra // restore return address before loading c128
610 b 1f // enter aligned loop
612 .align 4 // balance 13-word loop between QWs...
613 nop // ...which improves performance 5% +/-
615 1: // loop over aligned 64-byte chunks
616 dcbt c96,rs // pre-fetch three cache lines ahead
617 dcbt c128,rs // and four
630 b LForwardVectorAlignedEnd
633 // G4 Forward, long, unaligned, 32-byte cache ops, use DCBT and DCBA. At least on
634 // some CPUs, this routine is no slower than the simpler aligned version that does
635 // not use permutes. But it cannot be used with aligned operands, because of the
636 // way it prefetches source QWs.
637 // r0/cr0 = #bytes to 32-byte align
639 LForwardVecUnal32Dcba:
640 bnel+ LAlign32 // align destination iff necessary
641 bl LPrepareForwardVectors
642 lvx v1,0,rs // prime loop
643 mtlr ra // restore return address before loading c128
644 lvsl vp,0,rs // get permute vector to shift left
646 b 1f // enter aligned loop
648 .align 4 // long loop heads should be at least 16-byte aligned
649 1: // loop over aligned 64-byte destination chunks
651 dcbt c96,rs // touch 3rd cache line ahead
653 dcbt c128,rs // touch 4th cache line ahead
659 dcba 0,rd // avoid read of destination lines
670 LForwardVectorUnalignedEnd: // r0/cr0=#QWs, rv=VRSave, v1=next QW, cr7=(rc & F), cr6 set on cr7
671 beq- 3f // no leftover quadwords
673 2: // loop over remaining quadwords
677 vor v1,v2,v2 // v1 <- v2
682 mtspr VRSave,rv // restore bitmap of live vr's
683 bne cr6,LShort16 // handle last 0-15 bytes if any
687 // G4 Forward, long, unaligned, 32-byte cache ops, use DCBT but not DCBA.
688 // r0/cr0 = #bytes to 32-byte align
690 LForwardVecUnal32NoDcba:
691 bnel+ LAlign32 // align destination iff necessary
692 bl LPrepareForwardVectors
693 lvx v1,0,rs // prime loop
694 mtlr ra // restore return address before loading c128
695 lvsl vp,0,rs // get permute vector to shift left
697 b 1f // enter aligned loop
700 nop // balance 17-word loop between QWs
702 1: // loop over aligned 64-byte destination chunks
704 dcbt c96,rs // touch 3rd cache line ahead
706 dcbt c128,rs // touch 4th cache line ahead
721 b LForwardVectorUnalignedEnd
724 // G3 Reverse, long, unaligned.
727 bl LAlign8Reverse // 8-byte align destination
728 mtlr ra // restore return address
729 srwi r0,rc,5 // get count of 32-byte chunks to xfer (> 1)
753 // G3 Reverse, long, word aligned.
756 bl LAlign8Reverse // 8-byte align
757 mtlr ra // restore return address
758 srwi r0,rc,5 // get count of 32-byte chunks to xfer (> 1)
774 // G4 Reverse, long, 16-byte aligned, 32-byte DCBT but no DCBA.
776 LReverseVectorAligned32:
777 bl LAlign32Reverse // 32-byte align destination iff necessary
778 bl LPrepareReverseVectors
779 mtlr ra // restore return address before loading cm129
781 b 1f // enter aligned loop
784 nop // must start in 3rd word of QW...
785 nop // ...to keep balanced
786 1: // loop over aligned 64-byte chunks
787 dcbt cm97,rs // pre-fetch three cache lines ahead
788 dcbt cm129,rs // and four
801 LReverseVectorAlignedEnd: // cr0/r0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7
802 beq 3f // no leftover quadwords
804 2: // loop over 1-3 quadwords
811 mtspr VRSave,rv // restore bitmap of live vr's
812 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
816 // G4 Reverse, long, unaligned, 32-byte DCBT.
818 LReverseVectorUnal32:
819 bl LAlign32Reverse // align destination iff necessary
820 bl LPrepareReverseVectors
821 lvx v1,cm1,rs // prime loop
822 mtlr ra // restore return address before loading cm129
823 lvsl vp,0,rs // get permute vector to shift left
825 b 1f // enter aligned loop
828 nop // start loop in 3rd word on QW to balance
830 1: // loop over aligned 64-byte destination chunks
832 dcbt cm97,rs // touch in 3rd source block
834 dcbt cm129,rs // touch in 4th
849 LReverseVectorUnalignedEnd: // r0/cr0=#QWs, rv=VRSave, v1=source QW, cr7=low 4 bits of rc, cr6 set on cr7
850 beq 3f // no leftover quadwords
852 2: // loop over 1-3 quadwords
856 vor v1,v2,v2 // v1 <- v2
861 mtspr VRSave,rv // restore bitmap of live vr's
862 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
866 // Subroutine to prepare for 64-byte forward vector loops.
867 // Returns many things:
868 // ctr = number of 64-byte chunks to move
869 // r0/cr0 = leftover QWs to move
870 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
871 // cr6 = beq if leftover byte count is 0
873 // rv = original value of VRSave
874 // NB: c128 not set (if needed), since it is still "ra"
876 LPrepareForwardVectors:
877 mfspr rv,VRSave // get bitmap of live vector registers
878 srwi r0,rc,6 // get count of 64-byte chunks to move (>=1)
879 oris w1,rv,0xFF00 // we use v0-v7
880 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShort16
881 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
882 mtspr VRSave,w1 // update mask
883 li c16,16 // get constants used in ldvx/stvx
885 mtctr r0 // set up loop count
886 cmpwi cr6,w3,0 // set cr6 on leftover byte count
889 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
893 // Subroutine to prepare for 64-byte reverse vector loops.
894 // Returns many things:
895 // ctr = number of 64-byte chunks to move
896 // r0/cr0 = leftover QWs to move
897 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
898 // cr6 = beq if leftover byte count is 0
900 // rv = original value of VRSave
901 // NB: cm129 not set (if needed), since it is still "ra"
903 LPrepareReverseVectors:
904 mfspr rv,VRSave // get bitmap of live vector registers
905 srwi r0,rc,6 // get count of 64-byte chunks to move (>=1)
906 oris w1,rv,0xFF00 // we use v0-v7
907 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
908 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
909 mtspr VRSave,w1 // update mask
910 li cm1,-1 // get constants used in ldvx/stvx
912 mtctr r0 // set up loop count
913 cmpwi cr6,w3,0 // set cr6 on leftover byte count
916 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
921 // Subroutine to align destination on a 32-byte boundary.
922 // r0 = number of bytes to xfer (0-31)
925 mtcrf 0x01,r0 // length to cr (faster to change 1 CR at a time)
927 sub rc,rc,r0 // adjust length
928 bf 31,1f // skip if no odd bit
934 bf 30,2f // halfword to move?
946 bf 28,4f // doubleword?
954 bflr 27 // done if no quadword to move
967 // Subroutine to align destination if necessary on a 32-byte boundary for reverse moves.
968 // rs and rd still point to low end of operands
969 // we adjust rs and rd to point to last byte moved
972 add rd,rd,rc // point to last byte moved (ie, 1 past end of operands)
974 andi. r0,rd,0x1F // r0 <- #bytes that must be moved to align destination
975 mtcrf 0x01,r0 // length to cr (faster to change 1 CR at a time)
977 sub rc,rc,r0 // update length
978 beqlr- // destination already 32-byte aligned
980 bf 31,1f // odd byte?
984 bf 30,2f // halfword to move?
992 bf 28,4f // doubleword?
998 bflr 27 // done if no quadwords
1010 // Subroutine to align destination on an 8-byte boundary for reverse moves.
1011 // rs and rd still point to low end of operands
1012 // we adjust rs and rd to point to last byte moved
1015 add rd,rd,rc // point to last byte moved (ie, 1 past end of operands)
1017 andi. r0,rd,0x7 // r0 <- #bytes that must be moved to align destination
1018 beqlr- // destination already 8-byte aligned
1019 mtctr r0 // set up for loop
1020 sub rc,rc,r0 // update length
1029 // Called by pthread initialization to set up the branch table pointer based on
1030 // the CPU capability vector. This routine may be called more than once (for
1031 // example, during testing.)
1033 // Size of the buffer we use to do DCBA timing on G4:
1034 #define kBufSiz 1024
1036 // Stack frame size, which contains the 128-byte-aligned buffer:
1037 #define kSFSize (kBufSiz+128+16)
1039 // Iterations of the timing loop:
1042 // Bit in cr5 used as a flag in timing loop:
1045 __bcopy_initialize: // int _bcopy_initialize(void)
1046 mflr ra // get return
1047 stw ra,8(r1) // save
1048 stwu r1,-kSFSize(r1) // carve our temp buffer from the stack
1049 addi w6,r1,127+16 // get base address...
1050 rlwinm w6,w6,0,0,24 // ...of our buffer, 128-byte aligned
1051 bcl 20,31,1f // get our PIC base
1054 addis w2,w1,ha16(__cpu_capabilities - 1b)
1055 lwz w3,lo16(__cpu_capabilities - 1b)(w2)
1056 andi. r0,w3,kUseDcba+kNoDcba+kCache32+k64Bit+kHasAltivec
1057 cmpwi r0,kCache32+kHasAltivec // untyped G4?
1058 li w8,0 // assume no need to test
1059 bne 2f // not an untyped G4, so do not test
1061 // G4, but neither kUseDcba or kNoDcba are set. Time and select fastest.
1063 crset kDCBA // first, use DCBA
1064 bl LTest32 // time it
1065 mr w8,w4 // w8 <- best time using DCBA
1066 srwi r0,w8,3 // bias 12 pct in favor of not using DCBA...
1067 add w8,w8,r0 // ...because DCBA is always slower with warm cache
1069 bl LTest32 // w4 <- best time without DCBA
1070 cmplw w8,w4 // which is better?
1071 li w8,kUseDcba // assume using DCBA is faster
1073 li w8,kNoDcba // no DCBA is faster
1075 // What branch table to use?
1077 2: // here with w8 = 0, kUseDcba, or kNoDcba
1078 bcl 20,31,4f // get our PIC base again
1081 addis w2,w1,ha16(__cpu_capabilities - 4b)
1082 lwz w3,lo16(__cpu_capabilities - 4b)(w2)
1083 or w3,w3,w8 // add in kUseDcba or kNoDcba if untyped G4
1084 mr r3,w8 // return dynamic selection, if any (used in testing)
1086 andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba
1087 cmpwi r0,kHasAltivec+kCache32+kUseDcba // G4 with DCBA?
1088 addis w4,w1,ha16(LG4UseDcba - 4b)
1089 addi w4,w4,lo16(LG4UseDcba - 4b)
1092 andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba
1093 cmpwi r0,kHasAltivec+kCache32+kNoDcba // G4 without DCBA?
1094 addis w4,w1,ha16(LG4NoDcba - 4b)
1095 addi w4,w4,lo16(LG4NoDcba - 4b)
1098 andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32
1099 cmpwi r0,kCache32 // G3?
1100 addis w4,w1,ha16(LG3 - 4b)
1101 addi w4,w4,lo16(LG3 - 4b)
1104 // Map unrecognized CPU types to G3 (lowest common denominator)
1106 5: // w4 <- branch table pointer
1107 addis w5,w1,ha16(LBranchTablePtr - 4b)
1108 stw w4,lo16(LBranchTablePtr - 4b)(w5)
1109 lwz ra,kSFSize+8(r1) // recover return address
1110 mtlr ra // restore it
1111 lwz r1,0(r1) // pop off our stack frame
1112 blr // return dynamic selection (or 0) in r3
1115 // Subroutine to time a 32-byte cache.
1116 // kDCBA = set if we should use DCBA
1117 // w6 = base of buffer to use for test (kBufSiz bytes)
1118 // w4 = we return time of fastest loop in w4
1121 li w1,kLoopCnt // number of times to loop
1122 li w4,-1 // initialize fastest time
1124 mr rd,w6 // initialize buffer ptr
1125 li r0,kBufSiz/32 // r0 <- cache blocks to test
1128 dcbf 0,rd // first, force the blocks out of the cache
1131 sync // make sure all the flushes take
1132 mr rd,w6 // re-initialize buffer ptr
1133 mtctr r0 // reset cache-block count
1134 mftbu w5 // remember upper half so we can check for carry
1135 mftb w2 // start the timer
1136 3: // loop over cache blocks
1137 bf kDCBA,4f // should we DCBA?
1140 stfd f1,0(rd) // store the entire cache block
1148 cmpw r0,w5 // did timebase carry?
1149 bne 1b // yes, retest rather than fuss
1150 sub w3,w3,w2 // w3 <- time for this loop
1151 cmplw w3,w4 // faster than current best?
1153 mr w4,w3 // remember fastest time through loop
1155 subi w1,w1,1 // decrement outer loop count
1156 cmpwi w1,0 // more to go?
1157 bne 1b // loop if so