2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
26 #include <sys/appleapiopts.h>
28 #include <ppc/proc_reg.h>
29 #include <machine/cpu_capabilities.h>
30 #include <machine/commpage.h>
33 // commpage_time_dcba() uses a stack frame as follows:
35 #define kBufSiz 1024 // Size of the buffer we use to do DCBA timing on G4
36 #define kSFSize (kBufSiz+128+16) // Stack frame size, which contains the 128-byte-aligned buffer
37 #define kLoopCnt 5 // Iterations of the timing loop
38 #define kDCBA 22 // Bit in cr5 used as a flag in timing loop
41 .align 3 // three doubleword fields
45 .double 1.0e0 // kkDouble1
46 .long 0x43300000 // kkTicksPerSec (plus 2**52)
47 .long 0 // this is where we store ticks_per_sec, to float
51 .globl EXT(commpage_time_dcba)
53 /* ***************************************
54 * * C O M M P A G E _ T I M E _ D C B A *
55 * ***************************************
57 * Not all processors that support the DCBA opcode actually benefit from it.
58 * Some store-gather and read-cancel well enough that there is no need to use
59 * DCBA to avoid fetching cache lines that will be completely overwritten, while
60 * others have this feature disabled (to work around errata etc), and so benefit
61 * from DCBA. Since it is hard to tell the one group from the other, we just
62 * time loops with and without DCBA, and pick the fastest. Thus we avoid
63 * delicate dependence on processor and/or platform revisions.
65 * We return either kDcbaRecommended or zero.
67 * int commpage_time_dcba( void );
70 LEXT(commpage_time_dcba)
71 mflr r12 // get return
73 stwu r1,-kSFSize(r1) // carve our temp buffer from the stack
74 addi r11,r1,127+16 // get base address...
75 rlwinm r11,r11,0,0,24 // ...of our buffer, 128-byte aligned
76 crset kDCBA // first, use DCBA
77 bl LTest // time it with DCBA
78 srwi r0,r3,3 // bias 12 pct in favor of not using DCBA...
79 add r10,r3,r0 // ...because DCBA is always slower with warm cache
81 bl LTest // time without DCBA
82 cmplw r10,r3 // which is better?
83 mtlr r12 // restore return
84 lwz r1,0(r1) // pop off our stack frame
85 li r3,kDcbaRecommended // assume using DCBA is faster
87 li r3,0 // no DCBA is faster
91 // Subroutine to time a loop with or without DCBA.
92 // kDCBA = set if we should use DCBA
93 // r11 = base of buffer to use for test (kBufSiz bytes)
95 // We return TBR ticks in r3.
99 li r4,kLoopCnt // number of times to loop
100 li r3,-1 // initialize fastest time
102 mr r6,r11 // initialize buffer ptr
103 li r0,kBufSiz/32 // r0 <- cache blocks to test
106 dcbf 0,r6 // first, force the blocks out of the cache
109 sync // make sure all the flushes take
110 mr r6,r11 // re-initialize buffer ptr
111 mtctr r0 // reset cache-block count
112 mftbu r7 // remember upper half so we can check for carry
113 mftb r8 // start the timer
114 3: // loop over cache blocks
115 bf kDCBA,4f // should we DCBA?
118 stw r0,0(r6) // store the entire cache block
130 cmpw r0,r7 // did timebase carry?
131 bne 1b // yes, retest rather than fuss
132 sub r9,r9,r8 // r9 <- time for this loop
133 cmplw r9,r3 // faster than current best?
135 mr r3,r9 // remember fastest time through loop
137 subi r4,r4,1 // decrement outer loop count
138 cmpwi r4,0 // more to go?
140 blr // return fastest time in r3