]>
Commit | Line | Data |
---|---|---|
43866e37 A |
1 | /* |
2 | * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. | |
7 | * | |
8 | * This file contains Original Code and/or Modifications of Original Code | |
9 | * as defined in and that are subject to the Apple Public Source License | |
10 | * Version 2.0 (the 'License'). You may not use this file except in | |
11 | * compliance with the License. Please obtain a copy of the License at | |
12 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
13 | * file. | |
14 | * | |
15 | * The Original Code and all software distributed under the License are | |
16 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
17 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
18 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
19 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
20 | * Please see the License for the specific language governing rights and | |
21 | * limitations under the License. | |
22 | * | |
23 | * @APPLE_LICENSE_HEADER_END@ | |
24 | */ | |
25 | ||
26 | #include <sys/appleapiopts.h> | |
27 | #include <ppc/asm.h> | |
28 | #include <ppc/proc_reg.h> | |
29 | #include <machine/cpu_capabilities.h> | |
30 | #include <machine/commpage.h> | |
31 | ||
32 | ||
33 | // commpage_time_dcba() uses a stack frame as follows: | |
34 | ||
35 | #define kBufSiz 1024 // Size of the buffer we use to do DCBA timing on G4 | |
36 | #define kSFSize (kBufSiz+128+16) // Stack frame size, which contains the 128-byte-aligned buffer | |
37 | #define kLoopCnt 5 // Iterations of the timing loop | |
38 | #define kDCBA 22 // Bit in cr5 used as a flag in timing loop | |
39 | ||
40 | .data | |
41 | .align 3 // three doubleword fields | |
42 | Ldata: | |
43 | .long 0 // kkBinary0 | |
44 | .long 0 | |
45 | .double 1.0e0 // kkDouble1 | |
46 | .long 0x43300000 // kkTicksPerSec (plus 2**52) | |
47 | .long 0 // this is where we store ticks_per_sec, to float | |
48 | ||
49 | .text | |
50 | .align 2 | |
51 | .globl EXT(commpage_time_dcba) | |
52 | ||
53 | /* *************************************** | |
54 | * * C O M M P A G E _ T I M E _ D C B A * | |
55 | * *************************************** | |
56 | * | |
57 | * Not all processors that support the DCBA opcode actually benefit from it. | |
58 | * Some store-gather and read-cancel well enough that there is no need to use | |
59 | * DCBA to avoid fetching cache lines that will be completely overwritten, while | |
60 | * others have this feature disabled (to work around errata etc), and so benefit | |
61 | * from DCBA. Since it is hard to tell the one group from the other, we just | |
62 | * time loops with and without DCBA, and pick the fastest. Thus we avoid | |
63 | * delicate dependence on processor and/or platform revisions. | |
64 | * | |
65 | * We return either kDcbaRecommended or zero. | |
66 | * | |
67 | * int commpage_time_dcba( void ); | |
68 | */ | |
69 | ||
70 | LEXT(commpage_time_dcba) | |
71 | mflr r12 // get return | |
72 | stw r12,8(r1) // save | |
73 | stwu r1,-kSFSize(r1) // carve our temp buffer from the stack | |
74 | addi r11,r1,127+16 // get base address... | |
75 | rlwinm r11,r11,0,0,24 // ...of our buffer, 128-byte aligned | |
76 | crset kDCBA // first, use DCBA | |
77 | bl LTest // time it with DCBA | |
78 | srwi r0,r3,3 // bias 12 pct in favor of not using DCBA... | |
79 | add r10,r3,r0 // ...because DCBA is always slower with warm cache | |
80 | crclr kDCBA | |
81 | bl LTest // time without DCBA | |
82 | cmplw r10,r3 // which is better? | |
83 | mtlr r12 // restore return | |
84 | lwz r1,0(r1) // pop off our stack frame | |
85 | li r3,kDcbaRecommended // assume using DCBA is faster | |
86 | bltlr | |
87 | li r3,0 // no DCBA is faster | |
88 | blr | |
89 | ||
90 | ||
91 | // Subroutine to time a loop with or without DCBA. | |
92 | // kDCBA = set if we should use DCBA | |
93 | // r11 = base of buffer to use for test (kBufSiz bytes) | |
94 | // | |
95 | // We return TBR ticks in r3. | |
96 | // We use r0,r3-r9. | |
97 | ||
98 | LTest: | |
99 | li r4,kLoopCnt // number of times to loop | |
100 | li r3,-1 // initialize fastest time | |
101 | 1: | |
102 | mr r6,r11 // initialize buffer ptr | |
103 | li r0,kBufSiz/32 // r0 <- cache blocks to test | |
104 | mtctr r0 | |
105 | 2: | |
106 | dcbf 0,r6 // first, force the blocks out of the cache | |
107 | addi r6,r6,32 | |
108 | bdnz 2b | |
109 | sync // make sure all the flushes take | |
110 | mr r6,r11 // re-initialize buffer ptr | |
111 | mtctr r0 // reset cache-block count | |
112 | mftbu r7 // remember upper half so we can check for carry | |
113 | mftb r8 // start the timer | |
114 | 3: // loop over cache blocks | |
115 | bf kDCBA,4f // should we DCBA? | |
116 | dcba 0,r6 | |
117 | 4: | |
118 | stw r0,0(r6) // store the entire cache block | |
119 | stw r0,4(r6) | |
120 | stw r0,8(r6) | |
121 | stw r0,12(r6) | |
122 | stw r0,16(r6) | |
123 | stw r0,20(r6) | |
124 | stw r0,24(r6) | |
125 | stw r0,28(r6) | |
126 | addi r6,r6,32 | |
127 | bdnz 3b | |
128 | mftb r9 | |
129 | mftbu r0 | |
130 | cmpw r0,r7 // did timebase carry? | |
131 | bne 1b // yes, retest rather than fuss | |
132 | sub r9,r9,r8 // r9 <- time for this loop | |
133 | cmplw r9,r3 // faster than current best? | |
134 | bge 5f // no | |
135 | mr r3,r9 // remember fastest time through loop | |
136 | 5: | |
137 | subi r4,r4,1 // decrement outer loop count | |
138 | cmpwi r4,0 // more to go? | |
139 | bne 1b // loop if so | |
140 | blr // return fastest time in r3 |