]>
Commit | Line | Data |
---|---|---|
43866e37 | 1 | /* |
0c530ab8 | 2 | * Copyright (c) 2003-2005 Apple Computer, Inc. All rights reserved. |
43866e37 | 3 | * |
2d21ac55 | 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
43866e37 | 5 | * |
2d21ac55 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
8f6c56a5 | 14 | * |
2d21ac55 A |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
8f6c56a5 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
8f6c56a5 | 25 | * |
2d21ac55 | 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
43866e37 A |
27 | */ |
28 | ||
29 | #include <sys/appleapiopts.h> | |
30 | #include <ppc/asm.h> | |
31 | #include <ppc/proc_reg.h> | |
32 | #include <machine/cpu_capabilities.h> | |
33 | #include <machine/commpage.h> | |
34 | ||
35 | ||
36 | // commpage_time_dcba() uses a stack frame as follows: | |
37 | ||
38 | #define kBufSiz 1024 // Size of the buffer we use to do DCBA timing on G4 | |
39 | #define kSFSize (kBufSiz+128+16) // Stack frame size, which contains the 128-byte-aligned buffer | |
40 | #define kLoopCnt 5 // Iterations of the timing loop | |
41 | #define kDCBA 22 // Bit in cr5 used as a flag in timing loop | |
42 | ||
55e303ae A |
43 | |
44 | // commpage_set_timestamp() uses the red zone for temporary storage: | |
45 | ||
46 | #define rzSaveF1 -8 // caller's FPR1 | |
47 | #define rzSaveF2 -16 // caller's FPR2 | |
48 | #define rzSaveF3 -24 // caller's FPR3 | |
49 | #define rzSaveF4 -32 // caller's FPR4 | |
50 | #define rzSaveF5 -40 // caller's FPR5 | |
51 | #define rzNewTimeBase -48 // used to load 64-bit TBR into a FPR | |
52 | ||
53 | ||
54 | // commpage_set_timestamp() uses the following data. kkTicksPerSec remembers | |
55 | // the number used to compute _COMM_PAGE_SEC_PER_TICK. Since this constant | |
56 | // rarely changes, we use it to avoid needless recomputation. It is a double | |
57 | // value, pre-initialize with an exponent of 2**52. | |
58 | ||
59 | #define kkBinary0 0 // offset in data to long long 0 (a constant) | |
60 | #define kkDouble1 8 // offset in data to double 1.0 (a constant) | |
61 | #define kkTicksPerSec 16 // offset in data to double(ticks_per_sec) | |
62 | ||
43866e37 A |
63 | .data |
64 | .align 3 // three doubleword fields | |
65 | Ldata: | |
66 | .long 0 // kkBinary0 | |
67 | .long 0 | |
68 | .double 1.0e0 // kkDouble1 | |
69 | .long 0x43300000 // kkTicksPerSec (plus 2**52) | |
70 | .long 0 // this is where we store ticks_per_sec, to float | |
71 | ||
72 | .text | |
73 | .align 2 | |
74 | .globl EXT(commpage_time_dcba) | |
55e303ae A |
75 | .globl EXT(commpage_set_timestamp) |
76 | ||
77 | ||
78 | /* *********************************************** | |
79 | * * C O M M P A G E _ S E T _ T I M E S T A M P * | |
80 | * *********************************************** | |
81 | * | |
91447636 | 82 | * Update the gettimeofday() shared data on the commpages, as follows: |
0c530ab8 | 83 | * _COMM_PAGE_TIMESTAMP = the clock offset at timebase (seconds) |
55e303ae A |
84 | * _COMM_PAGE_TIMEBASE = the timebase at which the timestamp was valid |
85 | * _COMM_PAGE_SEC_PER_TICK = multiply timebase ticks by this to get seconds (double) | |
86 | * The convention is that if the timebase is 0, the data is invalid. Because other | |
87 | * CPUs are reading the three values asynchronously and must get a consistent set, | |
88 | * it is critical that we update them with the following protocol: | |
89 | * 1. set timebase to 0 (atomically), to invalidate all three values | |
90 | * 2. eieio (to create a barrier in stores to cacheable memory) | |
91 | * 3. change timestamp and "secs per tick" | |
92 | * 4. eieio | |
93 | * 5. set timebase nonzero (atomically) | |
94 | * This works because readers read the timebase, then the timestamp and divisor, sync | |
95 | * if MP, then read the timebase a second time and check to be sure it is equal to the first. | |
96 | * | |
97 | * We could save a few cycles on 64-bit machines by special casing them, but it probably | |
98 | * isn't necessary because this routine shouldn't be called very often. | |
99 | * | |
100 | * When called: | |
101 | * r3 = upper half of timebase (timebase is disabled if 0) | |
102 | * r4 = lower half of timebase | |
0c530ab8 A |
103 | * r5 = upper half of timestamp |
104 | * r6 = lower half of timestamp | |
55e303ae A |
105 | * r7 = divisor (ie, timebase ticks per sec) |
106 | * We set up: | |
107 | * r8 = ptr to our static data (kkBinary0, kkDouble1, kkTicksPerSec) | |
91447636 A |
108 | * r9 = ptr to 32-bit commpage in kernel map |
109 | * r10 = ptr to 64-bit commpage in kernel map | |
55e303ae A |
110 | * |
111 | * --> Interrupts must be disabled and rtclock locked when called. <-- | |
112 | */ | |
113 | ||
114 | .align 5 | |
0c530ab8 | 115 | LEXT(commpage_set_timestamp) // void commpage_set_timestamp(tbr,secs,divisor) |
55e303ae A |
116 | mfmsr r11 // get MSR |
117 | ori r2,r11,MASK(MSR_FP) // turn FP on | |
118 | mtmsr r2 | |
119 | isync // wait until MSR changes take effect | |
120 | ||
121 | or. r0,r3,r4 // is timebase 0? (thus disabled) | |
122 | lis r8,hi16(Ldata) // point to our data | |
91447636 A |
123 | lis r9,ha16(EXT(commPagePtr32)) // get ptrs to address of commpages in kernel map |
124 | lis r10,ha16(EXT(commPagePtr64)) | |
55e303ae A |
125 | stfd f1,rzSaveF1(r1) // save a FPR in the red zone |
126 | ori r8,r8,lo16(Ldata) | |
91447636 A |
127 | lwz r9,lo16(EXT(commPagePtr32))(r9) // r9 <- 32-bit commpage ptr |
128 | lwz r10,lo16(EXT(commPagePtr64))(r10) // r10 <- 64-bit commpage ptr | |
55e303ae A |
129 | lfd f1,kkBinary0(r8) // get fixed 0s |
130 | li r0,_COMM_PAGE_BASE_ADDRESS // get va in user space of commpage | |
91447636 A |
131 | cmpwi cr1,r9,0 // is 32-bit commpage allocated yet? |
132 | cmpwi cr6,r10,0 // is 64-bit commpage allocated yet? | |
133 | sub r9,r9,r0 // r9 <- 32-bit commpage address, biased by user va | |
134 | sub r10,r10,r0 // r10<- 64-bit commpage address | |
135 | beq-- cr1,3f // skip if 32-bit commpage not allocated (64-bit won't be either) | |
136 | bne++ cr6,1f // skip if 64-bit commpage is allocated | |
137 | mr r10,r9 // if no 64-bit commpage, point to 32-bit version with r10 too | |
138 | 1: | |
139 | stfd f1,_COMM_PAGE_TIMEBASE(r9) // turn off the 32-bit-commpage timestamp (atomically) | |
140 | stfd f1,_COMM_PAGE_TIMEBASE(r10) // and the 64-bit one too | |
55e303ae A |
141 | eieio // make sure all CPUs see it is off |
142 | beq 3f // all we had to do is turn off timestamp | |
143 | ||
144 | lwz r0,kkTicksPerSec+4(r8) // get last ticks_per_sec (or 0 if first) | |
145 | stw r3,rzNewTimeBase(r1) // store new timebase so we can lfd | |
146 | stw r4,rzNewTimeBase+4(r1) | |
147 | cmpw r0,r7 // do we need to recompute _COMM_PAGE_SEC_PER_TICK? | |
91447636 | 148 | stw r5,_COMM_PAGE_TIMESTAMP(r9) // store the new timestamp in the 32-bit page |
55e303ae | 149 | stw r6,_COMM_PAGE_TIMESTAMP+4(r9) |
91447636 A |
150 | stw r5,_COMM_PAGE_TIMESTAMP(r10)// and the 64-bit commpage |
151 | stw r6,_COMM_PAGE_TIMESTAMP+4(r10) | |
55e303ae A |
152 | lfd f1,rzNewTimeBase(r1) // get timebase in a FPR so we can store atomically |
153 | beq++ 2f // same ticks_per_sec, no need to recompute | |
154 | ||
155 | stw r7,kkTicksPerSec+4(r8) // must recompute SEC_PER_TICK | |
156 | stfd f2,rzSaveF2(r1) // we'll need a few more temp FPRs | |
157 | stfd f3,rzSaveF3(r1) | |
158 | stfd f4,rzSaveF4(r1) | |
159 | stfd f5,rzSaveF5(r1) | |
160 | lfd f2,_COMM_PAGE_2_TO_52(r9) // f2 <- double(2**52) | |
161 | lfd f3,kkTicksPerSec(r8) // float new ticks_per_sec + 2**52 | |
162 | lfd f4,kkDouble1(r8) // f4 <- double(1.0) | |
163 | mffs f5 // save caller's FPSCR | |
0c530ab8 | 164 | mtfsfi 7,1 // clear Inexeact Exception bit, set round-to-zero |
55e303ae A |
165 | fsub f3,f3,f2 // get ticks_per_sec |
166 | fdiv f3,f4,f3 // divide 1 by ticks_per_sec to get SEC_PER_TICK | |
167 | stfd f3,_COMM_PAGE_SEC_PER_TICK(r9) | |
91447636 | 168 | stfd f3,_COMM_PAGE_SEC_PER_TICK(r10) |
55e303ae A |
169 | mtfsf 0xFF,f5 // restore FPSCR |
170 | lfd f2,rzSaveF2(r1) // restore FPRs | |
171 | lfd f3,rzSaveF3(r1) | |
172 | lfd f4,rzSaveF4(r1) | |
173 | lfd f5,rzSaveF5(r1) | |
174 | 2: // f1 == new timestamp | |
175 | eieio // wait until the stores take | |
176 | stfd f1,_COMM_PAGE_TIMEBASE(r9) // then turn the timestamp back on (atomically) | |
91447636 | 177 | stfd f1,_COMM_PAGE_TIMEBASE(r10) // both |
55e303ae A |
178 | 3: // here once all fields updated |
179 | lfd f1,rzSaveF1(r1) // restore last FPR | |
180 | mtmsr r11 // turn FP back off | |
181 | isync | |
182 | blr | |
183 | ||
43866e37 A |
184 | |
185 | /* *************************************** | |
186 | * * C O M M P A G E _ T I M E _ D C B A * | |
187 | * *************************************** | |
188 | * | |
189 | * Not all processors that support the DCBA opcode actually benefit from it. | |
190 | * Some store-gather and read-cancel well enough that there is no need to use | |
191 | * DCBA to avoid fetching cache lines that will be completely overwritten, while | |
192 | * others have this feature disabled (to work around errata etc), and so benefit | |
193 | * from DCBA. Since it is hard to tell the one group from the other, we just | |
194 | * time loops with and without DCBA, and pick the fastest. Thus we avoid | |
195 | * delicate dependence on processor and/or platform revisions. | |
196 | * | |
197 | * We return either kDcbaRecommended or zero. | |
198 | * | |
199 | * int commpage_time_dcba( void ); | |
200 | */ | |
201 | ||
202 | LEXT(commpage_time_dcba) | |
203 | mflr r12 // get return | |
204 | stw r12,8(r1) // save | |
205 | stwu r1,-kSFSize(r1) // carve our temp buffer from the stack | |
206 | addi r11,r1,127+16 // get base address... | |
207 | rlwinm r11,r11,0,0,24 // ...of our buffer, 128-byte aligned | |
208 | crset kDCBA // first, use DCBA | |
209 | bl LTest // time it with DCBA | |
210 | srwi r0,r3,3 // bias 12 pct in favor of not using DCBA... | |
211 | add r10,r3,r0 // ...because DCBA is always slower with warm cache | |
212 | crclr kDCBA | |
213 | bl LTest // time without DCBA | |
214 | cmplw r10,r3 // which is better? | |
215 | mtlr r12 // restore return | |
216 | lwz r1,0(r1) // pop off our stack frame | |
217 | li r3,kDcbaRecommended // assume using DCBA is faster | |
218 | bltlr | |
219 | li r3,0 // no DCBA is faster | |
220 | blr | |
221 | ||
222 | ||
223 | // Subroutine to time a loop with or without DCBA. | |
224 | // kDCBA = set if we should use DCBA | |
225 | // r11 = base of buffer to use for test (kBufSiz bytes) | |
226 | // | |
227 | // We return TBR ticks in r3. | |
228 | // We use r0,r3-r9. | |
229 | ||
230 | LTest: | |
231 | li r4,kLoopCnt // number of times to loop | |
232 | li r3,-1 // initialize fastest time | |
233 | 1: | |
234 | mr r6,r11 // initialize buffer ptr | |
235 | li r0,kBufSiz/32 // r0 <- cache blocks to test | |
236 | mtctr r0 | |
237 | 2: | |
238 | dcbf 0,r6 // first, force the blocks out of the cache | |
239 | addi r6,r6,32 | |
240 | bdnz 2b | |
241 | sync // make sure all the flushes take | |
242 | mr r6,r11 // re-initialize buffer ptr | |
243 | mtctr r0 // reset cache-block count | |
244 | mftbu r7 // remember upper half so we can check for carry | |
245 | mftb r8 // start the timer | |
246 | 3: // loop over cache blocks | |
247 | bf kDCBA,4f // should we DCBA? | |
248 | dcba 0,r6 | |
249 | 4: | |
250 | stw r0,0(r6) // store the entire cache block | |
251 | stw r0,4(r6) | |
252 | stw r0,8(r6) | |
253 | stw r0,12(r6) | |
254 | stw r0,16(r6) | |
255 | stw r0,20(r6) | |
256 | stw r0,24(r6) | |
257 | stw r0,28(r6) | |
258 | addi r6,r6,32 | |
259 | bdnz 3b | |
260 | mftb r9 | |
261 | mftbu r0 | |
262 | cmpw r0,r7 // did timebase carry? | |
263 | bne 1b // yes, retest rather than fuss | |
264 | sub r9,r9,r8 // r9 <- time for this loop | |
265 | cmplw r9,r3 // faster than current best? | |
266 | bge 5f // no | |
267 | mr r3,r9 // remember fastest time through loop | |
268 | 5: | |
269 | subi r4,r4,1 // decrement outer loop count | |
270 | cmpwi r4,0 // more to go? | |
271 | bne 1b // loop if so | |
272 | blr // return fastest time in r3 |