2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
26 #include <sys/appleapiopts.h>
28 #include <ppc/proc_reg.h>
29 #include <machine/cpu_capabilities.h>
30 #include <machine/commpage.h>
33 // commpage_time_dcba() uses a stack frame as follows:
35 #define kBufSiz 1024 // Size of the buffer we use to do DCBA timing on G4
36 #define kSFSize (kBufSiz+128+16) // Stack frame size, which contains the 128-byte-aligned buffer
37 #define kLoopCnt 5 // Iterations of the timing loop
38 #define kDCBA 22 // Bit in cr5 used as a flag in timing loop
41 // commpage_set_timestamp() uses the red zone for temporary storage:
43 #define rzSaveF1 -8 // caller's FPR1
44 #define rzSaveF2 -16 // caller's FPR2
45 #define rzSaveF3 -24 // caller's FPR3
46 #define rzSaveF4 -32 // caller's FPR4
47 #define rzSaveF5 -40 // caller's FPR5
48 #define rzNewTimeBase -48 // used to load 64-bit TBR into a FPR
51 // commpage_set_timestamp() uses the following data. kkTicksPerSec remembers
52 // the number used to compute _COMM_PAGE_SEC_PER_TICK. Since this constant
53 // rarely changes, we use it to avoid needless recomputation. It is a double
54 // value, pre-initialize with an exponent of 2**52.
56 #define kkBinary0 0 // offset in data to long long 0 (a constant)
57 #define kkDouble1 8 // offset in data to double 1.0 (a constant)
58 #define kkTicksPerSec 16 // offset in data to double(ticks_per_sec)
61 .align 3 // three doubleword fields
65 .double 1.0e0 // kkDouble1
66 .long 0x43300000 // kkTicksPerSec (plus 2**52)
67 .long 0 // this is where we store ticks_per_sec, to float
71 .globl EXT(commpage_time_dcba)
72 .globl EXT(commpage_set_timestamp)
75 /* ***********************************************
76 * * C O M M P A G E _ S E T _ T I M E S T A M P *
77 * ***********************************************
79 * Update the gettimeofday() shared data on the commpage, as follows:
80 * _COMM_PAGE_TIMESTAMP = a BSD-style pair of uint_32's for secs and usecs
81 * _COMM_PAGE_TIMEBASE = the timebase at which the timestamp was valid
82 * _COMM_PAGE_SEC_PER_TICK = multiply timebase ticks by this to get seconds (double)
83 * The convention is that if the timebase is 0, the data is invalid. Because other
84 * CPUs are reading the three values asynchronously and must get a consistent set,
85 * it is critical that we update them with the following protocol:
86 * 1. set timebase to 0 (atomically), to invalidate all three values
87 * 2. eieio (to create a barrier in stores to cacheable memory)
88 * 3. change timestamp and "secs per tick"
90 * 5. set timebase nonzero (atomically)
91 * This works because readers read the timebase, then the timestamp and divisor, sync
92 * if MP, then read the timebase a second time and check to be sure it is equal to the first.
94 * We could save a few cycles on 64-bit machines by special casing them, but it probably
95 * isn't necessary because this routine shouldn't be called very often.
98 * r3 = upper half of timebase (timebase is disabled if 0)
99 * r4 = lower half of timebase
100 * r5 = seconds part of timestamp
101 * r6 = useconds part of timestamp
102 * r7 = divisor (ie, timebase ticks per sec)
104 * r8 = ptr to our static data (kkBinary0, kkDouble1, kkTicksPerSec)
105 * r9 = ptr to comm page in kernel map
107 * --> Interrupts must be disabled and rtclock locked when called. <--
111 LEXT(commpage_set_timestamp) // void commpage_set_timestamp(tbr,secs,usecs,divisor)
113 ori r2,r11,MASK(MSR_FP) // turn FP on
115 isync // wait until MSR changes take effect
117 or. r0,r3,r4 // is timebase 0? (thus disabled)
118 lis r8,hi16(Ldata) // point to our data
119 lis r9,ha16(EXT(commPagePtr)) // get ptr to address of commpage in kernel map
120 stfd f1,rzSaveF1(r1) // save a FPR in the red zone
121 ori r8,r8,lo16(Ldata)
122 lwz r9,lo16(EXT(commPagePtr))(r9) // r9 <- commPagePtr
123 lfd f1,kkBinary0(r8) // get fixed 0s
124 li r0,_COMM_PAGE_BASE_ADDRESS // get va in user space of commpage
125 cmpwi cr1,r9,0 // is commpage allocated yet?
126 sub r9,r9,r0 // r9 <- commpage address, biased by user va
127 beq-- cr1,3f // skip if not allocated
128 stfd f1,_COMM_PAGE_TIMEBASE(r9) // turn off the timestamp (atomically)
129 eieio // make sure all CPUs see it is off
130 beq 3f // all we had to do is turn off timestamp
132 lwz r0,kkTicksPerSec+4(r8) // get last ticks_per_sec (or 0 if first)
133 stw r3,rzNewTimeBase(r1) // store new timebase so we can lfd
134 stw r4,rzNewTimeBase+4(r1)
135 cmpw r0,r7 // do we need to recompute _COMM_PAGE_SEC_PER_TICK?
136 stw r5,_COMM_PAGE_TIMESTAMP(r9) // store the new timestamp
137 stw r6,_COMM_PAGE_TIMESTAMP+4(r9)
138 lfd f1,rzNewTimeBase(r1) // get timebase in a FPR so we can store atomically
139 beq++ 2f // same ticks_per_sec, no need to recompute
141 stw r7,kkTicksPerSec+4(r8) // must recompute SEC_PER_TICK
142 stfd f2,rzSaveF2(r1) // we'll need a few more temp FPRs
146 lfd f2,_COMM_PAGE_2_TO_52(r9) // f2 <- double(2**52)
147 lfd f3,kkTicksPerSec(r8) // float new ticks_per_sec + 2**52
148 lfd f4,kkDouble1(r8) // f4 <- double(1.0)
149 mffs f5 // save caller's FPSCR
150 mtfsfi 7,0 // clear Inexeact Exception bit, set round-to-nearest
151 fsub f3,f3,f2 // get ticks_per_sec
152 fdiv f3,f4,f3 // divide 1 by ticks_per_sec to get SEC_PER_TICK
153 stfd f3,_COMM_PAGE_SEC_PER_TICK(r9)
154 mtfsf 0xFF,f5 // restore FPSCR
155 lfd f2,rzSaveF2(r1) // restore FPRs
159 2: // f1 == new timestamp
160 eieio // wait until the stores take
161 stfd f1,_COMM_PAGE_TIMEBASE(r9) // then turn the timestamp back on (atomically)
162 3: // here once all fields updated
163 lfd f1,rzSaveF1(r1) // restore last FPR
164 mtmsr r11 // turn FP back off
169 /* ***************************************
170 * * C O M M P A G E _ T I M E _ D C B A *
171 * ***************************************
173 * Not all processors that support the DCBA opcode actually benefit from it.
174 * Some store-gather and read-cancel well enough that there is no need to use
175 * DCBA to avoid fetching cache lines that will be completely overwritten, while
176 * others have this feature disabled (to work around errata etc), and so benefit
177 * from DCBA. Since it is hard to tell the one group from the other, we just
178 * time loops with and without DCBA, and pick the fastest. Thus we avoid
179 * delicate dependence on processor and/or platform revisions.
181 * We return either kDcbaRecommended or zero.
183 * int commpage_time_dcba( void );
186 LEXT(commpage_time_dcba)
187 mflr r12 // get return
188 stw r12,8(r1) // save
189 stwu r1,-kSFSize(r1) // carve our temp buffer from the stack
190 addi r11,r1,127+16 // get base address...
191 rlwinm r11,r11,0,0,24 // ...of our buffer, 128-byte aligned
192 crset kDCBA // first, use DCBA
193 bl LTest // time it with DCBA
194 srwi r0,r3,3 // bias 12 pct in favor of not using DCBA...
195 add r10,r3,r0 // ...because DCBA is always slower with warm cache
197 bl LTest // time without DCBA
198 cmplw r10,r3 // which is better?
199 mtlr r12 // restore return
200 lwz r1,0(r1) // pop off our stack frame
201 li r3,kDcbaRecommended // assume using DCBA is faster
203 li r3,0 // no DCBA is faster
207 // Subroutine to time a loop with or without DCBA.
208 // kDCBA = set if we should use DCBA
209 // r11 = base of buffer to use for test (kBufSiz bytes)
211 // We return TBR ticks in r3.
215 li r4,kLoopCnt // number of times to loop
216 li r3,-1 // initialize fastest time
218 mr r6,r11 // initialize buffer ptr
219 li r0,kBufSiz/32 // r0 <- cache blocks to test
222 dcbf 0,r6 // first, force the blocks out of the cache
225 sync // make sure all the flushes take
226 mr r6,r11 // re-initialize buffer ptr
227 mtctr r0 // reset cache-block count
228 mftbu r7 // remember upper half so we can check for carry
229 mftb r8 // start the timer
230 3: // loop over cache blocks
231 bf kDCBA,4f // should we DCBA?
234 stw r0,0(r6) // store the entire cache block
246 cmpw r0,r7 // did timebase carry?
247 bne 1b // yes, retest rather than fuss
248 sub r9,r9,r8 // r9 <- time for this loop
249 cmplw r9,r3 // faster than current best?
251 mr r3,r9 // remember fastest time through loop
253 subi r4,r4,1 // decrement outer loop count
254 cmpwi r4,0 // more to go?
256 blr // return fastest time in r3