]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/commpage_asm.s
37b5fc58346bd97aa8de9d88f718cfea59576c03
[apple/xnu.git] / osfmk / ppc / commpage / commpage_asm.s
1 /*
2 * Copyright (c) 2003-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30
31 #include <sys/appleapiopts.h>
32 #include <ppc/asm.h>
33 #include <ppc/proc_reg.h>
34 #include <machine/cpu_capabilities.h>
35 #include <machine/commpage.h>
36
37
38 // commpage_time_dcba() uses a stack frame as follows:
39
40 #define kBufSiz 1024 // Size of the buffer we use to do DCBA timing on G4
41 #define kSFSize (kBufSiz+128+16) // Stack frame size, which contains the 128-byte-aligned buffer
42 #define kLoopCnt 5 // Iterations of the timing loop
43 #define kDCBA 22 // Bit in cr5 used as a flag in timing loop
44
45
46 // commpage_set_timestamp() uses the red zone for temporary storage:
47
48 #define rzSaveF1 -8 // caller's FPR1
49 #define rzSaveF2 -16 // caller's FPR2
50 #define rzSaveF3 -24 // caller's FPR3
51 #define rzSaveF4 -32 // caller's FPR4
52 #define rzSaveF5 -40 // caller's FPR5
53 #define rzNewTimeBase -48 // used to load 64-bit TBR into a FPR
54
55
56 // commpage_set_timestamp() uses the following data. kkTicksPerSec remembers
57 // the number used to compute _COMM_PAGE_SEC_PER_TICK. Since this constant
58 // rarely changes, we use it to avoid needless recomputation. It is a double
59 // value, pre-initialize with an exponent of 2**52.
60
61 #define kkBinary0 0 // offset in data to long long 0 (a constant)
62 #define kkDouble1 8 // offset in data to double 1.0 (a constant)
63 #define kkTicksPerSec 16 // offset in data to double(ticks_per_sec)
64
65 .data
66 .align 3 // three doubleword fields
67 Ldata:
68 .long 0 // kkBinary0
69 .long 0
70 .double 1.0e0 // kkDouble1
71 .long 0x43300000 // kkTicksPerSec (plus 2**52)
72 .long 0 // this is where we store ticks_per_sec, to float
73
74 .text
75 .align 2
76 .globl EXT(commpage_time_dcba)
77 .globl EXT(commpage_set_timestamp)
78
79
80 /* ***********************************************
81 * * C O M M P A G E _ S E T _ T I M E S T A M P *
82 * ***********************************************
83 *
84 * Update the gettimeofday() shared data on the commpages, as follows:
85 * _COMM_PAGE_TIMESTAMP = the clock offset at timebase (seconds)
86 * _COMM_PAGE_TIMEBASE = the timebase at which the timestamp was valid
87 * _COMM_PAGE_SEC_PER_TICK = multiply timebase ticks by this to get seconds (double)
88 * The convention is that if the timebase is 0, the data is invalid. Because other
89 * CPUs are reading the three values asynchronously and must get a consistent set,
90 * it is critical that we update them with the following protocol:
91 * 1. set timebase to 0 (atomically), to invalidate all three values
92 * 2. eieio (to create a barrier in stores to cacheable memory)
93 * 3. change timestamp and "secs per tick"
94 * 4. eieio
95 * 5. set timebase nonzero (atomically)
96 * This works because readers read the timebase, then the timestamp and divisor, sync
97 * if MP, then read the timebase a second time and check to be sure it is equal to the first.
98 *
99 * We could save a few cycles on 64-bit machines by special casing them, but it probably
100 * isn't necessary because this routine shouldn't be called very often.
101 *
102 * When called:
103 * r3 = upper half of timebase (timebase is disabled if 0)
104 * r4 = lower half of timebase
105 * r5 = upper half of timestamp
106 * r6 = lower half of timestamp
107 * r7 = divisor (ie, timebase ticks per sec)
108 * We set up:
109 * r8 = ptr to our static data (kkBinary0, kkDouble1, kkTicksPerSec)
110 * r9 = ptr to 32-bit commpage in kernel map
111 * r10 = ptr to 64-bit commpage in kernel map
112 *
113 * --> Interrupts must be disabled and rtclock locked when called. <--
114 */
115
116 .align 5
117 LEXT(commpage_set_timestamp) // void commpage_set_timestamp(tbr,secs,divisor)
118 mfmsr r11 // get MSR
119 ori r2,r11,MASK(MSR_FP) // turn FP on
120 mtmsr r2
121 isync // wait until MSR changes take effect
122
123 or. r0,r3,r4 // is timebase 0? (thus disabled)
124 lis r8,hi16(Ldata) // point to our data
125 lis r9,ha16(EXT(commPagePtr32)) // get ptrs to address of commpages in kernel map
126 lis r10,ha16(EXT(commPagePtr64))
127 stfd f1,rzSaveF1(r1) // save a FPR in the red zone
128 ori r8,r8,lo16(Ldata)
129 lwz r9,lo16(EXT(commPagePtr32))(r9) // r9 <- 32-bit commpage ptr
130 lwz r10,lo16(EXT(commPagePtr64))(r10) // r10 <- 64-bit commpage ptr
131 lfd f1,kkBinary0(r8) // get fixed 0s
132 li r0,_COMM_PAGE_BASE_ADDRESS // get va in user space of commpage
133 cmpwi cr1,r9,0 // is 32-bit commpage allocated yet?
134 cmpwi cr6,r10,0 // is 64-bit commpage allocated yet?
135 sub r9,r9,r0 // r9 <- 32-bit commpage address, biased by user va
136 sub r10,r10,r0 // r10<- 64-bit commpage address
137 beq-- cr1,3f // skip if 32-bit commpage not allocated (64-bit won't be either)
138 bne++ cr6,1f // skip if 64-bit commpage is allocated
139 mr r10,r9 // if no 64-bit commpage, point to 32-bit version with r10 too
140 1:
141 stfd f1,_COMM_PAGE_TIMEBASE(r9) // turn off the 32-bit-commpage timestamp (atomically)
142 stfd f1,_COMM_PAGE_TIMEBASE(r10) // and the 64-bit one too
143 eieio // make sure all CPUs see it is off
144 beq 3f // all we had to do is turn off timestamp
145
146 lwz r0,kkTicksPerSec+4(r8) // get last ticks_per_sec (or 0 if first)
147 stw r3,rzNewTimeBase(r1) // store new timebase so we can lfd
148 stw r4,rzNewTimeBase+4(r1)
149 cmpw r0,r7 // do we need to recompute _COMM_PAGE_SEC_PER_TICK?
150 stw r5,_COMM_PAGE_TIMESTAMP(r9) // store the new timestamp in the 32-bit page
151 stw r6,_COMM_PAGE_TIMESTAMP+4(r9)
152 stw r5,_COMM_PAGE_TIMESTAMP(r10)// and the 64-bit commpage
153 stw r6,_COMM_PAGE_TIMESTAMP+4(r10)
154 lfd f1,rzNewTimeBase(r1) // get timebase in a FPR so we can store atomically
155 beq++ 2f // same ticks_per_sec, no need to recompute
156
157 stw r7,kkTicksPerSec+4(r8) // must recompute SEC_PER_TICK
158 stfd f2,rzSaveF2(r1) // we'll need a few more temp FPRs
159 stfd f3,rzSaveF3(r1)
160 stfd f4,rzSaveF4(r1)
161 stfd f5,rzSaveF5(r1)
162 lfd f2,_COMM_PAGE_2_TO_52(r9) // f2 <- double(2**52)
163 lfd f3,kkTicksPerSec(r8) // float new ticks_per_sec + 2**52
164 lfd f4,kkDouble1(r8) // f4 <- double(1.0)
165 mffs f5 // save caller's FPSCR
166 mtfsfi 7,1 // clear Inexeact Exception bit, set round-to-zero
167 fsub f3,f3,f2 // get ticks_per_sec
168 fdiv f3,f4,f3 // divide 1 by ticks_per_sec to get SEC_PER_TICK
169 stfd f3,_COMM_PAGE_SEC_PER_TICK(r9)
170 stfd f3,_COMM_PAGE_SEC_PER_TICK(r10)
171 mtfsf 0xFF,f5 // restore FPSCR
172 lfd f2,rzSaveF2(r1) // restore FPRs
173 lfd f3,rzSaveF3(r1)
174 lfd f4,rzSaveF4(r1)
175 lfd f5,rzSaveF5(r1)
176 2: // f1 == new timestamp
177 eieio // wait until the stores take
178 stfd f1,_COMM_PAGE_TIMEBASE(r9) // then turn the timestamp back on (atomically)
179 stfd f1,_COMM_PAGE_TIMEBASE(r10) // both
180 3: // here once all fields updated
181 lfd f1,rzSaveF1(r1) // restore last FPR
182 mtmsr r11 // turn FP back off
183 isync
184 blr
185
186
187 /* ***************************************
188 * * C O M M P A G E _ T I M E _ D C B A *
189 * ***************************************
190 *
191 * Not all processors that support the DCBA opcode actually benefit from it.
192 * Some store-gather and read-cancel well enough that there is no need to use
193 * DCBA to avoid fetching cache lines that will be completely overwritten, while
194 * others have this feature disabled (to work around errata etc), and so benefit
195 * from DCBA. Since it is hard to tell the one group from the other, we just
196 * time loops with and without DCBA, and pick the fastest. Thus we avoid
197 * delicate dependence on processor and/or platform revisions.
198 *
199 * We return either kDcbaRecommended or zero.
200 *
201 * int commpage_time_dcba( void );
202 */
203
204 LEXT(commpage_time_dcba)
205 mflr r12 // get return
206 stw r12,8(r1) // save
207 stwu r1,-kSFSize(r1) // carve our temp buffer from the stack
208 addi r11,r1,127+16 // get base address...
209 rlwinm r11,r11,0,0,24 // ...of our buffer, 128-byte aligned
210 crset kDCBA // first, use DCBA
211 bl LTest // time it with DCBA
212 srwi r0,r3,3 // bias 12 pct in favor of not using DCBA...
213 add r10,r3,r0 // ...because DCBA is always slower with warm cache
214 crclr kDCBA
215 bl LTest // time without DCBA
216 cmplw r10,r3 // which is better?
217 mtlr r12 // restore return
218 lwz r1,0(r1) // pop off our stack frame
219 li r3,kDcbaRecommended // assume using DCBA is faster
220 bltlr
221 li r3,0 // no DCBA is faster
222 blr
223
224
225 // Subroutine to time a loop with or without DCBA.
226 // kDCBA = set if we should use DCBA
227 // r11 = base of buffer to use for test (kBufSiz bytes)
228 //
229 // We return TBR ticks in r3.
230 // We use r0,r3-r9.
231
232 LTest:
233 li r4,kLoopCnt // number of times to loop
234 li r3,-1 // initialize fastest time
235 1:
236 mr r6,r11 // initialize buffer ptr
237 li r0,kBufSiz/32 // r0 <- cache blocks to test
238 mtctr r0
239 2:
240 dcbf 0,r6 // first, force the blocks out of the cache
241 addi r6,r6,32
242 bdnz 2b
243 sync // make sure all the flushes take
244 mr r6,r11 // re-initialize buffer ptr
245 mtctr r0 // reset cache-block count
246 mftbu r7 // remember upper half so we can check for carry
247 mftb r8 // start the timer
248 3: // loop over cache blocks
249 bf kDCBA,4f // should we DCBA?
250 dcba 0,r6
251 4:
252 stw r0,0(r6) // store the entire cache block
253 stw r0,4(r6)
254 stw r0,8(r6)
255 stw r0,12(r6)
256 stw r0,16(r6)
257 stw r0,20(r6)
258 stw r0,24(r6)
259 stw r0,28(r6)
260 addi r6,r6,32
261 bdnz 3b
262 mftb r9
263 mftbu r0
264 cmpw r0,r7 // did timebase carry?
265 bne 1b // yes, retest rather than fuss
266 sub r9,r9,r8 // r9 <- time for this loop
267 cmplw r9,r3 // faster than current best?
268 bge 5f // no
269 mr r3,r9 // remember fastest time through loop
270 5:
271 subi r4,r4,1 // decrement outer loop count
272 cmpwi r4,0 // more to go?
273 bne 1b // loop if so
274 blr // return fastest time in r3