]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/commpage_asm.s
f5e3571364abc0cec1a0b0222e11d2b6e9cf8e55
[apple/xnu.git] / osfmk / ppc / commpage / commpage_asm.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 #include <sys/appleapiopts.h>
25 #include <ppc/asm.h>
26 #include <ppc/proc_reg.h>
27 #include <machine/cpu_capabilities.h>
28 #include <machine/commpage.h>
29
30
31 // commpage_time_dcba() uses a stack frame as follows:
32
33 #define kBufSiz 1024 // Size of the buffer we use to do DCBA timing on G4
34 #define kSFSize (kBufSiz+128+16) // Stack frame size, which contains the 128-byte-aligned buffer
35 #define kLoopCnt 5 // Iterations of the timing loop
36 #define kDCBA 22 // Bit in cr5 used as a flag in timing loop
37
38
39 // commpage_set_timestamp() uses the red zone for temporary storage:
40
41 #define rzSaveF1 -8 // caller's FPR1
42 #define rzSaveF2 -16 // caller's FPR2
43 #define rzSaveF3 -24 // caller's FPR3
44 #define rzSaveF4 -32 // caller's FPR4
45 #define rzSaveF5 -40 // caller's FPR5
46 #define rzNewTimeBase -48 // used to load 64-bit TBR into a FPR
47
48
49 // commpage_set_timestamp() uses the following data. kkTicksPerSec remembers
50 // the number used to compute _COMM_PAGE_SEC_PER_TICK. Since this constant
51 // rarely changes, we use it to avoid needless recomputation. It is a double
52 // value, pre-initialize with an exponent of 2**52.
53
54 #define kkBinary0 0 // offset in data to long long 0 (a constant)
55 #define kkDouble1 8 // offset in data to double 1.0 (a constant)
56 #define kkTicksPerSec 16 // offset in data to double(ticks_per_sec)
57
58 .data
59 .align 3 // three doubleword fields
60 Ldata:
61 .long 0 // kkBinary0
62 .long 0
63 .double 1.0e0 // kkDouble1
64 .long 0x43300000 // kkTicksPerSec (plus 2**52)
65 .long 0 // this is where we store ticks_per_sec, to float
66
67 .text
68 .align 2
69 .globl EXT(commpage_time_dcba)
70 .globl EXT(commpage_set_timestamp)
71
72
73 /* ***********************************************
74 * * C O M M P A G E _ S E T _ T I M E S T A M P *
75 * ***********************************************
76 *
77 * Update the gettimeofday() shared data on the commpages, as follows:
78 * _COMM_PAGE_TIMESTAMP = a BSD-style pair of uint_32's for secs and usecs
79 * _COMM_PAGE_TIMEBASE = the timebase at which the timestamp was valid
80 * _COMM_PAGE_SEC_PER_TICK = multiply timebase ticks by this to get seconds (double)
81 * The convention is that if the timebase is 0, the data is invalid. Because other
82 * CPUs are reading the three values asynchronously and must get a consistent set,
83 * it is critical that we update them with the following protocol:
84 * 1. set timebase to 0 (atomically), to invalidate all three values
85 * 2. eieio (to create a barrier in stores to cacheable memory)
86 * 3. change timestamp and "secs per tick"
87 * 4. eieio
88 * 5. set timebase nonzero (atomically)
89 * This works because readers read the timebase, then the timestamp and divisor, sync
90 * if MP, then read the timebase a second time and check to be sure it is equal to the first.
91 *
92 * We could save a few cycles on 64-bit machines by special casing them, but it probably
93 * isn't necessary because this routine shouldn't be called very often.
94 *
95 * When called:
96 * r3 = upper half of timebase (timebase is disabled if 0)
97 * r4 = lower half of timebase
98 * r5 = seconds part of timestamp
99 * r6 = useconds part of timestamp
100 * r7 = divisor (ie, timebase ticks per sec)
101 * We set up:
102 * r8 = ptr to our static data (kkBinary0, kkDouble1, kkTicksPerSec)
103 * r9 = ptr to 32-bit commpage in kernel map
104 * r10 = ptr to 64-bit commpage in kernel map
105 *
106 * --> Interrupts must be disabled and rtclock locked when called. <--
107 */
108
109 .align 5
110 LEXT(commpage_set_timestamp) // void commpage_set_timestamp(tbr,secs,usecs,divisor)
111 mfmsr r11 // get MSR
112 ori r2,r11,MASK(MSR_FP) // turn FP on
113 mtmsr r2
114 isync // wait until MSR changes take effect
115
116 or. r0,r3,r4 // is timebase 0? (thus disabled)
117 lis r8,hi16(Ldata) // point to our data
118 lis r9,ha16(EXT(commPagePtr32)) // get ptrs to address of commpages in kernel map
119 lis r10,ha16(EXT(commPagePtr64))
120 stfd f1,rzSaveF1(r1) // save a FPR in the red zone
121 ori r8,r8,lo16(Ldata)
122 lwz r9,lo16(EXT(commPagePtr32))(r9) // r9 <- 32-bit commpage ptr
123 lwz r10,lo16(EXT(commPagePtr64))(r10) // r10 <- 64-bit commpage ptr
124 lfd f1,kkBinary0(r8) // get fixed 0s
125 li r0,_COMM_PAGE_BASE_ADDRESS // get va in user space of commpage
126 cmpwi cr1,r9,0 // is 32-bit commpage allocated yet?
127 cmpwi cr6,r10,0 // is 64-bit commpage allocated yet?
128 sub r9,r9,r0 // r9 <- 32-bit commpage address, biased by user va
129 sub r10,r10,r0 // r10<- 64-bit commpage address
130 beq-- cr1,3f // skip if 32-bit commpage not allocated (64-bit won't be either)
131 bne++ cr6,1f // skip if 64-bit commpage is allocated
132 mr r10,r9 // if no 64-bit commpage, point to 32-bit version with r10 too
133 1:
134 stfd f1,_COMM_PAGE_TIMEBASE(r9) // turn off the 32-bit-commpage timestamp (atomically)
135 stfd f1,_COMM_PAGE_TIMEBASE(r10) // and the 64-bit one too
136 eieio // make sure all CPUs see it is off
137 beq 3f // all we had to do is turn off timestamp
138
139 lwz r0,kkTicksPerSec+4(r8) // get last ticks_per_sec (or 0 if first)
140 stw r3,rzNewTimeBase(r1) // store new timebase so we can lfd
141 stw r4,rzNewTimeBase+4(r1)
142 cmpw r0,r7 // do we need to recompute _COMM_PAGE_SEC_PER_TICK?
143 stw r5,_COMM_PAGE_TIMESTAMP(r9) // store the new timestamp in the 32-bit page
144 stw r6,_COMM_PAGE_TIMESTAMP+4(r9)
145 stw r5,_COMM_PAGE_TIMESTAMP(r10)// and the 64-bit commpage
146 stw r6,_COMM_PAGE_TIMESTAMP+4(r10)
147 lfd f1,rzNewTimeBase(r1) // get timebase in a FPR so we can store atomically
148 beq++ 2f // same ticks_per_sec, no need to recompute
149
150 stw r7,kkTicksPerSec+4(r8) // must recompute SEC_PER_TICK
151 stfd f2,rzSaveF2(r1) // we'll need a few more temp FPRs
152 stfd f3,rzSaveF3(r1)
153 stfd f4,rzSaveF4(r1)
154 stfd f5,rzSaveF5(r1)
155 lfd f2,_COMM_PAGE_2_TO_52(r9) // f2 <- double(2**52)
156 lfd f3,kkTicksPerSec(r8) // float new ticks_per_sec + 2**52
157 lfd f4,kkDouble1(r8) // f4 <- double(1.0)
158 mffs f5 // save caller's FPSCR
159 mtfsfi 7,0 // clear Inexeact Exception bit, set round-to-nearest
160 fsub f3,f3,f2 // get ticks_per_sec
161 fdiv f3,f4,f3 // divide 1 by ticks_per_sec to get SEC_PER_TICK
162 stfd f3,_COMM_PAGE_SEC_PER_TICK(r9)
163 stfd f3,_COMM_PAGE_SEC_PER_TICK(r10)
164 mtfsf 0xFF,f5 // restore FPSCR
165 lfd f2,rzSaveF2(r1) // restore FPRs
166 lfd f3,rzSaveF3(r1)
167 lfd f4,rzSaveF4(r1)
168 lfd f5,rzSaveF5(r1)
169 2: // f1 == new timestamp
170 eieio // wait until the stores take
171 stfd f1,_COMM_PAGE_TIMEBASE(r9) // then turn the timestamp back on (atomically)
172 stfd f1,_COMM_PAGE_TIMEBASE(r10) // both
173 3: // here once all fields updated
174 lfd f1,rzSaveF1(r1) // restore last FPR
175 mtmsr r11 // turn FP back off
176 isync
177 blr
178
179
180 /* ***************************************
181 * * C O M M P A G E _ T I M E _ D C B A *
182 * ***************************************
183 *
184 * Not all processors that support the DCBA opcode actually benefit from it.
185 * Some store-gather and read-cancel well enough that there is no need to use
186 * DCBA to avoid fetching cache lines that will be completely overwritten, while
187 * others have this feature disabled (to work around errata etc), and so benefit
188 * from DCBA. Since it is hard to tell the one group from the other, we just
189 * time loops with and without DCBA, and pick the fastest. Thus we avoid
190 * delicate dependence on processor and/or platform revisions.
191 *
192 * We return either kDcbaRecommended or zero.
193 *
194 * int commpage_time_dcba( void );
195 */
196
197 LEXT(commpage_time_dcba)
198 mflr r12 // get return
199 stw r12,8(r1) // save
200 stwu r1,-kSFSize(r1) // carve our temp buffer from the stack
201 addi r11,r1,127+16 // get base address...
202 rlwinm r11,r11,0,0,24 // ...of our buffer, 128-byte aligned
203 crset kDCBA // first, use DCBA
204 bl LTest // time it with DCBA
205 srwi r0,r3,3 // bias 12 pct in favor of not using DCBA...
206 add r10,r3,r0 // ...because DCBA is always slower with warm cache
207 crclr kDCBA
208 bl LTest // time without DCBA
209 cmplw r10,r3 // which is better?
210 mtlr r12 // restore return
211 lwz r1,0(r1) // pop off our stack frame
212 li r3,kDcbaRecommended // assume using DCBA is faster
213 bltlr
214 li r3,0 // no DCBA is faster
215 blr
216
217
218 // Subroutine to time a loop with or without DCBA.
219 // kDCBA = set if we should use DCBA
220 // r11 = base of buffer to use for test (kBufSiz bytes)
221 //
222 // We return TBR ticks in r3.
223 // We use r0,r3-r9.
224
225 LTest:
226 li r4,kLoopCnt // number of times to loop
227 li r3,-1 // initialize fastest time
228 1:
229 mr r6,r11 // initialize buffer ptr
230 li r0,kBufSiz/32 // r0 <- cache blocks to test
231 mtctr r0
232 2:
233 dcbf 0,r6 // first, force the blocks out of the cache
234 addi r6,r6,32
235 bdnz 2b
236 sync // make sure all the flushes take
237 mr r6,r11 // re-initialize buffer ptr
238 mtctr r0 // reset cache-block count
239 mftbu r7 // remember upper half so we can check for carry
240 mftb r8 // start the timer
241 3: // loop over cache blocks
242 bf kDCBA,4f // should we DCBA?
243 dcba 0,r6
244 4:
245 stw r0,0(r6) // store the entire cache block
246 stw r0,4(r6)
247 stw r0,8(r6)
248 stw r0,12(r6)
249 stw r0,16(r6)
250 stw r0,20(r6)
251 stw r0,24(r6)
252 stw r0,28(r6)
253 addi r6,r6,32
254 bdnz 3b
255 mftb r9
256 mftbu r0
257 cmpw r0,r7 // did timebase carry?
258 bne 1b // yes, retest rather than fuss
259 sub r9,r9,r8 // r9 <- time for this loop
260 cmplw r9,r3 // faster than current best?
261 bge 5f // no
262 mr r3,r9 // remember fastest time through loop
263 5:
264 subi r4,r4,1 // decrement outer loop count
265 cmpwi r4,0 // more to go?
266 bne 1b // loop if so
267 blr // return fastest time in r3