]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/commpage_asm.s
xnu-792.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / commpage_asm.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23 #include <sys/appleapiopts.h>
24 #include <ppc/asm.h>
25 #include <ppc/proc_reg.h>
26 #include <machine/cpu_capabilities.h>
27 #include <machine/commpage.h>
28
29
30 // commpage_time_dcba() uses a stack frame as follows:
31
32 #define kBufSiz 1024 // Size of the buffer we use to do DCBA timing on G4
33 #define kSFSize (kBufSiz+128+16) // Stack frame size, which contains the 128-byte-aligned buffer
34 #define kLoopCnt 5 // Iterations of the timing loop
35 #define kDCBA 22 // Bit in cr5 used as a flag in timing loop
36
37
38 // commpage_set_timestamp() uses the red zone for temporary storage:
39
40 #define rzSaveF1 -8 // caller's FPR1
41 #define rzSaveF2 -16 // caller's FPR2
42 #define rzSaveF3 -24 // caller's FPR3
43 #define rzSaveF4 -32 // caller's FPR4
44 #define rzSaveF5 -40 // caller's FPR5
45 #define rzNewTimeBase -48 // used to load 64-bit TBR into a FPR
46
47
48 // commpage_set_timestamp() uses the following data. kkTicksPerSec remembers
49 // the number used to compute _COMM_PAGE_SEC_PER_TICK. Since this constant
50 // rarely changes, we use it to avoid needless recomputation. It is a double
51 // value, pre-initialize with an exponent of 2**52.
52
53 #define kkBinary0 0 // offset in data to long long 0 (a constant)
54 #define kkDouble1 8 // offset in data to double 1.0 (a constant)
55 #define kkTicksPerSec 16 // offset in data to double(ticks_per_sec)
56
57 .data
58 .align 3 // three doubleword fields
59 Ldata:
60 .long 0 // kkBinary0
61 .long 0
62 .double 1.0e0 // kkDouble1
63 .long 0x43300000 // kkTicksPerSec (plus 2**52)
64 .long 0 // this is where we store ticks_per_sec, to float
65
66 .text
67 .align 2
68 .globl EXT(commpage_time_dcba)
69 .globl EXT(commpage_set_timestamp)
70
71
72 /* ***********************************************
73 * * C O M M P A G E _ S E T _ T I M E S T A M P *
74 * ***********************************************
75 *
76 * Update the gettimeofday() shared data on the commpages, as follows:
77 * _COMM_PAGE_TIMESTAMP = a BSD-style pair of uint_32's for secs and usecs
78 * _COMM_PAGE_TIMEBASE = the timebase at which the timestamp was valid
79 * _COMM_PAGE_SEC_PER_TICK = multiply timebase ticks by this to get seconds (double)
80 * The convention is that if the timebase is 0, the data is invalid. Because other
81 * CPUs are reading the three values asynchronously and must get a consistent set,
82 * it is critical that we update them with the following protocol:
83 * 1. set timebase to 0 (atomically), to invalidate all three values
84 * 2. eieio (to create a barrier in stores to cacheable memory)
85 * 3. change timestamp and "secs per tick"
86 * 4. eieio
87 * 5. set timebase nonzero (atomically)
88 * This works because readers read the timebase, then the timestamp and divisor, sync
89 * if MP, then read the timebase a second time and check to be sure it is equal to the first.
90 *
91 * We could save a few cycles on 64-bit machines by special casing them, but it probably
92 * isn't necessary because this routine shouldn't be called very often.
93 *
94 * When called:
95 * r3 = upper half of timebase (timebase is disabled if 0)
96 * r4 = lower half of timebase
97 * r5 = seconds part of timestamp
98 * r6 = useconds part of timestamp
99 * r7 = divisor (ie, timebase ticks per sec)
100 * We set up:
101 * r8 = ptr to our static data (kkBinary0, kkDouble1, kkTicksPerSec)
102 * r9 = ptr to 32-bit commpage in kernel map
103 * r10 = ptr to 64-bit commpage in kernel map
104 *
105 * --> Interrupts must be disabled and rtclock locked when called. <--
106 */
107
108 .align 5
109 LEXT(commpage_set_timestamp) // void commpage_set_timestamp(tbr,secs,usecs,divisor)
110 mfmsr r11 // get MSR
111 ori r2,r11,MASK(MSR_FP) // turn FP on
112 mtmsr r2
113 isync // wait until MSR changes take effect
114
115 or. r0,r3,r4 // is timebase 0? (thus disabled)
116 lis r8,hi16(Ldata) // point to our data
117 lis r9,ha16(EXT(commPagePtr32)) // get ptrs to address of commpages in kernel map
118 lis r10,ha16(EXT(commPagePtr64))
119 stfd f1,rzSaveF1(r1) // save a FPR in the red zone
120 ori r8,r8,lo16(Ldata)
121 lwz r9,lo16(EXT(commPagePtr32))(r9) // r9 <- 32-bit commpage ptr
122 lwz r10,lo16(EXT(commPagePtr64))(r10) // r10 <- 64-bit commpage ptr
123 lfd f1,kkBinary0(r8) // get fixed 0s
124 li r0,_COMM_PAGE_BASE_ADDRESS // get va in user space of commpage
125 cmpwi cr1,r9,0 // is 32-bit commpage allocated yet?
126 cmpwi cr6,r10,0 // is 64-bit commpage allocated yet?
127 sub r9,r9,r0 // r9 <- 32-bit commpage address, biased by user va
128 sub r10,r10,r0 // r10<- 64-bit commpage address
129 beq-- cr1,3f // skip if 32-bit commpage not allocated (64-bit won't be either)
130 bne++ cr6,1f // skip if 64-bit commpage is allocated
131 mr r10,r9 // if no 64-bit commpage, point to 32-bit version with r10 too
132 1:
133 stfd f1,_COMM_PAGE_TIMEBASE(r9) // turn off the 32-bit-commpage timestamp (atomically)
134 stfd f1,_COMM_PAGE_TIMEBASE(r10) // and the 64-bit one too
135 eieio // make sure all CPUs see it is off
136 beq 3f // all we had to do is turn off timestamp
137
138 lwz r0,kkTicksPerSec+4(r8) // get last ticks_per_sec (or 0 if first)
139 stw r3,rzNewTimeBase(r1) // store new timebase so we can lfd
140 stw r4,rzNewTimeBase+4(r1)
141 cmpw r0,r7 // do we need to recompute _COMM_PAGE_SEC_PER_TICK?
142 stw r5,_COMM_PAGE_TIMESTAMP(r9) // store the new timestamp in the 32-bit page
143 stw r6,_COMM_PAGE_TIMESTAMP+4(r9)
144 stw r5,_COMM_PAGE_TIMESTAMP(r10)// and the 64-bit commpage
145 stw r6,_COMM_PAGE_TIMESTAMP+4(r10)
146 lfd f1,rzNewTimeBase(r1) // get timebase in a FPR so we can store atomically
147 beq++ 2f // same ticks_per_sec, no need to recompute
148
149 stw r7,kkTicksPerSec+4(r8) // must recompute SEC_PER_TICK
150 stfd f2,rzSaveF2(r1) // we'll need a few more temp FPRs
151 stfd f3,rzSaveF3(r1)
152 stfd f4,rzSaveF4(r1)
153 stfd f5,rzSaveF5(r1)
154 lfd f2,_COMM_PAGE_2_TO_52(r9) // f2 <- double(2**52)
155 lfd f3,kkTicksPerSec(r8) // float new ticks_per_sec + 2**52
156 lfd f4,kkDouble1(r8) // f4 <- double(1.0)
157 mffs f5 // save caller's FPSCR
158 mtfsfi 7,0 // clear Inexeact Exception bit, set round-to-nearest
159 fsub f3,f3,f2 // get ticks_per_sec
160 fdiv f3,f4,f3 // divide 1 by ticks_per_sec to get SEC_PER_TICK
161 stfd f3,_COMM_PAGE_SEC_PER_TICK(r9)
162 stfd f3,_COMM_PAGE_SEC_PER_TICK(r10)
163 mtfsf 0xFF,f5 // restore FPSCR
164 lfd f2,rzSaveF2(r1) // restore FPRs
165 lfd f3,rzSaveF3(r1)
166 lfd f4,rzSaveF4(r1)
167 lfd f5,rzSaveF5(r1)
168 2: // f1 == new timestamp
169 eieio // wait until the stores take
170 stfd f1,_COMM_PAGE_TIMEBASE(r9) // then turn the timestamp back on (atomically)
171 stfd f1,_COMM_PAGE_TIMEBASE(r10) // both
172 3: // here once all fields updated
173 lfd f1,rzSaveF1(r1) // restore last FPR
174 mtmsr r11 // turn FP back off
175 isync
176 blr
177
178
179 /* ***************************************
180 * * C O M M P A G E _ T I M E _ D C B A *
181 * ***************************************
182 *
183 * Not all processors that support the DCBA opcode actually benefit from it.
184 * Some store-gather and read-cancel well enough that there is no need to use
185 * DCBA to avoid fetching cache lines that will be completely overwritten, while
186 * others have this feature disabled (to work around errata etc), and so benefit
187 * from DCBA. Since it is hard to tell the one group from the other, we just
188 * time loops with and without DCBA, and pick the fastest. Thus we avoid
189 * delicate dependence on processor and/or platform revisions.
190 *
191 * We return either kDcbaRecommended or zero.
192 *
193 * int commpage_time_dcba( void );
194 */
195
196 LEXT(commpage_time_dcba)
197 mflr r12 // get return
198 stw r12,8(r1) // save
199 stwu r1,-kSFSize(r1) // carve our temp buffer from the stack
200 addi r11,r1,127+16 // get base address...
201 rlwinm r11,r11,0,0,24 // ...of our buffer, 128-byte aligned
202 crset kDCBA // first, use DCBA
203 bl LTest // time it with DCBA
204 srwi r0,r3,3 // bias 12 pct in favor of not using DCBA...
205 add r10,r3,r0 // ...because DCBA is always slower with warm cache
206 crclr kDCBA
207 bl LTest // time without DCBA
208 cmplw r10,r3 // which is better?
209 mtlr r12 // restore return
210 lwz r1,0(r1) // pop off our stack frame
211 li r3,kDcbaRecommended // assume using DCBA is faster
212 bltlr
213 li r3,0 // no DCBA is faster
214 blr
215
216
217 // Subroutine to time a loop with or without DCBA.
218 // kDCBA = set if we should use DCBA
219 // r11 = base of buffer to use for test (kBufSiz bytes)
220 //
221 // We return TBR ticks in r3.
222 // We use r0,r3-r9.
223
224 LTest:
225 li r4,kLoopCnt // number of times to loop
226 li r3,-1 // initialize fastest time
227 1:
228 mr r6,r11 // initialize buffer ptr
229 li r0,kBufSiz/32 // r0 <- cache blocks to test
230 mtctr r0
231 2:
232 dcbf 0,r6 // first, force the blocks out of the cache
233 addi r6,r6,32
234 bdnz 2b
235 sync // make sure all the flushes take
236 mr r6,r11 // re-initialize buffer ptr
237 mtctr r0 // reset cache-block count
238 mftbu r7 // remember upper half so we can check for carry
239 mftb r8 // start the timer
240 3: // loop over cache blocks
241 bf kDCBA,4f // should we DCBA?
242 dcba 0,r6
243 4:
244 stw r0,0(r6) // store the entire cache block
245 stw r0,4(r6)
246 stw r0,8(r6)
247 stw r0,12(r6)
248 stw r0,16(r6)
249 stw r0,20(r6)
250 stw r0,24(r6)
251 stw r0,28(r6)
252 addi r6,r6,32
253 bdnz 3b
254 mftb r9
255 mftbu r0
256 cmpw r0,r7 // did timebase carry?
257 bne 1b // yes, retest rather than fuss
258 sub r9,r9,r8 // r9 <- time for this loop
259 cmplw r9,r3 // faster than current best?
260 bge 5f // no
261 mr r3,r9 // remember fastest time through loop
262 5:
263 subi r4,r4,1 // decrement outer loop count
264 cmpwi r4,0 // more to go?
265 bne 1b // loop if so
266 blr // return fastest time in r3