]> git.saurik.com Git - apple/xnu.git/blame - osfmk/ppc/commpage/commpage_asm.s
xnu-344.21.74.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / commpage_asm.s
CommitLineData
d7e50217
A
1/*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25
26#include <sys/appleapiopts.h>
27#include <ppc/asm.h>
28#include <ppc/proc_reg.h>
29#include <machine/cpu_capabilities.h>
30#include <machine/commpage.h>
31
32
33// commpage_time_dcba() uses a stack frame as follows:
34
35#define kBufSiz 1024 // Size of the buffer we use to do DCBA timing on G4
36#define kSFSize (kBufSiz+128+16) // Stack frame size, which contains the 128-byte-aligned buffer
37#define kLoopCnt 5 // Iterations of the timing loop
38#define kDCBA 22 // Bit in cr5 used as a flag in timing loop
39
40
41// commpage_set_timestamp() uses the red zone for temporary storage:
42
43#define rzSaveF1 -8 // caller's FPR1
44#define rzSaveF2 -16 // caller's FPR2
45#define rzSaveF3 -24 // caller's FPR3
46#define rzSaveF4 -32 // caller's FPR4
47#define rzSaveF5 -40 // caller's FPR5
48#define rzNewTimeBase -48 // used to load 64-bit TBR into a FPR
49
50
51// commpage_set_timestamp() uses the following data. kkTicksPerSec remembers
52// the number used to compute _COMM_PAGE_SEC_PER_TICK. Since this constant
53// rarely changes, we use it to avoid needless recomputation. It is a double
54// value, pre-initialize with an exponent of 2**52.
55
56#define kkBinary0 0 // offset in data to long long 0 (a constant)
57#define kkDouble1 8 // offset in data to double 1.0 (a constant)
58#define kkTicksPerSec 16 // offset in data to double(ticks_per_sec)
59
60 .data
61 .align 3 // three doubleword fields
62Ldata:
63 .long 0 // kkBinary0
64 .long 0
65 .double 1.0e0 // kkDouble1
66 .long 0x43300000 // kkTicksPerSec (plus 2**52)
67 .long 0 // this is where we store ticks_per_sec, to float
68
69 .text
70 .align 2
71 .globl EXT(commpage_time_dcba)
72 .globl EXT(commpage_set_timestamp)
73
74
75/* ***********************************************
76 * * C O M M P A G E _ S E T _ T I M E S T A M P *
77 * ***********************************************
78 *
79 * Update the gettimeofday() shared data on the commpage, as follows:
80 * _COMM_PAGE_TIMESTAMP = a BSD-style pair of uint_32's for secs and usecs
81 * _COMM_PAGE_TIMEBASE = the timebase at which the timestamp was valid
82 * _COMM_PAGE_SEC_PER_TICK = multiply timebase ticks by this to get seconds (double)
83 * The convention is that if the timebase is 0, the data is invalid. Because other
84 * CPUs are reading the three values asynchronously and must get a consistent set,
85 * it is critical that we update them with the following protocol:
86 * 1. set timebase to 0 (atomically), to invalidate all three values
87 * 2. eieio (to create a barrier in stores to cacheable memory)
88 * 3. change timestamp and "secs per tick"
89 * 4. eieio
90 * 5. set timebase nonzero (atomically)
91 * This works because readers read the timebase, then the timestamp and divisor, sync
92 * if MP, then read the timebase a second time and check to be sure it is equal to the first.
93 *
94 * We could save a few cycles on 64-bit machines by special casing them, but it probably
95 * isn't necessary because this routine shouldn't be called very often.
96 *
97 * When called:
98 * r3 = upper half of timebase (timebase is disabled if 0)
99 * r4 = lower half of timebase
100 * r5 = seconds part of timestamp
101 * r6 = useconds part of timestamp
102 * r7 = divisor (ie, timebase ticks per sec)
103 * We set up:
104 * r8 = ptr to our static data (kkBinary0, kkDouble1, kkTicksPerSec)
105 * r9 = ptr to comm page in kernel map
106 *
107 * --> Interrupts must be disabled and rtclock locked when called. <--
108 */
109
110 .align 5
111LEXT(commpage_set_timestamp) // void commpage_set_timestamp(tbr,secs,usecs,divisor)
112 mfmsr r11 // get MSR
113 ori r2,r11,MASK(MSR_FP) // turn FP on
114 mtmsr r2
115 isync // wait until MSR changes take effect
116
117 or. r0,r3,r4 // is timebase 0? (thus disabled)
118 lis r8,hi16(Ldata) // point to our data
119 lis r9,ha16(EXT(commPagePtr)) // get ptr to address of commpage in kernel map
120 stfd f1,rzSaveF1(r1) // save a FPR in the red zone
121 ori r8,r8,lo16(Ldata)
122 lwz r9,lo16(EXT(commPagePtr))(r9) // r9 <- commPagePtr
123 lfd f1,kkBinary0(r8) // get fixed 0s
124 li r0,_COMM_PAGE_BASE_ADDRESS // get va in user space of commpage
125 cmpwi cr1,r9,0 // is commpage allocated yet?
126 sub r9,r9,r0 // r9 <- commpage address, biased by user va
127 beq-- cr1,3f // skip if not allocated
128 stfd f1,_COMM_PAGE_TIMEBASE(r9) // turn off the timestamp (atomically)
129 eieio // make sure all CPUs see it is off
130 beq 3f // all we had to do is turn off timestamp
131
132 lwz r0,kkTicksPerSec+4(r8) // get last ticks_per_sec (or 0 if first)
133 stw r3,rzNewTimeBase(r1) // store new timebase so we can lfd
134 stw r4,rzNewTimeBase+4(r1)
135 cmpw r0,r7 // do we need to recompute _COMM_PAGE_SEC_PER_TICK?
136 stw r5,_COMM_PAGE_TIMESTAMP(r9) // store the new timestamp
137 stw r6,_COMM_PAGE_TIMESTAMP+4(r9)
138 lfd f1,rzNewTimeBase(r1) // get timebase in a FPR so we can store atomically
139 beq++ 2f // same ticks_per_sec, no need to recompute
140
141 stw r7,kkTicksPerSec+4(r8) // must recompute SEC_PER_TICK
142 stfd f2,rzSaveF2(r1) // we'll need a few more temp FPRs
143 stfd f3,rzSaveF3(r1)
144 stfd f4,rzSaveF4(r1)
145 stfd f5,rzSaveF5(r1)
146 lfd f2,_COMM_PAGE_2_TO_52(r9) // f2 <- double(2**52)
147 lfd f3,kkTicksPerSec(r8) // float new ticks_per_sec + 2**52
148 lfd f4,kkDouble1(r8) // f4 <- double(1.0)
149 mffs f5 // save caller's FPSCR
150 mtfsfi 7,0 // clear Inexeact Exception bit, set round-to-nearest
151 fsub f3,f3,f2 // get ticks_per_sec
152 fdiv f3,f4,f3 // divide 1 by ticks_per_sec to get SEC_PER_TICK
153 stfd f3,_COMM_PAGE_SEC_PER_TICK(r9)
154 mtfsf 0xFF,f5 // restore FPSCR
155 lfd f2,rzSaveF2(r1) // restore FPRs
156 lfd f3,rzSaveF3(r1)
157 lfd f4,rzSaveF4(r1)
158 lfd f5,rzSaveF5(r1)
1592: // f1 == new timestamp
160 eieio // wait until the stores take
161 stfd f1,_COMM_PAGE_TIMEBASE(r9) // then turn the timestamp back on (atomically)
1623: // here once all fields updated
163 lfd f1,rzSaveF1(r1) // restore last FPR
164 mtmsr r11 // turn FP back off
165 isync
166 blr
167
168
169/* ***************************************
170 * * C O M M P A G E _ T I M E _ D C B A *
171 * ***************************************
172 *
173 * Not all processors that support the DCBA opcode actually benefit from it.
174 * Some store-gather and read-cancel well enough that there is no need to use
175 * DCBA to avoid fetching cache lines that will be completely overwritten, while
176 * others have this feature disabled (to work around errata etc), and so benefit
177 * from DCBA. Since it is hard to tell the one group from the other, we just
178 * time loops with and without DCBA, and pick the fastest. Thus we avoid
179 * delicate dependence on processor and/or platform revisions.
180 *
181 * We return either kDcbaRecommended or zero.
182 *
183 * int commpage_time_dcba( void );
184 */
185
186LEXT(commpage_time_dcba)
187 mflr r12 // get return
188 stw r12,8(r1) // save
189 stwu r1,-kSFSize(r1) // carve our temp buffer from the stack
190 addi r11,r1,127+16 // get base address...
191 rlwinm r11,r11,0,0,24 // ...of our buffer, 128-byte aligned
192 crset kDCBA // first, use DCBA
193 bl LTest // time it with DCBA
194 srwi r0,r3,3 // bias 12 pct in favor of not using DCBA...
195 add r10,r3,r0 // ...because DCBA is always slower with warm cache
196 crclr kDCBA
197 bl LTest // time without DCBA
198 cmplw r10,r3 // which is better?
199 mtlr r12 // restore return
200 lwz r1,0(r1) // pop off our stack frame
201 li r3,kDcbaRecommended // assume using DCBA is faster
202 bltlr
203 li r3,0 // no DCBA is faster
204 blr
205
206
207// Subroutine to time a loop with or without DCBA.
208// kDCBA = set if we should use DCBA
209// r11 = base of buffer to use for test (kBufSiz bytes)
210//
211// We return TBR ticks in r3.
212// We use r0,r3-r9.
213
214LTest:
215 li r4,kLoopCnt // number of times to loop
216 li r3,-1 // initialize fastest time
2171:
218 mr r6,r11 // initialize buffer ptr
219 li r0,kBufSiz/32 // r0 <- cache blocks to test
220 mtctr r0
2212:
222 dcbf 0,r6 // first, force the blocks out of the cache
223 addi r6,r6,32
224 bdnz 2b
225 sync // make sure all the flushes take
226 mr r6,r11 // re-initialize buffer ptr
227 mtctr r0 // reset cache-block count
228 mftbu r7 // remember upper half so we can check for carry
229 mftb r8 // start the timer
2303: // loop over cache blocks
231 bf kDCBA,4f // should we DCBA?
232 dcba 0,r6
2334:
234 stw r0,0(r6) // store the entire cache block
235 stw r0,4(r6)
236 stw r0,8(r6)
237 stw r0,12(r6)
238 stw r0,16(r6)
239 stw r0,20(r6)
240 stw r0,24(r6)
241 stw r0,28(r6)
242 addi r6,r6,32
243 bdnz 3b
244 mftb r9
245 mftbu r0
246 cmpw r0,r7 // did timebase carry?
247 bne 1b // yes, retest rather than fuss
248 sub r9,r9,r8 // r9 <- time for this loop
249 cmplw r9,r3 // faster than current best?
250 bge 5f // no
251 mr r3,r9 // remember fastest time through loop
2525:
253 subi r4,r4,1 // decrement outer loop count
254 cmpwi r4,0 // more to go?
255 bne 1b // loop if so
256 blr // return fastest time in r3