]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/commpage_asm.s
5ec82596b104870dac94a1af9e2ae8906189bc2e
[apple/xnu.git] / osfmk / ppc / commpage / commpage_asm.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25
26 #include <sys/appleapiopts.h>
27 #include <ppc/asm.h>
28 #include <ppc/proc_reg.h>
29 #include <machine/cpu_capabilities.h>
30 #include <machine/commpage.h>
31
32
33 // commpage_time_dcba() uses a stack frame as follows:
34
35 #define kBufSiz 1024 // Size of the buffer we use to do DCBA timing on G4
36 #define kSFSize (kBufSiz+128+16) // Stack frame size, which contains the 128-byte-aligned buffer
37 #define kLoopCnt 5 // Iterations of the timing loop
38 #define kDCBA 22 // Bit in cr5 used as a flag in timing loop
39
40 .data
41 .align 3 // three doubleword fields
42 Ldata:
43 .long 0 // kkBinary0
44 .long 0
45 .double 1.0e0 // kkDouble1
46 .long 0x43300000 // kkTicksPerSec (plus 2**52)
47 .long 0 // this is where we store ticks_per_sec, to float
48
49 .text
50 .align 2
51 .globl EXT(commpage_time_dcba)
52
53 /* ***************************************
54 * * C O M M P A G E _ T I M E _ D C B A *
55 * ***************************************
56 *
57 * Not all processors that support the DCBA opcode actually benefit from it.
58 * Some store-gather and read-cancel well enough that there is no need to use
59 * DCBA to avoid fetching cache lines that will be completely overwritten, while
60 * others have this feature disabled (to work around errata etc), and so benefit
61 * from DCBA. Since it is hard to tell the one group from the other, we just
62 * time loops with and without DCBA, and pick the fastest. Thus we avoid
63 * delicate dependence on processor and/or platform revisions.
64 *
65 * We return either kDcbaRecommended or zero.
66 *
67 * int commpage_time_dcba( void );
68 */
69
70 LEXT(commpage_time_dcba)
71 mflr r12 // get return
72 stw r12,8(r1) // save
73 stwu r1,-kSFSize(r1) // carve our temp buffer from the stack
74 addi r11,r1,127+16 // get base address...
75 rlwinm r11,r11,0,0,24 // ...of our buffer, 128-byte aligned
76 crset kDCBA // first, use DCBA
77 bl LTest // time it with DCBA
78 srwi r0,r3,3 // bias 12 pct in favor of not using DCBA...
79 add r10,r3,r0 // ...because DCBA is always slower with warm cache
80 crclr kDCBA
81 bl LTest // time without DCBA
82 cmplw r10,r3 // which is better?
83 mtlr r12 // restore return
84 lwz r1,0(r1) // pop off our stack frame
85 li r3,kDcbaRecommended // assume using DCBA is faster
86 bltlr
87 li r3,0 // no DCBA is faster
88 blr
89
90
91 // Subroutine to time a loop with or without DCBA.
92 // kDCBA = set if we should use DCBA
93 // r11 = base of buffer to use for test (kBufSiz bytes)
94 //
95 // We return TBR ticks in r3.
96 // We use r0,r3-r9.
97
98 LTest:
99 li r4,kLoopCnt // number of times to loop
100 li r3,-1 // initialize fastest time
101 1:
102 mr r6,r11 // initialize buffer ptr
103 li r0,kBufSiz/32 // r0 <- cache blocks to test
104 mtctr r0
105 2:
106 dcbf 0,r6 // first, force the blocks out of the cache
107 addi r6,r6,32
108 bdnz 2b
109 sync // make sure all the flushes take
110 mr r6,r11 // re-initialize buffer ptr
111 mtctr r0 // reset cache-block count
112 mftbu r7 // remember upper half so we can check for carry
113 mftb r8 // start the timer
114 3: // loop over cache blocks
115 bf kDCBA,4f // should we DCBA?
116 dcba 0,r6
117 4:
118 stw r0,0(r6) // store the entire cache block
119 stw r0,4(r6)
120 stw r0,8(r6)
121 stw r0,12(r6)
122 stw r0,16(r6)
123 stw r0,20(r6)
124 stw r0,24(r6)
125 stw r0,28(r6)
126 addi r6,r6,32
127 bdnz 3b
128 mftb r9
129 mftbu r0
130 cmpw r0,r7 // did timebase carry?
131 bne 1b // yes, retest rather than fuss
132 sub r9,r9,r8 // r9 <- time for this loop
133 cmplw r9,r3 // faster than current best?
134 bge 5f // no
135 mr r3,r9 // remember fastest time through loop
136 5:
137 subi r4,r4,1 // decrement outer loop count
138 cmpwi r4,0 // more to go?
139 bne 1b // loop if so
140 blr // return fastest time in r3