osfmk/ppc/commpage/commpage_asm.s

   1 /*
   2  * Copyright (c) 2003-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30
  31 #include <sys/appleapiopts.h>
  32 #include <ppc/asm.h>
  33 #include <ppc/proc_reg.h>
  34 #include <machine/cpu_capabilities.h>
  35 #include <machine/commpage.h>
  36
  37
  38 // commpage_time_dcba() uses a stack frame as follows:
  39
  40 #define kBufSiz         1024                            // Size of the buffer we use to do DCBA timing on G4
  41 #define kSFSize         (kBufSiz+128+16)        // Stack frame size, which contains the 128-byte-aligned buffer
  42 #define kLoopCnt        5                                       // Iterations of the timing loop
  43 #define kDCBA           22                                      // Bit in cr5 used as a flag in timing loop
  44
  45
  46 // commpage_set_timestamp() uses the red zone for temporary storage:
  47
  48 #define rzSaveF1                        -8              // caller's FPR1
  49 #define rzSaveF2                        -16             // caller's FPR2
  50 #define rzSaveF3                        -24             // caller's FPR3
  51 #define rzSaveF4                        -32             // caller's FPR4
  52 #define rzSaveF5                        -40             // caller's FPR5
  53 #define rzNewTimeBase           -48             // used to load 64-bit TBR into a FPR
  54
  55
  56 // commpage_set_timestamp() uses the following data.  kkTicksPerSec remembers
  57 // the number used to compute _COMM_PAGE_SEC_PER_TICK.  Since this constant
  58 // rarely changes, we use it to avoid needless recomputation.  It is a double
  59 // value, pre-initialize with an exponent of 2**52.
  60
  61 #define kkBinary0               0                                       // offset in data to long long 0 (a constant)
  62 #define kkDouble1               8                                       // offset in data to double 1.0 (a constant)
  63 #define kkTicksPerSec   16                                      // offset in data to double(ticks_per_sec)
  64
  65         .data
  66         .align  3                                                       // three doubleword fields
  67 Ldata:
  68         .long   0                                                       // kkBinary0
  69         .long   0
  70         .double 1.0e0                                           // kkDouble1
  71         .long   0x43300000                                      // kkTicksPerSec (plus 2**52)
  72         .long   0                                                       // this is where we store ticks_per_sec, to float
  73
  74         .text
  75         .align  2
  76         .globl  EXT(commpage_time_dcba)
  77         .globl  EXT(commpage_set_timestamp)
  78
  79
  80 /*      ***********************************************
  81  *      * C O M M P A G E _ S E T _ T I M E S T A M P *
  82  *      ***********************************************
  83  *
  84  *      Update the gettimeofday() shared data on the commpages, as follows:
  85  *              _COMM_PAGE_TIMESTAMP = the clock offset at timebase (seconds)
  86  *              _COMM_PAGE_TIMEBASE = the timebase at which the timestamp was valid
  87  *              _COMM_PAGE_SEC_PER_TICK = multiply timebase ticks by this to get seconds (double)
  88  *      The convention is that if the timebase is 0, the data is invalid.  Because other
  89  *      CPUs are reading the three values asynchronously and must get a consistent set,
  90  *      it is critical that we update them with the following protocol:
  91  *              1. set timebase to 0 (atomically), to invalidate all three values
  92  *              2. eieio (to create a barrier in stores to cacheable memory)
  93  *              3. change timestamp and "secs per tick"
  94  *              4. eieio
  95  *              5. set timebase nonzero (atomically)
  96  *      This works because readers read the timebase, then the timestamp and divisor, sync
  97  *      if MP, then read the timebase a second time and check to be sure it is equal to the first.
  98  *
  99  *      We could save a few cycles on 64-bit machines by special casing them, but it probably
 100  *      isn't necessary because this routine shouldn't be called very often.
 101  *
 102  *      When called:
 103  *              r3 = upper half of timebase (timebase is disabled if 0)
 104  *              r4 = lower half of timebase
 105  *              r5 = upper half of timestamp
 106  *              r6 = lower half of timestamp
 107  *              r7 = divisor (ie, timebase ticks per sec)
 108  *      We set up:
 109  *              r8 = ptr to our static data (kkBinary0, kkDouble1, kkTicksPerSec)
 110  *              r9 = ptr to 32-bit commpage in kernel map
 111  *     r10 = ptr to 64-bit commpage in kernel map
 112  *
 113  *      --> Interrupts must be disabled and rtclock locked when called.  <--
 114  */
 115
 116         .align  5
 117 LEXT(commpage_set_timestamp)                            // void commpage_set_timestamp(tbr,secs,divisor)
 118         mfmsr   r11                                                     // get MSR
 119         ori             r2,r11,MASK(MSR_FP)                     // turn FP on
 120         mtmsr   r2
 121         isync                                                           // wait until MSR changes take effect
 122
 123         or.             r0,r3,r4                                        // is timebase 0? (thus disabled)
 124         lis             r8,hi16(Ldata)                          // point to our data
 125         lis             r9,ha16(EXT(commPagePtr32))     // get ptrs to address of commpages in kernel map
 126                 lis             r10,ha16(EXT(commPagePtr64))
 127         stfd    f1,rzSaveF1(r1)                         // save a FPR in the red zone
 128         ori             r8,r8,lo16(Ldata)
 129         lwz             r9,lo16(EXT(commPagePtr32))(r9) // r9 <- 32-bit commpage ptr
 130                 lwz             r10,lo16(EXT(commPagePtr64))(r10) // r10 <- 64-bit commpage ptr
 131         lfd             f1,kkBinary0(r8)                        // get fixed 0s
 132         li              r0,_COMM_PAGE_BASE_ADDRESS      // get va in user space of commpage
 133         cmpwi   cr1,r9,0                                        // is 32-bit commpage allocated yet?
 134                 cmpwi   cr6,r10,0                                       // is 64-bit commpage allocated yet?
 135         sub             r9,r9,r0                                        // r9 <- 32-bit commpage address, biased by user va
 136                 sub             r10,r10,r0                                      // r10<- 64-bit commpage address
 137         beq--   cr1,3f                                          // skip if 32-bit commpage not allocated (64-bit won't be either)
 138                 bne++   cr6,1f                                          // skip if 64-bit commpage is allocated
 139                 mr              r10,r9                                          // if no 64-bit commpage, point to 32-bit version with r10 too
 140 1:
 141         stfd    f1,_COMM_PAGE_TIMEBASE(r9)      // turn off the 32-bit-commpage timestamp (atomically)
 142                 stfd    f1,_COMM_PAGE_TIMEBASE(r10) // and the 64-bit one too
 143         eieio                                                           // make sure all CPUs see it is off
 144         beq             3f                                                      // all we had to do is turn off timestamp
 145
 146         lwz             r0,kkTicksPerSec+4(r8)          // get last ticks_per_sec (or 0 if first)
 147         stw             r3,rzNewTimeBase(r1)            // store new timebase so we can lfd
 148         stw             r4,rzNewTimeBase+4(r1)
 149         cmpw    r0,r7                                           // do we need to recompute _COMM_PAGE_SEC_PER_TICK?
 150         stw             r5,_COMM_PAGE_TIMESTAMP(r9)     // store the new timestamp in the 32-bit page
 151         stw             r6,_COMM_PAGE_TIMESTAMP+4(r9)
 152         stw             r5,_COMM_PAGE_TIMESTAMP(r10)// and the 64-bit commpage
 153         stw             r6,_COMM_PAGE_TIMESTAMP+4(r10)
 154         lfd             f1,rzNewTimeBase(r1)            // get timebase in a FPR so we can store atomically
 155         beq++   2f                                                      // same ticks_per_sec, no need to recompute
 156
 157         stw             r7,kkTicksPerSec+4(r8)          // must recompute SEC_PER_TICK
 158         stfd    f2,rzSaveF2(r1)                         // we'll need a few more temp FPRs
 159         stfd    f3,rzSaveF3(r1)
 160         stfd    f4,rzSaveF4(r1)
 161         stfd    f5,rzSaveF5(r1)
 162         lfd             f2,_COMM_PAGE_2_TO_52(r9)       // f2 <- double(2**52)
 163         lfd             f3,kkTicksPerSec(r8)            // float new ticks_per_sec + 2**52
 164         lfd             f4,kkDouble1(r8)                        // f4 <- double(1.0)
 165         mffs    f5                                                      // save caller's FPSCR
 166         mtfsfi  7,1                                                     // clear Inexeact Exception bit, set round-to-zero
 167         fsub    f3,f3,f2                                        // get ticks_per_sec
 168         fdiv    f3,f4,f3                                        // divide 1 by ticks_per_sec to get SEC_PER_TICK
 169         stfd    f3,_COMM_PAGE_SEC_PER_TICK(r9)
 170         stfd    f3,_COMM_PAGE_SEC_PER_TICK(r10)
 171         mtfsf   0xFF,f5                                         // restore FPSCR
 172         lfd             f2,rzSaveF2(r1)                         // restore FPRs
 173         lfd             f3,rzSaveF3(r1)
 174         lfd             f4,rzSaveF4(r1)
 175         lfd             f5,rzSaveF5(r1)
 176 2:                                                                                      // f1 == new timestamp
 177         eieio                                                           // wait until the stores take
 178         stfd    f1,_COMM_PAGE_TIMEBASE(r9)      // then turn the timestamp back on (atomically)
 179         stfd    f1,_COMM_PAGE_TIMEBASE(r10)     // both
 180 3:                                                                                      // here once all fields updated
 181         lfd             f1,rzSaveF1(r1)                         // restore last FPR
 182         mtmsr   r11                                                     // turn FP back off
 183         isync
 184         blr
 185
 186
 187 /*      ***************************************
 188  *      * C O M M P A G E _ T I M E _ D C B A *
 189  *      ***************************************
 190  *
 191  *      Not all processors that support the DCBA opcode actually benefit from it.
 192  *      Some store-gather and read-cancel well enough that there is no need to use
 193  *      DCBA to avoid fetching cache lines that will be completely overwritten, while
 194  *      others have this feature disabled (to work around errata etc), and so benefit
 195  *      from DCBA.  Since it is hard to tell the one group from the other, we just
 196  *      time loops with and without DCBA, and pick the fastest.  Thus we avoid
 197  *      delicate dependence on processor and/or platform revisions.
 198  *
 199  *      We return either kDcbaRecommended or zero.
 200  *
 201  *              int commpage_time_dcba( void );
 202  */
 203
 204 LEXT(commpage_time_dcba)
 205         mflr    r12                                     // get return
 206         stw             r12,8(r1)                       // save
 207         stwu    r1,-kSFSize(r1)         // carve our temp buffer from the stack
 208         addi    r11,r1,127+16           // get base address...
 209         rlwinm  r11,r11,0,0,24          // ...of our buffer, 128-byte aligned
 210         crset   kDCBA                           // first, use DCBA
 211         bl              LTest                           // time it with DCBA
 212         srwi    r0,r3,3                         // bias 12 pct in favor of not using DCBA...
 213         add             r10,r3,r0                       // ...because DCBA is always slower with warm cache
 214         crclr   kDCBA
 215         bl              LTest                           // time without DCBA
 216         cmplw   r10,r3                          // which is better?
 217         mtlr    r12                                     // restore return
 218         lwz             r1,0(r1)                        // pop off our stack frame
 219         li              r3,kDcbaRecommended             // assume using DCBA is faster
 220         bltlr
 221         li              r3,0                    // no DCBA is faster
 222         blr
 223
 224
 225 // Subroutine to time a loop with or without DCBA.
 226 //              kDCBA = set if we should use DCBA
 227 //              r11 = base of buffer to use for test (kBufSiz bytes)
 228 //
 229 //              We return TBR ticks in r3.
 230 //              We use r0,r3-r9.
 231
 232 LTest:
 233         li              r4,kLoopCnt                     // number of times to loop
 234         li              r3,-1                           // initialize fastest time
 235 1:
 236         mr              r6,r11                          // initialize buffer ptr
 237         li              r0,kBufSiz/32           // r0 <- cache blocks to test
 238         mtctr   r0
 239 2:
 240         dcbf    0,r6                            // first, force the blocks out of the cache
 241         addi    r6,r6,32
 242         bdnz    2b
 243         sync                                            // make sure all the flushes take
 244         mr              r6,r11                          // re-initialize buffer ptr
 245         mtctr   r0                                      // reset cache-block count
 246         mftbu   r7                                      // remember upper half so we can check for carry
 247         mftb    r8                                      // start the timer
 248 3:                                                                      // loop over cache blocks
 249         bf              kDCBA,4f                        // should we DCBA?
 250         dcba    0,r6
 251 4:
 252         stw             r0,0(r6)                        // store the entire cache block
 253         stw             r0,4(r6)
 254         stw             r0,8(r6)
 255         stw             r0,12(r6)
 256         stw             r0,16(r6)
 257         stw             r0,20(r6)
 258         stw             r0,24(r6)
 259         stw             r0,28(r6)
 260         addi    r6,r6,32
 261         bdnz    3b
 262         mftb    r9
 263         mftbu   r0
 264         cmpw    r0,r7                           // did timebase carry?
 265         bne             1b                                      // yes, retest rather than fuss
 266         sub             r9,r9,r8                        // r9 <- time for this loop
 267         cmplw   r9,r3                           // faster than current best?
 268         bge             5f                                      // no
 269         mr              r3,r9                           // remember fastest time through loop
 270 5:
 271         subi    r4,r4,1                         // decrement outer loop count
 272         cmpwi   r4,0                            // more to go?
 273         bne             1b                                      // loop if so
 274         blr                                                     // return fastest time in r3