osfmk/ppc/commpage/commpage_asm.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #include <sys/appleapiopts.h>
  24 #include <ppc/asm.h>
  25 #include <ppc/proc_reg.h>
  26 #include <machine/cpu_capabilities.h>
  27 #include <machine/commpage.h>
  28
  29
  30 // commpage_time_dcba() uses a stack frame as follows:
  31
  32 #define kBufSiz         1024                            // Size of the buffer we use to do DCBA timing on G4
  33 #define kSFSize         (kBufSiz+128+16)        // Stack frame size, which contains the 128-byte-aligned buffer
  34 #define kLoopCnt        5                                       // Iterations of the timing loop
  35 #define kDCBA           22                                      // Bit in cr5 used as a flag in timing loop
  36
  37
  38 // commpage_set_timestamp() uses the red zone for temporary storage:
  39
  40 #define rzSaveF1                        -8              // caller's FPR1
  41 #define rzSaveF2                        -16             // caller's FPR2
  42 #define rzSaveF3                        -24             // caller's FPR3
  43 #define rzSaveF4                        -32             // caller's FPR4
  44 #define rzSaveF5                        -40             // caller's FPR5
  45 #define rzNewTimeBase           -48             // used to load 64-bit TBR into a FPR
  46
  47
  48 // commpage_set_timestamp() uses the following data.  kkTicksPerSec remembers
  49 // the number used to compute _COMM_PAGE_SEC_PER_TICK.  Since this constant
  50 // rarely changes, we use it to avoid needless recomputation.  It is a double
  51 // value, pre-initialize with an exponent of 2**52.
  52
  53 #define kkBinary0               0                                       // offset in data to long long 0 (a constant)
  54 #define kkDouble1               8                                       // offset in data to double 1.0 (a constant)
  55 #define kkTicksPerSec   16                                      // offset in data to double(ticks_per_sec)
  56
  57         .data
  58         .align  3                                                       // three doubleword fields
  59 Ldata:
  60         .long   0                                                       // kkBinary0
  61         .long   0
  62         .double 1.0e0                                           // kkDouble1
  63         .long   0x43300000                                      // kkTicksPerSec (plus 2**52)
  64         .long   0                                                       // this is where we store ticks_per_sec, to float
  65
  66         .text
  67         .align  2
  68         .globl  EXT(commpage_time_dcba)
  69         .globl  EXT(commpage_set_timestamp)
  70
  71
  72 /*      ***********************************************
  73  *      * C O M M P A G E _ S E T _ T I M E S T A M P *
  74  *      ***********************************************
  75  *
  76  *      Update the gettimeofday() shared data on the commpages, as follows:
  77  *              _COMM_PAGE_TIMESTAMP = a BSD-style pair of uint_32's for secs and usecs
  78  *              _COMM_PAGE_TIMEBASE = the timebase at which the timestamp was valid
  79  *              _COMM_PAGE_SEC_PER_TICK = multiply timebase ticks by this to get seconds (double)
  80  *      The convention is that if the timebase is 0, the data is invalid.  Because other
  81  *      CPUs are reading the three values asynchronously and must get a consistent set,
  82  *      it is critical that we update them with the following protocol:
  83  *              1. set timebase to 0 (atomically), to invalidate all three values
  84  *              2. eieio (to create a barrier in stores to cacheable memory)
  85  *              3. change timestamp and "secs per tick"
  86  *              4. eieio
  87  *              5. set timebase nonzero (atomically)
  88  *      This works because readers read the timebase, then the timestamp and divisor, sync
  89  *      if MP, then read the timebase a second time and check to be sure it is equal to the first.
  90  *
  91  *      We could save a few cycles on 64-bit machines by special casing them, but it probably
  92  *      isn't necessary because this routine shouldn't be called very often.
  93  *
  94  *      When called:
  95  *              r3 = upper half of timebase (timebase is disabled if 0)
  96  *              r4 = lower half of timebase
  97  *              r5 = seconds part of timestamp
  98  *              r6 = useconds part of timestamp
  99  *              r7 = divisor (ie, timebase ticks per sec)
 100  *      We set up:
 101  *              r8 = ptr to our static data (kkBinary0, kkDouble1, kkTicksPerSec)
 102  *              r9 = ptr to 32-bit commpage in kernel map
 103  *     r10 = ptr to 64-bit commpage in kernel map
 104  *
 105  *      --> Interrupts must be disabled and rtclock locked when called.  <--
 106  */
 107
 108         .align  5
 109 LEXT(commpage_set_timestamp)                            // void commpage_set_timestamp(tbr,secs,usecs,divisor)
 110         mfmsr   r11                                                     // get MSR
 111         ori             r2,r11,MASK(MSR_FP)                     // turn FP on
 112         mtmsr   r2
 113         isync                                                           // wait until MSR changes take effect
 114
 115         or.             r0,r3,r4                                        // is timebase 0? (thus disabled)
 116         lis             r8,hi16(Ldata)                          // point to our data
 117         lis             r9,ha16(EXT(commPagePtr32))     // get ptrs to address of commpages in kernel map
 118                 lis             r10,ha16(EXT(commPagePtr64))
 119         stfd    f1,rzSaveF1(r1)                         // save a FPR in the red zone
 120         ori             r8,r8,lo16(Ldata)
 121         lwz             r9,lo16(EXT(commPagePtr32))(r9) // r9 <- 32-bit commpage ptr
 122                 lwz             r10,lo16(EXT(commPagePtr64))(r10) // r10 <- 64-bit commpage ptr
 123         lfd             f1,kkBinary0(r8)                        // get fixed 0s
 124         li              r0,_COMM_PAGE_BASE_ADDRESS      // get va in user space of commpage
 125         cmpwi   cr1,r9,0                                        // is 32-bit commpage allocated yet?
 126                 cmpwi   cr6,r10,0                                       // is 64-bit commpage allocated yet?
 127         sub             r9,r9,r0                                        // r9 <- 32-bit commpage address, biased by user va
 128                 sub             r10,r10,r0                                      // r10<- 64-bit commpage address
 129         beq--   cr1,3f                                          // skip if 32-bit commpage not allocated (64-bit won't be either)
 130                 bne++   cr6,1f                                          // skip if 64-bit commpage is allocated
 131                 mr              r10,r9                                          // if no 64-bit commpage, point to 32-bit version with r10 too
 132 1:
 133         stfd    f1,_COMM_PAGE_TIMEBASE(r9)      // turn off the 32-bit-commpage timestamp (atomically)
 134                 stfd    f1,_COMM_PAGE_TIMEBASE(r10) // and the 64-bit one too
 135         eieio                                                           // make sure all CPUs see it is off
 136         beq             3f                                                      // all we had to do is turn off timestamp
 137
 138         lwz             r0,kkTicksPerSec+4(r8)          // get last ticks_per_sec (or 0 if first)
 139         stw             r3,rzNewTimeBase(r1)            // store new timebase so we can lfd
 140         stw             r4,rzNewTimeBase+4(r1)
 141         cmpw    r0,r7                                           // do we need to recompute _COMM_PAGE_SEC_PER_TICK?
 142         stw             r5,_COMM_PAGE_TIMESTAMP(r9)     // store the new timestamp in the 32-bit page
 143         stw             r6,_COMM_PAGE_TIMESTAMP+4(r9)
 144         stw             r5,_COMM_PAGE_TIMESTAMP(r10)// and the 64-bit commpage
 145         stw             r6,_COMM_PAGE_TIMESTAMP+4(r10)
 146         lfd             f1,rzNewTimeBase(r1)            // get timebase in a FPR so we can store atomically
 147         beq++   2f                                                      // same ticks_per_sec, no need to recompute
 148
 149         stw             r7,kkTicksPerSec+4(r8)          // must recompute SEC_PER_TICK
 150         stfd    f2,rzSaveF2(r1)                         // we'll need a few more temp FPRs
 151         stfd    f3,rzSaveF3(r1)
 152         stfd    f4,rzSaveF4(r1)
 153         stfd    f5,rzSaveF5(r1)
 154         lfd             f2,_COMM_PAGE_2_TO_52(r9)       // f2 <- double(2**52)
 155         lfd             f3,kkTicksPerSec(r8)            // float new ticks_per_sec + 2**52
 156         lfd             f4,kkDouble1(r8)                        // f4 <- double(1.0)
 157         mffs    f5                                                      // save caller's FPSCR
 158         mtfsfi  7,0                                                     // clear Inexeact Exception bit, set round-to-nearest
 159         fsub    f3,f3,f2                                        // get ticks_per_sec
 160         fdiv    f3,f4,f3                                        // divide 1 by ticks_per_sec to get SEC_PER_TICK
 161         stfd    f3,_COMM_PAGE_SEC_PER_TICK(r9)
 162         stfd    f3,_COMM_PAGE_SEC_PER_TICK(r10)
 163         mtfsf   0xFF,f5                                         // restore FPSCR
 164         lfd             f2,rzSaveF2(r1)                         // restore FPRs
 165         lfd             f3,rzSaveF3(r1)
 166         lfd             f4,rzSaveF4(r1)
 167         lfd             f5,rzSaveF5(r1)
 168 2:                                                                                      // f1 == new timestamp
 169         eieio                                                           // wait until the stores take
 170         stfd    f1,_COMM_PAGE_TIMEBASE(r9)      // then turn the timestamp back on (atomically)
 171         stfd    f1,_COMM_PAGE_TIMEBASE(r10)     // both
 172 3:                                                                                      // here once all fields updated
 173         lfd             f1,rzSaveF1(r1)                         // restore last FPR
 174         mtmsr   r11                                                     // turn FP back off
 175         isync
 176         blr
 177
 178
 179 /*      ***************************************
 180  *      * C O M M P A G E _ T I M E _ D C B A *
 181  *      ***************************************
 182  *
 183  *      Not all processors that support the DCBA opcode actually benefit from it.
 184  *      Some store-gather and read-cancel well enough that there is no need to use
 185  *      DCBA to avoid fetching cache lines that will be completely overwritten, while
 186  *      others have this feature disabled (to work around errata etc), and so benefit
 187  *      from DCBA.  Since it is hard to tell the one group from the other, we just
 188  *      time loops with and without DCBA, and pick the fastest.  Thus we avoid
 189  *      delicate dependence on processor and/or platform revisions.
 190  *
 191  *      We return either kDcbaRecommended or zero.
 192  *
 193  *              int commpage_time_dcba( void );
 194  */
 195
 196 LEXT(commpage_time_dcba)
 197         mflr    r12                                     // get return
 198         stw             r12,8(r1)                       // save
 199         stwu    r1,-kSFSize(r1)         // carve our temp buffer from the stack
 200         addi    r11,r1,127+16           // get base address...
 201         rlwinm  r11,r11,0,0,24          // ...of our buffer, 128-byte aligned
 202         crset   kDCBA                           // first, use DCBA
 203         bl              LTest                           // time it with DCBA
 204         srwi    r0,r3,3                         // bias 12 pct in favor of not using DCBA...
 205         add             r10,r3,r0                       // ...because DCBA is always slower with warm cache
 206         crclr   kDCBA
 207         bl              LTest                           // time without DCBA
 208         cmplw   r10,r3                          // which is better?
 209         mtlr    r12                                     // restore return
 210         lwz             r1,0(r1)                        // pop off our stack frame
 211         li              r3,kDcbaRecommended             // assume using DCBA is faster
 212         bltlr
 213         li              r3,0                    // no DCBA is faster
 214         blr
 215
 216
 217 // Subroutine to time a loop with or without DCBA.
 218 //              kDCBA = set if we should use DCBA
 219 //              r11 = base of buffer to use for test (kBufSiz bytes)
 220 //
 221 //              We return TBR ticks in r3.
 222 //              We use r0,r3-r9.
 223
 224 LTest:
 225         li              r4,kLoopCnt                     // number of times to loop
 226         li              r3,-1                           // initialize fastest time
 227 1:
 228         mr              r6,r11                          // initialize buffer ptr
 229         li              r0,kBufSiz/32           // r0 <- cache blocks to test
 230         mtctr   r0
 231 2:
 232         dcbf    0,r6                            // first, force the blocks out of the cache
 233         addi    r6,r6,32
 234         bdnz    2b
 235         sync                                            // make sure all the flushes take
 236         mr              r6,r11                          // re-initialize buffer ptr
 237         mtctr   r0                                      // reset cache-block count
 238         mftbu   r7                                      // remember upper half so we can check for carry
 239         mftb    r8                                      // start the timer
 240 3:                                                                      // loop over cache blocks
 241         bf              kDCBA,4f                        // should we DCBA?
 242         dcba    0,r6
 243 4:
 244         stw             r0,0(r6)                        // store the entire cache block
 245         stw             r0,4(r6)
 246         stw             r0,8(r6)
 247         stw             r0,12(r6)
 248         stw             r0,16(r6)
 249         stw             r0,20(r6)
 250         stw             r0,24(r6)
 251         stw             r0,28(r6)
 252         addi    r6,r6,32
 253         bdnz    3b
 254         mftb    r9
 255         mftbu   r0
 256         cmpw    r0,r7                           // did timebase carry?
 257         bne             1b                                      // yes, retest rather than fuss
 258         sub             r9,r9,r8                        // r9 <- time for this loop
 259         cmplw   r9,r3                           // faster than current best?
 260         bge             5f                                      // no
 261         mr              r3,r9                           // remember fastest time through loop
 262 5:
 263         subi    r4,r4,1                         // decrement outer loop count
 264         cmpwi   r4,0                            // more to go?
 265         bne             1b                                      // loop if so
 266         blr                                                     // return fastest time in r3