xnu-792.12.6.tar.gz

[apple/xnu.git] / osfmk / i386 / rtclock.c
diff --git a/osfmk/i386/rtclock.c b/osfmk/i386/rtclock.c

index 8193417b63d704fc49a4ae22b6ebd16cf7008065..ee179f3b128a0f24b31a47c8bef792060af4b46c 100644 (file)
--- a/osfmk/i386/rtclock.c
+++ b/osfmk/i386/rtclock.c
@@ -1,23 +1,31 @@
  /*
   * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   *
- * @APPLE_LICENSE_HEADER_START@
+ * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   * 
- * The contents of this file constitute Original Code as defined in and
- * are subject to the Apple Public Source License Version 1.1 (the
- * "License").  You may not use this file except in compliance with the
- * License.  Please obtain a copy of the License at
- * http://www.apple.com/publicsource and read it before using this file.
- * 
- * This Original Code and all software distributed under the License are
- * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
- * License for the specific language governing rights and limitations
- * under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
+ * This file contains Original Code and/or Modifications of Original Code 
+ * as defined in and that are subject to the Apple Public Source License 
+ * Version 2.0 (the 'License'). You may not use this file except in 
+ * compliance with the License.  The rights granted to you under the 
+ * License may not be used to create, or enable the creation or 
+ * redistribution of, unlawful or unlicensed copies of an Apple operating 
+ * system, or to circumvent, violate, or enable the circumvention or 
+ * violation of, any terms of an Apple operating system software license 
+ * agreement.
+ *
+ * Please obtain a copy of the License at 
+ * http://www.opensource.apple.com/apsl/ and read it before using this 
+ * file.
+ *
+ * The Original Code and all software distributed under the License are 
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
+ * Please see the License for the specific language governing rights and 
+ * limitations under the License.
+ *
+ * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
   */
  /*
   * @OSF_COPYRIGHT@
@@ -51,7 +59,7 @@
  #include <vm/vm_kern.h>                /* for kernel_map */
  #include <i386/ipl.h>
  #include <i386/pit.h>
-#include <architecture/i386/pio.h>
+#include <i386/pio.h>
  #include <i386/misc_protos.h>
  #include <i386/proc_reg.h>
  #include <i386/machine_cpu.h>
@@ -61,12 +69,11 @@
  #include <i386/cpu_threads.h>
  #include <i386/perfmon.h>
  #include <i386/machine_routines.h>
+#include <i386/AT386/bbclock_entries.h>
  #include <pexpert/pexpert.h>
  #include <machine/limits.h>
  #include <machine/commpage.h>
  #include <sys/kdebug.h>
-#include <i386/tsc.h>
-#include <i386/hpet.h>
  
  #define MAX(a,b) (((a)>(b))?(a):(b))
  #define MIN(a,b) (((a)>(b))?(b):(a))
@@ -75,62 +82,429 @@
  
  #define UI_CPUFREQ_ROUNDING_FACTOR     10000000
  
-int            rtclock_config(void);
+int            sysclk_config(void);
+
+int            sysclk_init(void);
+
+kern_return_t  sysclk_gettime(
+       mach_timespec_t                 *cur_time);
+
+kern_return_t  sysclk_getattr(
+       clock_flavor_t                  flavor,
+       clock_attr_t                    attr,
+       mach_msg_type_number_t  *count);
+
+void           sysclk_setalarm(
+       mach_timespec_t                 *alarm_time);
+
+/*
+ * Lists of clock routines.
+ */
+struct clock_ops  sysclk_ops = {
+       sysclk_config,                  sysclk_init,
+       sysclk_gettime,                 0,
+       sysclk_getattr,                 0,
+       sysclk_setalarm,
+};
+
+int            calend_config(void);
  
-int            rtclock_init(void);
+int            calend_init(void);
  
-uint64_t       rtc_decrementer_min;
+kern_return_t  calend_gettime(
+       mach_timespec_t                 *cur_time);
  
-void                   rtclock_intr(x86_saved_state_t *regs);
-static uint64_t                maxDec;                 /* longest interval our hardware timer can handle (nsec) */
+kern_return_t  calend_getattr(
+       clock_flavor_t                  flavor,
+       clock_attr_t                    attr,
+       mach_msg_type_number_t  *count);
  
-/* XXX this should really be in a header somewhere */
-extern clock_timer_func_t      rtclock_timer_expire;
+struct clock_ops calend_ops = {
+       calend_config,                  calend_init,
+       calend_gettime,                 0,
+       calend_getattr,                 0,
+       0,
+};
  
-static void    rtc_set_timescale(uint64_t cycles);
-static uint64_t        rtc_export_speed(uint64_t cycles);
+/* local data declarations */
  
-extern void            rtc_nanotime_store(
-                                       uint64_t                tsc,
-                                       uint64_t                nsec,
-                                       uint32_t                scale,
-                                       uint32_t                shift,
-                                       rtc_nanotime_t  *dst);
+static clock_timer_func_t      rtclock_timer_expire;
  
-extern void            rtc_nanotime_load(
-                                       rtc_nanotime_t  *src,
-                                       rtc_nanotime_t  *dst);
+static timer_call_data_t       rtclock_alarm_timer;
  
-rtc_nanotime_t rtc_nanotime_info;
+static void    rtclock_alarm_expire(
+                       timer_call_param_t      p0,
+                       timer_call_param_t      p1);
+
+struct {
+       mach_timespec_t                 calend_offset;
+       boolean_t                       calend_is_set;
+
+       int64_t                         calend_adjtotal;
+       int32_t                         calend_adjdelta;
+
+       uint32_t                        boottime;
+
+        mach_timebase_info_data_t      timebase_const;
+
+       decl_simple_lock_data(,lock)    /* real-time clock device lock */
+} rtclock;
+
+boolean_t              rtc_initialized = FALSE;
+clock_res_t            rtc_intr_nsec = NSEC_PER_HZ;    /* interrupt res */
+uint64_t               rtc_cycle_count;        /* clocks in 1/20th second */
+uint64_t               rtc_cyc_per_sec;        /* processor cycles per sec */
+uint32_t               rtc_boot_frequency;     /* provided by 1st speed-step */
+uint32_t               rtc_quant_scale;        /* clock to nanos multiplier */
+uint32_t               rtc_quant_shift;        /* clock to nanos right shift */
+uint64_t               rtc_decrementer_min;
+
+static mach_timebase_info_data_t       rtc_lapic_scale; /* nsec to lapic count */
  
  /*
- * tsc_to_nanoseconds:
+ *     Macros to lock/unlock real-time clock data.
+ */
+#define RTC_INTRS_OFF(s)               \
+       (s) = splclock()
+
+#define RTC_INTRS_ON(s)                        \
+       splx(s)
+
+#define RTC_LOCK(s)                    \
+MACRO_BEGIN                            \
+       RTC_INTRS_OFF(s);               \
+       simple_lock(&rtclock.lock);     \
+MACRO_END
+
+#define RTC_UNLOCK(s)                  \
+MACRO_BEGIN                            \
+       simple_unlock(&rtclock.lock);   \
+       RTC_INTRS_ON(s);                \
+MACRO_END
+
+/*
+ * i8254 control.  ** MONUMENT **
+ *
+ * The i8254 is a traditional PC device with some arbitrary characteristics.
+ * Basically, it is a register that counts at a fixed rate and can be
+ * programmed to generate an interrupt every N counts.  The count rate is
+ * clknum counts per sec (see pit.h), historically 1193167=14.318MHz/12
+ * but the more accurate value is 1193182=14.31818MHz/12. [14.31818 MHz being
+ * the master crystal oscillator reference frequency since the very first PC.]
+ * Various constants are computed based on this value, and we calculate
+ * them at init time for execution efficiency.  To obtain sufficient
+ * accuracy, some of the calculation are most easily done in floating
+ * point and then converted to int.
   *
- * Basic routine to convert a raw 64 bit TSC value to a
- * 64 bit nanosecond value.  The conversion is implemented
- * based on the scale factor and an implicit 32 bit shift.
   */
-static inline uint64_t
-_tsc_to_nanoseconds(uint64_t value)
+
+/*
+ * Forward decl.
+ */
+
+static uint64_t        rtc_set_cyc_per_sec(uint64_t cycles);
+uint64_t       rtc_nanotime_read(void);
+
+/*
+ * create_mul_quant_GHZ
+ *   create a constant used to multiply the TSC by to convert to nanoseconds.
+ *   This is a 32 bit number and the TSC *MUST* have a frequency higher than
+ *   1000Mhz for this routine to work.
+ *
+ * The theory here is that we know how many TSCs-per-sec the processor runs at.
+ * Normally to convert this to nanoseconds you would multiply the current
+ * timestamp by 1000000000 (a billion) then divide by TSCs-per-sec.
+ * Unfortunatly the TSC is 64 bits which would leave us with 96 bit intermediate
+ * results from the multiply that must be divided by.
+ * Usually thats
+ *   uint96 = tsc * numer
+ *   nanos = uint96 / denom
+ * Instead, we create this quant constant and it becomes the numerator,
+ * the denominator can then be 0x100000000 which makes our division as simple as
+ * forgetting the lower 32 bits of the result. We can also pass this number to
+ * user space as the numer and pass 0xFFFFFFFF (RTC_FAST_DENOM) as the denom to
+ * convert raw counts * to nanos. The difference is so small as to be
+ * undetectable by anything.
+ *
+ * Unfortunatly we can not do this for sub GHZ processors. In this case, all
+ * we do is pass the CPU speed in raw as the denom and we pass in 1000000000
+ * as the numerator. No short cuts allowed
+ */
+#define RTC_FAST_DENOM 0xFFFFFFFF
+inline static uint32_t
+create_mul_quant_GHZ(int shift, uint32_t quant)
  {
-    asm volatile("movl %%edx,%%esi     ;"
-                                "mull  %%ecx           ;"
-                                "movl  %%edx,%%edi     ;"
-                                "movl  %%esi,%%eax     ;"
-                                "mull  %%ecx           ;"
-                                "addl  %%edi,%%eax     ;"      
-                                "adcl  $0,%%edx         "
-                                               : "+A" (value) : "c" (rtc_nanotime_info.scale) : "esi", "edi");
+       return (uint32_t)((((uint64_t)NSEC_PER_SEC/20) << shift) / quant);
+}
+/*
+ * This routine takes a value of raw TSC ticks and applies the passed mul_quant
+ * generated by create_mul_quant() This is our internal routine for creating
+ * nanoseconds.
+ * Since we don't really have uint96_t this routine basically does this....
+ *   uint96_t intermediate = (*value) * scale
+ *   return (intermediate >> 32)
+ */
+inline static uint64_t
+fast_get_nano_from_abs(uint64_t value, int scale)
+{
+    asm ("     movl    %%edx,%%esi     \n\t"
+         "      mull   %%ecx           \n\t"
+         "      movl   %%edx,%%edi     \n\t"
+         "      movl   %%esi,%%eax     \n\t"
+         "      mull   %%ecx           \n\t"
+         "      xorl   %%ecx,%%ecx     \n\t"   
+         "      addl   %%edi,%%eax     \n\t"   
+         "      adcl   %%ecx,%%edx         "
+               : "+A" (value)
+               : "c" (scale)
+               : "%esi", "%edi");
+    return value;
+}
  
-    return (value);
+/*
+ * This routine basically does this...
+ * ts.tv_sec = nanos / 1000000000;     create seconds
+ * ts.tv_nsec = nanos % 1000000000;    create remainder nanos
+ */
+inline static mach_timespec_t 
+nanos_to_timespec(uint64_t nanos)
+{
+       union {
+               mach_timespec_t ts;
+               uint64_t u64;
+       } ret;
+        ret.u64 = nanos;
+        asm volatile("divl %1" : "+A" (ret.u64) : "r" (NSEC_PER_SEC));
+        return ret.ts;
  }
  
-uint64_t
-tsc_to_nanoseconds(uint64_t value)
+/*
+ * The following two routines perform the 96 bit arithmetic we need to
+ * convert generic absolute<->nanoseconds
+ * The multiply routine takes a uint64_t and a uint32_t and returns the result
+ * in a uint32_t[3] array.
+ * The divide routine takes this uint32_t[3] array and divides it by a uint32_t
+ * returning a uint64_t
+ */
+inline static void
+longmul(uint64_t       *abstime, uint32_t multiplicand, uint32_t *result)
  {
-       return _tsc_to_nanoseconds(value);
+    asm volatile(
+        " pushl        %%ebx                   \n\t"   
+        " movl %%eax,%%ebx             \n\t"
+        " movl (%%eax),%%eax           \n\t"
+        " mull %%ecx                   \n\t"
+        " xchg %%eax,%%ebx             \n\t"
+        " pushl        %%edx                   \n\t"
+        " movl 4(%%eax),%%eax          \n\t"
+        " mull %%ecx                   \n\t"
+        " movl %2,%%ecx                \n\t"
+        " movl %%ebx,(%%ecx)           \n\t"
+        " popl %%ebx                   \n\t"
+        " addl %%ebx,%%eax             \n\t"
+        " popl %%ebx                   \n\t"
+        " movl %%eax,4(%%ecx)          \n\t"
+        " adcl $0,%%edx                \n\t"
+        " movl %%edx,8(%%ecx)  // and save it"
+        : : "a"(abstime), "c"(multiplicand), "m"(result));
+    
  }
  
+inline static uint64_t
+longdiv(uint32_t *numer, uint32_t denom)
+{
+    uint64_t   result;
+    asm volatile(
+        " pushl        %%ebx                   \n\t"
+        " movl %%eax,%%ebx             \n\t"
+        " movl 8(%%eax),%%edx          \n\t"
+        " movl 4(%%eax),%%eax          \n\t"
+        " divl %%ecx                   \n\t"
+        " xchg %%ebx,%%eax             \n\t"
+        " movl (%%eax),%%eax           \n\t"
+        " divl %%ecx                   \n\t"
+        " xchg %%ebx,%%edx             \n\t"
+        " popl %%ebx                   \n\t"
+        : "=A"(result) : "a"(numer),"c"(denom));
+    return result;
+}
+
+/*
+ * Enable or disable timer 2.
+ * Port 0x61 controls timer 2:
+ *   bit 0 gates the clock,
+ *   bit 1 gates output to speaker.
+ */
+inline static void
+enable_PIT2(void)
+{
+    asm volatile(
+        " inb   $0x61,%%al      \n\t"
+        " and   $0xFC,%%al       \n\t"
+        " or    $1,%%al         \n\t"
+        " outb  %%al,$0x61      \n\t"
+        : : : "%al" );
+}
+
+inline static void
+disable_PIT2(void)
+{
+    asm volatile(
+        " inb   $0x61,%%al      \n\t"
+        " and   $0xFC,%%al      \n\t"
+        " outb  %%al,$0x61      \n\t"
+        : : : "%al" );
+}
+
+inline static void
+set_PIT2(int value)
+{
+/*
+ * First, tell the clock we are going to write 16 bits to the counter
+ *   and enable one-shot mode (command 0xB8 to port 0x43)
+ * Then write the two bytes into the PIT2 clock register (port 0x42).
+ * Loop until the value is "realized" in the clock,
+ * this happens on the next tick.
+ */
+    asm volatile(
+        " movb  $0xB8,%%al      \n\t"
+        " outb %%al,$0x43      \n\t"
+        " movb %%dl,%%al       \n\t"
+        " outb %%al,$0x42      \n\t"
+        " movb %%dh,%%al       \n\t"
+        " outb %%al,$0x42      \n"
+"1:      inb   $0x42,%%al      \n\t" 
+        " inb  $0x42,%%al      \n\t"
+        " cmp  %%al,%%dh       \n\t"
+        " jne  1b"
+        : : "d"(value) : "%al");
+}
+
+inline static uint64_t
+get_PIT2(unsigned int *value)
+{
+    register uint64_t  result;
+/*
+ * This routine first latches the time (command 0x80 to port 0x43),
+ * then gets the time stamp so we know how long the read will take later.
+ * Read (from port 0x42) and return the current value of the timer.
+ */
+    asm volatile(
+        " xorl %%ecx,%%ecx     \n\t"
+        " movb $0x80,%%al      \n\t"
+        " outb %%al,$0x43      \n\t"
+        " rdtsc                        \n\t"
+        " pushl        %%eax           \n\t"
+        " inb  $0x42,%%al      \n\t"
+        " movb %%al,%%cl       \n\t"
+        " inb  $0x42,%%al      \n\t"
+        " movb %%al,%%ch       \n\t"
+        " popl %%eax   "
+        : "=A"(result), "=c"(*value));
+    return result;
+}
+
+/*
+ * timeRDTSC()
+ * This routine sets up PIT counter 2 to count down 1/20 of a second.
+ * It pauses until the value is latched in the counter
+ * and then reads the time stamp counter to return to the caller.
+ */
+static uint64_t
+timeRDTSC(void)
+{
+    int                attempts = 0;
+    uint64_t   latchTime;
+    uint64_t   saveTime,intermediate;
+    unsigned int timerValue, lastValue;
+    boolean_t   int_enabled;
+    /*
+     * Table of correction factors to account for
+     *   - timer counter quantization errors, and
+     *   - undercounts 0..5
+     */
+#define        SAMPLE_CLKS_EXACT       (((double) CLKNUM) / 20.0)
+#define        SAMPLE_CLKS_INT         ((int) CLKNUM / 20)
+#define SAMPLE_NSECS           (2000000000LL)
+#define SAMPLE_MULTIPLIER      (((double)SAMPLE_NSECS)*SAMPLE_CLKS_EXACT)
+#define ROUND64(x)             ((uint64_t)((x) + 0.5))
+    uint64_t   scale[6] = {
+       ROUND64(SAMPLE_MULTIPLIER/(double)(SAMPLE_CLKS_INT-0)), 
+       ROUND64(SAMPLE_MULTIPLIER/(double)(SAMPLE_CLKS_INT-1)), 
+       ROUND64(SAMPLE_MULTIPLIER/(double)(SAMPLE_CLKS_INT-2)), 
+       ROUND64(SAMPLE_MULTIPLIER/(double)(SAMPLE_CLKS_INT-3)), 
+       ROUND64(SAMPLE_MULTIPLIER/(double)(SAMPLE_CLKS_INT-4)), 
+       ROUND64(SAMPLE_MULTIPLIER/(double)(SAMPLE_CLKS_INT-5))
+    };
+                            
+    int_enabled = ml_set_interrupts_enabled(FALSE);
+    
+restart:
+    if (attempts >= 2)
+       panic("timeRDTSC() calibation failed with %d attempts\n", attempts);
+    attempts++;
+    enable_PIT2();      // turn on PIT2
+    set_PIT2(0);       // reset timer 2 to be zero
+    latchTime = rdtsc64();     // get the time stamp to time 
+    latchTime = get_PIT2(&timerValue) - latchTime; // time how long this takes
+    set_PIT2(SAMPLE_CLKS_INT); // set up the timer for (almost) 1/20th a second
+    saveTime = rdtsc64();      // now time how long a 20th a second is...
+    get_PIT2(&lastValue);
+    get_PIT2(&lastValue);      // read twice, first value may be unreliable
+    do {
+        intermediate = get_PIT2(&timerValue);
+        if (timerValue > lastValue) {
+           printf("Hey we are going backwards! %u -> %u, restarting timing\n",
+                       timerValue,lastValue);
+           set_PIT2(0);
+           disable_PIT2();
+           goto restart;
+       }
+        lastValue = timerValue;
+    } while (timerValue > 5);
+    kprintf("timerValue   %d\n",timerValue);
+    kprintf("intermediate 0x%016llx\n",intermediate);
+    kprintf("saveTime     0x%016llx\n",saveTime);
+    
+    intermediate -= saveTime;          // raw count for about 1/20 second
+    intermediate *= scale[timerValue]; // rescale measured time spent
+    intermediate /= SAMPLE_NSECS;      // so its exactly 1/20 a second
+    intermediate += latchTime;         // add on our save fudge
+    
+    set_PIT2(0);                       // reset timer 2 to be zero
+    disable_PIT2();                    // turn off PIT 2
+
+    ml_set_interrupts_enabled(int_enabled);
+    return intermediate;
+}
+
+static uint64_t
+tsc_to_nanoseconds(uint64_t abstime)
+{
+        uint32_t       numer;
+        uint32_t       denom;
+        uint32_t       intermediate[3];
+        
+        numer = rtclock.timebase_const.numer;
+        denom = rtclock.timebase_const.denom;
+        if (denom == RTC_FAST_DENOM) {
+            abstime = fast_get_nano_from_abs(abstime, numer);
+        } else {
+            longmul(&abstime, numer, intermediate);
+            abstime = longdiv(intermediate, denom);
+        }
+        return abstime;
+}
+
+inline static mach_timespec_t 
+tsc_to_timespec(void)
+{
+        uint64_t       currNanos;
+        currNanos = rtc_nanotime_read();
+        return nanos_to_timespec(currNanos);
+}
+
+#define        DECREMENTER_MAX         UINT_MAX
  static uint32_t
  deadline_to_decrementer(
         uint64_t        deadline,
@@ -142,28 +516,86 @@ deadline_to_decrementer(
                 return rtc_decrementer_min;
         else {
                 delta = deadline - now;
-               return MIN(MAX(rtc_decrementer_min,delta),maxDec); 
+               return MIN(MAX(rtc_decrementer_min,delta),DECREMENTER_MAX); 
         }
  }
  
+static inline uint64_t
+lapic_time_countdown(uint32_t initial_count)
+{
+       boolean_t               state;
+       uint64_t                start_time;
+       uint64_t                stop_time;
+       lapic_timer_count_t     count;
+
+       state = ml_set_interrupts_enabled(FALSE);
+       lapic_set_timer(FALSE, one_shot, divide_by_1, initial_count);
+       start_time = rdtsc64();
+       do {
+               lapic_get_timer(NULL, NULL, NULL, &count);
+       } while (count > 0);
+       stop_time = rdtsc64();
+       ml_set_interrupts_enabled(state);
+
+       return tsc_to_nanoseconds(stop_time - start_time);
+}
+
  static void
-rtc_lapic_start_ticking(void)
+rtc_lapic_timer_calibrate(void)
  {
-       uint64_t        abstime;
-       uint64_t        first_tick;
-       cpu_data_t      *cdp = current_cpu_datap();
+       uint32_t        nsecs;
+       uint64_t        countdown;
  
-       abstime = mach_absolute_time();
-       rtclock_tick_interval = NSEC_PER_HZ;
+       if (!(cpuid_features() & CPUID_FEATURE_APIC))
+               return;
  
-       first_tick = abstime + rtclock_tick_interval;
-       cdp->rtclock_intr_deadline = first_tick;
+       /*
+        * Set the local apic timer counting down to zero without an interrupt.
+        * Use the timestamp to calculate how long this takes.
+        */ 
+       nsecs = (uint32_t) lapic_time_countdown(rtc_intr_nsec);
  
         /*
-        * Force a complete re-evaluation of timer deadlines.
+        * Compute a countdown ratio for a given time in nanoseconds.
+        * That is, countdown = time * numer / denom.
          */
-       cdp->rtcPop = EndOfAllTime;
-       etimer_resync_deadlines();
+       countdown = (uint64_t)rtc_intr_nsec * (uint64_t)rtc_intr_nsec / nsecs;
+
+       nsecs = (uint32_t) lapic_time_countdown((uint32_t) countdown);
+
+       rtc_lapic_scale.numer = countdown;
+       rtc_lapic_scale.denom = nsecs;
+
+       kprintf("rtc_lapic_timer_calibrate() scale: %d/%d\n",
+               (uint32_t) countdown, nsecs);
+}
+
+static void
+rtc_lapic_set_timer(
+       uint32_t        interval)
+{
+       uint64_t        count;
+
+       assert(rtc_lapic_scale.denom);
+
+       count = interval * (uint64_t) rtc_lapic_scale.numer;
+       count /= rtc_lapic_scale.denom;
+
+       lapic_set_timer(TRUE, one_shot, divide_by_1, (uint32_t) count);
+}
+
+static void
+rtc_lapic_start_ticking(void)
+{
+       uint64_t        abstime;
+       uint64_t        first_tick;
+       uint64_t        decr;
+
+       abstime = mach_absolute_time();
+       first_tick = abstime + NSEC_PER_HZ;
+       current_cpu_datap()->cpu_rtc_tick_deadline = first_tick;
+       decr = deadline_to_decrementer(first_tick, abstime);
+       rtc_lapic_set_timer(decr);
  }
  
  /*
@@ -172,9 +604,20 @@ rtc_lapic_start_ticking(void)
   */
  
  int
-rtclock_config(void)
+sysclk_config(void)
  {
-       /* nothing to do */
+
+       mp_disable_preemption();
+       if (cpu_number() != master_cpu) {
+               mp_enable_preemption();
+               return(1);
+       }
+       mp_enable_preemption();
+
+       timer_call_setup(&rtclock_alarm_timer, rtclock_alarm_expire, NULL);
+
+       simple_lock_init(&rtclock.lock, 0);
+
         return (1);
  }
  
@@ -182,169 +625,235 @@ rtclock_config(void)
  /*
   * Nanotime/mach_absolutime_time
   * -----------------------------
- * The timestamp counter (TSC) - which counts cpu clock cycles and can be read
- * efficiently by the kernel and in userspace - is the reference for all timing.
- * The cpu clock rate is platform-dependent and may stop or be reset when the
- * processor is napped/slept.  As a result, nanotime is the software abstraction
- * used to maintain a monotonic clock, adjusted from an outside reference as needed.
+ * The timestamp counter (tsc) - which counts cpu clock cycles and can be read
+ * efficient by the kernel and in userspace - is the reference for all timing.
+ * However, the cpu clock rate is not only platform-dependent but can change
+ * (speed-step) dynamically. Hence tsc is converted into nanoseconds which is
+ * identical to mach_absolute_time. The conversion to tsc to nanoseconds is
+ * encapsulated by nanotime.
   *
   * The kernel maintains nanotime information recording:
- *     - the ratio of tsc to nanoseconds
+ *     - the current ratio of tsc to nanoseconds
   *       with this ratio expressed as a 32-bit scale and shift
   *       (power of 2 divider);
- *     - { tsc_base, ns_base } pair of corresponding timestamps.
+ *     - the tsc (step_tsc) and nanotime (step_ns) at which the current
+ *       ratio (clock speed) began.
+ * So a tsc value can be converted to nanotime by:
+ *
+ *     nanotime = (((tsc - step_tsc)*scale) >> shift) + step_ns
+ *
+ * In general, (tsc - step_tsc) is a 64-bit quantity with the scaling
+ * involving a 96-bit intermediate value. However, by saving the converted 
+ * values at each tick (or at any intervening speed-step) - base_tsc and
+ * base_ns - we can perform conversions relative to these and be assured that
+ * (tsc - tick_tsc) is 32-bits. Hence:
   *
- * The tuple {tsc_base, ns_base, scale, shift} is exported in the commpage 
- * for the userspace nanotime routine to read.
+ *     fast_nanotime = (((tsc - base_tsc)*scale) >> shift) + base_ns  
   *
- * All of the routines which update the nanotime data are non-reentrant.  This must
- * be guaranteed by the caller.
+ * The tuple {base_tsc, base_ns, scale, shift} is exported in the commpage 
+ * for the userspace nanotime routine to read. A duplicate check_tsc is
+ * appended so that the consistency of the read can be verified. Note that
+ * this scheme is essential for MP systems in which the commpage is updated
+ * by the master cpu but may be read concurrently by other cpus.
+ * 
   */
  static inline void
  rtc_nanotime_set_commpage(rtc_nanotime_t *rntp)
  {
-       commpage_set_nanotime(rntp->tsc_base, rntp->ns_base, rntp->scale, rntp->shift);
-}
+       commpage_nanotime_t     cp_nanotime;
  
-/*
- * rtc_nanotime_init:
- *
- * Intialize the nanotime info from the base time.  Since
- * the base value might be from a lower resolution clock,
- * we compare it to the TSC derived value, and use the
- * greater of the two values.
- */
-static inline void
-_rtc_nanotime_init(rtc_nanotime_t *rntp, uint64_t base)
-{
-       uint64_t        nsecs, tsc = rdtsc64();
+       /* Only the master cpu updates the commpage */
+       if (cpu_number() != master_cpu)
+               return;
+
+       cp_nanotime.nt_base_tsc = rntp->rnt_tsc;
+       cp_nanotime.nt_base_ns = rntp->rnt_nanos;
+       cp_nanotime.nt_scale = rntp->rnt_scale;
+       cp_nanotime.nt_shift = rntp->rnt_shift;
  
-       nsecs = _tsc_to_nanoseconds(tsc);
-       rtc_nanotime_store(tsc, MAX(nsecs, base), rntp->scale, rntp->shift, rntp);
+       commpage_set_nanotime(&cp_nanotime);
  }
  
  static void
-rtc_nanotime_init(uint64_t base)
+rtc_nanotime_init(void)
  {
-       rtc_nanotime_t  *rntp = &rtc_nanotime_info;
+       rtc_nanotime_t  *rntp = &current_cpu_datap()->cpu_rtc_nanotime;
+       rtc_nanotime_t  *master_rntp = &cpu_datap(master_cpu)->cpu_rtc_nanotime;
  
-       _rtc_nanotime_init(rntp, base);
-       rtc_nanotime_set_commpage(rntp);
+       if (cpu_number() == master_cpu) {
+               rntp->rnt_tsc = rdtsc64();
+               rntp->rnt_nanos = tsc_to_nanoseconds(rntp->rnt_tsc);
+               rntp->rnt_scale = rtc_quant_scale;
+               rntp->rnt_shift = rtc_quant_shift;
+               rntp->rnt_step_tsc = 0ULL;
+               rntp->rnt_step_nanos = 0ULL;
+       } else {
+               /*
+                * Copy master processor's nanotime info.
+                * Loop required in case this changes while copying.
+                */
+               do {
+                       *rntp = *master_rntp;
+               } while (rntp->rnt_tsc != master_rntp->rnt_tsc);
+       }
  }
  
-/*
- * rtc_nanotime_init:
- *
- * Call back from the commpage initialization to
- * cause the commpage data to be filled in once the
- * commpages have been created.
- */
-void
-rtc_nanotime_init_commpage(void)
+static inline void
+_rtc_nanotime_update(rtc_nanotime_t *rntp, uint64_t    tsc)
  {
-       spl_t                   s = splclock();
-
-       rtc_nanotime_set_commpage(&rtc_nanotime_info);
+       uint64_t        tsc_delta;
+       uint64_t        ns_delta;
  
-       splx(s);
+       tsc_delta = tsc - rntp->rnt_step_tsc;
+       ns_delta = tsc_to_nanoseconds(tsc_delta);
+       rntp->rnt_nanos = rntp->rnt_step_nanos + ns_delta;
+       rntp->rnt_tsc = tsc;
  }
  
-/*
- * rtc_nanotime_update:
- *
- * Update the nanotime info from the base time.  Since
- * the base value might be from a lower resolution clock,
- * we compare it to the TSC derived value, and use the
- * greater of the two values.
- *
- * N.B. In comparison to the above init routine, this assumes
- * that the TSC has remained monotonic compared to the tsc_base
- * value, which is not the case after S3 sleep.
- */
-static inline void
-_rtc_nanotime_update(rtc_nanotime_t *rntp, uint64_t    base)
+static void
+rtc_nanotime_update(void)
  {
-       uint64_t        nsecs, tsc = rdtsc64();
+       rtc_nanotime_t  *rntp = &current_cpu_datap()->cpu_rtc_nanotime;
  
-       nsecs = rntp->ns_base + _tsc_to_nanoseconds(tsc - rntp->tsc_base);
-       rtc_nanotime_store(tsc, MAX(nsecs, base), rntp->scale, rntp->shift, rntp);
+       assert(get_preemption_level() > 0);
+       assert(!ml_get_interrupts_enabled());
+        
+       _rtc_nanotime_update(rntp, rdtsc64());
+       rtc_nanotime_set_commpage(rntp);
  }
  
  static void
-rtc_nanotime_update(
-       uint64_t                base)
+rtc_nanotime_scale_update(void)
  {
-       rtc_nanotime_t  *rntp = &rtc_nanotime_info;
+       rtc_nanotime_t  *rntp = &current_cpu_datap()->cpu_rtc_nanotime;
+       uint64_t        tsc = rdtsc64();
  
         assert(!ml_get_interrupts_enabled());
          
-       _rtc_nanotime_update(rntp, base);
+       /*
+        * Update time based on past scale.
+        */
+       _rtc_nanotime_update(rntp, tsc);
+
+       /*
+        * Update scale and timestamp this update.
+        */
+       rntp->rnt_scale = rtc_quant_scale;
+       rntp->rnt_shift = rtc_quant_shift;
+       rntp->rnt_step_tsc = rntp->rnt_tsc;
+       rntp->rnt_step_nanos = rntp->rnt_nanos;
+
+       /* Export update to userland */
         rtc_nanotime_set_commpage(rntp);
  }
  
-/*
- * rtc_nanotime_read:
- *
- * Returns the current nanotime value, accessable from any
- * context.
- */
  static uint64_t
+_rtc_nanotime_read(void)
+{
+       rtc_nanotime_t  *rntp = &current_cpu_datap()->cpu_rtc_nanotime;
+       uint64_t        rnt_tsc;
+       uint32_t        rnt_scale;
+       uint32_t        rnt_shift;
+       uint64_t        rnt_nanos;
+       uint64_t        tsc;
+       uint64_t        tsc_delta;
+
+       rnt_scale = rntp->rnt_scale;
+       if (rnt_scale == 0)
+               return 0ULL;
+
+       rnt_shift = rntp->rnt_shift;
+       rnt_nanos = rntp->rnt_nanos;
+       rnt_tsc = rntp->rnt_tsc;
+       tsc = rdtsc64();
+
+       tsc_delta = tsc - rnt_tsc;
+       if ((tsc_delta >> 32) != 0)
+               return rnt_nanos + tsc_to_nanoseconds(tsc_delta);
+
+       /* Let the compiler optimize(?): */
+       if (rnt_shift == 32)
+               return rnt_nanos + ((tsc_delta * rnt_scale) >> 32);     
+       else 
+               return rnt_nanos + ((tsc_delta * rnt_scale) >> rnt_shift);
+}
+
+uint64_t
  rtc_nanotime_read(void)
  {
-       rtc_nanotime_t  rnt, *rntp = &rtc_nanotime_info;
-       uint64_t                result;
+       uint64_t        result;
+       uint64_t        rnt_tsc;
+       rtc_nanotime_t  *rntp = &current_cpu_datap()->cpu_rtc_nanotime;
  
+       /*
+        * Use timestamp to ensure the uptime record isn't changed.
+        * This avoids disabling interrupts.
+        * And not this is a per-cpu structure hence no locking.
+        */
         do {
-               rtc_nanotime_load(rntp, &rnt);
-               result = rnt.ns_base + _tsc_to_nanoseconds(rdtsc64() - rnt.tsc_base);
-       } while (rntp->tsc_base != rnt.tsc_base);
+               rnt_tsc = rntp->rnt_tsc;
+               result = _rtc_nanotime_read();
+       } while (rnt_tsc != rntp->rnt_tsc);
  
-       return (result);
+       return result;
  }
  
+
  /*
- * rtc_clock_napped:
- *
- * Invoked from power manangement when we have awoken from a nap (C3/C4)
- * during which the TSC lost counts.  The nanotime data is updated according
- * to the provided nanosecond base value.
- *
- * The caller must guarantee non-reentrancy.
+ * This function is called by the speed-step driver when a
+ * change of cpu clock frequency is about to occur.
+ * The scale is not changed until rtc_clock_stepped() is called.
+ * Between these times there is an uncertainty is exactly when
+ * the change takes effect. FIXME: by using another timing source
+ * we could eliminate this error.
   */
-void
-rtc_clock_napped(
-       uint64_t                base)
-{
-       rtc_nanotime_update(base);
-}
-
  void
  rtc_clock_stepping(__unused uint32_t new_frequency,
                    __unused uint32_t old_frequency)
  {
-       panic("rtc_clock_stepping unsupported");
+       boolean_t       istate;
+
+       istate = ml_set_interrupts_enabled(FALSE);
+       rtc_nanotime_scale_update();
+       ml_set_interrupts_enabled(istate);
  }
  
+/*
+ * This function is called by the speed-step driver when a
+ * change of cpu clock frequency has just occured. This change
+ * is expressed as a ratio relative to the boot clock rate.
+ */
  void
-rtc_clock_stepped(__unused uint32_t new_frequency,
-                 __unused uint32_t old_frequency)
+rtc_clock_stepped(uint32_t new_frequency, uint32_t old_frequency)
  {
-       panic("rtc_clock_stepping unsupported");
+       boolean_t       istate;
+
+       istate = ml_set_interrupts_enabled(FALSE);
+       if (rtc_boot_frequency == 0) {
+               /*
+                * At the first ever stepping, old frequency is the real
+                * initial clock rate. This step and all others are based
+                * relative to this initial frequency at which the tsc
+                * calibration was made. Hence we must remember this base
+                * frequency as reference.
+                */
+               rtc_boot_frequency = old_frequency;
+       }
+       rtc_set_cyc_per_sec(rtc_cycle_count * new_frequency /
+                               rtc_boot_frequency);
+       rtc_nanotime_scale_update();
+       ml_set_interrupts_enabled(istate);
  }
  
  /*
- * rtc_sleep_wakeup:
- *
- * Invoked from power manageent when we have awoken from a sleep (S3)
- * and the TSC has been reset.  The nanotime data is updated based on
- * the HPET value.
- *
- * The caller must guarantee non-reentrancy.
+ * rtc_sleep_wakeup() is called from acpi on awakening from a S3 sleep
   */
  void
  rtc_sleep_wakeup(void)
  {
-       boolean_t               istate;
+       rtc_nanotime_t  *rntp = &current_cpu_datap()->cpu_rtc_nanotime;
+
+       boolean_t       istate;
  
         istate = ml_set_interrupts_enabled(FALSE);
  
@@ -352,8 +861,12 @@ rtc_sleep_wakeup(void)
          * Reset nanotime.
          * The timestamp counter will have been reset
          * but nanotime (uptime) marches onward.
+        * We assume that we're still at the former cpu frequency.
          */
-       rtc_nanotime_init(tmrCvt(rdHPET(), hpetCvtt2n));
+       rntp->rnt_tsc = rdtsc64();
+       rntp->rnt_step_tsc = 0ULL;
+       rntp->rnt_step_nanos = rntp->rnt_nanos;
+       rtc_nanotime_set_commpage(rntp);
  
         /* Restart tick interrupts from the LAPIC timer */
         rtc_lapic_start_ticking();
@@ -366,68 +879,100 @@ rtc_sleep_wakeup(void)
   * In addition, various variables used to support the clock are initialized.
   */
  int
-rtclock_init(void)
+sysclk_init(void)
  {
         uint64_t        cycles;
  
-       assert(!ml_get_interrupts_enabled());
-
+       mp_disable_preemption();
         if (cpu_number() == master_cpu) {
-
-               assert(tscFreq);
-               rtc_set_timescale(tscFreq);
-
                 /*
-                * Adjust and set the exported cpu speed.
+                * Perform calibration.
+                * The PIT is used as the reference to compute how many
+                * TCS counts (cpu clock cycles) occur per second.
                  */
-               cycles = rtc_export_speed(tscFreq);
+               rtc_cycle_count = timeRDTSC();
+               cycles = rtc_set_cyc_per_sec(rtc_cycle_count);
  
                 /*
                  * Set min/max to actual.
                  * ACPI may update these later if speed-stepping is detected.
                  */
-               gPEClockFrequencyInfo.cpu_frequency_min_hz = cycles;
-               gPEClockFrequencyInfo.cpu_frequency_max_hz = cycles;
+               gPEClockFrequencyInfo.cpu_frequency_min_hz = cycles;
+               gPEClockFrequencyInfo.cpu_frequency_max_hz = cycles;
+               printf("[RTCLOCK] frequency %llu (%llu)\n",
+                      cycles, rtc_cyc_per_sec);
  
-               /*
-                * Compute the longest interval we can represent.
-                */
-               maxDec = tmrCvt(0x7fffffffULL, busFCvtt2n);
-               kprintf("maxDec: %lld\n", maxDec);
+               rtc_lapic_timer_calibrate();
  
                 /* Minimum interval is 1usec */
-               rtc_decrementer_min = deadline_to_decrementer(NSEC_PER_USEC, 0ULL);
+               rtc_decrementer_min = deadline_to_decrementer(NSEC_PER_USEC,
+                                                               0ULL);
                 /* Point LAPIC interrupts to hardclock() */
                 lapic_set_timer_func((i386_intr_func_t) rtclock_intr);
  
                 clock_timebase_init();
-               ml_init_lock_timeout();
+               rtc_initialized = TRUE;
         }
  
+       rtc_nanotime_init();
+
         rtc_lapic_start_ticking();
  
+       mp_enable_preemption();
+
         return (1);
  }
  
-// utility routine 
-// Code to calculate how many processor cycles are in a second...
+/*
+ * Get the clock device time. This routine is responsible
+ * for converting the device's machine dependent time value
+ * into a canonical mach_timespec_t value.
+ */
+static kern_return_t
+sysclk_gettime_internal(
+       mach_timespec_t *cur_time)      /* OUT */
+{
+       *cur_time = tsc_to_timespec();
+       return (KERN_SUCCESS);
+}
  
-static void
-rtc_set_timescale(uint64_t cycles)
+kern_return_t
+sysclk_gettime(
+       mach_timespec_t *cur_time)      /* OUT */
  {
-       rtc_nanotime_info.scale = ((uint64_t)NSEC_PER_SEC << 32) / cycles;
-       rtc_nanotime_info.shift = 32;
+       return sysclk_gettime_internal(cur_time);
+}
  
-       rtc_nanotime_init(0);
+void
+sysclk_gettime_interrupts_disabled(
+       mach_timespec_t *cur_time)      /* OUT */
+{
+       (void) sysclk_gettime_internal(cur_time);
  }
  
+// utility routine 
+// Code to calculate how many processor cycles are in a second...
+
  static uint64_t
-rtc_export_speed(uint64_t cyc_per_sec)
+rtc_set_cyc_per_sec(uint64_t cycles)
  {
-       uint64_t        cycles;
  
-       /* Round: */
-        cycles = ((cyc_per_sec + (UI_CPUFREQ_ROUNDING_FACTOR/2))
+        if (cycles > (NSEC_PER_SEC/20)) {
+            // we can use just a "fast" multiply to get nanos
+           rtc_quant_shift = 32;
+            rtc_quant_scale = create_mul_quant_GHZ(rtc_quant_shift, cycles);
+            rtclock.timebase_const.numer = rtc_quant_scale; // timeRDTSC is 1/20
+           rtclock.timebase_const.denom = RTC_FAST_DENOM;
+        } else {
+           rtc_quant_shift = 26;
+            rtc_quant_scale = create_mul_quant_GHZ(rtc_quant_shift, cycles);
+            rtclock.timebase_const.numer = NSEC_PER_SEC/20; // timeRDTSC is 1/20
+            rtclock.timebase_const.denom = cycles;
+        }
+       rtc_cyc_per_sec = cycles*20;    // multiply it by 20 and we are done..
+                                       // BUT we also want to calculate...
+
+        cycles = ((rtc_cyc_per_sec + (UI_CPUFREQ_ROUNDING_FACTOR/2))
                         / UI_CPUFREQ_ROUNDING_FACTOR)
                                 * UI_CPUFREQ_ROUNDING_FACTOR;
  
@@ -441,7 +986,7 @@ rtc_export_speed(uint64_t cyc_per_sec)
          }
          gPEClockFrequencyInfo.cpu_frequency_hz = cycles;
  
-       kprintf("[RTCLOCK] frequency %llu (%llu)\n", cycles, cyc_per_sec);
+       kprintf("[RTCLOCK] frequency %llu (%llu)\n", cycles, rtc_cyc_per_sec);
         return(cycles);
  }
  
@@ -450,17 +995,12 @@ clock_get_system_microtime(
         uint32_t                        *secs,
         uint32_t                        *microsecs)
  {
-       uint64_t        now = rtc_nanotime_read();
-       uint32_t        remain;
+       mach_timespec_t         now;
  
-       asm volatile(
-                       "divl %3"
-                               : "=a" (*secs), "=d" (remain)
-                               : "A" (now), "r" (NSEC_PER_SEC));
-       asm volatile(
-                       "divl %3"
-                               : "=a" (*microsecs)
-                               : "0" (remain), "d" (0), "r" (NSEC_PER_USEC));
+       (void) sysclk_gettime_internal(&now);
+
+       *secs = now.tv_sec;
+       *microsecs = now.tv_nsec / NSEC_PER_USEC;
  }
  
  void
@@ -468,39 +1008,291 @@ clock_get_system_nanotime(
         uint32_t                        *secs,
         uint32_t                        *nanosecs)
  {
-       uint64_t        now = rtc_nanotime_read();
+       mach_timespec_t         now;
  
-       asm volatile(
-                       "divl %3"
-                               : "=a" (*secs), "=d" (*nanosecs)
-                               : "A" (now), "r" (NSEC_PER_SEC));
+       (void) sysclk_gettime_internal(&now);
+
+       *secs = now.tv_sec;
+       *nanosecs = now.tv_nsec;
+}
+
+/*
+ * Get clock device attributes.
+ */
+kern_return_t
+sysclk_getattr(
+       clock_flavor_t          flavor,
+       clock_attr_t            attr,           /* OUT */
+       mach_msg_type_number_t  *count)         /* IN/OUT */
+{
+       if (*count != 1)
+               return (KERN_FAILURE);
+       switch (flavor) {
+
+       case CLOCK_GET_TIME_RES:        /* >0 res */
+               *(clock_res_t *) attr = rtc_intr_nsec;
+               break;
+
+       case CLOCK_ALARM_CURRES:        /* =0 no alarm */
+       case CLOCK_ALARM_MAXRES:
+       case CLOCK_ALARM_MINRES:
+               *(clock_res_t *) attr = 0;
+               break;
+
+       default:
+               return (KERN_INVALID_VALUE);
+       }
+       return (KERN_SUCCESS);
  }
  
+/*
+ * Set next alarm time for the clock device. This call
+ * always resets the time to deliver an alarm for the
+ * clock.
+ */
  void
-clock_gettimeofday_set_commpage(
-       uint64_t                                abstime,
-       uint64_t                                epoch,
-       uint64_t                                offset,
-       uint32_t                                *secs,
-       uint32_t                                *microsecs)
-{
-       uint64_t        now = abstime;
-       uint32_t        remain;
+sysclk_setalarm(
+       mach_timespec_t *alarm_time)
+{
+       timer_call_enter(&rtclock_alarm_timer,
+                        (uint64_t) alarm_time->tv_sec * NSEC_PER_SEC
+                               + alarm_time->tv_nsec);
+}
+
+/*
+ * Configure the calendar clock.
+ */
+int
+calend_config(void)
+{
+       return bbc_config();
+}
  
-       now += offset;
+/*
+ * Initialize calendar clock.
+ */
+int
+calend_init(void)
+{
+       return (1);
+}
  
-       asm volatile(
-                       "divl %3"
-                               : "=a" (*secs), "=d" (remain)
-                               : "A" (now), "r" (NSEC_PER_SEC));
-       asm volatile(
-                       "divl %3"
-                               : "=a" (*microsecs)
-                               : "0" (remain), "d" (0), "r" (NSEC_PER_USEC));
+/*
+ * Get the current clock time.
+ */
+kern_return_t
+calend_gettime(
+       mach_timespec_t *cur_time)      /* OUT */
+{
+       spl_t           s;
+
+       RTC_LOCK(s);
+       if (!rtclock.calend_is_set) {
+               RTC_UNLOCK(s);
+               return (KERN_FAILURE);
+       }
+
+       (void) sysclk_gettime_internal(cur_time);
+       ADD_MACH_TIMESPEC(cur_time, &rtclock.calend_offset);
+       RTC_UNLOCK(s);
+
+       return (KERN_SUCCESS);
+}
+
+void
+clock_get_calendar_microtime(
+       uint32_t                        *secs,
+       uint32_t                        *microsecs)
+{
+       mach_timespec_t         now;
+
+       calend_gettime(&now);
+
+       *secs = now.tv_sec;
+       *microsecs = now.tv_nsec / NSEC_PER_USEC;
+}
+
+void
+clock_get_calendar_nanotime(
+       uint32_t                        *secs,
+       uint32_t                        *nanosecs)
+{
+       mach_timespec_t         now;
+
+       calend_gettime(&now);
+
+       *secs = now.tv_sec;
+       *nanosecs = now.tv_nsec;
+}
+
+void
+clock_set_calendar_microtime(
+       uint32_t                        secs,
+       uint32_t                        microsecs)
+{
+       mach_timespec_t         new_time, curr_time;
+       uint32_t                        old_offset;
+       spl_t           s;
+
+       new_time.tv_sec = secs;
+       new_time.tv_nsec = microsecs * NSEC_PER_USEC;
+
+       RTC_LOCK(s);
+       old_offset = rtclock.calend_offset.tv_sec;
+       (void) sysclk_gettime_internal(&curr_time);
+       rtclock.calend_offset = new_time;
+       SUB_MACH_TIMESPEC(&rtclock.calend_offset, &curr_time);
+       rtclock.boottime += rtclock.calend_offset.tv_sec - old_offset;
+       rtclock.calend_is_set = TRUE;
+       RTC_UNLOCK(s);
+
+       (void) bbc_settime(&new_time);
  
-       *secs += epoch;
+       host_notify_calendar_change();
+}
+
+/*
+ * Get clock device attributes.
+ */
+kern_return_t
+calend_getattr(
+       clock_flavor_t          flavor,
+       clock_attr_t            attr,           /* OUT */
+       mach_msg_type_number_t  *count)         /* IN/OUT */
+{
+       if (*count != 1)
+               return (KERN_FAILURE);
+       switch (flavor) {
+
+       case CLOCK_GET_TIME_RES:        /* >0 res */
+               *(clock_res_t *) attr = rtc_intr_nsec;
+               break;
+
+       case CLOCK_ALARM_CURRES:        /* =0 no alarm */
+       case CLOCK_ALARM_MINRES:
+       case CLOCK_ALARM_MAXRES:
+               *(clock_res_t *) attr = 0;
+               break;
+
+       default:
+               return (KERN_INVALID_VALUE);
+       }
+       return (KERN_SUCCESS);
+}
+
+#define tickadj                (40*NSEC_PER_USEC)      /* "standard" skew, ns / tick */
+#define        bigadj          (NSEC_PER_SEC)          /* use 10x skew above bigadj ns */
+
+uint32_t
+clock_set_calendar_adjtime(
+       int32_t                         *secs,
+       int32_t                         *microsecs)
+{
+       int64_t                 total, ototal;
+       uint32_t                interval = 0;
+       spl_t                   s;
+
+       total = (int64_t)*secs * NSEC_PER_SEC + *microsecs * NSEC_PER_USEC;
+
+       RTC_LOCK(s);
+       ototal = rtclock.calend_adjtotal;
+
+       if (total != 0) {
+               int32_t         delta = tickadj;
+
+               if (total > 0) {
+                       if (total > bigadj)
+                               delta *= 10;
+                       if (delta > total)
+                               delta = total;
+               }
+               else {
+                       if (total < -bigadj)
+                               delta *= 10;
+                       delta = -delta;
+                       if (delta < total)
+                               delta = total;
+               }
+
+               rtclock.calend_adjtotal = total;
+               rtclock.calend_adjdelta = delta;
+
+               interval = NSEC_PER_HZ;
+       }
+       else
+               rtclock.calend_adjdelta = rtclock.calend_adjtotal = 0;
+
+       RTC_UNLOCK(s);
+
+       if (ototal == 0)
+               *secs = *microsecs = 0;
+       else {
+               *secs = ototal / NSEC_PER_SEC;
+               *microsecs = ototal % NSEC_PER_SEC;
+       }
+
+       return (interval);
+}
+
+uint32_t
+clock_adjust_calendar(void)
+{
+       uint32_t                interval = 0;
+       int32_t                 delta;
+       spl_t                   s;
+
+       RTC_LOCK(s);
+       delta = rtclock.calend_adjdelta;
+       ADD_MACH_TIMESPEC_NSEC(&rtclock.calend_offset, delta);
+
+       rtclock.calend_adjtotal -= delta;
+
+       if (delta > 0) {
+               if (delta > rtclock.calend_adjtotal)
+                       rtclock.calend_adjdelta = rtclock.calend_adjtotal;
+       }
+       else
+       if (delta < 0) {
+               if (delta < rtclock.calend_adjtotal)
+                       rtclock.calend_adjdelta = rtclock.calend_adjtotal;
+       }
  
-       commpage_set_timestamp(abstime - remain, *secs, NSEC_PER_SEC);
+       if (rtclock.calend_adjdelta != 0)
+               interval = NSEC_PER_HZ;
+
+       RTC_UNLOCK(s);
+
+       return (interval);
+}
+
+void
+clock_initialize_calendar(void)
+{
+       mach_timespec_t bbc_time, curr_time;
+       spl_t           s;
+
+       if (bbc_gettime(&bbc_time) != KERN_SUCCESS)
+               return;
+
+       RTC_LOCK(s);
+       if (rtclock.boottime == 0)
+               rtclock.boottime = bbc_time.tv_sec;
+       (void) sysclk_gettime_internal(&curr_time);
+       rtclock.calend_offset = bbc_time;
+       SUB_MACH_TIMESPEC(&rtclock.calend_offset, &curr_time);
+       rtclock.calend_is_set = TRUE;
+       RTC_UNLOCK(s);
+
+       host_notify_calendar_change();
+}
+
+void
+clock_get_boottime_nanotime(
+       uint32_t                        *secs,
+       uint32_t                        *nanosecs)
+{
+       *secs = rtclock.boottime;
+       *nanosecs = 0;
  }
  
  void
@@ -510,6 +1302,37 @@ clock_timebase_info(
         info->numer = info->denom =  1;
  }      
  
+void
+clock_set_timer_deadline(
+       uint64_t                        deadline)
+{
+       spl_t           s;
+       cpu_data_t      *pp = current_cpu_datap();
+       rtclock_timer_t *mytimer = &pp->cpu_rtc_timer;
+       uint64_t        abstime;
+       uint64_t        decr;
+
+       assert(get_preemption_level() > 0);
+       assert(rtclock_timer_expire);
+
+       RTC_INTRS_OFF(s);
+       mytimer->deadline = deadline;
+       mytimer->is_set = TRUE;
+       if (!mytimer->has_expired) {
+               abstime = mach_absolute_time();
+               if (mytimer->deadline < pp->cpu_rtc_tick_deadline) {
+                       decr = deadline_to_decrementer(mytimer->deadline,
+                                                      abstime);
+                       rtc_lapic_set_timer(decr);
+                       pp->cpu_rtc_intr_deadline = mytimer->deadline;
+                       KERNEL_DEBUG_CONSTANT(
+                               MACHDBG_CODE(DBG_MACH_EXCP_DECI, 1) |
+                                       DBG_FUNC_NONE, decr, 2, 0, 0, 0);
+               }
+       }
+       RTC_INTRS_ON(s);
+}
+
  void
  clock_set_timer_func(
         clock_timer_func_t              func)
@@ -522,83 +1345,87 @@ clock_set_timer_func(
   * Real-time clock device interrupt.
   */
  void
-rtclock_intr(
-       x86_saved_state_t       *tregs)
+rtclock_intr(struct i386_interrupt_state *regs)
  {
-        uint64_t       rip;
-       boolean_t       user_mode = FALSE;
         uint64_t        abstime;
         uint32_t        latency;
+       uint64_t        decr;
+       uint64_t        decr_tick;
+       uint64_t        decr_timer;
         cpu_data_t      *pp = current_cpu_datap();
+       rtclock_timer_t *mytimer = &pp->cpu_rtc_timer;
  
         assert(get_preemption_level() > 0);
         assert(!ml_get_interrupts_enabled());
  
-       abstime = rtc_nanotime_read();
-       latency = (uint32_t) abstime - pp->rtcPop;
-
-       if (is_saved_state64(tregs) == TRUE) {
-               x86_saved_state64_t     *regs;
-                 
-               regs = saved_state64(tregs);
-
-               user_mode = TRUE;
-               rip = regs->isf.rip;
-       } else {
-               x86_saved_state32_t     *regs;
-
-               regs = saved_state32(tregs);
+        abstime = _rtc_nanotime_read();
+       latency = (uint32_t) abstime - pp->cpu_rtc_intr_deadline;
+       if (pp->cpu_rtc_tick_deadline <= abstime) {
+               rtc_nanotime_update();
+               clock_deadline_for_periodic_event(
+                       NSEC_PER_HZ, abstime, &pp->cpu_rtc_tick_deadline);
+               hertz_tick(
+#if STAT_TIME
+                          NSEC_PER_HZ,
+#endif
+                          (regs->efl & EFL_VM) || ((regs->cs & 0x03) != 0),
+                          regs->eip);
+       }
  
-               if (regs->cs & 0x03)
-                       user_mode = TRUE;
-               rip = regs->eip;
+       abstime = _rtc_nanotime_read();
+       if (mytimer->is_set && mytimer->deadline <= abstime) {
+               mytimer->has_expired = TRUE;
+               mytimer->is_set = FALSE;
+               (*rtclock_timer_expire)(abstime);
+               assert(!ml_get_interrupts_enabled());
+               mytimer->has_expired = FALSE;
         }
  
         /* Log the interrupt service latency (-ve value expected by tool) */
         KERNEL_DEBUG_CONSTANT(
                 MACHDBG_CODE(DBG_MACH_EXCP_DECI, 0) | DBG_FUNC_NONE,
-               -latency, (uint32_t)rip, user_mode, 0, 0);
-
-       /* call the generic etimer */
-       etimer_intr(user_mode, rip);
-}
+               -latency, (uint32_t)regs->eip, 0, 0, 0);
  
-/*
- *     Request timer pop from the hardware 
- */
+       abstime = _rtc_nanotime_read();
+       decr_tick = deadline_to_decrementer(pp->cpu_rtc_tick_deadline, abstime);
+       decr_timer = (mytimer->is_set) ?
+                       deadline_to_decrementer(mytimer->deadline, abstime) :
+                       DECREMENTER_MAX;
+       decr = MIN(decr_tick, decr_timer);
+       pp->cpu_rtc_intr_deadline = abstime + decr;
  
-int
-setPop(
-       uint64_t time)
-{
-       uint64_t now;
-       uint32_t decr;
-       uint64_t count;
-       
-       now = rtc_nanotime_read();              /* The time in nanoseconds */
-       decr = deadline_to_decrementer(time, now);
+       rtc_lapic_set_timer(decr);
  
-       count = tmrCvt(decr, busFCvtn2t);
-       lapic_set_timer(TRUE, one_shot, divide_by_1, (uint32_t) count);
+       /* Log the new decrementer value */
+       KERNEL_DEBUG_CONSTANT(
+               MACHDBG_CODE(DBG_MACH_EXCP_DECI, 1) | DBG_FUNC_NONE,
+               decr, 3, 0, 0, 0);
  
-       return decr;                            /* Pass back what we set */
  }
  
+static void
+rtclock_alarm_expire(
+       __unused timer_call_param_t     p0,
+       __unused timer_call_param_t     p1)
+{
+       mach_timespec_t clock_time;
  
+       (void) sysclk_gettime_internal(&clock_time);
  
-uint64_t
-mach_absolute_time(void)
-{
-       return rtc_nanotime_read();
+       clock_alarm_intr(SYSTEM_CLOCK, &clock_time);
  }
  
  void
-clock_interval_to_absolutetime_interval(
-       uint32_t                interval,
-       uint32_t                scale_factor,
+clock_get_uptime(
         uint64_t                *result)
  {
-       *result = (uint64_t)interval * scale_factor;
+        *result = rtc_nanotime_read();
+}
+
+uint64_t
+mach_absolute_time(void)
+{
+        return rtc_nanotime_read();
  }
  
  void
@@ -620,24 +1447,37 @@ absolutetime_to_microtime(
  }
  
  void
-absolutetime_to_nanotime(
-       uint64_t                        abstime,
-       uint32_t                        *secs,
-       uint32_t                        *nanosecs)
+clock_interval_to_deadline(
+       uint32_t                interval,
+       uint32_t                scale_factor,
+       uint64_t                *result)
  {
-       asm volatile(
-                       "divl %3"
-                       : "=a" (*secs), "=d" (*nanosecs)
-                       : "A" (abstime), "r" (NSEC_PER_SEC));
+       uint64_t                abstime;
+
+       clock_get_uptime(result);
+
+       clock_interval_to_absolutetime_interval(interval, scale_factor, &abstime);
+
+       *result += abstime;
  }
  
  void
-nanotime_to_absolutetime(
-       uint32_t                        secs,
-       uint32_t                        nanosecs,
-       uint64_t                        *result)
+clock_interval_to_absolutetime_interval(
+       uint32_t                interval,
+       uint32_t                scale_factor,
+       uint64_t                *result)
+{
+       *result = (uint64_t)interval * scale_factor;
+}
+
+void
+clock_absolutetime_interval_to_deadline(
+       uint64_t                abstime,
+       uint64_t                *result)
  {
-       *result = ((uint64_t)secs * NSEC_PER_SEC) + nanosecs;
+       clock_get_uptime(result);
+
+       *result += abstime;
  }
  
  void