[apple/xnu.git] / osfmk / x86_64 / machine_routines_asm.s

/*
 * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
 
#include <i386/asm.h>
#include <i386/rtclock_asm.h>
#include <i386/proc_reg.h>
#include <i386/eflags.h>
       
#include <i386/postcode.h>
#include <i386/apic.h>
#include <i386/vmx/vmx_asm.h>
#include <assym.s>

/*
**      ml_get_timebase()
**
**      Returns TSC in RAX
**
*/
ENTRY(ml_get_timebase)

	lfence
	rdtsc
	lfence
        shlq	$32,%rdx 
        orq	%rdx,%rax
			
	ret

/*
 *  	Convert between various timer units 
 *
 *	This code converts 64-bit time units to other units.
 *	For example, the TSC is converted to HPET units.
 *
 *	Time is a 64-bit integer that is some number of ticks.
 *	Conversion is 64-bit fixed point number which is composed
 *	of a 32 bit integer and a 32 bit fraction. 
 *
 *	The time ticks are multiplied by the conversion factor.  The
 *	calculations are done as a 128-bit value but both the high
 *	and low words are dropped.  The high word is overflow and the
 *	low word is the fraction part of the result.
 *
 *	We return a 64-bit value.
 *
 *	Note that we can use this function to multiply 2 conversion factors.
 *	We do this in order to calculate the multiplier used to convert
 *	directly between any two units.
 *
 *	uint64_t tmrCvt(uint64_t time,		// %rdi
 *			uint64_t conversion)	// %rsi
 *
 */
ENTRY(tmrCvt)
	cmpq	$1,%rsi				/* check for unity fastpath */
	je	1f
	movq	%rdi,%rax
	mulq	%rsi				/* result is %rdx:%rax */
	shrdq   $32,%rdx,%rax			/* %rdx:%rax >>= 32 */
	ret
1:
	mov	%rdi,%rax
	ret

 /*
 * void _rtc_nanotime_adjust(
 *		uint64_t        tsc_base_delta,	// %rdi
 *		rtc_nanotime_t  *dst);		// %rsi
 */
ENTRY(_rtc_nanotime_adjust)
	movl	RNT_GENERATION(%rsi),%eax	/* get current generation */
	movl	$0,RNT_GENERATION(%rsi)		/* flag data as being updated */
	addq	%rdi,RNT_TSC_BASE(%rsi)

	incl	%eax				/* next generation */
	jnz	1f
	incl	%eax				/* skip 0, which is a flag */
1:	movl	%eax,RNT_GENERATION(%rsi)	/* update generation */

	ret

/*
 * uint64_t _rtc_nanotime_read(rtc_nanotime_t *rntp);
 *
 * This is the same as the commpage nanotime routine, except that it uses the
 * kernel internal "rtc_nanotime_info" data instead of the commpage data.
 * These two copies of data are kept in sync by rtc_clock_napped().
 *
 * Warning!  There are several copies of this code in the trampolines found in
 * osfmk/x86_64/idt64.s, coming from the various TIMER macros in rtclock_asm.h.
 * They're all kept in sync by using the RTC_NANOTIME_READ() macro.
 *
 * The algorithm we use is:
 *
 *	ns = ((((rdtsc - rnt_tsc_base)<<rnt_shift)*rnt_tsc_scale) / 2**32) + rnt_ns_base;
 *
 * rnt_shift, a constant computed during initialization, is the smallest value for which:
 *
 *	(tscFreq << rnt_shift) > SLOW_TSC_THRESHOLD
 *
 * Where SLOW_TSC_THRESHOLD is about 10e9.  Since most processor's tscFreqs are greater
 * than 1GHz, rnt_shift is usually 0.  rnt_tsc_scale is also a 32-bit constant:
 *
 *	rnt_tsc_scale = (10e9 * 2**32) / (tscFreq << rnt_shift);
 *
 * On 64-bit processors this algorithm could be simplified by doing a 64x64 bit
 * multiply of rdtsc by tscFCvtt2n:
 *
 *	ns = (((rdtsc - rnt_tsc_base) * tscFCvtt2n) / 2**32) + rnt_ns_base;
 *
 * We don't do so in order to use the same algorithm in 32- and 64-bit mode.
 * When U32 goes away, we should reconsider.
 *
 * Since this routine is not synchronized and can be called in any context, 
 * we use a generation count to guard against seeing partially updated data.
 * In addition, the _rtc_nanotime_store() routine zeroes the generation before
 * updating the data, and stores the nonzero generation only after all fields
 * have been stored.  Because IA32 guarantees that stores by one processor
 * must be seen in order by another, we can avoid using a lock.  We spin while
 * the generation is zero.
 *
 * unint64_t _rtc_nanotime_read(
 *			rtc_nanotime_t *rntp);		// %rdi
 *
 */
ENTRY(_rtc_nanotime_read)

	PAL_RTC_NANOTIME_READ_FAST()

	ret
    
/*
 * extern uint64_t _rtc_tsc_to_nanoseconds(
 *          uint64_t    value,              // %rdi
 *          pal_rtc_nanotime_t	*rntp);     // %rsi
 *
 * Converts TSC units to nanoseconds, using an abbreviated form of the above
 * algorithm.  Note that while we could have simply used tmrCvt(value,tscFCvtt2n),
 * which would avoid the need for this asm, doing so is a bit more risky since
 * we'd be using a different algorithm with possibly different rounding etc.
 */

ENTRY(_rtc_tsc_to_nanoseconds)
	movq    %rdi,%rax			/* copy value (in TSC units) to convert */
	movl    RNT_SHIFT(%rsi),%ecx
	movl    RNT_SCALE(%rsi),%edx
	shlq    %cl,%rax			/* tscUnits << shift */
	mulq    %rdx				/* (tscUnits << shift) * scale */
	shrdq   $32,%rdx,%rax			/* %rdx:%rax >>= 32 */
	ret
    
    

Entry(call_continuation)
	movq	%rdi,%rcx			/* get continuation */
	movq	%rsi,%rdi			/* continuation param */
	movq	%rdx,%rsi			/* wait result */
	movq	%gs:CPU_KERNEL_STACK,%rsp	/* set the stack */
	xorq	%rbp,%rbp			/* zero frame pointer */
	call	*%rcx				/* call continuation */
	movq	%gs:CPU_ACTIVE_THREAD,%rdi
	call	EXT(thread_terminate)

Entry(x86_init_wrapper)
	xor	%rbp, %rbp
	movq	%rsi, %rsp
	callq	*%rdi

#if CONFIG_VMX

/*
 *	__vmxon -- Enter VMX Operation
 *	int __vmxon(addr64_t v);
 */
Entry(__vmxon)
	FRAME
	push	%rdi
	
	mov	$(VMX_FAIL_INVALID), %ecx
	mov	$(VMX_FAIL_VALID), %edx
	mov	$(VMX_SUCCEED), %eax
	vmxon	(%rsp)
	cmovcl 	%ecx, %eax	/* CF = 1, ZF = 0 */
	cmovzl	%edx, %eax	/* CF = 0, ZF = 1 */

	pop	%rdi
	EMARF
	ret

/*
 *	__vmxoff -- Leave VMX Operation
 *	int __vmxoff(void);
 */
Entry(__vmxoff)
	FRAME
	
	mov	$(VMX_FAIL_INVALID), %ecx
	mov	$(VMX_FAIL_VALID), %edx
	mov	$(VMX_SUCCEED), %eax
	vmxoff
	cmovcl 	%ecx, %eax	/* CF = 1, ZF = 0 */
	cmovzl	%edx, %eax	/* CF = 0, ZF = 1 */

	EMARF
	ret

#endif /* CONFIG_VMX */

/*
 *	mfence -- Memory Barrier
 *	Use out-of-line assembly to get
 *	standard x86-64 ABI guarantees
 *	about what the caller's codegen
 *	has in registers vs. memory
 */
Entry(do_mfence)
	mfence
	ret
Commit	Line	Data
b0d623f7	1	/*
0b4c1975	2	* Copyright (c) 2000-2010 Apple Inc. All rights reserved.
b0d623f7 A	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	#include <i386/asm.h>
6d2010ae	30	#include <i386/rtclock_asm.h>
b0d623f7 A	31	#include <i386/proc_reg.h>
	32	#include <i386/eflags.h>
	33
	34	#include <i386/postcode.h>
	35	#include <i386/apic.h>
316670eb	36	#include <i386/vmx/vmx_asm.h>
b0d623f7 A	37	#include <assym.s>
	38
	39	/*
	40	** ml_get_timebase()
	41	**
fe8ab488	42	** Returns TSC in RAX
b0d623f7 A	43	**
	44	*/
	45	ENTRY(ml_get_timebase)
	46
	47	lfence
	48	rdtsc
	49	lfence
	50	shlq $32,%rdx
	51	orq %rdx,%rax
b0d623f7 A	52
	53	ret
	54
	55	/*
	56	* Convert between various timer units
	57	*
	58	* This code converts 64-bit time units to other units.
	59	* For example, the TSC is converted to HPET units.
	60	*
	61	* Time is a 64-bit integer that is some number of ticks.
	62	* Conversion is 64-bit fixed point number which is composed
	63	* of a 32 bit integer and a 32 bit fraction.
	64	*
	65	* The time ticks are multiplied by the conversion factor. The
	66	* calculations are done as a 128-bit value but both the high
	67	* and low words are dropped. The high word is overflow and the
	68	* low word is the fraction part of the result.
	69	*
	70	* We return a 64-bit value.
	71	*
	72	* Note that we can use this function to multiply 2 conversion factors.
	73	* We do this in order to calculate the multiplier used to convert
	74	* directly between any two units.
	75	*
	76	* uint64_t tmrCvt(uint64_t time, // %rdi
	77	* uint64_t conversion) // %rsi
	78	*
	79	*/
	80	ENTRY(tmrCvt)
3e170ce0 A	81	cmpq $1,%rsi /* check for unity fastpath */
3e170ce0 A	82	je 1f
b0d623f7 A	83	movq %rdi,%rax
	84	mulq %rsi /* result is %rdx:%rax */
	85	shrdq $32,%rdx,%rax /* %rdx:%rax >>= 32 */
	86	ret
3e170ce0 A	87	1:
	88	mov %rdi,%rax
	89	ret
b0d623f7	90
6d2010ae	91	/*
0b4c1975 A	92	* void _rtc_nanotime_adjust(
	93	* uint64_t tsc_base_delta, // %rdi
	94	* rtc_nanotime_t *dst); // %rsi
	95	*/
	96	ENTRY(_rtc_nanotime_adjust)
	97	movl RNT_GENERATION(%rsi),%eax /* get current generation */
	98	movl $0,RNT_GENERATION(%rsi) /* flag data as being updated */
	99	addq %rdi,RNT_TSC_BASE(%rsi)
	100
	101	incl %eax /* next generation */
	102	jnz 1f
	103	incl %eax /* skip 0, which is a flag */
	104	1: movl %eax,RNT_GENERATION(%rsi) /* update generation */
	105
	106	ret
	107
b0d623f7	108	/*
bd504ef0	109	* uint64_t _rtc_nanotime_read(rtc_nanotime_t *rntp);
b0d623f7 A	110	*
	111	* This is the same as the commpage nanotime routine, except that it uses the
	112	* kernel internal "rtc_nanotime_info" data instead of the commpage data.
	113	* These two copies of data are kept in sync by rtc_clock_napped().
	114	*
bd504ef0 A	115	* Warning! There are several copies of this code in the trampolines found in
	116	* osfmk/x86_64/idt64.s, coming from the various TIMER macros in rtclock_asm.h.
	117	* They're all kept in sync by using the RTC_NANOTIME_READ() macro.
b0d623f7	118	*
bd504ef0	119	* The algorithm we use is:
b0d623f7	120	*
bd504ef0	121	* ns = ((((rdtsc - rnt_tsc_base)<<rnt_shift)rnt_tsc_scale) / 2*32) + rnt_ns_base;
b0d623f7	122	*
bd504ef0	123	* rnt_shift, a constant computed during initialization, is the smallest value for which:
b0d623f7	124	*
bd504ef0	125	* (tscFreq << rnt_shift) > SLOW_TSC_THRESHOLD
b0d623f7	126	*
bd504ef0 A	127	* Where SLOW_TSC_THRESHOLD is about 10e9. Since most processor's tscFreqs are greater
bd504ef0 A	128	* than 1GHz, rnt_shift is usually 0. rnt_tsc_scale is also a 32-bit constant:
b0d623f7	129	*
bd504ef0 A	130	* rnt_tsc_scale = (10e9 * 2**32) / (tscFreq << rnt_shift);
	131	*
	132	* On 64-bit processors this algorithm could be simplified by doing a 64x64 bit
	133	* multiply of rdtsc by tscFCvtt2n:
	134	*
	135	* ns = (((rdtsc - rnt_tsc_base) * tscFCvtt2n) / 2**32) + rnt_ns_base;
	136	*
	137	* We don't do so in order to use the same algorithm in 32- and 64-bit mode.
	138	* When U32 goes away, we should reconsider.
b0d623f7 A	139	*
	140	* Since this routine is not synchronized and can be called in any context,
	141	* we use a generation count to guard against seeing partially updated data.
	142	* In addition, the _rtc_nanotime_store() routine zeroes the generation before
	143	* updating the data, and stores the nonzero generation only after all fields
	144	* have been stored. Because IA32 guarantees that stores by one processor
	145	* must be seen in order by another, we can avoid using a lock. We spin while
	146	* the generation is zero.
	147	*
	148	* unint64_t _rtc_nanotime_read(
bd504ef0	149	* rtc_nanotime_t *rntp); // %rdi
b0d623f7 A	150	*
	151	*/
	152	ENTRY(_rtc_nanotime_read)
bd504ef0	153
6d2010ae	154	PAL_RTC_NANOTIME_READ_FAST()
b0d623f7 A	155
b0d623f7 A	156	ret
bd504ef0 A	157
	158	/*
	159	* extern uint64_t _rtc_tsc_to_nanoseconds(
	160	* uint64_t value, // %rdi
	161	* pal_rtc_nanotime_t *rntp); // %rsi
	162	*
	163	* Converts TSC units to nanoseconds, using an abbreviated form of the above
	164	* algorithm. Note that while we could have simply used tmrCvt(value,tscFCvtt2n),
	165	* which would avoid the need for this asm, doing so is a bit more risky since
	166	* we'd be using a different algorithm with possibly different rounding etc.
	167	*/
b0d623f7	168
bd504ef0 A	169	ENTRY(_rtc_tsc_to_nanoseconds)
	170	movq %rdi,%rax /* copy value (in TSC units) to convert */
	171	movl RNT_SHIFT(%rsi),%ecx
	172	movl RNT_SCALE(%rsi),%edx
	173	shlq %cl,%rax /* tscUnits << shift */
	174	mulq %rdx /* (tscUnits << shift) * scale */
	175	shrdq $32,%rdx,%rax /* %rdx:%rax >>= 32 */
	176	ret
	177
	178
6d2010ae A	179
	180	Entry(call_continuation)
	181	movq %rdi,%rcx /* get continuation */
	182	movq %rsi,%rdi /* continuation param */
	183	movq %rdx,%rsi /* wait result */
	184	movq %gs:CPU_KERNEL_STACK,%rsp /* set the stack */
	185	xorq %rbp,%rbp /* zero frame pointer */
	186	call %rcx / call continuation */
	187	movq %gs:CPU_ACTIVE_THREAD,%rdi
	188	call EXT(thread_terminate)
	189
316670eb A	190	Entry(x86_init_wrapper)
	191	xor %rbp, %rbp
	192	movq %rsi, %rsp
	193	callq *%rdi
	194
316670eb A	195	#if CONFIG_VMX
	196
	197	/*
	198	* __vmxon -- Enter VMX Operation
	199	* int __vmxon(addr64_t v);
	200	*/
	201	Entry(__vmxon)
	202	FRAME
	203	push %rdi
	204
	205	mov $(VMX_FAIL_INVALID), %ecx
	206	mov $(VMX_FAIL_VALID), %edx
	207	mov $(VMX_SUCCEED), %eax
	208	vmxon (%rsp)
	209	cmovcl %ecx, %eax /* CF = 1, ZF = 0 */
	210	cmovzl %edx, %eax /* CF = 0, ZF = 1 */
	211
	212	pop %rdi
	213	EMARF
	214	ret
	215
	216	/*
	217	* __vmxoff -- Leave VMX Operation
	218	* int __vmxoff(void);
	219	*/
	220	Entry(__vmxoff)
	221	FRAME
	222
	223	mov $(VMX_FAIL_INVALID), %ecx
	224	mov $(VMX_FAIL_VALID), %edx
	225	mov $(VMX_SUCCEED), %eax
	226	vmxoff
	227	cmovcl %ecx, %eax /* CF = 1, ZF = 0 */
	228	cmovzl %edx, %eax /* CF = 0, ZF = 1 */
	229
	230	EMARF
	231	ret
	232
	233	#endif /* CONFIG_VMX */
39236c6e A	234
	235	/*
	236	* mfence -- Memory Barrier
	237	* Use out-of-line assembly to get
	238	* standard x86-64 ABI guarantees
	239	* about what the caller's codegen
	240	* has in registers vs. memory
	241	*/
	242	Entry(do_mfence)
	243	mfence
	244	ret