[apple/xnu.git] / osfmk / i386 / machine_routines_asm.s

/*
 * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
 
#include <i386/asm.h>
#include <i386/apic.h>
#include <i386/eflags.h>
#include <i386/rtclock_asm.h>
#include <i386/postcode.h>
#include <i386/proc_reg.h>
#include <assym.s>

/*
**      ml_get_timebase()
**
**      Entry   - %esp contains pointer to 64 bit structure.
**
**      Exit    - 64 bit structure filled in.
**
*/
ENTRY(ml_get_timebase)

			movl    S_ARG0, %ecx
			
			lfence
			rdtsc
			lfence
			
			movl    %edx, 0(%ecx)
			movl    %eax, 4(%ecx)
			
			ret

/*
 *  	Convert between various timer units 
 *
 *		uint64_t tmrCvt(uint64_t time, uint64_t *conversion)
 *
 *		This code converts 64-bit time units to other units.
 *		For example, the TSC is converted to HPET units.
 *
 *		Time is a 64-bit integer that is some number of ticks.
 *		Conversion is 64-bit fixed point number which is composed
 *		of a 32 bit integer and a 32 bit fraction. 
 *
 *		The time ticks are multiplied by the conversion factor.  The
 *		calculations are done as a 128-bit value but both the high
 *		and low words are dropped.  The high word is overflow and the
 *		low word is the fraction part of the result.
 *
 *		We return a 64-bit value.
 *
 *		Note that we can use this function to multiply 2 conversion factors.
 *		We do this in order to calculate the multiplier used to convert
 *		directly between any two units.
 *
 */

			.globl	EXT(tmrCvt)
			.align FALIGN

LEXT(tmrCvt)

			pushl	%ebp					// Save a volatile
			movl	%esp,%ebp				// Get the parameters - 8
			pushl	%ebx					// Save a volatile
			pushl	%esi					// Save a volatile
			pushl	%edi					// Save a volatile

//			%ebp + 8	- low-order ts
//			%ebp + 12	- high-order ts
//			%ebp + 16	- low-order cvt
//			%ebp + 20	- high-order cvt

			movl	8(%ebp),%eax			// Get low-order ts
			mull	16(%ebp)				// Multiply by low-order conversion
			movl	%edx,%edi				// Need to save only the high order part
			
			movl	12(%ebp),%eax			// Get the high-order ts
			mull	16(%ebp)				// Multiply by low-order conversion
			addl	%eax,%edi				// Add in the overflow from the low x low calculation
			adcl	$0,%edx					// Add in any overflow to high high part
			movl	%edx,%esi				// Save high high part
			
//			We now have the upper 64 bits of the 96 bit multiply of ts and the low half of cvt
//			in %esi:%edi

			movl	8(%ebp),%eax			// Get low-order ts
			mull	20(%ebp)				// Multiply by high-order conversion
			movl	%eax,%ebx				// Need to save the low order part
			movl	%edx,%ecx				// Need to save the high order part
			
			movl	12(%ebp),%eax			// Get the high-order ts
			mull	20(%ebp)				// Multiply by high-order conversion
			
//			Now have %ecx:%ebx as low part of high low and %edx:%eax as high part of high high
//			We don't care about the highest word since it is overflow
			
			addl	%edi,%ebx				// Add the low words
			adcl	%ecx,%esi				// Add in the high plus carry from low
			addl	%eax,%esi				// Add in the rest of the high
			
			movl	%ebx,%eax				// Pass back low word
			movl	%esi,%edx				// and the high word
			
			popl	%edi					// Restore a volatile
			popl	%esi					// Restore a volatile
			popl	%ebx					// Restore a volatile
			popl	%ebp					// Restore a volatile

			ret						// Leave...


/* void  _rtc_nanotime_adjust(	
		uint64_t         tsc_base_delta,
	        rtc_nanotime_t  *dst);
*/
	.globl	EXT(_rtc_nanotime_adjust)
	.align	FALIGN

LEXT(_rtc_nanotime_adjust)
	mov	12(%esp),%edx			/* ptr to rtc_nanotime_info */
	
	movl	RNT_GENERATION(%edx),%ecx	/* get current generation */
	movl	$0,RNT_GENERATION(%edx)		/* flag data as being updated */

	movl	4(%esp),%eax			/* get lower 32-bits of delta */
	addl	%eax,RNT_TSC_BASE(%edx)
	adcl	$0,RNT_TSC_BASE+4(%edx)		/* propagate carry */

	incl	%ecx				/* next generation */
	jnz	1f
	incl	%ecx				/* skip 0, which is a flag */
1:	movl	%ecx,RNT_GENERATION(%edx)	/* update generation and make usable */

	ret


/* unint64_t _rtc_nanotime_read( rtc_nanotime_t *rntp, int slow );
 *
 * This is the same as the commpage nanotime routine, except that it uses the
 * kernel internal "rtc_nanotime_info" data instead of the commpage data.  The two copies
 * of data (one in the kernel and one in user space) are kept in sync by rtc_clock_napped().
 *
 * Warning!  There is another copy of this code in osfmk/i386/locore.s.  The
 * two versions must be kept in sync with each other!
 *
 * There are actually two versions of the algorithm, one each for "slow" and "fast"
 * processors.  The more common "fast" algorithm is:
 *
 *	nanoseconds = (((rdtsc - rnt_tsc_base) * rnt_tsc_scale) / 2**32) - rnt_ns_base;
 *
 * Of course, the divide by 2**32 is a nop.  rnt_tsc_scale is a constant computed during initialization:
 *
 *	rnt_tsc_scale = (10e9 * 2**32) / tscFreq;
 *
 * The "slow" algorithm uses long division:
 *
 *	nanoseconds = (((rdtsc - rnt_tsc_base) * 10e9) / tscFreq) - rnt_ns_base;
 *
 * Since this routine is not synchronized and can be called in any context, 
 * we use a generation count to guard against seeing partially updated data.  In addition,
 * the _rtc_nanotime_store() routine -- just above -- zeroes the generation before
 * updating the data, and stores the nonzero generation only after all other data has been
 * stored.  Because IA32 guarantees that stores by one processor must be seen in order
 * by another, we can avoid using a lock.  We spin while the generation is zero.
 *
 * In accordance with the ABI, we return the 64-bit nanotime in %edx:%eax.
 */
 
		.globl	EXT(_rtc_nanotime_read)
		.align	FALIGN
LEXT(_rtc_nanotime_read)
		pushl		%ebp
		movl		%esp,%ebp
		pushl		%esi
		pushl		%edi
		pushl		%ebx
		movl		8(%ebp),%edi				/* get ptr to rtc_nanotime_info */
		movl		12(%ebp),%eax				/* get "slow" flag */
		testl		%eax,%eax
		jnz		Lslow
		
		/* Processor whose TSC frequency is faster than SLOW_TSC_THRESHOLD */
		PAL_RTC_NANOTIME_READ_FAST()

		popl		%ebx
		popl		%edi
		popl		%esi
		popl		%ebp
		ret

		/* Processor whose TSC frequency is slower than or equal to SLOW_TSC_THRESHOLD */
Lslow:
		movl		RNT_GENERATION(%edi),%esi		/* get generation (0 if being changed) */
		testl		%esi,%esi				/* if being changed, loop until stable */
		jz		Lslow
		pushl		%esi					/* save generation */
		pushl		RNT_SHIFT(%edi)				/* save low 32 bits of tscFreq */

		lfence
		rdtsc	  						/* get TSC in %edx:%eax */
		lfence
		subl		RNT_TSC_BASE(%edi),%eax
		sbbl		RNT_TSC_BASE+4(%edi),%edx

		/*
		* Do the math to convert tsc ticks to nanoseconds.  We first
		* do long multiply of 1 billion times the tsc.  Then we do
		* long division by the tsc frequency
		*/
		mov		$1000000000, %ecx			/* number of nanoseconds in a second */
		mov		%edx, %ebx
		mul		%ecx
		mov		%edx, %edi
		mov		%eax, %esi
		mov		%ebx, %eax
		mul		%ecx
		add		%edi, %eax
		adc		$0, %edx				/* result in edx:eax:esi */
		mov		%eax, %edi
		popl		%ecx					/* get low 32 tscFreq */
		xor		%eax, %eax
		xchg		%edx, %eax
		div		%ecx
		xor		%eax, %eax
		mov		%edi, %eax
		div		%ecx
		mov		%eax, %ebx
		mov		%esi, %eax
		div		%ecx
		mov		%ebx, %edx				/* result in edx:eax */
		
		movl		8(%ebp),%edi				/* recover ptr to rtc_nanotime_info */
		popl		%esi					/* recover generation */

		addl		RNT_NS_BASE(%edi),%eax
		adcl		RNT_NS_BASE+4(%edi),%edx

		cmpl		RNT_GENERATION(%edi),%esi		/* have the parameters changed? */
		jne		Lslow					/* yes, loop until stable */

		pop		%ebx
		pop		%edi
		pop		%esi
		pop		%ebp
		ret							/* result in edx:eax */


/*
 * Timing routines.
 */
Entry(timer_update)
	movl	4(%esp),%ecx
	movl	8(%esp),%eax
	movl	12(%esp),%edx
	movl	%eax,TIMER_HIGHCHK(%ecx)
	movl	%edx,TIMER_LOW(%ecx)
	movl	%eax,TIMER_HIGH(%ecx)
	ret

Entry(timer_grab)
	movl	4(%esp),%ecx
0:	movl	TIMER_HIGH(%ecx),%edx
	movl	TIMER_LOW(%ecx),%eax
	cmpl	TIMER_HIGHCHK(%ecx),%edx
	jne	0b
	ret


Entry(call_continuation)
	movl	S_ARG0,%eax			/* get continuation */
	movl	S_ARG1,%edx			/* continuation param */
	movl	S_ARG2,%ecx			/* wait result */
	movl	%gs:CPU_KERNEL_STACK,%esp	/* pop the stack */
	xorl	%ebp,%ebp			/* zero frame pointer */
	subl	$8,%esp				/* align the stack */
	pushl	%ecx
	pushl	%edx
	call	*%eax				/* call continuation */
	addl	$16,%esp
	movl	%gs:CPU_ACTIVE_THREAD,%eax
	pushl	%eax
	call	EXT(thread_terminate)
Commit	Line	Data
1c79356b	1	/*
0b4c1975	2	* Copyright (c) 2000-2010 Apple Inc. All rights reserved.
1c79356b	3	*
2d21ac55	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b	5	*
2d21ac55 A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
8f6c56a5	14	*
2d21ac55 A	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5 A	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
8f6c56a5 A	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55 A	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
8f6c56a5	25	*
2d21ac55	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b	27	*/
0c530ab8	28
1c79356b	29	#include <i386/asm.h>
6d2010ae	30	#include <i386/apic.h>
0c530ab8	31	#include <i386/eflags.h>
6d2010ae	32	#include <i386/rtclock_asm.h>
0c530ab8	33	#include <i386/postcode.h>
6d2010ae	34	#include <i386/proc_reg.h>
0c530ab8 A	35	#include <assym.s>
0c530ab8 A	36
1c79356b A	37	/*
	38	** ml_get_timebase()
	39	**
	40	** Entry - %esp contains pointer to 64 bit structure.
	41	**
	42	** Exit - 64 bit structure filled in.
	43	**
	44	*/
	45	ENTRY(ml_get_timebase)
	46
0c530ab8 A	47	movl S_ARG0, %ecx
0c530ab8 A	48
c910b4d9	49	lfence
0c530ab8	50	rdtsc
593a1d5f	51	lfence
0c530ab8 A	52
	53	movl %edx, 0(%ecx)
	54	movl %eax, 4(%ecx)
	55
	56	ret
	57
0c530ab8 A	58	/*
	59	* Convert between various timer units
	60	*
	61	* uint64_t tmrCvt(uint64_t time, uint64_t *conversion)
	62	*
	63	* This code converts 64-bit time units to other units.
	64	* For example, the TSC is converted to HPET units.
	65	*
	66	* Time is a 64-bit integer that is some number of ticks.
	67	* Conversion is 64-bit fixed point number which is composed
	68	* of a 32 bit integer and a 32 bit fraction.
	69	*
	70	* The time ticks are multiplied by the conversion factor. The
	71	* calculations are done as a 128-bit value but both the high
	72	* and low words are dropped. The high word is overflow and the
	73	* low word is the fraction part of the result.
	74	*
	75	* We return a 64-bit value.
	76	*
	77	* Note that we can use this function to multiply 2 conversion factors.
	78	* We do this in order to calculate the multiplier used to convert
	79	* directly between any two units.
	80	*
	81	*/
	82
	83	.globl EXT(tmrCvt)
	84	.align FALIGN
	85
	86	LEXT(tmrCvt)
	87
	88	pushl %ebp // Save a volatile
	89	movl %esp,%ebp // Get the parameters - 8
	90	pushl %ebx // Save a volatile
	91	pushl %esi // Save a volatile
	92	pushl %edi // Save a volatile
	93
	94	// %ebp + 8 - low-order ts
	95	// %ebp + 12 - high-order ts
	96	// %ebp + 16 - low-order cvt
	97	// %ebp + 20 - high-order cvt
	98
	99	movl 8(%ebp),%eax // Get low-order ts
	100	mull 16(%ebp) // Multiply by low-order conversion
	101	movl %edx,%edi // Need to save only the high order part
	102
	103	movl 12(%ebp),%eax // Get the high-order ts
	104	mull 16(%ebp) // Multiply by low-order conversion
	105	addl %eax,%edi // Add in the overflow from the low x low calculation
	106	adcl $0,%edx // Add in any overflow to high high part
	107	movl %edx,%esi // Save high high part
	108
	109	// We now have the upper 64 bits of the 96 bit multiply of ts and the low half of cvt
	110	// in %esi:%edi
	111
	112	movl 8(%ebp),%eax // Get low-order ts
	113	mull 20(%ebp) // Multiply by high-order conversion
	114	movl %eax,%ebx // Need to save the low order part
	115	movl %edx,%ecx // Need to save the high order part
	116
	117	movl 12(%ebp),%eax // Get the high-order ts
	118	mull 20(%ebp) // Multiply by high-order conversion
	119
	120	// Now have %ecx:%ebx as low part of high low and %edx:%eax as high part of high high
	121	// We don't care about the highest word since it is overflow
122
123	addl %edi,%ebx // Add the low words
124	adcl %ecx,%esi // Add in the high plus carry from low
125	addl %eax,%esi // Add in the rest of the high
126
127	movl %ebx,%eax // Pass back low word
128	movl %esi,%edx // and the high word
129
130	popl %edi // Restore a volatile
131	popl %esi // Restore a volatile
132	popl %ebx // Restore a volatile
133	popl %ebp // Restore a volatile
134
2d21ac55	135	ret // Leave...
0c530ab8	136
b0d623f7	137
0b4c1975 A	138	/* void _rtc_nanotime_adjust(
	139	uint64_t tsc_base_delta,
	140	rtc_nanotime_t *dst);
	141	*/
	142	.globl EXT(_rtc_nanotime_adjust)
	143	.align FALIGN
	144
	145	LEXT(_rtc_nanotime_adjust)
	146	mov 12(%esp),%edx /* ptr to rtc_nanotime_info */
	147
	148	movl RNT_GENERATION(%edx),%ecx /* get current generation */
	149	movl $0,RNT_GENERATION(%edx) /* flag data as being updated */
	150
	151	movl 4(%esp),%eax /* get lower 32-bits of delta */
	152	addl %eax,RNT_TSC_BASE(%edx)
	153	adcl $0,RNT_TSC_BASE+4(%edx) /* propagate carry */
	154
	155	incl %ecx /* next generation */
	156	jnz 1f
	157	incl %ecx /* skip 0, which is a flag */
	158	1: movl %ecx,RNT_GENERATION(%edx) /* update generation and make usable */
	159
	160	ret
	161
	162
2d21ac55 A	163	/* unint64_t _rtc_nanotime_read( rtc_nanotime_t *rntp, int slow );
	164	*
	165	* This is the same as the commpage nanotime routine, except that it uses the
	166	* kernel internal "rtc_nanotime_info" data instead of the commpage data. The two copies
cf7d32b8 A	167	* of data (one in the kernel and one in user space) are kept in sync by rtc_clock_napped().
	168	*
	169	* Warning! There is another copy of this code in osfmk/i386/locore.s. The
	170	* two versions must be kept in sync with each other!
2d21ac55 A	171	*
	172	* There are actually two versions of the algorithm, one each for "slow" and "fast"
	173	* processors. The more common "fast" algorithm is:
	174	*
	175	* nanoseconds = (((rdtsc - rnt_tsc_base) * rnt_tsc_scale) / 2**32) - rnt_ns_base;
	176	*
	177	* Of course, the divide by 2**32 is a nop. rnt_tsc_scale is a constant computed during initialization:
	178	*
	179	* rnt_tsc_scale = (10e9 * 2**32) / tscFreq;
	180	*
	181	* The "slow" algorithm uses long division:
	182	*
	183	* nanoseconds = (((rdtsc - rnt_tsc_base) * 10e9) / tscFreq) - rnt_ns_base;
	184	*
	185	* Since this routine is not synchronized and can be called in any context,
	186	* we use a generation count to guard against seeing partially updated data. In addition,
	187	* the _rtc_nanotime_store() routine -- just above -- zeroes the generation before
	188	* updating the data, and stores the nonzero generation only after all other data has been
	189	* stored. Because IA32 guarantees that stores by one processor must be seen in order
	190	* by another, we can avoid using a lock. We spin while the generation is zero.
	191	*
	192	* In accordance with the ABI, we return the 64-bit nanotime in %edx:%eax.
	193	*/
	194
	195	.globl EXT(_rtc_nanotime_read)
	196	.align FALIGN
	197	LEXT(_rtc_nanotime_read)
	198	pushl %ebp
	199	movl %esp,%ebp
	200	pushl %esi
	201	pushl %edi
	202	pushl %ebx
	203	movl 8(%ebp),%edi /* get ptr to rtc_nanotime_info */
	204	movl 12(%ebp),%eax /* get "slow" flag */
	205	testl %eax,%eax
	206	jnz Lslow
	207
	208	/* Processor whose TSC frequency is faster than SLOW_TSC_THRESHOLD */
6d2010ae	209	PAL_RTC_NANOTIME_READ_FAST()
1c79356b	210
2d21ac55 A	211	popl %ebx
	212	popl %edi
	213	popl %esi
	214	popl %ebp
	215	ret
	216
	217	/* Processor whose TSC frequency is slower than or equal to SLOW_TSC_THRESHOLD */
	218	Lslow:
	219	movl RNT_GENERATION(%edi),%esi /* get generation (0 if being changed) */
	220	testl %esi,%esi /* if being changed, loop until stable */
	221	jz Lslow
	222	pushl %esi /* save generation */
	223	pushl RNT_SHIFT(%edi) /* save low 32 bits of tscFreq */
	224
c910b4d9 A	225	lfence
	226	rdtsc /* get TSC in %edx:%eax */
	227	lfence
2d21ac55 A	228	subl RNT_TSC_BASE(%edi),%eax
	229	sbbl RNT_TSC_BASE+4(%edi),%edx
	230
	231	/*
	232	* Do the math to convert tsc ticks to nanoseconds. We first
	233	* do long multiply of 1 billion times the tsc. Then we do
	234	* long division by the tsc frequency
	235	*/
	236	mov $1000000000, %ecx /* number of nanoseconds in a second */
	237	mov %edx, %ebx
	238	mul %ecx
	239	mov %edx, %edi
	240	mov %eax, %esi
	241	mov %ebx, %eax
	242	mul %ecx
	243	add %edi, %eax
	244	adc $0, %edx /* result in edx:eax:esi */
	245	mov %eax, %edi
	246	popl %ecx /* get low 32 tscFreq */
	247	xor %eax, %eax
	248	xchg %edx, %eax
	249	div %ecx
	250	xor %eax, %eax
	251	mov %edi, %eax
	252	div %ecx
	253	mov %eax, %ebx
	254	mov %esi, %eax
	255	div %ecx
	256	mov %ebx, %edx /* result in edx:eax */
	257
	258	movl 8(%ebp),%edi /* recover ptr to rtc_nanotime_info */
	259	popl %esi /* recover generation */
43866e37	260
2d21ac55 A	261	addl RNT_NS_BASE(%edi),%eax
	262	adcl RNT_NS_BASE+4(%edi),%edx
	263
	264	cmpl RNT_GENERATION(%edi),%esi /* have the parameters changed? */
	265	jne Lslow /* yes, loop until stable */
	266
	267	pop %ebx
	268	pop %edi
	269	pop %esi
0c530ab8	270	pop %ebp
2d21ac55 A	271	ret /* result in edx:eax */
2d21ac55 A	272
6d2010ae A	273
	274
	275	/*
	276	* Timing routines.
	277	*/
	278	Entry(timer_update)
	279	movl 4(%esp),%ecx
	280	movl 8(%esp),%eax
	281	movl 12(%esp),%edx
	282	movl %eax,TIMER_HIGHCHK(%ecx)
	283	movl %edx,TIMER_LOW(%ecx)
	284	movl %eax,TIMER_HIGH(%ecx)
	285	ret
	286
	287	Entry(timer_grab)
	288	movl 4(%esp),%ecx
	289	0: movl TIMER_HIGH(%ecx),%edx
	290	movl TIMER_LOW(%ecx),%eax
	291	cmpl TIMER_HIGHCHK(%ecx),%edx
	292	jne 0b
	293	ret
	294
	295
	296	Entry(call_continuation)
	297	movl S_ARG0,%eax /* get continuation */
	298	movl S_ARG1,%edx /* continuation param */
	299	movl S_ARG2,%ecx /* wait result */
	300	movl %gs:CPU_KERNEL_STACK,%esp /* pop the stack */
	301	xorl %ebp,%ebp /* zero frame pointer */
	302	subl $8,%esp /* align the stack */
	303	pushl %ecx
	304	pushl %edx
	305	call %eax / call continuation */
	306	addl $16,%esp
	307	movl %gs:CPU_ACTIVE_THREAD,%eax
	308	pushl %eax
	309	call EXT(thread_terminate)
	310
	311