/*
 * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
 
#include <i386/asm.h>
#include <i386/apic.h>
#include <i386/eflags.h>
#include <i386/rtclock_asm.h>
#include <i386/postcode.h>
#include <i386/proc_reg.h>
#include <assym.s>

/*
**      ml_get_timebase()
**
**      Entry   - %esp contains pointer to 64 bit structure.
**
**      Exit    - 64 bit structure filled in.
**
*/
ENTRY(ml_get_timebase)

			movl    S_ARG0, %ecx
			
			lfence
			rdtsc
			lfence
			
			movl    %edx, 0(%ecx)
			movl    %eax, 4(%ecx)
			
			ret

/*
 *  	Convert between various timer units 
 *
 *		uint64_t tmrCvt(uint64_t time, uint64_t *conversion)
 *
 *		This code converts 64-bit time units to other units.
 *		For example, the TSC is converted to HPET units.
 *
 *		Time is a 64-bit integer that is some number of ticks.
 *		Conversion is 64-bit fixed point number which is composed
 *		of a 32 bit integer and a 32 bit fraction. 
 *
 *		The time ticks are multiplied by the conversion factor.  The
 *		calculations are done as a 128-bit value but both the high
 *		and low words are dropped.  The high word is overflow and the
 *		low word is the fraction part of the result.
 *
 *		We return a 64-bit value.
 *
 *		Note that we can use this function to multiply 2 conversion factors.
 *		We do this in order to calculate the multiplier used to convert
 *		directly between any two units.
 *
 */

			.globl	EXT(tmrCvt)
			.align FALIGN

LEXT(tmrCvt)

			pushl	%ebp					// Save a volatile
			movl	%esp,%ebp				// Get the parameters - 8
			pushl	%ebx					// Save a volatile
			pushl	%esi					// Save a volatile
			pushl	%edi					// Save a volatile

//			%ebp + 8	- low-order ts
//			%ebp + 12	- high-order ts
//			%ebp + 16	- low-order cvt
//			%ebp + 20	- high-order cvt

			movl	8(%ebp),%eax			// Get low-order ts
			mull	16(%ebp)				// Multiply by low-order conversion
			movl	%edx,%edi				// Need to save only the high order part
			
			movl	12(%ebp),%eax			// Get the high-order ts
			mull	16(%ebp)				// Multiply by low-order conversion
			addl	%eax,%edi				// Add in the overflow from the low x low calculation
			adcl	$0,%edx					// Add in any overflow to high high part
			movl	%edx,%esi				// Save high high part
			
//			We now have the upper 64 bits of the 96 bit multiply of ts and the low half of cvt
//			in %esi:%edi

			movl	8(%ebp),%eax			// Get low-order ts
			mull	20(%ebp)				// Multiply by high-order conversion
			movl	%eax,%ebx				// Need to save the low order part
			movl	%edx,%ecx				// Need to save the high order part
			
			movl	12(%ebp),%eax			// Get the high-order ts
			mull	20(%ebp)				// Multiply by high-order conversion
			
//			Now have %ecx:%ebx as low part of high low and %edx:%eax as high part of high high
//			We don't care about the highest word since it is overflow
			
			addl	%edi,%ebx				// Add the low words
			adcl	%ecx,%esi				// Add in the high plus carry from low
			addl	%eax,%esi				// Add in the rest of the high
			
			movl	%ebx,%eax				// Pass back low word
			movl	%esi,%edx				// and the high word
			
			popl	%edi					// Restore a volatile
			popl	%esi					// Restore a volatile
			popl	%ebx					// Restore a volatile
			popl	%ebp					// Restore a volatile

			ret						// Leave...


/* void  _rtc_nanotime_adjust(	
		uint64_t         tsc_base_delta,
	        rtc_nanotime_t  *dst);
*/
	.globl	EXT(_rtc_nanotime_adjust)
	.align	FALIGN

LEXT(_rtc_nanotime_adjust)
	mov	12(%esp),%edx			/* ptr to rtc_nanotime_info */
	
	movl	RNT_GENERATION(%edx),%ecx	/* get current generation */
	movl	$0,RNT_GENERATION(%edx)		/* flag data as being updated */

	movl	4(%esp),%eax			/* get lower 32-bits of delta */
	addl	%eax,RNT_TSC_BASE(%edx)
	adcl	$0,RNT_TSC_BASE+4(%edx)		/* propagate carry */

	incl	%ecx				/* next generation */
	jnz	1f
	incl	%ecx				/* skip 0, which is a flag */
1:	movl	%ecx,RNT_GENERATION(%edx)	/* update generation and make usable */

	ret


/* unint64_t _rtc_nanotime_read( rtc_nanotime_t *rntp, int slow );
 *
 * This is the same as the commpage nanotime routine, except that it uses the
 * kernel internal "rtc_nanotime_info" data instead of the commpage data.  The two copies
 * of data (one in the kernel and one in user space) are kept in sync by rtc_clock_napped().
 *
 * Warning!  There is another copy of this code in osfmk/i386/locore.s.  The
 * two versions must be kept in sync with each other!
 *
 * There are actually two versions of the algorithm, one each for "slow" and "fast"
 * processors.  The more common "fast" algorithm is:
 *
 *	nanoseconds = (((rdtsc - rnt_tsc_base) * rnt_tsc_scale) / 2**32) - rnt_ns_base;
 *
 * Of course, the divide by 2**32 is a nop.  rnt_tsc_scale is a constant computed during initialization:
 *
 *	rnt_tsc_scale = (10e9 * 2**32) / tscFreq;
 *
 * The "slow" algorithm uses long division:
 *
 *	nanoseconds = (((rdtsc - rnt_tsc_base) * 10e9) / tscFreq) - rnt_ns_base;
 *
 * Since this routine is not synchronized and can be called in any context, 
 * we use a generation count to guard against seeing partially updated data.  In addition,
 * the _rtc_nanotime_store() routine -- just above -- zeroes the generation before
 * updating the data, and stores the nonzero generation only after all other data has been
 * stored.  Because IA32 guarantees that stores by one processor must be seen in order
 * by another, we can avoid using a lock.  We spin while the generation is zero.
 *
 * In accordance with the ABI, we return the 64-bit nanotime in %edx:%eax.
 */
 
		.globl	EXT(_rtc_nanotime_read)
		.align	FALIGN
LEXT(_rtc_nanotime_read)
		pushl		%ebp
		movl		%esp,%ebp
		pushl		%esi
		pushl		%edi
		pushl		%ebx
		movl		8(%ebp),%edi				/* get ptr to rtc_nanotime_info */
		movl		12(%ebp),%eax				/* get "slow" flag */
		testl		%eax,%eax
		jnz		Lslow
		
		/* Processor whose TSC frequency is faster than SLOW_TSC_THRESHOLD */
		PAL_RTC_NANOTIME_READ_FAST()

		popl		%ebx
		popl		%edi
		popl		%esi
		popl		%ebp
		ret

		/* Processor whose TSC frequency is slower than or equal to SLOW_TSC_THRESHOLD */
Lslow:
		movl		RNT_GENERATION(%edi),%esi		/* get generation (0 if being changed) */
		testl		%esi,%esi				/* if being changed, loop until stable */
		jz		Lslow
		pushl		%esi					/* save generation */
		pushl		RNT_SHIFT(%edi)				/* save low 32 bits of tscFreq */

		lfence
		rdtsc	  						/* get TSC in %edx:%eax */
		lfence
		subl		RNT_TSC_BASE(%edi),%eax
		sbbl		RNT_TSC_BASE+4(%edi),%edx

		/*
		* Do the math to convert tsc ticks to nanoseconds.  We first
		* do long multiply of 1 billion times the tsc.  Then we do
		* long division by the tsc frequency
		*/
		mov		$1000000000, %ecx			/* number of nanoseconds in a second */
		mov		%edx, %ebx
		mul		%ecx
		mov		%edx, %edi
		mov		%eax, %esi
		mov		%ebx, %eax
		mul		%ecx
		add		%edi, %eax
		adc		$0, %edx				/* result in edx:eax:esi */
		mov		%eax, %edi
		popl		%ecx					/* get low 32 tscFreq */
		xor		%eax, %eax
		xchg		%edx, %eax
		div		%ecx
		xor		%eax, %eax
		mov		%edi, %eax
		div		%ecx
		mov		%eax, %ebx
		mov		%esi, %eax
		div		%ecx
		mov		%ebx, %edx				/* result in edx:eax */
		
		movl		8(%ebp),%edi				/* recover ptr to rtc_nanotime_info */
		popl		%esi					/* recover generation */

		addl		RNT_NS_BASE(%edi),%eax
		adcl		RNT_NS_BASE+4(%edi),%edx

		cmpl		RNT_GENERATION(%edi),%esi		/* have the parameters changed? */
		jne		Lslow					/* yes, loop until stable */

		pop		%ebx
		pop		%edi
		pop		%esi
		pop		%ebp
		ret							/* result in edx:eax */


/*
 * Timing routines.
 */
Entry(timer_update)
	movl	4(%esp),%ecx
	movl	8(%esp),%eax
	movl	12(%esp),%edx
	movl	%eax,TIMER_HIGHCHK(%ecx)
	movl	%edx,TIMER_LOW(%ecx)
	movl	%eax,TIMER_HIGH(%ecx)
	ret

Entry(timer_grab)
	movl	4(%esp),%ecx
0:	movl	TIMER_HIGH(%ecx),%edx
	movl	TIMER_LOW(%ecx),%eax
	cmpl	TIMER_HIGHCHK(%ecx),%edx
	jne	0b
	ret


Entry(call_continuation)
	movl	S_ARG0,%eax			/* get continuation */
	movl	S_ARG1,%edx			/* continuation param */
	movl	S_ARG2,%ecx			/* wait result */
	movl	%gs:CPU_KERNEL_STACK,%esp	/* pop the stack */
	xorl	%ebp,%ebp			/* zero frame pointer */
	subl	$8,%esp				/* align the stack */
	pushl	%ecx
	pushl	%edx
	call	*%eax				/* call continuation */
	addl	$16,%esp
	movl	%gs:CPU_ACTIVE_THREAD,%eax
	pushl	%eax
	call	EXT(thread_terminate)