[apple/xnu.git] / osfmk / i386 / commpage / bzero_sse3_64.s

/*
 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

/*
 * Bzero, tuned for Pentium-M class processors with SSE3
 * and 64-byte cache lines.  This is the 64-bit version.
 *
 * This routine is also used for memset(p,0,n), which is a common case
 * since gcc sometimes silently maps bzero() into memset().  As a result,
 * we always load the original ptr into %eax before returning.
 */

#define kShort		80		// too short to bother with SSE (must be >=80)
#define	kVeryLong	(1024*1024)


        .text
	.code64
        .align  5, 0x90
Lbzero_sse3_64:                         // void	bzero(void *b, size_t len);
	pushq	%rbp			// set up a frame for backtraces
	movq	%rsp,%rbp
        xorl    %eax,%eax               // set fill data to 0
	movq	%rdi,%r11		// save original ptr as return value
        cmpq    $(kShort),%rsi          // long enough for SSE?
        jg	LNotShort               // yes
        
// Here for short operands or the end of long ones.
//      %esi = length (<= kShort)
//      %rdi = ptr
//      %eax = zero

Lshort:
	cmpl	$16,%esi		// long enough to word align?
	jge	3f			// yes
	test	%esi,%esi		// length==0?
	jz	6f
1:
	movb	%al,(%rdi)		// zero a byte
	incq	%rdi
	decl	%esi
	jnz	1b
	jmp	6f
2:
	movb	%al,(%rdi)		// zero a byte
	incq	%rdi
	decl	%esi
3:
	testl	$3,%edi			// is ptr doubleword aligned?
	jnz	2b			// no
	movl	%esi,%ecx		// copy length
	shrl	$2,%esi			// #doublewords to store
4:      
	movl	%eax,(%rdi)		// zero an aligned doubleword
	addq	$4,%rdi
	decl	%esi
	jnz	4b
	andl	$3,%ecx			// mask down to #bytes at end (0..3)
	jz	6f			// none
5:
	movb	%al,(%rdi)		// zero a byte
	incq	%rdi
	decl	%ecx
	jnz	5b
6:
	movq	%r11,%rax		// set return value in case this was a call of memset()
	popq	%rbp
        ret
		
        
// We will be using SSE, so align ptr.
//      %rsi = length (> kShort)
//      %rdi = ptr
//      %eax = zero

LNotShort:
        movl    %edi,%ecx		// get #bytes to 16-byte align ptr
        negl    %ecx
        andl    $15,%ecx
	jz	LDestAligned		// already aligned
        subq    %rcx,%rsi               // decrement length
0:					// loop storing bytes to align the ptr
	movb	%al,(%rdi)		// pack in a byte
	incq	%rdi
	decl	%ecx
	jnz	0b
	
// Destination is now 16-byte aligned.  Prepare to loop over 64-byte chunks.
//      %rsi = length (> (kShort-15))
//      %rdi = ptr (aligned)
//      %eax = zero

LDestAligned:
        movq    %rsi,%rcx
        andl    $63,%esi                // mask down to residual length (0..63)
        andq    $-64,%rcx               // get #bytes we will zero in this loop
        pxor    %xmm0,%xmm0             // zero an SSE register
        addq    %rcx,%rdi               // increment ptr by length to move
	cmpq	$(kVeryLong),%rcx	// long enough to justify non-temporal stores?
	jae	LVeryLong		// yes
        negq    %rcx			// negate length to move
	jmp	1f
	
// Loop over 64-byte chunks, storing into cache.

	.align	4,0x90			// keep inner loops 16-byte aligned
1:
        movdqa  %xmm0,(%rdi,%rcx)
        movdqa  %xmm0,16(%rdi,%rcx)
        movdqa  %xmm0,32(%rdi,%rcx)
        movdqa  %xmm0,48(%rdi,%rcx)
        addq    $64,%rcx
        jne     1b
	
	jmp	Lshort
	
// Very long operands: use non-temporal stores to bypass cache.

LVeryLong:
        negq    %rcx			// negate length to move
	jmp	1f
	
	.align	4,0x90			// keep inner loops 16-byte aligned
1:
        movntdq %xmm0,(%rdi,%rcx)
        movntdq %xmm0,16(%rdi,%rcx)
        movntdq %xmm0,32(%rdi,%rcx)
        movntdq %xmm0,48(%rdi,%rcx)
        addq    $64,%rcx
        jne     1b
	
        sfence                          // required by non-temporal stores
	jmp	Lshort


	COMMPAGE_DESCRIPTOR(bzero_sse3_64,_COMM_PAGE_BZERO,kHasSSE3,0)
Commit	Line	Data
89b3af67 A	1	/*
	2	* Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	#include <machine/cpu_capabilities.h>
	30	#include <machine/commpage.h>
	31
	32	/*
	33	* Bzero, tuned for Pentium-M class processors with SSE3
	34	* and 64-byte cache lines. This is the 64-bit version.
	35	*
	36	* This routine is also used for memset(p,0,n), which is a common case
	37	* since gcc sometimes silently maps bzero() into memset(). As a result,
	38	* we always load the original ptr into %eax before returning.
	39	*/
	40
	41	#define kShort 80 // too short to bother with SSE (must be >=80)
	42	#define kVeryLong (1024*1024)
	43
	44
	45	.text
	46	.code64
	47	.align 5, 0x90
	48	Lbzero_sse3_64: // void bzero(void *b, size_t len);
	49	pushq %rbp // set up a frame for backtraces
	50	movq %rsp,%rbp
	51	xorl %eax,%eax // set fill data to 0
	52	movq %rdi,%r11 // save original ptr as return value
	53	cmpq $(kShort),%rsi // long enough for SSE?
	54	jg LNotShort // yes
	55
	56	// Here for short operands or the end of long ones.
	57	// %esi = length (<= kShort)
	58	// %rdi = ptr
	59	// %eax = zero
	60
	61	Lshort:
	62	cmpl $16,%esi // long enough to word align?
	63	jge 3f // yes
	64	test %esi,%esi // length==0?
65	jz 6f
66	1:
67	movb %al,(%rdi) // zero a byte
68	incq %rdi
69	decl %esi
70	jnz 1b
71	jmp 6f
72	2:
73	movb %al,(%rdi) // zero a byte
74	incq %rdi
75	decl %esi
76	3:
77	testl $3,%edi // is ptr doubleword aligned?
78	jnz 2b // no
79	movl %esi,%ecx // copy length
80	shrl $2,%esi // #doublewords to store
81	4:
82	movl %eax,(%rdi) // zero an aligned doubleword
83	addq $4,%rdi
84	decl %esi
85	jnz 4b
86	andl $3,%ecx // mask down to #bytes at end (0..3)
87	jz 6f // none
88	5:
89	movb %al,(%rdi) // zero a byte
90	incq %rdi
91	decl %ecx
92	jnz 5b
93	6:
94	movq %r11,%rax // set return value in case this was a call of memset()
95	popq %rbp
96	ret
97
98
99	// We will be using SSE, so align ptr.
100	// %rsi = length (> kShort)
101	// %rdi = ptr
102	// %eax = zero
103
104	LNotShort:
105	movl %edi,%ecx // get #bytes to 16-byte align ptr
106	negl %ecx
107	andl $15,%ecx
108	jz LDestAligned // already aligned
109	subq %rcx,%rsi // decrement length
110	0: // loop storing bytes to align the ptr
111	movb %al,(%rdi) // pack in a byte
112	incq %rdi
113	decl %ecx
114	jnz 0b
115
116	// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks.
117	// %rsi = length (> (kShort-15))
118	// %rdi = ptr (aligned)
119	// %eax = zero
120
121	LDestAligned:
122	movq %rsi,%rcx
123	andl $63,%esi // mask down to residual length (0..63)
124	andq $-64,%rcx // get #bytes we will zero in this loop
125	pxor %xmm0,%xmm0 // zero an SSE register
126	addq %rcx,%rdi // increment ptr by length to move
127	cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores?
128	jae LVeryLong // yes
129	negq %rcx // negate length to move
130	jmp 1f
131
132	// Loop over 64-byte chunks, storing into cache.
133
134	.align 4,0x90 // keep inner loops 16-byte aligned
135	1:
136	movdqa %xmm0,(%rdi,%rcx)
137	movdqa %xmm0,16(%rdi,%rcx)
138	movdqa %xmm0,32(%rdi,%rcx)
139	movdqa %xmm0,48(%rdi,%rcx)
140	addq $64,%rcx
141	jne 1b
142
143	jmp Lshort
144
145	// Very long operands: use non-temporal stores to bypass cache.
146
147	LVeryLong:
148	negq %rcx // negate length to move
149	jmp 1f
150
151	.align 4,0x90 // keep inner loops 16-byte aligned
152	1:
153	movntdq %xmm0,(%rdi,%rcx)
154	movntdq %xmm0,16(%rdi,%rcx)
155	movntdq %xmm0,32(%rdi,%rcx)
156	movntdq %xmm0,48(%rdi,%rcx)
157	addq $64,%rcx
158	jne 1b
159
160	sfence // required by non-temporal stores
161	jmp Lshort
162
163
164	COMMPAGE_DESCRIPTOR(bzero_sse3_64,_COMM_PAGE_BZERO,kHasSSE3,0)