[apple/xnu.git] / osfmk / i386 / commpage / bzero_sse3.s

/*
 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 *
 * The contents of this file constitute Original Code as defined in and
 * are subject to the Apple Public Source License Version 1.1 (the
 * "License").  You may not use this file except in compliance with the
 * License.  Please obtain a copy of the License at
 * http://www.apple.com/publicsource and read it before using this file.
 *
 * This Original Code and all software distributed under the License are
 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 * License for the specific language governing rights and limitations
 * under the License.
 *
 * @APPLE_LICENSE_HEADER_END@
 */

#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

/*
 * Bzero, tuned for Pentium-M class processors with SSE3
 * and 64-byte cache lines.
 *
 * This routine is also used for memset(p,0,n), which is a common case
 * since gcc sometimes silently maps bzero() into memset().  As a result,
 * we always load the original ptr into %eax before returning.
 */

#define kShort		80		// too short to bother with SSE (must be >=80)
#define	kVeryLong	(1024*1024)


        .text
        .align  5, 0x90
Lbzero_sse3:                            // void	bzero(void *b, size_t len);
	pushl	%ebp			// set up a frame for backtraces
	movl	%esp,%ebp
        pushl   %edi
        movl    8(%ebp),%edi            // get ptr
        movl    12(%ebp),%edx           // get length

        xorl    %eax,%eax               // set fill data to 0
        cmpl    $(kShort),%edx          // long enough for SSE?
        jg	LNotShort               // yes
        
// Here for short operands or the end of long ones.
//      %edx = length
//      %edi = ptr
//      %eax = zero

Lshort:
	cmpl	$16,%edx		// long enough to word align?
	jge	3f			// yes
	test	%edx,%edx		// length==0?
	jz	6f
1:
	movb	%al,(%edi)		// zero a byte
	inc	%edi
	dec	%edx
	jnz	1b
	jmp	6f
2:
	movb	%al,(%edi)		// zero a byte
	inc	%edi
	dec	%edx
3:
	test	$3,%edi			// is ptr doubleword aligned?
	jnz	2b			// no
	movl	%edx,%ecx		// copy length
	shrl	$2,%edx			// #doublewords to store
4:      
	movl	%eax,(%edi)		// zero an aligned doubleword
	addl	$4,%edi
	dec	%edx
	jnz	4b
	andl	$3,%ecx			// mask down to #bytes at end (0..3)
	jz	6f			// none
5:
	movb	%al,(%edi)		// zero a byte
	inc	%edi
	dec	%ecx
	jnz	5b
6:
	movl	8(%ebp),%eax		// get return value in case this was a call of memset()
        popl    %edi
	popl	%ebp
        ret

        
// We will be using SSE, so align ptr.

LNotShort:
        movl    %edi,%ecx
        negl    %ecx
        andl    $15,%ecx                // mask down to #bytes to 16-byte align
	jz	LDestAligned		// already aligned
        subl    %ecx,%edx               // decrement length
0:					// loop storing bytes to align the ptr
	movb	%al,(%edi)		// pack in a byte
	inc	%edi
	dec	%ecx
	jnz	0b
	
// Destination is now 16-byte aligned.  Prepare to loop over 64-byte chunks.
//      %edx = length
//      %edi = ptr
//      %eax = zero

LDestAligned:
        movl    %edx,%ecx
        andl    $63,%edx                // mask down to residual length (0..63)
        andl    $-64,%ecx               // get #bytes we will zero in this loop
        pxor    %xmm0,%xmm0             // zero an SSE register
        addl    %ecx,%edi               // increment ptr by length to move
	cmpl	$(kVeryLong),%ecx	// long enough to justify non-temporal stores?
	jae	LVeryLong		// yes
        negl    %ecx			// negate length to move
	jmp	1f
	
// Loop over 64-byte chunks, storing into cache.

	.align	4,0x90			// keep inner loops 16-byte aligned
1:
        movdqa  %xmm0,(%edi,%ecx)
        movdqa  %xmm0,16(%edi,%ecx)
        movdqa  %xmm0,32(%edi,%ecx)
        movdqa  %xmm0,48(%edi,%ecx)
        addl    $64,%ecx
        jne     1b
	
	jmp	Lshort
	
// Very long operands: use non-temporal stores to bypass cache.

LVeryLong:
        negl    %ecx			// negate length to move
	jmp	1f
	
	.align	4,0x90			// keep inner loops 16-byte aligned
1:
        movntdq %xmm0,(%edi,%ecx)
        movntdq %xmm0,16(%edi,%ecx)
        movntdq %xmm0,32(%edi,%ecx)
        movntdq %xmm0,48(%edi,%ecx)
        addl    $64,%ecx
        jne     1b
	
        sfence                          // required by non-temporal stores
	jmp	Lshort


	COMMPAGE_DESCRIPTOR(bzero_sse3,_COMM_PAGE_BZERO,kHasSSE2,0)
Commit	Line	Data
c0fea474 A	1	/*
	2	* Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
	11	*
	12	* This Original Code and all software distributed under the License are
	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22
	23	#include <machine/cpu_capabilities.h>
	24	#include <machine/commpage.h>
	25
	26	/*
	27	* Bzero, tuned for Pentium-M class processors with SSE3
	28	* and 64-byte cache lines.
	29	*
	30	* This routine is also used for memset(p,0,n), which is a common case
	31	* since gcc sometimes silently maps bzero() into memset(). As a result,
	32	* we always load the original ptr into %eax before returning.
	33	*/
	34
	35	#define kShort 80 // too short to bother with SSE (must be >=80)
	36	#define kVeryLong (1024*1024)
	37
	38
	39	.text
	40	.align 5, 0x90
	41	Lbzero_sse3: // void bzero(void *b, size_t len);
	42	pushl %ebp // set up a frame for backtraces
	43	movl %esp,%ebp
	44	pushl %edi
	45	movl 8(%ebp),%edi // get ptr
	46	movl 12(%ebp),%edx // get length
	47
	48	xorl %eax,%eax // set fill data to 0
	49	cmpl $(kShort),%edx // long enough for SSE?
	50	jg LNotShort // yes
	51
	52	// Here for short operands or the end of long ones.
	53	// %edx = length
	54	// %edi = ptr
	55	// %eax = zero
	56
	57	Lshort:
	58	cmpl $16,%edx // long enough to word align?
	59	jge 3f // yes
	60	test %edx,%edx // length==0?
	61	jz 6f
	62	1:
	63	movb %al,(%edi) // zero a byte
	64	inc %edi
65	dec %edx
66	jnz 1b
67	jmp 6f
68	2:
69	movb %al,(%edi) // zero a byte
70	inc %edi
71	dec %edx
72	3:
73	test $3,%edi // is ptr doubleword aligned?
74	jnz 2b // no
75	movl %edx,%ecx // copy length
76	shrl $2,%edx // #doublewords to store
77	4:
78	movl %eax,(%edi) // zero an aligned doubleword
79	addl $4,%edi
80	dec %edx
81	jnz 4b
82	andl $3,%ecx // mask down to #bytes at end (0..3)
83	jz 6f // none
84	5:
85	movb %al,(%edi) // zero a byte
86	inc %edi
87	dec %ecx
88	jnz 5b
89	6:
90	movl 8(%ebp),%eax // get return value in case this was a call of memset()
91	popl %edi
92	popl %ebp
93	ret
94
95
96	// We will be using SSE, so align ptr.
97
98	LNotShort:
99	movl %edi,%ecx
100	negl %ecx
101	andl $15,%ecx // mask down to #bytes to 16-byte align
102	jz LDestAligned // already aligned
103	subl %ecx,%edx // decrement length
104	0: // loop storing bytes to align the ptr
105	movb %al,(%edi) // pack in a byte
106	inc %edi
107	dec %ecx
108	jnz 0b
109
110	// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks.
111	// %edx = length
112	// %edi = ptr
113	// %eax = zero
114
115	LDestAligned:
116	movl %edx,%ecx
117	andl $63,%edx // mask down to residual length (0..63)
118	andl $-64,%ecx // get #bytes we will zero in this loop
119	pxor %xmm0,%xmm0 // zero an SSE register
120	addl %ecx,%edi // increment ptr by length to move
121	cmpl $(kVeryLong),%ecx // long enough to justify non-temporal stores?
122	jae LVeryLong // yes
123	negl %ecx // negate length to move
124	jmp 1f
125
126	// Loop over 64-byte chunks, storing into cache.
127
128	.align 4,0x90 // keep inner loops 16-byte aligned
129	1:
130	movdqa %xmm0,(%edi,%ecx)
131	movdqa %xmm0,16(%edi,%ecx)
132	movdqa %xmm0,32(%edi,%ecx)
133	movdqa %xmm0,48(%edi,%ecx)
134	addl $64,%ecx
135	jne 1b
136
137	jmp Lshort
138
139	// Very long operands: use non-temporal stores to bypass cache.
140
141	LVeryLong:
142	negl %ecx // negate length to move
143	jmp 1f
144
145	.align 4,0x90 // keep inner loops 16-byte aligned
146	1:
147	movntdq %xmm0,(%edi,%ecx)
148	movntdq %xmm0,16(%edi,%ecx)
149	movntdq %xmm0,32(%edi,%ecx)
150	movntdq %xmm0,48(%edi,%ecx)
151	addl $64,%ecx
152	jne 1b
153
154	sfence // required by non-temporal stores
155	jmp Lshort
156
157
158	COMMPAGE_DESCRIPTOR(bzero_sse3,_COMM_PAGE_BZERO,kHasSSE2,0)