[apple/libc.git] / i386 / string / bzero_sse2.s

/*
 * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <machine/cpu_capabilities.h>
#include <platfunc.h>

/*
 * Bzero, tuned for Pentium-M class processors with SSE2
 * and 64-byte cache lines.
 *
 * This routine is also used for memset(p,0,n), which is a common case
 * since gcc sometimes silently maps bzero() into memset().  As a result,
 * we always load the original ptr into %eax before returning.
 */

#define kShort		80		// too short to bother with SSE (must be >=80)
#define	kVeryLong	(1024*1024)

// void	bzero(void *b, size_t len);

PLATFUNC_FUNCTION_START(bzero, sse2, 32, 5)
	pushl	%ebp			// set up a frame for backtraces
	movl	%esp,%ebp
	pushl   %edi
	movl    8(%ebp),%edi            // get ptr
	movl    12(%ebp),%edx           // get length

	xorl    %eax,%eax               // set fill data to 0
	cmpl    $(kShort),%edx          // long enough for SSE?
	jg	LNotShort               // yes

// Here for short operands or the end of long ones.
//      %edx = length
//      %edi = ptr
//      %eax = zero

Lshort:
	cmpl	$16,%edx		// long enough to word align?
	jge	3f			// yes
	test	%edx,%edx		// length==0?
	jz	6f
1:
	movb	%al,(%edi)		// zero a byte
	inc	%edi
	dec	%edx
	jnz	1b
	jmp	6f
2:
	movb	%al,(%edi)		// zero a byte
	inc	%edi
	dec	%edx
3:
	test	$3,%edi			// is ptr doubleword aligned?
	jnz	2b			// no
	movl	%edx,%ecx		// copy length
	shrl	$2,%edx			// #doublewords to store
4:      
	movl	%eax,(%edi)		// zero an aligned doubleword
	addl	$4,%edi
	dec	%edx
	jnz	4b
	andl	$3,%ecx			// mask down to #bytes at end (0..3)
	jz	6f			// none
5:
	movb	%al,(%edi)		// zero a byte
	inc	%edi
	dec	%ecx
	jnz	5b
6:
	movl	8(%ebp),%eax		// get return value in case this was a call of memset()
	popl    %edi
	popl	%ebp
	ret


// We will be using SSE, so align ptr.

LNotShort:
	movl    %edi,%ecx
	negl    %ecx
	andl    $15,%ecx                // mask down to #bytes to 16-byte align
	jz	LDestAligned		// already aligned
	subl    %ecx,%edx               // decrement length
0:					// loop storing bytes to align the ptr
	movb	%al,(%edi)		// pack in a byte
	inc	%edi
	dec	%ecx
	jnz	0b

// Destination is now 16-byte aligned.  Prepare to loop over 64-byte chunks.
//      %edx = length
//      %edi = ptr
//      %eax = zero

LDestAligned:
	movl    %edx,%ecx
	andl    $63,%edx                // mask down to residual length (0..63)
	andl    $-64,%ecx               // get #bytes we will zero in this loop
	pxor    %xmm0,%xmm0             // zero an SSE register
	addl    %ecx,%edi               // increment ptr by length to move
	cmpl	$(kVeryLong),%ecx	// long enough to justify non-temporal stores?
	jae	LVeryLong		// yes
	negl    %ecx			// negate length to move
	jmp	1f

// Loop over 64-byte chunks, storing into cache.

	.align	4,0x90			// keep inner loops 16-byte aligned
1:
	movdqa  %xmm0,(%edi,%ecx)
	movdqa  %xmm0,16(%edi,%ecx)
	movdqa  %xmm0,32(%edi,%ecx)
	movdqa  %xmm0,48(%edi,%ecx)
	addl    $64,%ecx
	jne     1b

	jmp	Lshort

// Very long operands: use non-temporal stores to bypass cache.

LVeryLong:
	negl    %ecx			// negate length to move
	jmp	1f

	.align	4,0x90			// keep inner loops 16-byte aligned
1:
	movntdq %xmm0,(%edi,%ecx)
	movntdq %xmm0,16(%edi,%ecx)
	movntdq %xmm0,32(%edi,%ecx)
	movntdq %xmm0,48(%edi,%ecx)
	addl    $64,%ecx
	jne     1b

	sfence                          // required by non-temporal stores
	jmp	Lshort

PLATFUNC_DESCRIPTOR(bzero,sse2,kHasSSE2,kHasSSE4_2)
Commit	Line	Data
1f2f436a A	1	/*
	2	* Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	#include <machine/cpu_capabilities.h>
	30	#include <platfunc.h>
	31
	32	/*
	33	* Bzero, tuned for Pentium-M class processors with SSE2
	34	* and 64-byte cache lines.
	35	*
	36	* This routine is also used for memset(p,0,n), which is a common case
	37	* since gcc sometimes silently maps bzero() into memset(). As a result,
	38	* we always load the original ptr into %eax before returning.
	39	*/
	40
	41	#define kShort 80 // too short to bother with SSE (must be >=80)
	42	#define kVeryLong (1024*1024)
	43
	44	// void bzero(void *b, size_t len);
	45
	46	PLATFUNC_FUNCTION_START(bzero, sse2, 32, 5)
	47	pushl %ebp // set up a frame for backtraces
	48	movl %esp,%ebp
	49	pushl %edi
	50	movl 8(%ebp),%edi // get ptr
	51	movl 12(%ebp),%edx // get length
	52
	53	xorl %eax,%eax // set fill data to 0
	54	cmpl $(kShort),%edx // long enough for SSE?
	55	jg LNotShort // yes
	56
	57	// Here for short operands or the end of long ones.
	58	// %edx = length
	59	// %edi = ptr
	60	// %eax = zero
	61
	62	Lshort:
	63	cmpl $16,%edx // long enough to word align?
	64	jge 3f // yes
65	test %edx,%edx // length==0?
66	jz 6f
67	1:
68	movb %al,(%edi) // zero a byte
69	inc %edi
70	dec %edx
71	jnz 1b
72	jmp 6f
73	2:
74	movb %al,(%edi) // zero a byte
75	inc %edi
76	dec %edx
77	3:
78	test $3,%edi // is ptr doubleword aligned?
79	jnz 2b // no
80	movl %edx,%ecx // copy length
81	shrl $2,%edx // #doublewords to store
82	4:
83	movl %eax,(%edi) // zero an aligned doubleword
84	addl $4,%edi
85	dec %edx
86	jnz 4b
87	andl $3,%ecx // mask down to #bytes at end (0..3)
88	jz 6f // none
89	5:
90	movb %al,(%edi) // zero a byte
91	inc %edi
92	dec %ecx
93	jnz 5b
94	6:
95	movl 8(%ebp),%eax // get return value in case this was a call of memset()
96	popl %edi
97	popl %ebp
98	ret
99
100
101	// We will be using SSE, so align ptr.
102
103	LNotShort:
104	movl %edi,%ecx
105	negl %ecx
106	andl $15,%ecx // mask down to #bytes to 16-byte align
107	jz LDestAligned // already aligned
108	subl %ecx,%edx // decrement length
109	0: // loop storing bytes to align the ptr
110	movb %al,(%edi) // pack in a byte
111	inc %edi
112	dec %ecx
113	jnz 0b
114
115	// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks.
116	// %edx = length
117	// %edi = ptr
118	// %eax = zero
119
120	LDestAligned:
121	movl %edx,%ecx
122	andl $63,%edx // mask down to residual length (0..63)
123	andl $-64,%ecx // get #bytes we will zero in this loop
124	pxor %xmm0,%xmm0 // zero an SSE register
125	addl %ecx,%edi // increment ptr by length to move
126	cmpl $(kVeryLong),%ecx // long enough to justify non-temporal stores?
127	jae LVeryLong // yes
128	negl %ecx // negate length to move
129	jmp 1f
130
131	// Loop over 64-byte chunks, storing into cache.
132
133	.align 4,0x90 // keep inner loops 16-byte aligned
134	1:
135	movdqa %xmm0,(%edi,%ecx)
136	movdqa %xmm0,16(%edi,%ecx)
137	movdqa %xmm0,32(%edi,%ecx)
138	movdqa %xmm0,48(%edi,%ecx)
139	addl $64,%ecx
140	jne 1b
141
142	jmp Lshort
143
144	// Very long operands: use non-temporal stores to bypass cache.
145
146	LVeryLong:
147	negl %ecx // negate length to move
148	jmp 1f
149
150	.align 4,0x90 // keep inner loops 16-byte aligned
151	1:
152	movntdq %xmm0,(%edi,%ecx)
153	movntdq %xmm0,16(%edi,%ecx)
154	movntdq %xmm0,32(%edi,%ecx)
155	movntdq %xmm0,48(%edi,%ecx)
156	addl $64,%ecx
157	jne 1b
158
159	sfence // required by non-temporal stores
160	jmp Lshort
161
162	PLATFUNC_DESCRIPTOR(bzero,sse2,kHasSSE2,kHasSSE4_2)