[apple/libc.git] / i386 / string / memset.s

/*
 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 *
 * The contents of this file constitute Original Code as defined in and
 * are subject to the Apple Public Source License Version 1.1 (the
 * "License").  You may not use this file except in compliance with the
 * License.  Please obtain a copy of the License at
 * http://www.apple.com/publicsource and read it before using this file.
 *
 * This Original Code and all software distributed under the License are
 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 * License for the specific language governing rights and limitations
 * under the License.
 *
 * @APPLE_LICENSE_HEADER_END@
 */

#include <machine/cpu_capabilities.h>
 

/* This file contains the following functions:
 *
 *	void *memset(void *b, int c, size_t len);
 *	void memset_pattern4(void *b, const void *c4, size_t len);
 *	void memset_pattern8(void *b, const void *c8, size_t len);
 *	void memset_pattern16(void *b, const void *c16, size_t len);
 *
 * Calls of memset() with c==0 are routed to the bzero() routine.  Most of the
 * others go to _memset_pattern, which is entered as follows:
 *	%edi = ptr to memory to set (aligned)
 *	%edx = length (which can be short, though we bias in favor of long operands)
 *	%xmm0 = the pattern to store
 * Return conditions:
 *	%eax, %edi, %esi, %ecx, and %edx all trashed
 *
 * NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow
 * on P4s and probably other processors.
 */
 
 #define kShort	255			// for nonzero memset(), too short for commpage
 
 
	.text
	.globl	_memset
	.align	2
_memset:				// void *memset(void *b, int c, size_t len);
	movl	8(%esp),%eax		// get 1-byte pattern
	movl	12(%esp),%edx		// get length
	andl	$0xFF,%eax		// (c==0) ?
	jnz	LNonzero		// not a bzero
	
	movl	%edx,8(%esp)		// put count where bzero() expects it
	jmp		_bzero				// enter _bzero


	// Handle memset of a nonzero value.
	
LNonzero:	
	pushl	%edi			// save a few nonvolatiles
	pushl	%esi
	movl	%eax,%esi		// replicate byte in %al into all four bytes
	movl	12(%esp),%edi		// point to operand
	shll	$8,%esi
	orl	%esi,%eax
	movl	%eax,%esi
	shll	$16,%esi
	orl	%esi,%eax		// now %eax has "c" in all 4 bytes
	cmpl	$(kShort),%edx		// is operand too short for SSE?
	ja	LCallCommpage		// no
	
// Nonzero memset() too short to call commpage.
//	%eax = replicated 4-byte pattern
//	%edi = ptr
//	%edx = length (<= kShort)
	
	cmpl	$16,%edx		// long enough to word align?
	jge	3f			// yes
	test	%edx,%edx		// length==0?
	jz	6f
1:
	movb	%al,(%edi)		// pack in a byte
	inc	%edi
	dec	%edx
	jnz	1b
	jmp	6f
2:
	movb	%al,(%edi)		// pack in a byte
	inc	%edi
	dec	%edx
3:
	test	$3,%edi			// is ptr doubleword aligned?
	jnz	2b			// no
	movl	%edx,%ecx		// copy length
	shrl	$2,%edx			// #doublewords to store
4:      
	movl	%eax,(%edi)		// store aligned doubleword
	addl	$4,%edi
	dec	%edx
	jnz	4b
	andl	$3,%ecx			// any leftover bytes?
	jz	6f			// no
5:
	movb	%al,(%edi)		// pack in a byte
	inc	%edi
	dec	%ecx
	jnz	5b
6:
	movl	12(%esp),%eax		// get return value (ie, original ptr)
	popl	%esi
	popl	%edi
	ret
	
// Nonzero memset() is long enough to call commpage.
//	%eax = replicated 4-byte pattern
//	%edi = ptr
//	%edx = length (> kShort)
	
LCallCommpage:
	movd	%eax,%xmm0		// move %eax to low 4 bytes of %xmm0
	pshufd	$(0x00),%xmm0,%xmm0	// replicate across the vector
	movl	%edi,%ecx		// copy dest ptr
	negl	%ecx
	andl	$15,%ecx		// get #bytes to align ptr
	jz	2f			// skip if already aligned
	subl	%ecx,%edx		// decrement length
1:
	movb	%al,(%edi)		// pack in a byte
	inc	%edi
	dec	%ecx
	jnz	1b
2:					// ptr aligned, length long enough to justify
	call	_memset_pattern // call commpage to do the heavy lifting
	movl	12(%esp),%eax		// get return value (ie, original ptr)
	popl	%esi
	popl	%edi
	ret


// Handle memset of a 16-byte pattern.
	
	.globl	_memset_pattern16
	.align	2, 0x90
_memset_pattern16:			// void memset_pattern16(void *b, const void *c16, size_t len);
	pushl	%edi
	pushl	%esi
	movl	20(%esp),%edx		// get length
	movl	16(%esp),%esi		// get ptr to 16-byte pattern
	movl	12(%esp),%edi		// point to operand
	movdqu	(%esi),%xmm0		// load the pattern
	jmp	LAlignPtr


// Handle memset of an 8-byte pattern.
	
	.globl	_memset_pattern8
	.align	2, 0x90
_memset_pattern8:			// void memset_pattern8(void *b, const void *c8, size_t len);
	pushl	%edi
	pushl	%esi
	movl	20(%esp),%edx		// get length
	movl	16(%esp),%esi		// get ptr to 8-byte pattern
	movl	12(%esp),%edi		// point to operand
	movq	(%esi),%xmm0		// load pattern into low 8 bytes
	punpcklqdq %xmm0,%xmm0		// replicate into all 16
	jmp	LAlignPtr

// Handle memset of a 4-byte pattern.
	
	.globl	_memset_pattern4
	.align	2, 0x90
_memset_pattern4:			// void memset_pattern4(void *b, const void *c4, size_t len);
	pushl	%edi
	pushl	%esi
	movl	20(%esp),%edx		// get length
	movl	16(%esp),%esi		// get ptr to 4-byte pattern
	movl	12(%esp),%edi		// point to operand
	movd	(%esi),%xmm0		// load pattern into low 4 bytes
	pshufd	$(0x00),%xmm0,%xmm0	// replicate the 4 bytes across the vector


// Align ptr if necessary.  We must rotate the pattern right for each byte we
// store while aligning the ptr.  Since there is no rotate instruction in SSE3,
// we have to synthesize the rotates.
//	%edi = ptr
//	%edx = length
//	%xmm0 = pattern
	
LAlignPtr:				// NB: can drop down to here!
	cmpl	$100,%edx		// long enough to bother aligning ptr?
	movl	%edi,%ecx		// copy ptr
	jb	LReady			// not long enough
	negl	%ecx
	andl	$15,%ecx		// get #bytes to align ptr
	jz	LReady			// already aligned
	subl	%ecx,%edx		// adjust length
	
	test	$1,%cl			// 1-byte store required?
	movd	%xmm0,%eax		// get 4 low bytes in %eax
	jz	2f			// no
	movdqa	%xmm0,%xmm1		// copy pattern so we can shift in both directions
	movb	%al,(%edi)		// pack in the low-order byte
	psrldq	$1,%xmm0		// shift pattern right 1 byte
	inc	%edi
	pslldq	$15,%xmm1		// shift pattern left 15 bytes
	shrl	$8,%eax			// in case 2-byte store is required
	por	%xmm1,%xmm0		// complete right rotate of pattern by 1 byte
2:
	test	$2,%cl			// 2-byte store required?
	jz	4f			// no
	psrldq	$2,%xmm0		// shift pattern down 2 bytes
	movw	%ax,(%edi)		// pack in next two bytes
	pinsrw	$7,%eax,%xmm0		// insert low word of %eax into high word of %xmm0
	addl	$2,%edi			// adjust ptr
4:
	test	$4,%cl			// 4-byte store required?
	jz	8f			// no
	movd	%xmm0,(%edi)		// store low 4 bytes of %xmm0
	pshufd	$(0x39),%xmm0,%xmm0	// rotate %xmm0 right 4 bytes (mask == 00 11 10 01)
	addl	$4,%edi			// adjust ptr
8:
	test	$8,%cl			// 8-byte store required?
	jz	LReady			// no
	movq	%xmm0,(%edi)		// store low 8 bytes of %xmm0
	pshufd	$(0x4e),%xmm0,%xmm0	// rotate %xmm0 right 8 bytes (mask == 01 00 11 10)
	addl	$8,%edi			// adjust ptr
	
// Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.

LReady:
	call	_memset_pattern // call commpage to do the heavy lifting
	popl	%esi
	popl	%edi
	ret
Commit	Line	Data
eb1cde05 A	1	/*
	2	* Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
	11	*
	12	* This Original Code and all software distributed under the License are
	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22
	23	#include <machine/cpu_capabilities.h>
	24
	25
	26	/* This file contains the following functions:
	27	*
	28	* void memset(void b, int c, size_t len);
	29	* void memset_pattern4(void b, const void c4, size_t len);
	30	* void memset_pattern8(void b, const void c8, size_t len);
	31	* void memset_pattern16(void b, const void c16, size_t len);
	32	*
	33	* Calls of memset() with c==0 are routed to the bzero() routine. Most of the
1f2f436a	34	* others go to _memset_pattern, which is entered as follows:
eb1cde05 A	35	* %edi = ptr to memory to set (aligned)
	36	* %edx = length (which can be short, though we bias in favor of long operands)
	37	* %xmm0 = the pattern to store
	38	* Return conditions:
	39	* %eax, %edi, %esi, %ecx, and %edx all trashed
	40	*
	41	* NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow
	42	* on P4s and probably other processors.
	43	*/
	44
	45	#define kShort 255 // for nonzero memset(), too short for commpage
	46
	47
	48	.text
	49	.globl _memset
	50	.align 2
	51	_memset: // void memset(void b, int c, size_t len);
	52	movl 8(%esp),%eax // get 1-byte pattern
	53	movl 12(%esp),%edx // get length
	54	andl $0xFF,%eax // (c==0) ?
	55	jnz LNonzero // not a bzero
	56
eb1cde05	57	movl %edx,8(%esp) // put count where bzero() expects it
1f2f436a	58	jmp _bzero // enter _bzero
eb1cde05 A	59
	60
	61	// Handle memset of a nonzero value.
	62
	63	LNonzero:
	64	pushl %edi // save a few nonvolatiles
	65	pushl %esi
	66	movl %eax,%esi // replicate byte in %al into all four bytes
	67	movl 12(%esp),%edi // point to operand
	68	shll $8,%esi
	69	orl %esi,%eax
	70	movl %eax,%esi
	71	shll $16,%esi
	72	orl %esi,%eax // now %eax has "c" in all 4 bytes
	73	cmpl $(kShort),%edx // is operand too short for SSE?
	74	ja LCallCommpage // no
	75
	76	// Nonzero memset() too short to call commpage.
	77	// %eax = replicated 4-byte pattern
	78	// %edi = ptr
	79	// %edx = length (<= kShort)
	80
	81	cmpl $16,%edx // long enough to word align?
	82	jge 3f // yes
	83	test %edx,%edx // length==0?
	84	jz 6f
	85	1:
	86	movb %al,(%edi) // pack in a byte
	87	inc %edi
	88	dec %edx
	89	jnz 1b
	90	jmp 6f
	91	2:
	92	movb %al,(%edi) // pack in a byte
	93	inc %edi
	94	dec %edx
	95	3:
	96	test $3,%edi // is ptr doubleword aligned?
	97	jnz 2b // no
	98	movl %edx,%ecx // copy length
	99	shrl $2,%edx // #doublewords to store
	100	4:
	101	movl %eax,(%edi) // store aligned doubleword
	102	addl $4,%edi
	103	dec %edx
	104	jnz 4b
	105	andl $3,%ecx // any leftover bytes?
	106	jz 6f // no
	107	5:
	108	movb %al,(%edi) // pack in a byte
	109	inc %edi
	110	dec %ecx
	111	jnz 5b
	112	6:
	113	movl 12(%esp),%eax // get return value (ie, original ptr)
	114	popl %esi
	115	popl %edi
	116	ret
	117
	118	// Nonzero memset() is long enough to call commpage.
	119	// %eax = replicated 4-byte pattern
	120	// %edi = ptr
	121	// %edx = length (> kShort)
	122
123	LCallCommpage:
124	movd %eax,%xmm0 // move %eax to low 4 bytes of %xmm0
125	pshufd $(0x00),%xmm0,%xmm0 // replicate across the vector
126	movl %edi,%ecx // copy dest ptr
127	negl %ecx
128	andl $15,%ecx // get #bytes to align ptr
129	jz 2f // skip if already aligned
130	subl %ecx,%edx // decrement length
131	1:
132	movb %al,(%edi) // pack in a byte
133	inc %edi
134	dec %ecx
135	jnz 1b
136	2: // ptr aligned, length long enough to justify
1f2f436a	137	call _memset_pattern // call commpage to do the heavy lifting
eb1cde05 A	138	movl 12(%esp),%eax // get return value (ie, original ptr)
	139	popl %esi
	140	popl %edi
	141	ret
	142
	143
	144	// Handle memset of a 16-byte pattern.
	145
	146	.globl _memset_pattern16
	147	.align 2, 0x90
	148	_memset_pattern16: // void memset_pattern16(void b, const void c16, size_t len);
	149	pushl %edi
	150	pushl %esi
	151	movl 20(%esp),%edx // get length
	152	movl 16(%esp),%esi // get ptr to 16-byte pattern
	153	movl 12(%esp),%edi // point to operand
	154	movdqu (%esi),%xmm0 // load the pattern
	155	jmp LAlignPtr
	156
	157
	158	// Handle memset of an 8-byte pattern.
	159
	160	.globl _memset_pattern8
	161	.align 2, 0x90
	162	_memset_pattern8: // void memset_pattern8(void b, const void c8, size_t len);
	163	pushl %edi
	164	pushl %esi
	165	movl 20(%esp),%edx // get length
	166	movl 16(%esp),%esi // get ptr to 8-byte pattern
	167	movl 12(%esp),%edi // point to operand
	168	movq (%esi),%xmm0 // load pattern into low 8 bytes
	169	punpcklqdq %xmm0,%xmm0 // replicate into all 16
	170	jmp LAlignPtr
	171
	172	// Handle memset of a 4-byte pattern.
	173
	174	.globl _memset_pattern4
	175	.align 2, 0x90
	176	_memset_pattern4: // void memset_pattern4(void b, const void c4, size_t len);
	177	pushl %edi
	178	pushl %esi
	179	movl 20(%esp),%edx // get length
	180	movl 16(%esp),%esi // get ptr to 4-byte pattern
	181	movl 12(%esp),%edi // point to operand
	182	movd (%esi),%xmm0 // load pattern into low 4 bytes
	183	pshufd $(0x00),%xmm0,%xmm0 // replicate the 4 bytes across the vector
	184
	185
	186	// Align ptr if necessary. We must rotate the pattern right for each byte we
	187	// store while aligning the ptr. Since there is no rotate instruction in SSE3,
	188	// we have to synthesize the rotates.
	189	// %edi = ptr
	190	// %edx = length
	191	// %xmm0 = pattern
	192
	193	LAlignPtr: // NB: can drop down to here!
	194	cmpl $100,%edx // long enough to bother aligning ptr?
	195	movl %edi,%ecx // copy ptr
	196	jb LReady // not long enough
	197	negl %ecx
	198	andl $15,%ecx // get #bytes to align ptr
	199	jz LReady // already aligned
	200	subl %ecx,%edx // adjust length
	201
202	test $1,%cl // 1-byte store required?
203	movd %xmm0,%eax // get 4 low bytes in %eax
204	jz 2f // no
205	movdqa %xmm0,%xmm1 // copy pattern so we can shift in both directions
206	movb %al,(%edi) // pack in the low-order byte
207	psrldq $1,%xmm0 // shift pattern right 1 byte
208	inc %edi
209	pslldq $15,%xmm1 // shift pattern left 15 bytes
210	shrl $8,%eax // in case 2-byte store is required
211	por %xmm1,%xmm0 // complete right rotate of pattern by 1 byte
212	2:
213	test $2,%cl // 2-byte store required?
214	jz 4f // no
215	psrldq $2,%xmm0 // shift pattern down 2 bytes
216	movw %ax,(%edi) // pack in next two bytes
217	pinsrw $7,%eax,%xmm0 // insert low word of %eax into high word of %xmm0
218	addl $2,%edi // adjust ptr
219	4:
220	test $4,%cl // 4-byte store required?
221	jz 8f // no
222	movd %xmm0,(%edi) // store low 4 bytes of %xmm0
223	pshufd $(0x39),%xmm0,%xmm0 // rotate %xmm0 right 4 bytes (mask == 00 11 10 01)
224	addl $4,%edi // adjust ptr
225	8:
226	test $8,%cl // 8-byte store required?
227	jz LReady // no
228	movq %xmm0,(%edi) // store low 8 bytes of %xmm0
229	pshufd $(0x4e),%xmm0,%xmm0 // rotate %xmm0 right 8 bytes (mask == 01 00 11 10)
230	addl $8,%edi // adjust ptr
231
232	// Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
233
234	LReady:
1f2f436a	235	call _memset_pattern // call commpage to do the heavy lifting
eb1cde05 A	236	popl %esi
	237	popl %edi
	238	ret