[apple/libc.git] / i386 / string / strcpy.s

/*
 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */


// ***************
// * S T R C P Y *
// ***************
//
// char  *strcpy(const char *dst, const char *src);
//
// We optimize the move by doing it vector parallel.  This introduces
// a complication: if we blindly did vector load/stores until finding
// a 0, we might get a spurious page fault by touching bytes past it.
// To avoid this, we never do a load that crosses a page boundary,
// and never store a byte we don't have to.
//
// We align the destination, because unaligned vector stores are slow.

        .text
        .globl _strcpy

        .align 	4
_strcpy:				// char *strcpy(const char *dst, const char *src);
	pushl	%edi
	movl	8(%esp),%edi		// get dest ptr
	movl	12(%esp),%ecx		// get source ptr
	movl	%edi,%edx		// copy dest ptr
	negl	%edx
	andl	$15,%edx		// how many bytes to align dest ptr?
	jnz	LLoopOverBytes		// not aligned, so go do so
	
	
// In order to avoid spurious page faults, we loop until nearing the source page
// end.  Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
// then resume the vector loop. 
//	%ecx = source ptr (unaligned)
//	%edi = dest ptr (aligned)

LNextChunk:
	movl	%ecx,%eax		// copy source ptr
	movl	$4096,%edx
	andl	$4095,%eax		// get offset into source page
	subl	%eax,%edx		// get #bytes remaining in source page
	shrl	$4,%edx			// get #chunks till end of page
	jnz	LLoopOverChunks		// enter vector loop
	movl	$16,%edx		// move 16 bytes to cross page but keep dest aligned
	jmp	LLoopOverBytes


// Loop over bytes.
//	%ecx = source ptr
//	%edi = dest ptr
//	%edx = byte count

	.align	4,0x90			// align inner loops to optimize I-fetch
LLoopOverBytes:
	movzb	(%ecx),%eax		// get source byte
	inc	%ecx
	movb	%al,(%edi)		// pack into dest
	inc	%edi
	testl	%eax,%eax		// 0?
	jz	LDone			// yes, we're done
	dec	%edx			// more to go?
	jnz	LLoopOverBytes
	
	jmp	LNextChunk		// we've come to end of page


// Loop over 16-byte chunks.
//	%ecx = source ptr (unaligned)
//	%edi = dest ptr (aligned)
//	%edx = chunk count

	.align	4,0x90			// align inner loops to optimize I-fetch
LLoopOverChunks:
	movdqu	(%ecx),%xmm1		// get source
	pxor	%xmm0,%xmm0		// get some 0s
	addl	$16,%ecx
	pcmpeqb	%xmm1,%xmm0		// compare source to 0s
	pmovmskb %xmm0,%eax		// get result mask for 0 check
	testl	%eax,%eax		// any 0s?
	jnz	LFound0			// yes, exit loop
	movdqa	%xmm1,(%edi)		// no 0s so do aligned store into destination
	addl	$16,%edi
	dec	%edx			// more to go?
	jnz	LLoopOverChunks
	
	movl	$16,%edx		// move 16 bytes
	jmp	LLoopOverBytes		// cross page but keep dest aligned
	

// Found a zero in the vector.  Figure out where it is, and store the bytes
// up to it.
//	%edi = dest ptr (aligned)
//	%eax = result mask
//	%xmm1 = source vector

LFound0:
	bsf	%eax,%edx		// find first 0
	inc	%edx			// we need to store the 0 too
	test	$16,%dl			// was 0 last byte?
	jz	8f			// no
	movdqa	%xmm1,(%edi)		// yes, store entire vector
	jmp	LDone
8:	
	test	$8,%dl			// 8-byte store required?
	jz	4f			// no
	movq	%xmm1,(%edi)		// pack in 8 low bytes
	psrldq	$8,%xmm1		// then shift vector down 8 bytes
	addl	$8,%edi
4:
	test	$4,%dl			// 4-byte store required?
	jz	3f			// no
	movd	%xmm1,(%edi)		// pack in 4 low bytes
	psrldq	$4,%xmm1		// then shift vector down 4 bytes
	addl	$4,%edi
3:
	andl	$3,%edx			// more to go?
	jz	LDone			// no
	movd	%xmm1,%eax		// move remainders out of vector into %eax
1:					// loop on up to three bytes
	movb	%al,(%edi)		// pack in next byte
	shrl	$8,%eax			// shift next byte into position
	inc	%edi
	dec	%edx
	jnz	1b
	
LDone:
	movl	8(%esp),%eax		// original dest ptr is return value
	popl	%edi
	ret
Commit	Line	Data
eb1cde05 A	1	/*
	2	* Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. Please obtain a copy of the License at
	10	* http://www.opensource.apple.com/apsl/ and read it before using this
	11	* file.
	12	*
	13	* The Original Code and all software distributed under the License are
	14	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	15	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	16	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	17	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	18	* Please see the License for the specific language governing rights and
	19	* limitations under the License.
	20	*
	21	* @APPLE_LICENSE_HEADER_END@
	22	*/
	23
	24
	25	// ***************
	26	// * S T R C P Y *
	27	// ***************
	28	//
	29	// char strcpy(const char dst, const char *src);
	30	//
	31	// We optimize the move by doing it vector parallel. This introduces
	32	// a complication: if we blindly did vector load/stores until finding
	33	// a 0, we might get a spurious page fault by touching bytes past it.
	34	// To avoid this, we never do a load that crosses a page boundary,
	35	// and never store a byte we don't have to.
	36	//
	37	// We align the destination, because unaligned vector stores are slow.
	38
	39	.text
	40	.globl _strcpy
	41
	42	.align 4
	43	_strcpy: // char strcpy(const char dst, const char *src);
	44	pushl %edi
	45	movl 8(%esp),%edi // get dest ptr
	46	movl 12(%esp),%ecx // get source ptr
	47	movl %edi,%edx // copy dest ptr
	48	negl %edx
	49	andl $15,%edx // how many bytes to align dest ptr?
	50	jnz LLoopOverBytes // not aligned, so go do so
	51
	52
	53	// In order to avoid spurious page faults, we loop until nearing the source page
	54	// end. Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
	55	// then resume the vector loop.
	56	// %ecx = source ptr (unaligned)
	57	// %edi = dest ptr (aligned)
	58
	59	LNextChunk:
	60	movl %ecx,%eax // copy source ptr
	61	movl $4096,%edx
	62	andl $4095,%eax // get offset into source page
	63	subl %eax,%edx // get #bytes remaining in source page
	64	shrl $4,%edx // get #chunks till end of page
65	jnz LLoopOverChunks // enter vector loop
66	movl $16,%edx // move 16 bytes to cross page but keep dest aligned
67	jmp LLoopOverBytes
68
69
70	// Loop over bytes.
71	// %ecx = source ptr
72	// %edi = dest ptr
73	// %edx = byte count
74
75	.align 4,0x90 // align inner loops to optimize I-fetch
76	LLoopOverBytes:
77	movzb (%ecx),%eax // get source byte
78	inc %ecx
79	movb %al,(%edi) // pack into dest
80	inc %edi
81	testl %eax,%eax // 0?
82	jz LDone // yes, we're done
83	dec %edx // more to go?
84	jnz LLoopOverBytes
85
86	jmp LNextChunk // we've come to end of page
87
88
89	// Loop over 16-byte chunks.
90	// %ecx = source ptr (unaligned)
91	// %edi = dest ptr (aligned)
92	// %edx = chunk count
93
94	.align 4,0x90 // align inner loops to optimize I-fetch
95	LLoopOverChunks:
96	movdqu (%ecx),%xmm1 // get source
97	pxor %xmm0,%xmm0 // get some 0s
98	addl $16,%ecx
99	pcmpeqb %xmm1,%xmm0 // compare source to 0s
100	pmovmskb %xmm0,%eax // get result mask for 0 check
101	testl %eax,%eax // any 0s?
102	jnz LFound0 // yes, exit loop
103	movdqa %xmm1,(%edi) // no 0s so do aligned store into destination
104	addl $16,%edi
105	dec %edx // more to go?
106	jnz LLoopOverChunks
107
108	movl $16,%edx // move 16 bytes
109	jmp LLoopOverBytes // cross page but keep dest aligned
110
111
112	// Found a zero in the vector. Figure out where it is, and store the bytes
113	// up to it.
114	// %edi = dest ptr (aligned)
115	// %eax = result mask
116	// %xmm1 = source vector
117
118	LFound0:
119	bsf %eax,%edx // find first 0
120	inc %edx // we need to store the 0 too
121	test $16,%dl // was 0 last byte?
122	jz 8f // no
123	movdqa %xmm1,(%edi) // yes, store entire vector
124	jmp LDone
125	8:
126	test $8,%dl // 8-byte store required?
127	jz 4f // no
128	movq %xmm1,(%edi) // pack in 8 low bytes
129	psrldq $8,%xmm1 // then shift vector down 8 bytes
130	addl $8,%edi
131	4:
132	test $4,%dl // 4-byte store required?
133	jz 3f // no
134	movd %xmm1,(%edi) // pack in 4 low bytes
135	psrldq $4,%xmm1 // then shift vector down 4 bytes
136	addl $4,%edi
137	3:
138	andl $3,%edx // more to go?
139	jz LDone // no
140	movd %xmm1,%eax // move remainders out of vector into %eax
141	1: // loop on up to three bytes
142	movb %al,(%edi) // pack in next byte
143	shrl $8,%eax // shift next byte into position
144	inc %edi
145	dec %edx
146	jnz 1b
147
148	LDone:
149	movl 8(%esp),%eax // original dest ptr is return value
150	popl %edi
151	ret