[apple/libc.git] / i386 / string / strncpy.s

/*
 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */

#include <machine/cpu_capabilities.h>


// *****************
// * S T R N C P Y *
// *****************
//
// char  *strncpy(const char *dst, const char *src, size_t n);
//
// We optimize the move by doing it vector parallel.  This introduces
// a complication: if we blindly did vector load/stores until finding
// a 0, we might get a spurious page fault by touching bytes past it.
// To avoid this, we never do a load that crosses a page boundary,
// and never store a byte we don't have to.
//
// We align the destination, because unaligned vector stores are slow.
//
// Recall that strncpy() zero fills the remainder of the dest buffer,
// and does not terminate the string if it's length is greater than or
// equal to n.

#define	kShort	31			// too short to bother with vector loop

        .text
        .globl _strncpy

        .align 	4
_strncpy:				// char  *strncpy(const char *dst, const char *src, size_t n);
	pushl	%edi
	pushl	%esi
	movl	12(%esp),%edi		// get dest ptr
	movl	16(%esp),%esi		// get source ptr
	movl	20(%esp),%ecx		// get length
	movl	%edi,%edx		// copy dest ptr
	negl	%edx
	andl	$15,%edx		// how many bytes to align dest ptr?
	jnz	LCheckShortCopy		// align destination first
	
	
// In order to avoid spurious page faults, we loop until nearing the source page
// end.  Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
// then resume the vector loop. 
//	%esi = source ptr (unaligned)
//	%edi = dest ptr (aligned)
//	%ecx = buffer length remaining

LNextChunk:				// NB: can drop down to here
	movl	%esi,%eax		// copy source ptr
	movl	$4096,%edx
	andl	$4095,%eax		// get offset into source page
	subl	%eax,%edx		// get #bytes remaining in source page
	cmpl	%ecx,%edx		// will buffer run out before the page end?
	cmova	%ecx,%edx		// get min(length remaining, bytes to page end)
	shrl	$4,%edx			// get #chunks till end of page
	jnz	LLoopOverChunks		// enter vector loop
	
// We can't use the chunk loop yet.  Check for short and empty buffers, then use byte loop.

LCrossPage:				// if buffer is large enough, cross source page
	movl	$16,%edx		// move 16 bytes to cross page but keep dest aligned
LCheckShortCopy:			// we propose to copy %edx bytes in byte loop
	cmpl	$(kShort),%ecx		// much left?
	ja	LLoopOverBytes		// yes, loop over bytes then more chunks
	movl	%ecx,%edx		// no, use the byte loop for everything
	testl	%ecx,%ecx		// have we filled buffer?
	jnz	LLoopOverBytes		// no
	jmp	LDone


// Loop over bytes.
//	%esi = source ptr
//	%edi = dest ptr
//	%ecx = buffer length remaining
//	%edx = count of bytes to loop over (<= buffer length)

	.align	4,0x90			// align inner loops to optimize I-fetch
LLoopOverBytes:
	movzb	(%esi),%eax		// get source byte
	inc	%esi
	dec	%ecx			// decrement length
	movb	%al,(%edi)		// pack into dest
	inc	%edi
	testl	%eax,%eax		// 0?
	jz	LZeroBuffer		// yes, we're done copying string
	dec	%edx			// more to go?
	jnz	LLoopOverBytes
	
	testl	%ecx,%ecx		// at end of buffer?
	jnz	LNextChunk		// no, xfer chunks
	jmp	LDone			// yes


// Loop over 16-byte chunks.
//	%esi = source ptr (unaligned)
//	%edi = dest ptr (aligned)
//	%ecx = buffer length remaining
//	%edx = chunk count

	.align	4,0x90			// align inner loops to optimize I-fetch
LLoopOverChunks:
	movdqu	(%esi),%xmm1		// get source
	pxor	%xmm0,%xmm0		// get some 0s
	addl	$16,%esi
	pcmpeqb	%xmm1,%xmm0		// compare source to 0s
	pmovmskb %xmm0,%eax		// get result mask for 0 check
	testl	%eax,%eax		// any 0s?
	jnz	LFound0			// yes, exit loop
	movdqa	%xmm1,(%edi)		// no 0s so do aligned store into destination
	addl	$16,%edi
	subl	$16,%ecx		// decrement length remaining
	dec	%edx			// more to go?
	jnz	LLoopOverChunks
	
	jmp	LCrossPage		// cross page but keep dest aligned
	

// Found a zero in the vector.  Figure out where it is, and store the bytes
// up to it.  It is possible that we should check to be sure (%ecx >= 16), and
// just do an aligned store of %xmm1 if so.  But if we did, we'd be doing byte
// stores into the same double quadword in bzero(), which might hit a hazard.
// Experimentation needed.
//	%edi = dest ptr (aligned)
//	%eax = result mask
//	%ecx = buffer length remaining
//	%xmm1 = source vector

LFound0:
	bsf	%eax,%edx		// find first 0
	subl	%edx,%ecx		// decrement remaining buffer length
	test	$8,%dl			// 8-byte store required?
	jz	4f			// no
	movq	%xmm1,(%edi)		// pack in 8 low bytes
	psrldq	$8,%xmm1		// then shift vector down 8 bytes
	addl	$8,%edi
4:
	test	$4,%dl			// 4-byte store required?
	jz	3f			// no
	movd	%xmm1,(%edi)		// pack in 4 low bytes
	psrldq	$4,%xmm1		// then shift vector down 4 bytes
	addl	$4,%edi
3:
	andl	$3,%edx			// more to go?
	jz	LZeroBuffer		// no
	movd	%xmm1,%eax		// move remainders out of vector into %eax
1:					// loop on up to three bytes
	movb	%al,(%edi)		// pack in next byte
	shrl	$8,%eax			// shift next byte into position
	inc	%edi
	dec	%edx
	jnz	1b

// We've copied the string.  Now zero the rest of the buffer, using commpage bzero().
//	%edi = dest ptr
//	%ecx = buffer length remaining

LZeroBuffer:
	pushl	%ecx			// remaining buffer size
	pushl	%edi			// ptr to 1st unstored byte
	movl	$(_COMM_PAGE_BZERO),%eax
	call	*%eax
	addl	$8,%esp			// pop off the arguments

LDone:
	movl	12(%esp),%eax		// original dest ptr is return value
	popl	%esi
	popl	%edi
	ret
Commit	Line	Data
eb1cde05 A	1	/*
	2	* Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. Please obtain a copy of the License at
	10	* http://www.opensource.apple.com/apsl/ and read it before using this
	11	* file.
	12	*
	13	* The Original Code and all software distributed under the License are
	14	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	15	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	16	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	17	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	18	* Please see the License for the specific language governing rights and
	19	* limitations under the License.
	20	*
	21	* @APPLE_LICENSE_HEADER_END@
	22	*/
	23
	24	#include <machine/cpu_capabilities.h>
	25
	26
	27	// *****************
	28	// * S T R N C P Y *
	29	// *****************
	30	//
	31	// char strncpy(const char dst, const char *src, size_t n);
	32	//
	33	// We optimize the move by doing it vector parallel. This introduces
	34	// a complication: if we blindly did vector load/stores until finding
	35	// a 0, we might get a spurious page fault by touching bytes past it.
	36	// To avoid this, we never do a load that crosses a page boundary,
	37	// and never store a byte we don't have to.
	38	//
	39	// We align the destination, because unaligned vector stores are slow.
	40	//
	41	// Recall that strncpy() zero fills the remainder of the dest buffer,
	42	// and does not terminate the string if it's length is greater than or
	43	// equal to n.
	44
	45	#define kShort 31 // too short to bother with vector loop
	46
	47	.text
	48	.globl _strncpy
	49
	50	.align 4
	51	_strncpy: // char strncpy(const char dst, const char *src, size_t n);
	52	pushl %edi
	53	pushl %esi
	54	movl 12(%esp),%edi // get dest ptr
	55	movl 16(%esp),%esi // get source ptr
	56	movl 20(%esp),%ecx // get length
	57	movl %edi,%edx // copy dest ptr
	58	negl %edx
	59	andl $15,%edx // how many bytes to align dest ptr?
	60	jnz LCheckShortCopy // align destination first
	61
	62
	63	// In order to avoid spurious page faults, we loop until nearing the source page
	64	// end. Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
65	// then resume the vector loop.
66	// %esi = source ptr (unaligned)
67	// %edi = dest ptr (aligned)
68	// %ecx = buffer length remaining
69
70	LNextChunk: // NB: can drop down to here
71	movl %esi,%eax // copy source ptr
72	movl $4096,%edx
73	andl $4095,%eax // get offset into source page
74	subl %eax,%edx // get #bytes remaining in source page
75	cmpl %ecx,%edx // will buffer run out before the page end?
76	cmova %ecx,%edx // get min(length remaining, bytes to page end)
77	shrl $4,%edx // get #chunks till end of page
78	jnz LLoopOverChunks // enter vector loop
79
80	// We can't use the chunk loop yet. Check for short and empty buffers, then use byte loop.
81
82	LCrossPage: // if buffer is large enough, cross source page
83	movl $16,%edx // move 16 bytes to cross page but keep dest aligned
84	LCheckShortCopy: // we propose to copy %edx bytes in byte loop
85	cmpl $(kShort),%ecx // much left?
86	ja LLoopOverBytes // yes, loop over bytes then more chunks
87	movl %ecx,%edx // no, use the byte loop for everything
88	testl %ecx,%ecx // have we filled buffer?
89	jnz LLoopOverBytes // no
90	jmp LDone
91
92
93	// Loop over bytes.
94	// %esi = source ptr
95	// %edi = dest ptr
96	// %ecx = buffer length remaining
97	// %edx = count of bytes to loop over (<= buffer length)
98
99	.align 4,0x90 // align inner loops to optimize I-fetch
100	LLoopOverBytes:
101	movzb (%esi),%eax // get source byte
102	inc %esi
103	dec %ecx // decrement length
104	movb %al,(%edi) // pack into dest
105	inc %edi
106	testl %eax,%eax // 0?
107	jz LZeroBuffer // yes, we're done copying string
108	dec %edx // more to go?
109	jnz LLoopOverBytes
110
111	testl %ecx,%ecx // at end of buffer?
112	jnz LNextChunk // no, xfer chunks
113	jmp LDone // yes
114
115
116	// Loop over 16-byte chunks.
117	// %esi = source ptr (unaligned)
118	// %edi = dest ptr (aligned)
119	// %ecx = buffer length remaining
120	// %edx = chunk count
121
122	.align 4,0x90 // align inner loops to optimize I-fetch
123	LLoopOverChunks:
124	movdqu (%esi),%xmm1 // get source
125	pxor %xmm0,%xmm0 // get some 0s
126	addl $16,%esi
127	pcmpeqb %xmm1,%xmm0 // compare source to 0s
128	pmovmskb %xmm0,%eax // get result mask for 0 check
129	testl %eax,%eax // any 0s?
130	jnz LFound0 // yes, exit loop
131	movdqa %xmm1,(%edi) // no 0s so do aligned store into destination
132	addl $16,%edi
133	subl $16,%ecx // decrement length remaining
134	dec %edx // more to go?
135	jnz LLoopOverChunks
136
137	jmp LCrossPage // cross page but keep dest aligned
138
139
140	// Found a zero in the vector. Figure out where it is, and store the bytes
141	// up to it. It is possible that we should check to be sure (%ecx >= 16), and
142	// just do an aligned store of %xmm1 if so. But if we did, we'd be doing byte
143	// stores into the same double quadword in bzero(), which might hit a hazard.
144	// Experimentation needed.
145	// %edi = dest ptr (aligned)
146	// %eax = result mask
147	// %ecx = buffer length remaining
148	// %xmm1 = source vector
149
150	LFound0:
151	bsf %eax,%edx // find first 0
152	subl %edx,%ecx // decrement remaining buffer length
153	test $8,%dl // 8-byte store required?
154	jz 4f // no
155	movq %xmm1,(%edi) // pack in 8 low bytes
156	psrldq $8,%xmm1 // then shift vector down 8 bytes
157	addl $8,%edi
158	4:
159	test $4,%dl // 4-byte store required?
160	jz 3f // no
161	movd %xmm1,(%edi) // pack in 4 low bytes
162	psrldq $4,%xmm1 // then shift vector down 4 bytes
163	addl $4,%edi
164	3:
165	andl $3,%edx // more to go?
166	jz LZeroBuffer // no
167	movd %xmm1,%eax // move remainders out of vector into %eax
168	1: // loop on up to three bytes
169	movb %al,(%edi) // pack in next byte
170	shrl $8,%eax // shift next byte into position
171	inc %edi
172	dec %edx
173	jnz 1b
174
175	// We've copied the string. Now zero the rest of the buffer, using commpage bzero().
176	// %edi = dest ptr
177	// %ecx = buffer length remaining
178
179	LZeroBuffer:
180	pushl %ecx // remaining buffer size
181	pushl %edi // ptr to 1st unstored byte
182	movl $(_COMM_PAGE_BZERO),%eax
8e029c65	183	call *%eax
eb1cde05 A	184	addl $8,%esp // pop off the arguments
	185
	186	LDone:
	187	movl 12(%esp),%eax // original dest ptr is return value
	188	popl %esi
	189	popl %edi
	190	ret