[apple/libc.git] / i386 / string / strlcpy.s

/*
 * Copyright (c) 2007 Apple Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */


// *****************
// * S T R L C P Y *
// *****************
//
// size_t  strlcpy(char *dst, const char *src, size_t size);
//
// We optimize the move by doing it word parallel.  This introduces
// a complication: if we blindly did word load/stores until finding
// a 0, we might get a spurious page fault by touching bytes past it.
// To avoid this, we never do a load that crosses a page boundary,
// or store unnecessary bytes.
//
// The test for 0s relies on the following inobvious but very efficient
// word-parallel test:
//		x =  dataWord + 0xFEFEFEFF
//		y = ~dataWord & 0x80808080
//		if (x & y) == 0 then no zero found
// The test maps any non-zero byte to zero, and any zero byte to 0x80,
// with one exception: 0x01 bytes preceeding the first zero are also
// mapped to 0x80.
//
// On Core2 class machines, this word-parallel implementation seems to
// be slightly faster than using SSE up to about 100 bytes.
// It is faster than the naive byte-by-byte implementation for
// operands longer than about 8 bytes.

        .text
        .globl _strlcpy

        .align 	4
_strlcpy:				// size_t *strlcpy(char *dst, const char *src, size_t size);
	pushl	%edi
	pushl	%esi
	pushl	%ebx
	movl	16(%esp),%edi		// get dest ptr
	movl	20(%esp),%esi		// get source ptr
	movl	24(%esp),%ecx		// get length of buffer
	movl	%esi,%edx		// copy source ptr
	negl	%edx
	andl	$3,%edx			// how many bytes to align source ptr?
	jz	LAligned		// already aligned


// Loop over bytes.
//	%edi = dest ptr
//	%esi = source ptr
//	%ecx = length remaining in buffer
//	%edx = number of bytes to copy (>0, may not fit in buffer)

LLoopOverBytes:
	movzb	(%esi),%eax		// get source byte before checking buffer length
	testl	%ecx,%ecx		// buffer full?
	jz	L0NotFound		// yes
	inc	%esi
	dec	%ecx
	movb	%al,(%edi)		// pack into dest
	inc	%edi
	testl	%eax,%eax		// 0?
	jz	LDone			// yes, done
	dec	%edx			// more to go?
	jnz	LLoopOverBytes
	

// Source is aligned.  Loop over words until end of buffer.  We
// align the source, rather than the dest, to avoid getting spurious page faults.
//	%edi = dest ptr (unaligned)
//	%esi = source ptr (word aligned)
//	%ecx = length remaining in buffer

LAligned:
	movl	$5,%edx			// if buffer almost exhausted, prepare to copy rest byte-by-byte
	cmpl	$4,%ecx			// enough for at least one word?
	jb	LLoopOverBytes
	

// Loop over words.
//	%edi = dest ptr (unaligned)
//	%esi = source ptr (word aligned)
//	%ecx = length remaining in buffer (>=4)

LLoopOverWords:
	movl	(%esi),%eax		// get next 4 bytes of source
	subl	$4,%ecx
	addl	$4,%esi
	movl	%eax,%edx		// make 2 copies of word
	movl	%eax,%ebx
	notl	%edx			// use magic word-parallel test for 0s
	addl	$0xFEFEFEFF,%ebx
	andl	$0x80808080,%edx
	testl	%ebx,%edx
	jnz	L0Found			// one of the bytes of %eax is a 0
	movl	%eax,(%edi)		// pack 4 bytes into destination
	addl	$4,%edi
	cmpl	$4,%ecx			// room in buffer for another word?
	jae	LLoopOverWords		// yes
	
	movl	%ecx,%edx		// copy leftovers in byte loop
	jmp	LLoopOverBytes
	
// Found a 0-byte in the word of source.  Store a byte at a time until the 0.
//	%edi = dest ptr (unaligned)
//	%eax = last word of source, known to have a 0-byte

LNextByte:
	shrl	$8,%eax			// next byte
L0Found:
	movb	%al,(%edi)		// pack in next byte
	incl	%edi
	testb	%al,%al			// 0?
	jnz	LNextByte
	
// Done storing string.
//	%edi = ptr to byte after 0-byte

LDone:
	subl	16(%esp),%edi		// subtract original dest ptr to get length stored
	decl	%edi			// don't count the 0-byte
	movl	%edi,%eax		// copy to return value
LExit:
	popl	%ebx
	popl	%esi
	popl	%edi
	ret

// Buffer filled but 0-byte not found.  We return the length of the source string.
// This is not optimized, as it is an error condition.
//	%edi = dest ptr (ie, 1 past end of buffer)
//	%esi = source ptr (ptr to 1st byte that does not fit)
	
L0NotFound:
	movl	24(%esp),%eax		// reload buffer length
	testl	%eax,%eax		// null?
	jz	1f			// yes, cannot store a 0
	xorl	%edx,%edx		// get a 0
	movb	%dl,-1(%edi)		// store a 0 at end of buffer to delimit string
1:
	movzb	(%esi),%edx		// get next byte of source
	incl	%esi
	incl	%eax
	testl	%edx,%edx		// 0?
	jnz	1b
	decl	%eax			// don't count the 0-byte
	jmp	LExit
Commit	Line	Data
224c7076 A	1	/*
	2	* Copyright (c) 2007 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. Please obtain a copy of the License at
	10	* http://www.opensource.apple.com/apsl/ and read it before using this
	11	* file.
	12	*
	13	* The Original Code and all software distributed under the License are
	14	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	15	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	16	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	17	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	18	* Please see the License for the specific language governing rights and
	19	* limitations under the License.
	20	*
	21	* @APPLE_LICENSE_HEADER_END@
	22	*/
	23
	24
	25	// *****************
	26	// * S T R L C P Y *
	27	// *****************
	28	//
	29	// size_t strlcpy(char dst, const char src, size_t size);
	30	//
	31	// We optimize the move by doing it word parallel. This introduces
	32	// a complication: if we blindly did word load/stores until finding
	33	// a 0, we might get a spurious page fault by touching bytes past it.
	34	// To avoid this, we never do a load that crosses a page boundary,
	35	// or store unnecessary bytes.
	36	//
	37	// The test for 0s relies on the following inobvious but very efficient
	38	// word-parallel test:
	39	// x = dataWord + 0xFEFEFEFF
	40	// y = ~dataWord & 0x80808080
	41	// if (x & y) == 0 then no zero found
	42	// The test maps any non-zero byte to zero, and any zero byte to 0x80,
	43	// with one exception: 0x01 bytes preceeding the first zero are also
	44	// mapped to 0x80.
	45	//
	46	// On Core2 class machines, this word-parallel implementation seems to
	47	// be slightly faster than using SSE up to about 100 bytes.
	48	// It is faster than the naive byte-by-byte implementation for
	49	// operands longer than about 8 bytes.
	50
	51	.text
	52	.globl _strlcpy
	53
	54	.align 4
	55	_strlcpy: // size_t strlcpy(char dst, const char *src, size_t size);
	56	pushl %edi
	57	pushl %esi
	58	pushl %ebx
	59	movl 16(%esp),%edi // get dest ptr
	60	movl 20(%esp),%esi // get source ptr
	61	movl 24(%esp),%ecx // get length of buffer
	62	movl %esi,%edx // copy source ptr
	63	negl %edx
	64	andl $3,%edx // how many bytes to align source ptr?
65	jz LAligned // already aligned
66
67
68	// Loop over bytes.
69	// %edi = dest ptr
70	// %esi = source ptr
71	// %ecx = length remaining in buffer
72	// %edx = number of bytes to copy (>0, may not fit in buffer)
73
74	LLoopOverBytes:
75	movzb (%esi),%eax // get source byte before checking buffer length
76	testl %ecx,%ecx // buffer full?
77	jz L0NotFound // yes
78	inc %esi
79	dec %ecx
80	movb %al,(%edi) // pack into dest
81	inc %edi
82	testl %eax,%eax // 0?
83	jz LDone // yes, done
84	dec %edx // more to go?
85	jnz LLoopOverBytes
86
87
88	// Source is aligned. Loop over words until end of buffer. We
89	// align the source, rather than the dest, to avoid getting spurious page faults.
90	// %edi = dest ptr (unaligned)
91	// %esi = source ptr (word aligned)
92	// %ecx = length remaining in buffer
93
94	LAligned:
95	movl $5,%edx // if buffer almost exhausted, prepare to copy rest byte-by-byte
96	cmpl $4,%ecx // enough for at least one word?
97	jb LLoopOverBytes
98
99
100	// Loop over words.
101	// %edi = dest ptr (unaligned)
102	// %esi = source ptr (word aligned)
103	// %ecx = length remaining in buffer (>=4)
104
105	LLoopOverWords:
106	movl (%esi),%eax // get next 4 bytes of source
107	subl $4,%ecx
108	addl $4,%esi
109	movl %eax,%edx // make 2 copies of word
110	movl %eax,%ebx
111	notl %edx // use magic word-parallel test for 0s
112	addl $0xFEFEFEFF,%ebx
113	andl $0x80808080,%edx
114	testl %ebx,%edx
115	jnz L0Found // one of the bytes of %eax is a 0
116	movl %eax,(%edi) // pack 4 bytes into destination
117	addl $4,%edi
118	cmpl $4,%ecx // room in buffer for another word?
119	jae LLoopOverWords // yes
120
121	movl %ecx,%edx // copy leftovers in byte loop
122	jmp LLoopOverBytes
123
124	// Found a 0-byte in the word of source. Store a byte at a time until the 0.
125	// %edi = dest ptr (unaligned)
126	// %eax = last word of source, known to have a 0-byte
127
128	LNextByte:
129	shrl $8,%eax // next byte
130	L0Found:
131	movb %al,(%edi) // pack in next byte
132	incl %edi
133	testb %al,%al // 0?
134	jnz LNextByte
135
136	// Done storing string.
137	// %edi = ptr to byte after 0-byte
138
139	LDone:
140	subl 16(%esp),%edi // subtract original dest ptr to get length stored
141	decl %edi // don't count the 0-byte
142	movl %edi,%eax // copy to return value
143	LExit:
144	popl %ebx
145	popl %esi
146	popl %edi
147	ret
148
149	// Buffer filled but 0-byte not found. We return the length of the source string.
150	// This is not optimized, as it is an error condition.
151	// %edi = dest ptr (ie, 1 past end of buffer)
152	// %esi = source ptr (ptr to 1st byte that does not fit)
153
154	L0NotFound:
155	movl 24(%esp),%eax // reload buffer length
156	testl %eax,%eax // null?
157	jz 1f // yes, cannot store a 0
158	xorl %edx,%edx // get a 0
159	movb %dl,-1(%edi) // store a 0 at end of buffer to delimit string
160	1:
161	movzb (%esi),%edx // get next byte of source
162	incl %esi
163	incl %eax
164	testl %edx,%edx // 0?
165	jnz 1b
166	decl %eax // don't count the 0-byte
167	jmp LExit