git.saurik.com Git - apple/libc.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. Please obtain a copy of the License at
	10	* http://www.opensource.apple.com/apsl/ and read it before using this
	11	* file.
	12	*
	13	* The Original Code and all software distributed under the License are
	14	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	15	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	16	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	17	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	18	* Please see the License for the specific language governing rights and
	19	* limitations under the License.
	20	*
	21	* @APPLE_LICENSE_HEADER_END@
	22	*/
	23
	24
	25	// ***************
	26	// * S T R C P Y *
	27	// ***************
	28	//
	29	// char strcpy(const char dst, const char *src);
	30	//
	31	// We optimize the move by doing it vector parallel. This introduces
	32	// a complication: if we blindly did vector load/stores until finding
	33	// a 0, we might get a spurious page fault by touching bytes past it.
	34	// To avoid this, we never do a load that crosses a page boundary,
	35	// and never store a byte we don't have to.
	36	//
	37	// We align the destination, because unaligned vector stores are slow.
	38
	39	.text
	40	.globl _strcpy
	41
	42	.align 4
	43	_strcpy: // char strcpy(const char dst, const char *src);
	44	pushl %edi
	45	movl 8(%esp),%edi // get dest ptr
	46	movl 12(%esp),%ecx // get source ptr
	47	movl %edi,%edx // copy dest ptr
	48	negl %edx
	49	andl $15,%edx // how many bytes to align dest ptr?
	50	jnz LLoopOverBytes // not aligned, so go do so
	51
	52
	53	// In order to avoid spurious page faults, we loop until nearing the source page
	54	// end. Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
	55	// then resume the vector loop.
	56	// %ecx = source ptr (unaligned)
	57	// %edi = dest ptr (aligned)
	58
	59	LNextChunk:
	60	movl %ecx,%eax // copy source ptr
	61	movl $4096,%edx
	62	andl $4095,%eax // get offset into source page
	63	subl %eax,%edx // get #bytes remaining in source page
	64	shrl $4,%edx // get #chunks till end of page
	65	jnz LLoopOverChunks // enter vector loop
	66	movl $16,%edx // move 16 bytes to cross page but keep dest aligned
	67	jmp LLoopOverBytes
	68
	69
	70	// Loop over bytes.
	71	// %ecx = source ptr
	72	// %edi = dest ptr
	73	// %edx = byte count
	74
	75	.align 4,0x90 // align inner loops to optimize I-fetch
	76	LLoopOverBytes:
	77	movzb (%ecx),%eax // get source byte
	78	inc %ecx
	79	movb %al,(%edi) // pack into dest
	80	inc %edi
	81	testl %eax,%eax // 0?
	82	jz LDone // yes, we're done
	83	dec %edx // more to go?
	84	jnz LLoopOverBytes
	85
	86	jmp LNextChunk // we've come to end of page
	87
	88
	89	// Loop over 16-byte chunks.
	90	// %ecx = source ptr (unaligned)
	91	// %edi = dest ptr (aligned)
	92	// %edx = chunk count
	93
	94	.align 4,0x90 // align inner loops to optimize I-fetch
	95	LLoopOverChunks:
	96	movdqu (%ecx),%xmm1 // get source
	97	pxor %xmm0,%xmm0 // get some 0s
	98	addl $16,%ecx
	99	pcmpeqb %xmm1,%xmm0 // compare source to 0s
	100	pmovmskb %xmm0,%eax // get result mask for 0 check
	101	testl %eax,%eax // any 0s?
	102	jnz LFound0 // yes, exit loop
	103	movdqa %xmm1,(%edi) // no 0s so do aligned store into destination
	104	addl $16,%edi
	105	dec %edx // more to go?
	106	jnz LLoopOverChunks
	107
	108	movl $16,%edx // move 16 bytes
	109	jmp LLoopOverBytes // cross page but keep dest aligned
	110
	111
	112	// Found a zero in the vector. Figure out where it is, and store the bytes
	113	// up to it.
	114	// %edi = dest ptr (aligned)
	115	// %eax = result mask
	116	// %xmm1 = source vector
	117
	118	LFound0:
	119	bsf %eax,%edx // find first 0
	120	inc %edx // we need to store the 0 too
	121	test $16,%dl // was 0 last byte?
	122	jz 8f // no
	123	movdqa %xmm1,(%edi) // yes, store entire vector
	124	jmp LDone
	125	8:
	126	test $8,%dl // 8-byte store required?
	127	jz 4f // no
	128	movq %xmm1,(%edi) // pack in 8 low bytes
	129	psrldq $8,%xmm1 // then shift vector down 8 bytes
	130	addl $8,%edi
	131	4:
	132	test $4,%dl // 4-byte store required?
	133	jz 3f // no
	134	movd %xmm1,(%edi) // pack in 4 low bytes
	135	psrldq $4,%xmm1 // then shift vector down 4 bytes
	136	addl $4,%edi
	137	3:
	138	andl $3,%edx // more to go?
	139	jz LDone // no
	140	movd %xmm1,%eax // move remainders out of vector into %eax
	141	1: // loop on up to three bytes
	142	movb %al,(%edi) // pack in next byte
	143	shrl $8,%eax // shift next byte into position
	144	inc %edi
	145	dec %edx
	146	jnz 1b
	147
	148	LDone:
	149	movl 8(%esp),%eax // original dest ptr is return value
	150	popl %edi
	151	ret