[apple/libc.git] / x86_64 / string / memcmp.s

/*
 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */


// ***************     ***********
// * M E M C M P * and * B C M P *
// ***************     ***********
//
// int	memcmp(const char *s1, const char *s2, size_t len);
// int	  bcmp(const char *s1, const char *s2, size_t len);
//
// Bcmp returns (+,0,-), whereas memcmp returns the true difference
// between the first differing bytes, but we treat them identically.
//
// We optimize the compare by doing it with SSE.  This introduces
// a complication: if we blindly did vector loads from both sides until
// finding a difference, we might get a spurious page fault by
// reading bytes past the difference.  To avoid this, we never do a load
// that crosses a page boundary.

#define	kShort	18			// too short for vectors (must be >16)

        .text
        .align 	4

        .globl _memcmp
        .globl _bcmp

_memcmp:				// int memcmp(const char *s1,const char *s2,size_t len);
_bcmp:					// int   bcmp(const char *s1,const char *s2,size_t len);
	cmpq	$(kShort),%rdx		// worth accelerating?
	ja	LNotShort		// yes
	

// Too short to bother with parallel compares.  Loop over bytes.
//	%rdi = LHS ptr
//	%rsi = RHS ptr
//	%edx = length (<= kShort)

LShort:
	testl	%edx,%edx	// 0-length?
	jnz	LShortLoop		// no
	xorq	%rax,%rax		// return 0
	jmp	LExit
	.align	4,0x90			// align inner loops to optimize I-fetch
LShortLoop:				// loop over bytes
	movzb	(%rdi),%eax		// get LHS byte
	movzb	(%rsi),%ecx		// get RHS byte
	addq	$1,%rdi
	addq	$1,%rsi
	subl	%ecx,%eax		// compare them
	jnz	LExit			// done if not equal
	subq	$1,%rdx			// decrement length
	jnz	LShortLoop
LExit:					// return value is in %eax
	ret
	
LNotEqual:				// here from LLoopOverBytes with LHS in eax
	movzb	(%rsi),%ecx		// get RHS byte
	subl	%ecx,%eax		// generate return value (nonzero)
	ret

	
// Loop over bytes until we reach end of a page.
//	%rdi = LHS ptr
//	%edi = RHS ptr
//	%rdx = length remaining after end of loop (i.e., already adjusted)
//	%ecx = #bytes until next page (1..15)

	.align	4,0x90			// align inner loops to optimize I-fetch
LLoopOverBytes:
	movzb	(%rdi),%eax		// get LHS byte
	addq	$1,%rdi
	cmpb	(%rsi),%al		// compare to RHS byte
	jnz	LNotEqual			// done if not equal
	addq	$1,%rsi
	subl	$1,%ecx			// more to go?
	jnz	LLoopOverBytes
	

// Long enough to justify overhead of setting up vector compares.  In order to
// avoid spurious page faults, we loop over:
//
//	min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4
//
// 16-byte chunks.  When we near a page end, we have to revert to a byte-by-byte
// comparison until reaching the next page, then resume the vector comparison.
//	%rdi = LHS ptr
//	%rsi = RHS ptr
//	%rdx = length (> kShort)

LNotShort:
	movq	%rdi,%rax		// copy ptrs
	movq	%rsi,%rcx
	andq	$4095,%rax		// mask down to page offsets
	andq	$4095,%rcx
	cmpq	%rax,%rcx		// which is bigger?
	cmova	%rcx,%rax		// %eax = max(LHS offset, RHS offset);
	movl	$4096,%ecx
	subl	%eax,%ecx		// get #bytes to next page crossing
	cmpq	%rdx,%rcx		// will operand run out first?
	cmova	%edx,%ecx		// get min(length remaining, bytes to page end)
	movl	%ecx,%eax
	shrl	$4,%ecx			// get #chunks till end of operand or page
	jnz	LLoopOverChunks		// enter vector loop
	
// Too near page end for vectors.

	subq	%rax,%rdx		// adjust length remaining
	movl	%eax,%ecx		// %ecx <- #bytes to page end
	cmpq	$(kShort),%rdx		// will there be enough after we cross page for vectors?
	ja	LLoopOverBytes		// yes
	addq	%rax,%rdx		// no, restore total length remaining
	jmp	LShortLoop		// compare rest byte-by-byte (%ecx != 0)


// Loop over 16-byte chunks.
//	%rdi = LHS ptr
//	%rsi = RHS ptr
//	%rdx = length remaining
//	%ecx = chunk count

	.align	4,0x90			// align inner loops to optimize I-fetch
LLoopOverChunks:
	movdqu	(%rdi),%xmm0		// get LHS
	movdqu	(%rsi),%xmm1		// get RHS
	addq	$16,%rdi
	pcmpeqb	%xmm1,%xmm0		// compare LHS to RHS
	addq	$16,%rsi
	pmovmskb %xmm0,%eax		// collect comparison result bits (1 if equal)
	subq	$16,%rdx		// adjust length remaining
	xorl	$0xFFFF,%eax		// all equal?
	jne	LDifferent		// no, we found differing bytes
	subl	$1,%ecx			// more to go?
	jnz	LLoopOverChunks
	
	cmpq	$(kShort),%rdx		// a lot more to compare?
	jbe	LShort			// no
	jmp	LNotShort		// compute distance to next page crossing etc


// Found a difference.  
//	%rdi = LHS ptr, already advanced by 16
//	%rsi = RHS ptr, already advanced by 16
//	%eax = complemented compare vector (ie, 0 == equal)

LDifferent:
	bsf	%eax,%edx		// which byte differed?
	subq	$16,%rdi		// point to byte 0 while we wait for bit scan
	subq	$16,%rsi
	movzb	(%rdi,%rdx),%eax	// get LHS byte
	movzb	(%rsi,%rdx),%ecx	// get RHS byte
	subl	%ecx,%eax		// compute difference (ie, return value)
	ret
Commit	Line	Data
8e029c65 A	1	/*
	2	* Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. Please obtain a copy of the License at
	10	* http://www.opensource.apple.com/apsl/ and read it before using this
	11	* file.
	12	*
	13	* The Original Code and all software distributed under the License are
	14	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	15	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	16	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	17	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	18	* Please see the License for the specific language governing rights and
	19	* limitations under the License.
	20	*
	21	* @APPLE_LICENSE_HEADER_END@
	22	*/
	23
	24
	25	// ************* *********
	26	// * M E M C M P * and * B C M P *
	27	// ************* *********
	28	//
	29	// int memcmp(const char s1, const char s2, size_t len);
	30	// int bcmp(const char s1, const char s2, size_t len);
	31	//
	32	// Bcmp returns (+,0,-), whereas memcmp returns the true difference
	33	// between the first differing bytes, but we treat them identically.
	34	//
	35	// We optimize the compare by doing it with SSE. This introduces
	36	// a complication: if we blindly did vector loads from both sides until
	37	// finding a difference, we might get a spurious page fault by
	38	// reading bytes past the difference. To avoid this, we never do a load
	39	// that crosses a page boundary.
	40
	41	#define kShort 18 // too short for vectors (must be >16)
	42
	43	.text
	44	.align 4
	45
	46	.globl _memcmp
	47	.globl _bcmp
	48
	49	_memcmp: // int memcmp(const char s1,const char s2,size_t len);
	50	_bcmp: // int bcmp(const char s1,const char s2,size_t len);
	51	cmpq $(kShort),%rdx // worth accelerating?
	52	ja LNotShort // yes
	53
	54
	55	// Too short to bother with parallel compares. Loop over bytes.
	56	// %rdi = LHS ptr
	57	// %rsi = RHS ptr
	58	// %edx = length (<= kShort)
	59
	60	LShort:
	61	testl %edx,%edx // 0-length?
	62	jnz LShortLoop // no
	63	xorq %rax,%rax // return 0
	64	jmp LExit
65	.align 4,0x90 // align inner loops to optimize I-fetch
66	LShortLoop: // loop over bytes
67	movzb (%rdi),%eax // get LHS byte
68	movzb (%rsi),%ecx // get RHS byte
69	addq $1,%rdi
70	addq $1,%rsi
71	subl %ecx,%eax // compare them
72	jnz LExit // done if not equal
73	subq $1,%rdx // decrement length
74	jnz LShortLoop
75	LExit: // return value is in %eax
76	ret
77
78	LNotEqual: // here from LLoopOverBytes with LHS in eax
79	movzb (%rsi),%ecx // get RHS byte
80	subl %ecx,%eax // generate return value (nonzero)
81	ret
82
83
84	// Loop over bytes until we reach end of a page.
85	// %rdi = LHS ptr
86	// %edi = RHS ptr
87	// %rdx = length remaining after end of loop (i.e., already adjusted)
88	// %ecx = #bytes until next page (1..15)
89
90	.align 4,0x90 // align inner loops to optimize I-fetch
91	LLoopOverBytes:
92	movzb (%rdi),%eax // get LHS byte
93	addq $1,%rdi
94	cmpb (%rsi),%al // compare to RHS byte
95	jnz LNotEqual // done if not equal
96	addq $1,%rsi
97	subl $1,%ecx // more to go?
98	jnz LLoopOverBytes
99
100
101	// Long enough to justify overhead of setting up vector compares. In order to
102	// avoid spurious page faults, we loop over:
103	//
104	// min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4
105	//
106	// 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte
107	// comparison until reaching the next page, then resume the vector comparison.
108	// %rdi = LHS ptr
109	// %rsi = RHS ptr
110	// %rdx = length (> kShort)
111
112	LNotShort:
113	movq %rdi,%rax // copy ptrs
114	movq %rsi,%rcx
115	andq $4095,%rax // mask down to page offsets
116	andq $4095,%rcx
117	cmpq %rax,%rcx // which is bigger?
118	cmova %rcx,%rax // %eax = max(LHS offset, RHS offset);
119	movl $4096,%ecx
120	subl %eax,%ecx // get #bytes to next page crossing
34e8f829	121	cmpq %rdx,%rcx // will operand run out first?
8e029c65 A	122	cmova %edx,%ecx // get min(length remaining, bytes to page end)
	123	movl %ecx,%eax
	124	shrl $4,%ecx // get #chunks till end of operand or page
	125	jnz LLoopOverChunks // enter vector loop
	126
	127	// Too near page end for vectors.
	128
	129	subq %rax,%rdx // adjust length remaining
	130	movl %eax,%ecx // %ecx <- #bytes to page end
	131	cmpq $(kShort),%rdx // will there be enough after we cross page for vectors?
	132	ja LLoopOverBytes // yes
	133	addq %rax,%rdx // no, restore total length remaining
	134	jmp LShortLoop // compare rest byte-by-byte (%ecx != 0)
	135
	136
	137	// Loop over 16-byte chunks.
	138	// %rdi = LHS ptr
	139	// %rsi = RHS ptr
	140	// %rdx = length remaining
	141	// %ecx = chunk count
	142
	143	.align 4,0x90 // align inner loops to optimize I-fetch
	144	LLoopOverChunks:
	145	movdqu (%rdi),%xmm0 // get LHS
	146	movdqu (%rsi),%xmm1 // get RHS
	147	addq $16,%rdi
	148	pcmpeqb %xmm1,%xmm0 // compare LHS to RHS
	149	addq $16,%rsi
	150	pmovmskb %xmm0,%eax // collect comparison result bits (1 if equal)
	151	subq $16,%rdx // adjust length remaining
	152	xorl $0xFFFF,%eax // all equal?
	153	jne LDifferent // no, we found differing bytes
	154	subl $1,%ecx // more to go?
	155	jnz LLoopOverChunks
	156
	157	cmpq $(kShort),%rdx // a lot more to compare?
	158	jbe LShort // no
	159	jmp LNotShort // compute distance to next page crossing etc
	160
	161
	162	// Found a difference.
	163	// %rdi = LHS ptr, already advanced by 16
	164	// %rsi = RHS ptr, already advanced by 16
	165	// %eax = complemented compare vector (ie, 0 == equal)
	166
	167	LDifferent:
	168	bsf %eax,%edx // which byte differed?
	169	subq $16,%rdi // point to byte 0 while we wait for bit scan
	170	subq $16,%rsi
	171	movzb (%rdi,%rdx),%eax // get LHS byte
	172	movzb (%rsi,%rdx),%ecx // get RHS byte
	173	subl %ecx,%eax // compute difference (ie, return value)
	174	ret