--- /dev/null
+/*
+ * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+
+// *************** ***********
+// * M E M C M P * and * B C M P *
+// *************** ***********
+//
+// int memcmp(const char *s1, const char *s2, size_t len);
+// int bcmp(const char *s1, const char *s2, size_t len);
+//
+// Bcmp returns (+,0,-), whereas memcmp returns the true difference
+// between the first differing bytes, but we treat them identically.
+//
+// We optimize the compare by doing it with SSE. This introduces
+// a complication: if we blindly did vector loads from both sides until
+// finding a difference, we might get a spurious page fault by
+// reading bytes past the difference. To avoid this, we never do a load
+// that crosses a page boundary.
+
+#define kShort 18 // too short for vectors (must be >16)
+
+ .text
+ .align 4
+
+ .globl _memcmp
+ .globl _bcmp
+
+_memcmp: // int memcmp(const char *s1,const char *s2,size_t len);
+_bcmp: // int bcmp(const char *s1,const char *s2,size_t len);
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%ecx // get length
+ movl 12(%esp),%esi // get LHS ptr
+ movl 16(%esp),%edi // get RHS ptr
+ cmpl $(kShort),%ecx // worth accelerating?
+ ja LNotShort // yes
+
+
+// Too short to bother with parallel compares. Loop over bytes.
+// %esi = LHS ptr
+// %edi = RHS ptr
+// %ecx = length (<= kShort)
+
+LShort:
+ testl %ecx,%ecx // 0-length?
+ jnz LShortLoop // no
+ xorl %eax,%eax // return 0
+ jmp LExit
+ .align 4,0x90 // align inner loops to optimize I-fetch
+LShortLoop: // loop over bytes
+ movzb (%esi),%eax // get LHS byte
+ movzb (%edi),%edx // get RHS byte
+ incl %esi
+ incl %edi
+ subl %edx,%eax // compare them
+ jnz LExit // done if not equal
+ decl %ecx // decrement length
+ jnz LShortLoop
+LExit: // return value is in %eax
+ popl %edi
+ popl %esi
+ ret
+
+LNotEqual: // here from LLoopOverBytes with LHS in eax
+ movzb (%edi),%edx // get RHS byte
+ subl %edx,%eax // generate return value (nonzero)
+ popl %edi
+ popl %esi
+ ret
+
+
+// Loop over bytes until we reach end of a page.
+// %esi = LHS ptr
+// %edi = RHS ptr
+// %ecx = length remaining after end of loop (ie, already adjusted)
+// %edx = #bytes until next page (1..15)
+
+ .align 4,0x90 // align inner loops to optimize I-fetch
+LLoopOverBytes:
+ movzb (%esi),%eax // get LHS byte
+ inc %esi
+ cmpb (%edi),%al // compare to RHS byte
+ jnz LNotEqual // done if not equal
+ inc %edi
+ dec %edx // more to go?
+ jnz LLoopOverBytes
+
+
+// Long enough to justify overhead of setting up vector compares. In order to
+// avoid spurious page faults, we loop over:
+//
+// min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4
+//
+// 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte
+// comparison until reaching the next page, then resume the vector comparison.
+// %esi = LHS ptr
+// %edi = RHS ptr
+// %ecx = length (> kShort)
+
+LNotShort:
+ movl %esi,%eax // copy ptrs
+ movl %edi,%edx
+ andl $4095,%eax // mask down to page offsets
+ andl $4095,%edx
+ cmpl %eax,%edx // which is bigger?
+ cmova %edx,%eax // %eax = max(LHS offset, RHS offset);
+ movl $4096,%edx
+ subl %eax,%edx // get #bytes to next page crossing
+ cmpl %ecx,%edx // will operand run out first?
+ cmova %ecx,%edx // get min(length remaining, bytes to page end)
+ movl %edx,%eax
+ shrl $4,%edx // get #chunks till end of operand or page
+ jnz LLoopOverChunks // enter vector loop
+
+// Too near page end for vectors.
+
+ subl %eax,%ecx // adjust length remaining
+ movl %eax,%edx // %edx <- #bytes to page end
+ cmpl $(kShort),%ecx // will there be enough after we cross page for vectors?
+ ja LLoopOverBytes // yes
+ addl %eax,%ecx // no, restore total length remaining
+ jmp LShortLoop // compare rest byte-by-byte (%ecx != 0)
+
+
+// Loop over 16-byte chunks.
+// %esi = LHS ptr
+// %edi = RHS ptr
+// %ecx = length remaining
+// %edx = chunk count
+
+ .align 4,0x90 // align inner loops to optimize I-fetch
+LLoopOverChunks:
+ movdqu (%esi),%xmm0 // get LHS
+ movdqu (%edi),%xmm1 // get RHS
+ addl $16,%esi
+ pcmpeqb %xmm1,%xmm0 // compare LHS to RHS
+ addl $16,%edi
+ pmovmskb %xmm0,%eax // collect comparison result bits (1 if equal)
+ subl $16,%ecx // adjust length remaining
+ xorl $0xFFFF,%eax // all equal?
+ jne LDifferent // no, we found differing bytes
+ dec %edx // more to go?
+ jnz LLoopOverChunks
+
+ cmpl $(kShort),%ecx // a lot more to compare?
+ jbe LShort // no
+ jmp LNotShort // compute distance to next page crossing etc
+
+
+// Found a difference.
+// %esi = LHS ptr, already advanced by 16
+// %edi = RHS ptr, already advanced by 16
+// %eax = complemented compare vector (ie, 0 == equal)
+
+LDifferent:
+ bsf %eax,%edx // which byte differed?
+ subl $16,%esi // point to byte 0 while we wait for bit scan
+ subl $16,%edi
+ movzb (%esi,%edx),%eax // get LHS byte
+ movzb (%edi,%edx),%ecx // get RHS byte
+ subl %ecx,%eax // compute difference (ie, return value)
+ popl %edi
+ popl %esi
+ ret