/*
- * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
*
* @APPLE_LICENSE_HEADER_END@
*/
-.text
-.globl _strcmp
-_strcmp:
- movl 0x04(%esp),%eax
- movl 0x08(%esp),%edx
- jmp L2 /* Jump into the loop! */
-
- .align 2,0x90
-L1: incl %eax
- incl %edx
-L2: movb (%eax),%cl
- testb %cl,%cl /* null terminator??? */
- jz L3
- cmpb %cl,(%edx) /* chars match??? */
- jne L3
- incl %eax
- incl %edx
- movb (%eax),%cl
- testb %cl,%cl
- jz L3
- cmpb %cl,(%edx)
- jne L3
- incl %eax
- incl %edx
- movb (%eax),%cl
- testb %cl,%cl
- jz L3
- cmpb %cl,(%edx)
- jne L3
- incl %eax
- incl %edx
- movb (%eax),%cl
- testb %cl,%cl
- jz L3
- cmpb %cl,(%edx)
- jne L3
- incl %eax
- incl %edx
- movb (%eax),%cl
- testb %cl,%cl
- jz L3
- cmpb %cl,(%edx)
- jne L3
- incl %eax
- incl %edx
- movb (%eax),%cl
- testb %cl,%cl
- jz L3
- cmpb %cl,(%edx)
- jne L3
- incl %eax
- incl %edx
- movb (%eax),%cl
- testb %cl,%cl
- jz L3
- cmpb %cl,(%edx)
- jne L3
- incl %eax
- incl %edx
- movb (%eax),%cl
- testb %cl,%cl
- jz L3
- cmpb %cl,(%edx)
- je L1
- .align 2, 0x90
-L3: movzbl (%eax),%eax /* unsigned comparison */
- movzbl (%edx),%edx
- subl %edx,%eax
- ret
+
+
+// ***************
+// * S T R C M P *
+// ***************
+//
+// int strcmp(const char *s1, const char *s2);
+//
+// We optimize the compare by doing it in parallel, using SSE. This introduces
+// a complication: if we blindly did vector loads from both sides until
+// finding a difference (or 0), we might get a spurious page fault by
+// reading bytes past the difference. To avoid this, we never do a load
+// that crosses a page boundary.
+
+ .text
+ .globl _strcmp
+
+ .align 4
+_strcmp: // int strcmp(const char *s1,const char *s2);
+ pushl %esi
+ pushl %edi
+ movl 12(%esp),%esi // get LHS ptr
+ movl 16(%esp),%edi // get RHS ptr
+
+
+// In order to avoid spurious page faults, we loop over:
+//
+// min( bytes_in_LHS_page, bytes_in_RHS_page) >> 4
+//
+// 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte
+// comparison until reaching the next page, then resume the vector comparison.
+// %esi = LHS ptr
+// %edi = RHS ptr
+
+LNextChunk:
+ movl %esi,%eax // copy ptrs
+ movl %edi,%edx
+ andl $4095,%eax // mask down to page offsets
+ andl $4095,%edx
+ cmpl %eax,%edx // which is bigger?
+ cmova %edx,%eax // %eax = max(LHS offset, RHS offset);
+ movl $4096,%edx
+ subl %eax,%edx // get #bytes to next page crossing
+ movl %edx,%eax
+ shrl $4,%edx // get #chunks till end of operand or page
+ jnz LLoopOverChunks // enter vector loop
+ movl %eax,%edx // no chunks...
+ jmp LLoopOverBytes // ...so loop over bytes until page end
+
+
+// Loop over bytes.
+// %esi = LHS ptr
+// %edi = RHS ptr
+// %edx = byte count
+
+ .align 4,0x90 // align inner loops to optimize I-fetch
+LLoopOverBytes:
+ movzb (%esi),%eax // get LHS byte
+ movzb (%edi),%ecx // get RHS byte
+ inc %esi
+ inc %edi
+ testl %eax,%eax // 0?
+ jz LExit0 // yes, we're done
+ subl %ecx,%eax // compare them
+ jnz LExit // done if not equal
+ dec %edx // more to go?
+ jnz LLoopOverBytes
+
+ jmp LNextChunk // we've come to end of page
+
+
+// Loop over 16-byte chunks.
+// %esi = LHS ptr
+// %edi = RHS ptr
+// %edx = chunk count
+
+ .align 4,0x90 // align inner loops to optimize I-fetch
+LLoopOverChunks:
+ movdqu (%esi),%xmm1 // get LHS
+ movdqu (%edi),%xmm2 // get RHS
+ pxor %xmm0,%xmm0 // get some 0s in the shadow of the loads
+ addl $16,%esi
+ pcmpeqb %xmm1,%xmm2 // compare LHS to RHS
+ pcmpeqb %xmm1,%xmm0 // compare LHS to 0s
+ addl $16,%edi
+ pmovmskb %xmm2,%eax // get result mask for comparison of LHS and RHS
+ pmovmskb %xmm0,%ecx // get result mask for 0 check
+ xorl $0xFFFF,%eax // complement compare mask so 1 means "not equal"
+ orl %ecx,%eax // combine the masks and check for 1-bits
+ jnz LFoundDiffOr0 // we found differing bytes or a 0-byte
+ dec %edx // more to go?
+ jnz LLoopOverChunks
+
+ jmp LNextChunk // compare up to next page boundary
+
+
+// Found a zero and/or a difference in vector compare.
+// %esi = LHS ptr, already advanced by 16
+// %edi = RHS ptr, already advanced by 16
+// %eax = bit n set if bytes n differed or were 0
+
+LFoundDiffOr0:
+ bsf %eax,%edx // which byte differed or was 0?
+ subl $16,%esi // point to start of vectors while we wait for bit scan
+ subl $16,%edi
+ movzb (%esi,%edx),%eax // get LHS byte
+ movzb (%edi,%edx),%ecx // get RHS byte
+ subl %ecx,%eax // compute difference (ie, return value)
+ popl %edi
+ popl %esi
+ ret
+
+
+// Found a zero and/or difference in byte loop.
+// %eax = LHS byte
+// %ecx = RHS byte
+
+LExit0:
+ subl %ecx,%eax // compute difference (ie, return value)
+LExit: // here with difference already in %eax
+ popl %edi
+ popl %esi
+ ret