Libc-763.12.tar.gz

[apple/libc.git] / i386 / string / strcmp.s
diff --git a/i386/string/strcmp.s b/i386/string/strcmp.s

index a21cea63cc303037ac53a15d7fdb3e725a2ddb14..fb1047f43a491b8c71a75c070cb37ed5da36c981 100644 (file)
--- a/i386/string/strcmp.s
+++ b/i386/string/strcmp.s
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   *
   * @APPLE_LICENSE_HEADER_START@
   * 
@@ -20,72 +20,126 @@
   * 
   * @APPLE_LICENSE_HEADER_END@
   */
-.text
-.globl _strcmp
-_strcmp:
-        movl    0x04(%esp),%eax
-        movl    0x08(%esp),%edx
-        jmp     L2                      /* Jump into the loop! */
-
-        .align  2,0x90
-L1:     incl    %eax
-        incl    %edx
-L2:     movb    (%eax),%cl
-        testb   %cl,%cl                 /* null terminator??? */
-        jz      L3
-        cmpb    %cl,(%edx)              /* chars match??? */
-        jne     L3
-        incl    %eax
-        incl    %edx
-        movb    (%eax),%cl
-        testb   %cl,%cl
-        jz      L3
-        cmpb    %cl,(%edx)
-        jne     L3
-        incl    %eax
-        incl    %edx
-        movb    (%eax),%cl
-        testb   %cl,%cl
-        jz      L3
-        cmpb    %cl,(%edx)
-        jne     L3
-        incl    %eax
-        incl    %edx
-        movb    (%eax),%cl
-        testb   %cl,%cl
-        jz      L3
-        cmpb    %cl,(%edx)
-        jne     L3
-        incl    %eax
-        incl    %edx
-        movb    (%eax),%cl
-        testb   %cl,%cl
-        jz      L3
-        cmpb    %cl,(%edx)
-        jne     L3
-        incl    %eax
-        incl    %edx
-        movb    (%eax),%cl
-        testb   %cl,%cl
-        jz      L3
-        cmpb    %cl,(%edx)
-        jne     L3
-        incl    %eax
-        incl    %edx
-        movb    (%eax),%cl
-        testb   %cl,%cl
-        jz      L3
-        cmpb    %cl,(%edx)
-        jne     L3
-        incl    %eax
-        incl    %edx
-        movb    (%eax),%cl
-        testb   %cl,%cl
-        jz      L3
-        cmpb    %cl,(%edx)
-        je      L1
-        .align 2, 0x90
-L3:     movzbl  (%eax),%eax             /* unsigned comparison */
-        movzbl  (%edx),%edx
-        subl    %edx,%eax
-        ret
+
+
+// ***************
+// * S T R C M P *
+// ***************
+//
+// int strcmp(const char *s1, const char *s2);
+//
+// We optimize the compare by doing it in parallel, using SSE.  This introduces
+// a complication: if we blindly did vector loads from both sides until
+// finding a difference (or 0), we might get a spurious page fault by
+// reading bytes past the difference.  To avoid this, we never do a load
+// that crosses a page boundary.
+
+        .text
+        .globl _strcmp
+
+        .align         4
+_strcmp:                               // int strcmp(const char *s1,const char *s2);
+       pushl   %esi
+       pushl   %edi
+       movl    12(%esp),%esi           // get LHS ptr
+       movl    16(%esp),%edi           // get RHS ptr
+       
+
+// In order to avoid spurious page faults, we loop over:
+//
+//     min( bytes_in_LHS_page, bytes_in_RHS_page) >> 4
+//
+// 16-byte chunks.  When we near a page end, we have to revert to a byte-by-byte
+// comparison until reaching the next page, then resume the vector comparison.
+//     %esi = LHS ptr
+//     %edi = RHS ptr
+
+LNextChunk:
+       movl    %esi,%eax               // copy ptrs
+       movl    %edi,%edx
+       andl    $4095,%eax              // mask down to page offsets
+       andl    $4095,%edx
+       cmpl    %eax,%edx               // which is bigger?
+       cmova   %edx,%eax               // %eax = max(LHS offset, RHS offset);
+       movl    $4096,%edx
+       subl    %eax,%edx               // get #bytes to next page crossing
+       movl    %edx,%eax
+       shrl    $4,%edx                 // get #chunks till end of operand or page
+       jnz     LLoopOverChunks         // enter vector loop
+       movl    %eax,%edx               // no chunks...
+       jmp     LLoopOverBytes          // ...so loop over bytes until page end
+
+
+// Loop over bytes.
+//     %esi = LHS ptr
+//     %edi = RHS ptr
+//     %edx = byte count
+
+       .align  4,0x90                  // align inner loops to optimize I-fetch
+LLoopOverBytes:
+       movzb   (%esi),%eax             // get LHS byte
+       movzb   (%edi),%ecx             // get RHS byte
+       inc     %esi
+       inc     %edi
+       testl   %eax,%eax               // 0?
+       jz      LExit0                  // yes, we're done
+       subl    %ecx,%eax               // compare them
+       jnz     LExit                   // done if not equal
+       dec     %edx                    // more to go?
+       jnz     LLoopOverBytes
+       
+       jmp     LNextChunk              // we've come to end of page
+
+
+// Loop over 16-byte chunks.
+//     %esi = LHS ptr
+//     %edi = RHS ptr
+//     %edx = chunk count
+
+       .align  4,0x90                  // align inner loops to optimize I-fetch
+LLoopOverChunks:
+       movdqu  (%esi),%xmm1            // get LHS
+       movdqu  (%edi),%xmm2            // get RHS
+       pxor    %xmm0,%xmm0             // get some 0s in the shadow of the loads
+       addl    $16,%esi
+       pcmpeqb %xmm1,%xmm2             // compare LHS to RHS
+       pcmpeqb %xmm1,%xmm0             // compare LHS to 0s
+       addl    $16,%edi
+       pmovmskb %xmm2,%eax             // get result mask for comparison of LHS and RHS
+       pmovmskb %xmm0,%ecx             // get result mask for 0 check
+       xorl    $0xFFFF,%eax            // complement compare mask so 1 means "not equal"
+       orl     %ecx,%eax               // combine the masks and check for 1-bits
+       jnz     LFoundDiffOr0           // we found differing bytes or a 0-byte
+       dec     %edx                    // more to go?
+       jnz     LLoopOverChunks
+       
+       jmp     LNextChunk              // compare up to next page boundary
+       
+
+// Found a zero and/or a difference in vector compare.
+//     %esi = LHS ptr, already advanced by 16
+//     %edi = RHS ptr, already advanced by 16
+//     %eax = bit n set if bytes n differed or were 0
+
+LFoundDiffOr0:
+       bsf     %eax,%edx               // which byte differed or was 0?
+       subl    $16,%esi                // point to start of vectors while we wait for bit scan
+       subl    $16,%edi
+       movzb   (%esi,%edx),%eax        // get LHS byte
+       movzb   (%edi,%edx),%ecx        // get RHS byte
+       subl    %ecx,%eax               // compute difference (ie, return value)
+       popl    %edi
+       popl    %esi
+       ret
+
+
+// Found a zero and/or difference in byte loop.
+//     %eax = LHS byte
+//     %ecx = RHS byte
+
+LExit0:
+       subl    %ecx,%eax               // compute difference (ie, return value)
+LExit:                                 // here with difference already in %eax
+       popl    %edi
+       popl    %esi
+       ret