Libc-997.1.1.tar.gz

[apple/libc.git] / i386 / string / memcmp.s
diff --git a/i386/string/memcmp.s b/i386/string/memcmp.s

deleted file mode 100644 (file)

index a69e3ea..0000000
--- a/i386/string/memcmp.s
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_LICENSE_HEADER_END@
- */
-
-
-// ***************     ***********
-// * M E M C M P * and * B C M P *
-// ***************     ***********
-//
-// int memcmp(const char *s1, const char *s2, size_t len);
-// int   bcmp(const char *s1, const char *s2, size_t len);
-//
-// Bcmp returns (+,0,-), whereas memcmp returns the true difference
-// between the first differing bytes, but we treat them identically.
-//
-// We optimize the compare by doing it with SSE.  This introduces
-// a complication: if we blindly did vector loads from both sides until
-// finding a difference, we might get a spurious page fault by
-// reading bytes past the difference.  To avoid this, we never do a load
-// that crosses a page boundary.
-
-#define        kShort  18                      // too short for vectors (must be >16)
-
-        .text
-        .align         4
-
-        .globl _memcmp
-        .globl _bcmp
-
-_memcmp:                               // int memcmp(const char *s1,const char *s2,size_t len);
-_bcmp:                                 // int   bcmp(const char *s1,const char *s2,size_t len);
-       pushl   %esi
-       pushl   %edi
-       movl    20(%esp),%ecx           // get length
-       movl    12(%esp),%esi           // get LHS ptr
-       movl    16(%esp),%edi           // get RHS ptr
-       cmpl    $(kShort),%ecx          // worth accelerating?
-       ja      LNotShort               // yes
-       
-
-// Too short to bother with parallel compares.  Loop over bytes.
-//     %esi = LHS ptr
-//     %edi = RHS ptr
-//     %ecx = length (<= kShort)
-
-LShort:
-       testl   %ecx,%ecx               // 0-length?
-       jnz     LShortLoop              // no
-       xorl    %eax,%eax               // return 0
-       jmp     LExit
-       .align  4,0x90                  // align inner loops to optimize I-fetch
-LShortLoop:                            // loop over bytes
-       movzb   (%esi),%eax             // get LHS byte
-       movzb   (%edi),%edx             // get RHS byte
-       incl    %esi
-       incl    %edi
-       subl    %edx,%eax               // compare them
-       jnz     LExit                   // done if not equal
-       decl    %ecx                    // decrement length
-       jnz     LShortLoop
-LExit:                                 // return value is in %eax
-       popl    %edi
-       popl    %esi
-       ret
-       
-LNotEqual:                             // here from LLoopOverBytes with LHS in eax
-       movzb   (%edi),%edx             // get RHS byte
-       subl    %edx,%eax               // generate return value (nonzero)
-       popl    %edi
-       popl    %esi
-       ret
-
-       
-// Loop over bytes until we reach end of a page.
-//     %esi = LHS ptr
-//     %edi = RHS ptr
-//     %ecx = length remaining after end of loop (ie, already adjusted)
-//     %edx = #bytes until next page (1..15)
-
-       .align  4,0x90                  // align inner loops to optimize I-fetch
-LLoopOverBytes:
-       movzb   (%esi),%eax             // get LHS byte
-       inc     %esi
-       cmpb    (%edi),%al              // compare to RHS byte
-       jnz     LNotEqual               // done if not equal
-       inc     %edi
-       dec     %edx                    // more to go?
-       jnz     LLoopOverBytes
-       
-
-// Long enough to justify overhead of setting up vector compares.  In order to
-// avoid spurious page faults, we loop over:
-//
-//     min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4
-//
-// 16-byte chunks.  When we near a page end, we have to revert to a byte-by-byte
-// comparison until reaching the next page, then resume the vector comparison.
-//     %esi = LHS ptr
-//     %edi = RHS ptr
-//     %ecx = length (> kShort)
-
-LNotShort:
-       movl    %esi,%eax               // copy ptrs
-       movl    %edi,%edx
-       andl    $4095,%eax              // mask down to page offsets
-       andl    $4095,%edx
-       cmpl    %eax,%edx               // which is bigger?
-       cmova   %edx,%eax               // %eax = max(LHS offset, RHS offset);
-       movl    $4096,%edx
-       subl    %eax,%edx               // get #bytes to next page crossing
-       cmpl    %ecx,%edx               // will operand run out first?
-       cmova   %ecx,%edx               // get min(length remaining, bytes to page end)
-       movl    %edx,%eax
-       shrl    $4,%edx                 // get #chunks till end of operand or page
-       jnz     LLoopOverChunks         // enter vector loop
-       
-// Too near page end for vectors.
-
-       subl    %eax,%ecx               // adjust length remaining
-       movl    %eax,%edx               // %edx <- #bytes to page end
-       cmpl    $(kShort),%ecx          // will there be enough after we cross page for vectors?
-       ja      LLoopOverBytes          // yes
-       addl    %eax,%ecx               // no, restore total length remaining
-       jmp     LShortLoop              // compare rest byte-by-byte (%ecx != 0)
-
-
-// Loop over 16-byte chunks.
-//     %esi = LHS ptr
-//     %edi = RHS ptr
-//     %ecx = length remaining
-//     %edx = chunk count
-
-       .align  4,0x90                  // align inner loops to optimize I-fetch
-LLoopOverChunks:
-       movdqu  (%esi),%xmm0            // get LHS
-       movdqu  (%edi),%xmm1            // get RHS
-       addl    $16,%esi
-       pcmpeqb %xmm1,%xmm0             // compare LHS to RHS
-       addl    $16,%edi
-       pmovmskb %xmm0,%eax             // collect comparison result bits (1 if equal)
-       subl    $16,%ecx                // adjust length remaining
-       xorl    $0xFFFF,%eax            // all equal?
-       jne     LDifferent              // no, we found differing bytes
-       dec     %edx                    // more to go?
-       jnz     LLoopOverChunks
-       
-       cmpl    $(kShort),%ecx          // a lot more to compare?
-       jbe     LShort                  // no
-       jmp     LNotShort               // compute distance to next page crossing etc
-
-
-// Found a difference.  
-//     %esi = LHS ptr, already advanced by 16
-//     %edi = RHS ptr, already advanced by 16
-//     %eax = complemented compare vector (ie, 0 == equal)
-
-LDifferent:
-       bsf     %eax,%edx               // which byte differed?
-       subl    $16,%esi                // point to byte 0 while we wait for bit scan
-       subl    $16,%edi
-       movzb   (%esi,%edx),%eax        // get LHS byte
-       movzb   (%edi,%edx),%ecx        // get RHS byte
-       subl    %ecx,%eax               // compute difference (ie, return value)
-       popl    %edi
-       popl    %esi
-       ret