i386/string/memcmp.s

   1 /*
   2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24
  25 // ***************     ***********
  26 // * M E M C M P * and * B C M P *
  27 // ***************     ***********
  28 //
  29 // int  memcmp(const char *s1, const char *s2, size_t len);
  30 // int    bcmp(const char *s1, const char *s2, size_t len);
  31 //
  32 // Bcmp returns (+,0,-), whereas memcmp returns the true difference
  33 // between the first differing bytes, but we treat them identically.
  34 //
  35 // We optimize the compare by doing it with SSE.  This introduces
  36 // a complication: if we blindly did vector loads from both sides until
  37 // finding a difference, we might get a spurious page fault by
  38 // reading bytes past the difference.  To avoid this, we never do a load
  39 // that crosses a page boundary.
  40
  41 #define kShort  18                      // too short for vectors (must be >16)
  42
  43         .text
  44         .align  4
  45
  46         .globl _memcmp
  47         .globl _bcmp
  48
  49 _memcmp:                                // int memcmp(const char *s1,const char *s2,size_t len);
  50 _bcmp:                                  // int   bcmp(const char *s1,const char *s2,size_t len);
  51         pushl   %esi
  52         pushl   %edi
  53         movl    20(%esp),%ecx           // get length
  54         movl    12(%esp),%esi           // get LHS ptr
  55         movl    16(%esp),%edi           // get RHS ptr
  56         cmpl    $(kShort),%ecx          // worth accelerating?
  57         ja      LNotShort               // yes
  58
  59
  60 // Too short to bother with parallel compares.  Loop over bytes.
  61 //      %esi = LHS ptr
  62 //      %edi = RHS ptr
  63 //      %ecx = length (<= kShort)
  64
  65 LShort:
  66         testl   %ecx,%ecx               // 0-length?
  67         jnz     LShortLoop              // no
  68         xorl    %eax,%eax               // return 0
  69         jmp     LExit
  70         .align  4,0x90                  // align inner loops to optimize I-fetch
  71 LShortLoop:                             // loop over bytes
  72         movzb   (%esi),%eax             // get LHS byte
  73         movzb   (%edi),%edx             // get RHS byte
  74         incl    %esi
  75         incl    %edi
  76         subl    %edx,%eax               // compare them
  77         jnz     LExit                   // done if not equal
  78         decl    %ecx                    // decrement length
  79         jnz     LShortLoop
  80 LExit:                                  // return value is in %eax
  81         popl    %edi
  82         popl    %esi
  83         ret
  84
  85 LNotEqual:                              // here from LLoopOverBytes with LHS in eax
  86         movzb   (%edi),%edx             // get RHS byte
  87         subl    %edx,%eax               // generate return value (nonzero)
  88         popl    %edi
  89         popl    %esi
  90         ret
  91
  92
  93 // Loop over bytes until we reach end of a page.
  94 //      %esi = LHS ptr
  95 //      %edi = RHS ptr
  96 //      %ecx = length remaining after end of loop (ie, already adjusted)
  97 //      %edx = #bytes until next page (1..15)
  98
  99         .align  4,0x90                  // align inner loops to optimize I-fetch
 100 LLoopOverBytes:
 101         movzb   (%esi),%eax             // get LHS byte
 102         inc     %esi
 103         cmpb    (%edi),%al              // compare to RHS byte
 104         jnz     LNotEqual               // done if not equal
 105         inc     %edi
 106         dec     %edx                    // more to go?
 107         jnz     LLoopOverBytes
 108
 109
 110 // Long enough to justify overhead of setting up vector compares.  In order to
 111 // avoid spurious page faults, we loop over:
 112 //
 113 //      min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4
 114 //
 115 // 16-byte chunks.  When we near a page end, we have to revert to a byte-by-byte
 116 // comparison until reaching the next page, then resume the vector comparison.
 117 //      %esi = LHS ptr
 118 //      %edi = RHS ptr
 119 //      %ecx = length (> kShort)
 120
 121 LNotShort:
 122         movl    %esi,%eax               // copy ptrs
 123         movl    %edi,%edx
 124         andl    $4095,%eax              // mask down to page offsets
 125         andl    $4095,%edx
 126         cmpl    %eax,%edx               // which is bigger?
 127         cmova   %edx,%eax               // %eax = max(LHS offset, RHS offset);
 128         movl    $4096,%edx
 129         subl    %eax,%edx               // get #bytes to next page crossing
 130         cmpl    %ecx,%edx               // will operand run out first?
 131         cmova   %ecx,%edx               // get min(length remaining, bytes to page end)
 132         movl    %edx,%eax
 133         shrl    $4,%edx                 // get #chunks till end of operand or page
 134         jnz     LLoopOverChunks         // enter vector loop
 135
 136 // Too near page end for vectors.
 137
 138         subl    %eax,%ecx               // adjust length remaining
 139         movl    %eax,%edx               // %edx <- #bytes to page end
 140         cmpl    $(kShort),%ecx          // will there be enough after we cross page for vectors?
 141         ja      LLoopOverBytes          // yes
 142         addl    %eax,%ecx               // no, restore total length remaining
 143         jmp     LShortLoop              // compare rest byte-by-byte (%ecx != 0)
 144
 145
 146 // Loop over 16-byte chunks.
 147 //      %esi = LHS ptr
 148 //      %edi = RHS ptr
 149 //      %ecx = length remaining
 150 //      %edx = chunk count
 151
 152         .align  4,0x90                  // align inner loops to optimize I-fetch
 153 LLoopOverChunks:
 154         movdqu  (%esi),%xmm0            // get LHS
 155         movdqu  (%edi),%xmm1            // get RHS
 156         addl    $16,%esi
 157         pcmpeqb %xmm1,%xmm0             // compare LHS to RHS
 158         addl    $16,%edi
 159         pmovmskb %xmm0,%eax             // collect comparison result bits (1 if equal)
 160         subl    $16,%ecx                // adjust length remaining
 161         xorl    $0xFFFF,%eax            // all equal?
 162         jne     LDifferent              // no, we found differing bytes
 163         dec     %edx                    // more to go?
 164         jnz     LLoopOverChunks
 165
 166         cmpl    $(kShort),%ecx          // a lot more to compare?
 167         jbe     LShort                  // no
 168         jmp     LNotShort               // compute distance to next page crossing etc
 169
 170
 171 // Found a difference.
 172 //      %esi = LHS ptr, already advanced by 16
 173 //      %edi = RHS ptr, already advanced by 16
 174 //      %eax = complemented compare vector (ie, 0 == equal)
 175
 176 LDifferent:
 177         bsf     %eax,%edx               // which byte differed?
 178         subl    $16,%esi                // point to byte 0 while we wait for bit scan
 179         subl    $16,%edi
 180         movzb   (%esi,%edx),%eax        // get LHS byte
 181         movzb   (%edi,%edx),%ecx        // get RHS byte
 182         subl    %ecx,%eax               // compute difference (ie, return value)
 183         popl    %edi
 184         popl    %esi
 185         ret