x86_64/string/memcmp.s

   1 /*
   2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24
  25 // ***************     ***********
  26 // * M E M C M P * and * B C M P *
  27 // ***************     ***********
  28 //
  29 // int  memcmp(const char *s1, const char *s2, size_t len);
  30 // int    bcmp(const char *s1, const char *s2, size_t len);
  31 //
  32 // Bcmp returns (+,0,-), whereas memcmp returns the true difference
  33 // between the first differing bytes, but we treat them identically.
  34 //
  35 // We optimize the compare by doing it with SSE.  This introduces
  36 // a complication: if we blindly did vector loads from both sides until
  37 // finding a difference, we might get a spurious page fault by
  38 // reading bytes past the difference.  To avoid this, we never do a load
  39 // that crosses a page boundary.
  40
  41 #define kShort  18                      // too short for vectors (must be >16)
  42
  43         .text
  44         .align  4
  45
  46         .globl _memcmp
  47         .globl _bcmp
  48
  49 _memcmp:                                // int memcmp(const char *s1,const char *s2,size_t len);
  50 _bcmp:                                  // int   bcmp(const char *s1,const char *s2,size_t len);
  51         cmpq    $(kShort),%rdx          // worth accelerating?
  52         ja      LNotShort               // yes
  53
  54
  55 // Too short to bother with parallel compares.  Loop over bytes.
  56 //      %rdi = LHS ptr
  57 //      %rsi = RHS ptr
  58 //      %edx = length (<= kShort)
  59
  60 LShort:
  61         testl   %edx,%edx       // 0-length?
  62         jnz     LShortLoop              // no
  63         xorq    %rax,%rax               // return 0
  64         jmp     LExit
  65         .align  4,0x90                  // align inner loops to optimize I-fetch
  66 LShortLoop:                             // loop over bytes
  67         movzb   (%rdi),%eax             // get LHS byte
  68         movzb   (%rsi),%ecx             // get RHS byte
  69         addq    $1,%rdi
  70         addq    $1,%rsi
  71         subl    %ecx,%eax               // compare them
  72         jnz     LExit                   // done if not equal
  73         subq    $1,%rdx                 // decrement length
  74         jnz     LShortLoop
  75 LExit:                                  // return value is in %eax
  76         ret
  77
  78 LNotEqual:                              // here from LLoopOverBytes with LHS in eax
  79         movzb   (%rsi),%ecx             // get RHS byte
  80         subl    %ecx,%eax               // generate return value (nonzero)
  81         ret
  82
  83
  84 // Loop over bytes until we reach end of a page.
  85 //      %rdi = LHS ptr
  86 //      %edi = RHS ptr
  87 //      %rdx = length remaining after end of loop (i.e., already adjusted)
  88 //      %ecx = #bytes until next page (1..15)
  89
  90         .align  4,0x90                  // align inner loops to optimize I-fetch
  91 LLoopOverBytes:
  92         movzb   (%rdi),%eax             // get LHS byte
  93         addq    $1,%rdi
  94         cmpb    (%rsi),%al              // compare to RHS byte
  95         jnz     LNotEqual                       // done if not equal
  96         addq    $1,%rsi
  97         subl    $1,%ecx                 // more to go?
  98         jnz     LLoopOverBytes
  99
 100
 101 // Long enough to justify overhead of setting up vector compares.  In order to
 102 // avoid spurious page faults, we loop over:
 103 //
 104 //      min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4
 105 //
 106 // 16-byte chunks.  When we near a page end, we have to revert to a byte-by-byte
 107 // comparison until reaching the next page, then resume the vector comparison.
 108 //      %rdi = LHS ptr
 109 //      %rsi = RHS ptr
 110 //      %rdx = length (> kShort)
 111
 112 LNotShort:
 113         movq    %rdi,%rax               // copy ptrs
 114         movq    %rsi,%rcx
 115         andq    $4095,%rax              // mask down to page offsets
 116         andq    $4095,%rcx
 117         cmpq    %rax,%rcx               // which is bigger?
 118         cmova   %rcx,%rax               // %eax = max(LHS offset, RHS offset);
 119         movl    $4096,%ecx
 120         subl    %eax,%ecx               // get #bytes to next page crossing
 121         cmpq    %rdx,%rcx               // will operand run out first?
 122         cmova   %edx,%ecx               // get min(length remaining, bytes to page end)
 123         movl    %ecx,%eax
 124         shrl    $4,%ecx                 // get #chunks till end of operand or page
 125         jnz     LLoopOverChunks         // enter vector loop
 126
 127 // Too near page end for vectors.
 128
 129         subq    %rax,%rdx               // adjust length remaining
 130         movl    %eax,%ecx               // %ecx <- #bytes to page end
 131         cmpq    $(kShort),%rdx          // will there be enough after we cross page for vectors?
 132         ja      LLoopOverBytes          // yes
 133         addq    %rax,%rdx               // no, restore total length remaining
 134         jmp     LShortLoop              // compare rest byte-by-byte (%ecx != 0)
 135
 136
 137 // Loop over 16-byte chunks.
 138 //      %rdi = LHS ptr
 139 //      %rsi = RHS ptr
 140 //      %rdx = length remaining
 141 //      %ecx = chunk count
 142
 143         .align  4,0x90                  // align inner loops to optimize I-fetch
 144 LLoopOverChunks:
 145         movdqu  (%rdi),%xmm0            // get LHS
 146         movdqu  (%rsi),%xmm1            // get RHS
 147         addq    $16,%rdi
 148         pcmpeqb %xmm1,%xmm0             // compare LHS to RHS
 149         addq    $16,%rsi
 150         pmovmskb %xmm0,%eax             // collect comparison result bits (1 if equal)
 151         subq    $16,%rdx                // adjust length remaining
 152         xorl    $0xFFFF,%eax            // all equal?
 153         jne     LDifferent              // no, we found differing bytes
 154         subl    $1,%ecx                 // more to go?
 155         jnz     LLoopOverChunks
 156
 157         cmpq    $(kShort),%rdx          // a lot more to compare?
 158         jbe     LShort                  // no
 159         jmp     LNotShort               // compute distance to next page crossing etc
 160
 161
 162 // Found a difference.
 163 //      %rdi = LHS ptr, already advanced by 16
 164 //      %rsi = RHS ptr, already advanced by 16
 165 //      %eax = complemented compare vector (ie, 0 == equal)
 166
 167 LDifferent:
 168         bsf     %eax,%edx               // which byte differed?
 169         subq    $16,%rdi                // point to byte 0 while we wait for bit scan
 170         subq    $16,%rsi
 171         movzb   (%rdi,%rdx),%eax        // get LHS byte
 172         movzb   (%rsi,%rdx),%ecx        // get RHS byte
 173         subl    %ecx,%eax               // compute difference (ie, return value)
 174         ret