x86_64/string/strncmp.s

   1 /*
   2  * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24
  25 // *****************
  26 // * S T R N C M P *
  27 // *****************
  28 //
  29 // int  strncmp(const char *s1, const char *s2, size_t len);
  30 //
  31 // We optimize the compare by doing it vector parallel.  This introduces
  32 // a complication: if we blindly did vector loads from both sides until
  33 // finding a difference (or 0), we might get a spurious page fault by
  34 // reading bytes past the difference.  To avoid this, we never do a load
  35 // that crosses a page boundary.
  36
  37 #define kShort  20                      // too short for vectors (must be >16)
  38
  39         .text
  40         .globl _strncmp
  41
  42         .align  4
  43 _strncmp:                               // int strncmp(const char *s1, const char *s2, size_t len);
  44         cmpq    $(kShort),%rdx          // worth accelerating?
  45         ja      LNotShort               // yes
  46
  47
  48 // Too short to bother with parallel compares.  Loop over bytes.
  49 //      %rdi = LHS ptr
  50 //      %rsi = RHS ptr
  51 //      %edx = length (<= kShort)
  52
  53 LShort:
  54         testl   %edx,%edx               // 0-length?
  55         jnz     LShortLoop              // no
  56         jmp     LReturn0                // yes, return 0
  57         .align  4,0x90                  // align inner loops to optimize I-fetch
  58 LShortLoop:                             // loop over bytes
  59         movzb   (%rdi),%eax             // get LHS byte
  60         movzb   (%rsi),%ecx             // get RHS byte
  61         incq    %rdi
  62         incq    %rsi
  63         testl   %eax,%eax               // LHS==0 ?
  64         jz      LNotEqual               // yes, this terminates comparison
  65         cmpl    %ecx,%eax               // compare them (sub won't fuse, but cmp will)
  66         jnz     LNotEqual               // done if not equal
  67         decl    %edx                    // decrement length
  68         jnz     LShortLoop
  69 LReturn0:
  70         xorl    %eax,%eax               // all bytes equal, so return 0
  71         ret
  72
  73 LNotEqual:                              // LHS in eax, RHS in ecx
  74         subl    %ecx,%eax               // generate return value (nonzero)
  75         ret
  76
  77
  78 // Loop over bytes until we reach end of a page.
  79 //      %rdi = LHS ptr
  80 //      %rsi = RHS ptr
  81 //      %rdx = length remaining after end of loop (ie, already adjusted)
  82 //      %r8d = #bytes until next page (1..15)
  83
  84         .align  4,0x90                  // align inner loops to optimize I-fetch
  85 LLoopOverBytes:
  86         movzb   (%rdi),%eax             // get LHS byte
  87         movzb   (%rsi),%ecx             // get RHS byte
  88         incq    %rdi
  89         incq    %rsi
  90         testl   %eax,%eax               // LHS==0 ?
  91         jz      LNotEqual               // yes, this terminates comparison
  92         cmpl    %ecx,%eax               // compare them (sub won't fuse, but cmp will)
  93         jnz     LNotEqual               // done if not equal
  94         decl    %r8d                    // more to go?
  95         jnz     LLoopOverBytes
  96
  97
  98 // Long enough to justify overhead of setting up vector compares.  In order to
  99 // avoid spurious page faults, we loop over:
 100 //
 101 //      min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4
 102 //
 103 // 16-byte chunks.  When we near a page end, we have to revert to a byte-by-byte
 104 // comparison until reaching the next page, then resume the vector comparison.
 105 //      %rdi = LHS ptr
 106 //      %rsi = RHS ptr
 107 //      %rdx = length (> kShort)
 108
 109 LNotShort:
 110         movl    %edi,%eax               // copy ptrs
 111         movl    %esi,%r8d
 112         andl    $4095,%eax              // mask down to page offsets
 113         andl    $4095,%r8d
 114         cmpl    %eax,%r8d               // which is bigger?
 115         cmova   %r8d,%eax               // %eax = max(LHS offset, RHS offset);
 116         movl    $4096,%r8d
 117         subl    %eax,%r8d               // get #bytes to next page crossing
 118         cmpq    %rdx,%r8                // will operand run out first?
 119         cmova   %rdx,%r8                // get min(length remaining, bytes to page end)
 120         movl    %r8d,%eax               // copy #bytes
 121         shrl    $4,%r8d                 // get #chunks till end of operand or page
 122         testl   %r8d,%r8d               // test %r8d for 0 without partial flag update stall
 123         jnz     LLoopOverChunks         // enter vector loop
 124
 125 // Too near page end for vectors.
 126
 127         subq    %rax,%rdx               // adjust length remaining
 128         movl    %eax,%r8d               // %r8d <- #bytes to page end
 129         cmpq    $(kShort),%rdx          // will there be enough after we cross page for vectors?
 130         ja      LLoopOverBytes          // yes
 131         addq    %rax,%rdx               // no, restore total length remaining
 132         jmp     LShortLoop              // compare rest byte-by-byte (%rdx != 0)
 133
 134
 135 // Loop over 16-byte chunks.
 136 //      %rdi = LHS ptr
 137 //      %rsi = RHS ptr
 138 //      %rdx = length remaining
 139 //      %r8d = chunk count
 140
 141         .align  4,0x90                  // align inner loops to optimize I-fetch
 142 LLoopOverChunks:
 143         movdqu  (%rdi),%xmm1            // get LHS
 144         movdqu  (%rsi),%xmm2            // get RHS
 145         pxor    %xmm0,%xmm0             // get some 0s in the shadow of the loads
 146         addq    $16,%rdi
 147         pcmpeqb %xmm1,%xmm2             // compare LHS to RHS
 148         pcmpeqb %xmm1,%xmm0             // compare LHS to 0s
 149         addq    $16,%rsi
 150         pmovmskb %xmm2,%eax             // get result mask for comparison of LHS and RHS
 151         pmovmskb %xmm0,%ecx             // get result mask for 0 check
 152         subq    $16,%rdx                // decrement length remaining
 153         xorl    $0xFFFF,%eax            // complement compare mask so 1 means "not equal"
 154         orl     %ecx,%eax               // combine the masks and check for 1-bits
 155         jnz     LFoundDiffOr0           // we found differing bytes or a 0-byte
 156         decl    %r8d                    // more to go?
 157         jnz     LLoopOverChunks         // yes
 158
 159         cmpq    $(kShort),%rdx          // a lot more to compare?
 160         jbe     LShort                  // no
 161         jmp     LNotShort               // compute distance to next page crossing etc
 162
 163
 164 // Found a zero and/or a difference in vector compare.
 165 //      %rdi = LHS ptr, already advanced by 16
 166 //      %rsi = RHS ptr, already advanced by 16
 167 //      %eax = bit n set if bytes n differed or were 0
 168
 169 LFoundDiffOr0:
 170         bsf     %eax,%edx               // which byte differed or was 0?
 171         movzb   -16(%rdi,%rdx),%eax     // get LHS byte
 172         movzb   -16(%rsi,%rdx),%edx     // get RHS byte
 173         subl    %edx,%eax               // compute difference (ie, return value)
 174         ret