i386/string/strncmp.s

   1 /*
   2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24
  25 // *****************
  26 // * S T R N C M P *
  27 // *****************
  28 //
  29 // int  strncmp(const char *s1, const char *s2, size_t len);
  30 //
  31 // We optimize the compare by doing it vector parallel.  This introduces
  32 // a complication: if we blindly did vector loads from both sides until
  33 // finding a difference (or 0), we might get a spurious page fault by
  34 // reading bytes past the difference.  To avoid this, we never do a load
  35 // that crosses a page boundary.
  36
  37 #define kShort  20                      // too short for vectors (must be >16)
  38
  39         .text
  40         .globl _strncmp
  41
  42         .align  4
  43 _strncmp:                               // int strncmp(const char *s1, const char *s2, size_t len);
  44         pushl   %esi
  45         pushl   %edi
  46         movl    20(%esp),%ecx           // get length
  47         movl    12(%esp),%esi           // get LHS ptr
  48         movl    16(%esp),%edi           // get RHS ptr
  49         push    %ebx
  50         cmpl    $(kShort),%ecx          // worth accelerating?
  51         ja      LNotShort               // yes
  52
  53
  54 // Too short to bother with parallel compares.  Loop over bytes.
  55 //      %esi = LHS ptr
  56 //      %edi = RHS ptr
  57 //      %ecx = length (<= kShort)
  58
  59 LShort:
  60         testl   %ecx,%ecx               // 0-length?
  61         jnz     LShortLoop              // no
  62         jmp     LReturn0                // yes, return 0
  63         .align  4,0x90                  // align inner loops to optimize I-fetch
  64 LShortLoop:                             // loop over bytes
  65         movzb   (%esi),%eax             // get LHS byte
  66         movzb   (%edi),%ebx             // get RHS byte
  67         incl    %esi
  68         incl    %edi
  69         testl   %eax,%eax               // LHS==0 ?
  70         jz      LNotEqual               // yes, this terminates comparison
  71         subl    %ebx,%eax               // compare them
  72         jnz     LExit                   // done if not equal
  73         decl    %ecx                    // decrement length
  74         jnz     LShortLoop
  75 LReturn0:
  76         xorl    %eax,%eax               // all bytes equal, so return 0
  77 LExit:                                  // return value is in %eax
  78         popl    %ebx
  79         popl    %edi
  80         popl    %esi
  81         ret
  82
  83 LNotEqual:                              // LHS in eax, RHS in ebx
  84         subl    %ebx,%eax               // generate return value (nonzero)
  85         popl    %ebx
  86         popl    %edi
  87         popl    %esi
  88         ret
  89
  90
  91 // Loop over bytes until we reach end of a page.
  92 //      %esi = LHS ptr
  93 //      %edi = RHS ptr
  94 //      %ecx = length remaining after end of loop (ie, already adjusted)
  95 //      %edx = #bytes until next page (1..15)
  96
  97         .align  4,0x90                  // align inner loops to optimize I-fetch
  98 LLoopOverBytes:
  99         movzb   (%esi),%eax             // get LHS byte
 100         movzb   (%edi),%ebx             // get RHS byte
 101         inc     %esi
 102         inc     %edi
 103         testl   %eax,%eax               // LHS==0 ?
 104         jz      LNotEqual               // yes, this terminates comparison
 105         subl    %ebx,%eax               // compare them
 106         jnz     LExit                   // done if not equal
 107         dec     %edx                    // more to go?
 108         jnz     LLoopOverBytes
 109
 110
 111 // Long enough to justify overhead of setting up vector compares.  In order to
 112 // avoid spurious page faults, we loop over:
 113 //
 114 //      min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4
 115 //
 116 // 16-byte chunks.  When we near a page end, we have to revert to a byte-by-byte
 117 // comparison until reaching the next page, then resume the vector comparison.
 118 //      %esi = LHS ptr
 119 //      %edi = RHS ptr
 120 //      %ecx = length (> kShort)
 121
 122 LNotShort:
 123         movl    %esi,%eax               // copy ptrs
 124         movl    %edi,%edx
 125         andl    $4095,%eax              // mask down to page offsets
 126         andl    $4095,%edx
 127         cmpl    %eax,%edx               // which is bigger?
 128         cmova   %edx,%eax               // %eax = max(LHS offset, RHS offset);
 129         movl    $4096,%edx
 130         subl    %eax,%edx               // get #bytes to next page crossing
 131         cmpl    %ecx,%edx               // will operand run out first?
 132         cmova   %ecx,%edx               // get min(length remaining, bytes to page end)
 133         movl    %edx,%eax
 134         shrl    $4,%edx                 // get #chunks till end of operand or page
 135         jnz     LLoopOverChunks         // enter vector loop
 136
 137 // Too near page end for vectors.
 138
 139         subl    %eax,%ecx               // adjust length remaining
 140         movl    %eax,%edx               // %edx <- #bytes to page end
 141         cmpl    $(kShort),%ecx          // will there be enough after we cross page for vectors?
 142         ja      LLoopOverBytes          // yes
 143         addl    %eax,%ecx               // no, restore total length remaining
 144         jmp     LShortLoop              // compare rest byte-by-byte (%ecx != 0)
 145
 146
 147 // Loop over 16-byte chunks.
 148 //      %esi = LHS ptr
 149 //      %edi = RHS ptr
 150 //      %ecx = length remaining
 151 //      %edx = chunk count
 152
 153         .align  4,0x90                  // align inner loops to optimize I-fetch
 154 LLoopOverChunks:
 155         movdqu  (%esi),%xmm1            // get LHS
 156         movdqu  (%edi),%xmm2            // get RHS
 157         pxor    %xmm0,%xmm0             // get some 0s in the shadow of the loads
 158         addl    $16,%esi
 159         pcmpeqb %xmm1,%xmm2             // compare LHS to RHS
 160         pcmpeqb %xmm1,%xmm0             // compare LHS to 0s
 161         addl    $16,%edi
 162         pmovmskb %xmm2,%eax             // get result mask for comparison of LHS and RHS
 163         pmovmskb %xmm0,%ebx             // get result mask for 0 check
 164         subl    $16,%ecx                // decrement length remaining
 165         xorl    $0xFFFF,%eax            // complement compare mask so 1 means "not equal"
 166         orl     %ebx,%eax               // combine the masks and check for 1-bits
 167         jnz     LFoundDiffOr0           // we found differing bytes or a 0-byte
 168         dec     %edx                    // more to go?
 169         jnz     LLoopOverChunks         // yes
 170
 171         cmpl    $(kShort),%ecx          // a lot more to compare?
 172         jbe     LShort                  // no
 173         jmp     LNotShort               // compute distance to next page crossing etc
 174
 175
 176 // Found a zero and/or a difference in vector compare.
 177 //      %esi = LHS ptr, already advanced by 16
 178 //      %edi = RHS ptr, already advanced by 16
 179 //      %eax = bit n set if bytes n differed or were 0
 180
 181 LFoundDiffOr0:
 182         bsf     %eax,%edx               // which byte differed or was 0?
 183         subl    $16,%esi                // point to start of vectors while we wait for bit scan
 184         subl    $16,%edi
 185         movzb   (%esi,%edx),%eax        // get LHS byte
 186         movzb   (%edi,%edx),%ecx        // get RHS byte
 187         popl    %ebx
 188         popl    %edi
 189         subl    %ecx,%eax               // compute difference (ie, return value)
 190         popl    %esi
 191         ret