i386/string/strcmp.s

   1 /*
   2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24
  25 // ***************
  26 // * S T R C M P *
  27 // ***************
  28 //
  29 // int  strcmp(const char *s1, const char *s2);
  30 //
  31 // We optimize the compare by doing it in parallel, using SSE.  This introduces
  32 // a complication: if we blindly did vector loads from both sides until
  33 // finding a difference (or 0), we might get a spurious page fault by
  34 // reading bytes past the difference.  To avoid this, we never do a load
  35 // that crosses a page boundary.
  36
  37         .text
  38         .globl _strcmp
  39
  40         .align  4
  41 _strcmp:                                // int strcmp(const char *s1,const char *s2);
  42         pushl   %esi
  43         pushl   %edi
  44         movl    12(%esp),%esi           // get LHS ptr
  45         movl    16(%esp),%edi           // get RHS ptr
  46
  47
  48 // In order to avoid spurious page faults, we loop over:
  49 //
  50 //      min( bytes_in_LHS_page, bytes_in_RHS_page) >> 4
  51 //
  52 // 16-byte chunks.  When we near a page end, we have to revert to a byte-by-byte
  53 // comparison until reaching the next page, then resume the vector comparison.
  54 //      %esi = LHS ptr
  55 //      %edi = RHS ptr
  56
  57 LNextChunk:
  58         movl    %esi,%eax               // copy ptrs
  59         movl    %edi,%edx
  60         andl    $4095,%eax              // mask down to page offsets
  61         andl    $4095,%edx
  62         cmpl    %eax,%edx               // which is bigger?
  63         cmova   %edx,%eax               // %eax = max(LHS offset, RHS offset);
  64         movl    $4096,%edx
  65         subl    %eax,%edx               // get #bytes to next page crossing
  66         movl    %edx,%eax
  67         shrl    $4,%edx                 // get #chunks till end of operand or page
  68         jnz     LLoopOverChunks         // enter vector loop
  69         movl    %eax,%edx               // no chunks...
  70         jmp     LLoopOverBytes          // ...so loop over bytes until page end
  71
  72
  73 // Loop over bytes.
  74 //      %esi = LHS ptr
  75 //      %edi = RHS ptr
  76 //      %edx = byte count
  77
  78         .align  4,0x90                  // align inner loops to optimize I-fetch
  79 LLoopOverBytes:
  80         movzb   (%esi),%eax             // get LHS byte
  81         movzb   (%edi),%ecx             // get RHS byte
  82         inc     %esi
  83         inc     %edi
  84         testl   %eax,%eax               // 0?
  85         jz      LExit0                  // yes, we're done
  86         subl    %ecx,%eax               // compare them
  87         jnz     LExit                   // done if not equal
  88         dec     %edx                    // more to go?
  89         jnz     LLoopOverBytes
  90
  91         jmp     LNextChunk              // we've come to end of page
  92
  93
  94 // Loop over 16-byte chunks.
  95 //      %esi = LHS ptr
  96 //      %edi = RHS ptr
  97 //      %edx = chunk count
  98
  99         .align  4,0x90                  // align inner loops to optimize I-fetch
 100 LLoopOverChunks:
 101         movdqu  (%esi),%xmm1            // get LHS
 102         movdqu  (%edi),%xmm2            // get RHS
 103         pxor    %xmm0,%xmm0             // get some 0s in the shadow of the loads
 104         addl    $16,%esi
 105         pcmpeqb %xmm1,%xmm2             // compare LHS to RHS
 106         pcmpeqb %xmm1,%xmm0             // compare LHS to 0s
 107         addl    $16,%edi
 108         pmovmskb %xmm2,%eax             // get result mask for comparison of LHS and RHS
 109         pmovmskb %xmm0,%ecx             // get result mask for 0 check
 110         xorl    $0xFFFF,%eax            // complement compare mask so 1 means "not equal"
 111         orl     %ecx,%eax               // combine the masks and check for 1-bits
 112         jnz     LFoundDiffOr0           // we found differing bytes or a 0-byte
 113         dec     %edx                    // more to go?
 114         jnz     LLoopOverChunks
 115
 116         jmp     LNextChunk              // compare up to next page boundary
 117
 118
 119 // Found a zero and/or a difference in vector compare.
 120 //      %esi = LHS ptr, already advanced by 16
 121 //      %edi = RHS ptr, already advanced by 16
 122 //      %eax = bit n set if bytes n differed or were 0
 123
 124 LFoundDiffOr0:
 125         bsf     %eax,%edx               // which byte differed or was 0?
 126         subl    $16,%esi                // point to start of vectors while we wait for bit scan
 127         subl    $16,%edi
 128         movzb   (%esi,%edx),%eax        // get LHS byte
 129         movzb   (%edi,%edx),%ecx        // get RHS byte
 130         subl    %ecx,%eax               // compute difference (ie, return value)
 131         popl    %edi
 132         popl    %esi
 133         ret
 134
 135
 136 // Found a zero and/or difference in byte loop.
 137 //      %eax = LHS byte
 138 //      %ecx = RHS byte
 139
 140 LExit0:
 141         subl    %ecx,%eax               // compute difference (ie, return value)
 142 LExit:                                  // here with difference already in %eax
 143         popl    %edi
 144         popl    %esi
 145         ret