/* * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_LICENSE_HEADER_END@ */ // *************** // * S T R C M P * // *************** // // int strcmp(const char *s1, const char *s2); // // We optimize the compare by doing it in parallel, using SSE. This introduces // a complication: if we blindly did vector loads from both sides until // finding a difference (or 0), we might get a spurious page fault by // reading bytes past the difference. To avoid this, we never do a load // that crosses a page boundary. .text .globl _strcmp .align 4 _strcmp: // int strcmp(const char *s1,const char *s2); pushl %esi pushl %edi movl 12(%esp),%esi // get LHS ptr movl 16(%esp),%edi // get RHS ptr // In order to avoid spurious page faults, we loop over: // // min( bytes_in_LHS_page, bytes_in_RHS_page) >> 4 // // 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte // comparison until reaching the next page, then resume the vector comparison. // %esi = LHS ptr // %edi = RHS ptr LNextChunk: movl %esi,%eax // copy ptrs movl %edi,%edx andl $4095,%eax // mask down to page offsets andl $4095,%edx cmpl %eax,%edx // which is bigger? cmova %edx,%eax // %eax = max(LHS offset, RHS offset); movl $4096,%edx subl %eax,%edx // get #bytes to next page crossing movl %edx,%eax shrl $4,%edx // get #chunks till end of operand or page jnz LLoopOverChunks // enter vector loop movl %eax,%edx // no chunks... jmp LLoopOverBytes // ...so loop over bytes until page end // Loop over bytes. // %esi = LHS ptr // %edi = RHS ptr // %edx = byte count .align 4,0x90 // align inner loops to optimize I-fetch LLoopOverBytes: movzb (%esi),%eax // get LHS byte movzb (%edi),%ecx // get RHS byte inc %esi inc %edi testl %eax,%eax // 0? jz LExit0 // yes, we're done subl %ecx,%eax // compare them jnz LExit // done if not equal dec %edx // more to go? jnz LLoopOverBytes jmp LNextChunk // we've come to end of page // Loop over 16-byte chunks. // %esi = LHS ptr // %edi = RHS ptr // %edx = chunk count .align 4,0x90 // align inner loops to optimize I-fetch LLoopOverChunks: movdqu (%esi),%xmm1 // get LHS movdqu (%edi),%xmm2 // get RHS pxor %xmm0,%xmm0 // get some 0s in the shadow of the loads addl $16,%esi pcmpeqb %xmm1,%xmm2 // compare LHS to RHS pcmpeqb %xmm1,%xmm0 // compare LHS to 0s addl $16,%edi pmovmskb %xmm2,%eax // get result mask for comparison of LHS and RHS pmovmskb %xmm0,%ecx // get result mask for 0 check xorl $0xFFFF,%eax // complement compare mask so 1 means "not equal" orl %ecx,%eax // combine the masks and check for 1-bits jnz LFoundDiffOr0 // we found differing bytes or a 0-byte dec %edx // more to go? jnz LLoopOverChunks jmp LNextChunk // compare up to next page boundary // Found a zero and/or a difference in vector compare. // %esi = LHS ptr, already advanced by 16 // %edi = RHS ptr, already advanced by 16 // %eax = bit n set if bytes n differed or were 0 LFoundDiffOr0: bsf %eax,%edx // which byte differed or was 0? subl $16,%esi // point to start of vectors while we wait for bit scan subl $16,%edi movzb (%esi,%edx),%eax // get LHS byte movzb (%edi,%edx),%ecx // get RHS byte subl %ecx,%eax // compute difference (ie, return value) popl %edi popl %esi ret // Found a zero and/or difference in byte loop. // %eax = LHS byte // %ecx = RHS byte LExit0: subl %ecx,%eax // compute difference (ie, return value) LExit: // here with difference already in %eax popl %edi popl %esi ret