/* * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_LICENSE_HEADER_END@ */ // *************** *********** // * M E M C M P * and * B C M P * // *************** *********** // // int memcmp(const char *s1, const char *s2, size_t len); // int bcmp(const char *s1, const char *s2, size_t len); // // Bcmp returns (+,0,-), whereas memcmp returns the true difference // between the first differing bytes, but we treat them identically. // // We optimize the compare by doing it with SSE. This introduces // a complication: if we blindly did vector loads from both sides until // finding a difference, we might get a spurious page fault by // reading bytes past the difference. To avoid this, we never do a load // that crosses a page boundary. #define kShort 18 // too short for vectors (must be >16) .text .align 4 .globl _memcmp .globl _bcmp _memcmp: // int memcmp(const char *s1,const char *s2,size_t len); _bcmp: // int bcmp(const char *s1,const char *s2,size_t len); pushl %esi pushl %edi movl 20(%esp),%ecx // get length movl 12(%esp),%esi // get LHS ptr movl 16(%esp),%edi // get RHS ptr cmpl $(kShort),%ecx // worth accelerating? ja LNotShort // yes // Too short to bother with parallel compares. Loop over bytes. // %esi = LHS ptr // %edi = RHS ptr // %ecx = length (<= kShort) LShort: testl %ecx,%ecx // 0-length? jnz LShortLoop // no xorl %eax,%eax // return 0 jmp LExit .align 4,0x90 // align inner loops to optimize I-fetch LShortLoop: // loop over bytes movzb (%esi),%eax // get LHS byte movzb (%edi),%edx // get RHS byte incl %esi incl %edi subl %edx,%eax // compare them jnz LExit // done if not equal decl %ecx // decrement length jnz LShortLoop LExit: // return value is in %eax popl %edi popl %esi ret LNotEqual: // here from LLoopOverBytes with LHS in eax movzb (%edi),%edx // get RHS byte subl %edx,%eax // generate return value (nonzero) popl %edi popl %esi ret // Loop over bytes until we reach end of a page. // %esi = LHS ptr // %edi = RHS ptr // %ecx = length remaining after end of loop (ie, already adjusted) // %edx = #bytes until next page (1..15) .align 4,0x90 // align inner loops to optimize I-fetch LLoopOverBytes: movzb (%esi),%eax // get LHS byte inc %esi cmpb (%edi),%al // compare to RHS byte jnz LNotEqual // done if not equal inc %edi dec %edx // more to go? jnz LLoopOverBytes // Long enough to justify overhead of setting up vector compares. In order to // avoid spurious page faults, we loop over: // // min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4 // // 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte // comparison until reaching the next page, then resume the vector comparison. // %esi = LHS ptr // %edi = RHS ptr // %ecx = length (> kShort) LNotShort: movl %esi,%eax // copy ptrs movl %edi,%edx andl $4095,%eax // mask down to page offsets andl $4095,%edx cmpl %eax,%edx // which is bigger? cmova %edx,%eax // %eax = max(LHS offset, RHS offset); movl $4096,%edx subl %eax,%edx // get #bytes to next page crossing cmpl %ecx,%edx // will operand run out first? cmova %ecx,%edx // get min(length remaining, bytes to page end) movl %edx,%eax shrl $4,%edx // get #chunks till end of operand or page jnz LLoopOverChunks // enter vector loop // Too near page end for vectors. subl %eax,%ecx // adjust length remaining movl %eax,%edx // %edx <- #bytes to page end cmpl $(kShort),%ecx // will there be enough after we cross page for vectors? ja LLoopOverBytes // yes addl %eax,%ecx // no, restore total length remaining jmp LShortLoop // compare rest byte-by-byte (%ecx != 0) // Loop over 16-byte chunks. // %esi = LHS ptr // %edi = RHS ptr // %ecx = length remaining // %edx = chunk count .align 4,0x90 // align inner loops to optimize I-fetch LLoopOverChunks: movdqu (%esi),%xmm0 // get LHS movdqu (%edi),%xmm1 // get RHS addl $16,%esi pcmpeqb %xmm1,%xmm0 // compare LHS to RHS addl $16,%edi pmovmskb %xmm0,%eax // collect comparison result bits (1 if equal) subl $16,%ecx // adjust length remaining xorl $0xFFFF,%eax // all equal? jne LDifferent // no, we found differing bytes dec %edx // more to go? jnz LLoopOverChunks cmpl $(kShort),%ecx // a lot more to compare? jbe LShort // no jmp LNotShort // compute distance to next page crossing etc // Found a difference. // %esi = LHS ptr, already advanced by 16 // %edi = RHS ptr, already advanced by 16 // %eax = complemented compare vector (ie, 0 == equal) LDifferent: bsf %eax,%edx // which byte differed? subl $16,%esi // point to byte 0 while we wait for bit scan subl $16,%edi movzb (%esi,%edx),%eax // get LHS byte movzb (%edi,%edx),%ecx // get RHS byte subl %ecx,%eax // compute difference (ie, return value) popl %edi popl %esi ret