X-Git-Url: https://git.saurik.com/apple/libc.git/blobdiff_plain/51631861ddb16afcfcf748cee26c14481549065e..6990d062918770ee2431fb3310826c5aefbffccd:/i386/string/strcmp.s diff --git a/i386/string/strcmp.s b/i386/string/strcmp.s index a21cea6..fb1047f 100644 --- a/i386/string/strcmp.s +++ b/i386/string/strcmp.s @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -20,72 +20,126 @@ * * @APPLE_LICENSE_HEADER_END@ */ -.text -.globl _strcmp -_strcmp: - movl 0x04(%esp),%eax - movl 0x08(%esp),%edx - jmp L2 /* Jump into the loop! */ - - .align 2,0x90 -L1: incl %eax - incl %edx -L2: movb (%eax),%cl - testb %cl,%cl /* null terminator??? */ - jz L3 - cmpb %cl,(%edx) /* chars match??? */ - jne L3 - incl %eax - incl %edx - movb (%eax),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%edx) - jne L3 - incl %eax - incl %edx - movb (%eax),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%edx) - jne L3 - incl %eax - incl %edx - movb (%eax),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%edx) - jne L3 - incl %eax - incl %edx - movb (%eax),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%edx) - jne L3 - incl %eax - incl %edx - movb (%eax),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%edx) - jne L3 - incl %eax - incl %edx - movb (%eax),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%edx) - jne L3 - incl %eax - incl %edx - movb (%eax),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%edx) - je L1 - .align 2, 0x90 -L3: movzbl (%eax),%eax /* unsigned comparison */ - movzbl (%edx),%edx - subl %edx,%eax - ret + + +// *************** +// * S T R C M P * +// *************** +// +// int strcmp(const char *s1, const char *s2); +// +// We optimize the compare by doing it in parallel, using SSE. This introduces +// a complication: if we blindly did vector loads from both sides until +// finding a difference (or 0), we might get a spurious page fault by +// reading bytes past the difference. To avoid this, we never do a load +// that crosses a page boundary. + + .text + .globl _strcmp + + .align 4 +_strcmp: // int strcmp(const char *s1,const char *s2); + pushl %esi + pushl %edi + movl 12(%esp),%esi // get LHS ptr + movl 16(%esp),%edi // get RHS ptr + + +// In order to avoid spurious page faults, we loop over: +// +// min( bytes_in_LHS_page, bytes_in_RHS_page) >> 4 +// +// 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte +// comparison until reaching the next page, then resume the vector comparison. +// %esi = LHS ptr +// %edi = RHS ptr + +LNextChunk: + movl %esi,%eax // copy ptrs + movl %edi,%edx + andl $4095,%eax // mask down to page offsets + andl $4095,%edx + cmpl %eax,%edx // which is bigger? + cmova %edx,%eax // %eax = max(LHS offset, RHS offset); + movl $4096,%edx + subl %eax,%edx // get #bytes to next page crossing + movl %edx,%eax + shrl $4,%edx // get #chunks till end of operand or page + jnz LLoopOverChunks // enter vector loop + movl %eax,%edx // no chunks... + jmp LLoopOverBytes // ...so loop over bytes until page end + + +// Loop over bytes. +// %esi = LHS ptr +// %edi = RHS ptr +// %edx = byte count + + .align 4,0x90 // align inner loops to optimize I-fetch +LLoopOverBytes: + movzb (%esi),%eax // get LHS byte + movzb (%edi),%ecx // get RHS byte + inc %esi + inc %edi + testl %eax,%eax // 0? + jz LExit0 // yes, we're done + subl %ecx,%eax // compare them + jnz LExit // done if not equal + dec %edx // more to go? + jnz LLoopOverBytes + + jmp LNextChunk // we've come to end of page + + +// Loop over 16-byte chunks. +// %esi = LHS ptr +// %edi = RHS ptr +// %edx = chunk count + + .align 4,0x90 // align inner loops to optimize I-fetch +LLoopOverChunks: + movdqu (%esi),%xmm1 // get LHS + movdqu (%edi),%xmm2 // get RHS + pxor %xmm0,%xmm0 // get some 0s in the shadow of the loads + addl $16,%esi + pcmpeqb %xmm1,%xmm2 // compare LHS to RHS + pcmpeqb %xmm1,%xmm0 // compare LHS to 0s + addl $16,%edi + pmovmskb %xmm2,%eax // get result mask for comparison of LHS and RHS + pmovmskb %xmm0,%ecx // get result mask for 0 check + xorl $0xFFFF,%eax // complement compare mask so 1 means "not equal" + orl %ecx,%eax // combine the masks and check for 1-bits + jnz LFoundDiffOr0 // we found differing bytes or a 0-byte + dec %edx // more to go? + jnz LLoopOverChunks + + jmp LNextChunk // compare up to next page boundary + + +// Found a zero and/or a difference in vector compare. +// %esi = LHS ptr, already advanced by 16 +// %edi = RHS ptr, already advanced by 16 +// %eax = bit n set if bytes n differed or were 0 + +LFoundDiffOr0: + bsf %eax,%edx // which byte differed or was 0? + subl $16,%esi // point to start of vectors while we wait for bit scan + subl $16,%edi + movzb (%esi,%edx),%eax // get LHS byte + movzb (%edi,%edx),%ecx // get RHS byte + subl %ecx,%eax // compute difference (ie, return value) + popl %edi + popl %esi + ret + + +// Found a zero and/or difference in byte loop. +// %eax = LHS byte +// %ecx = RHS byte + +LExit0: + subl %ecx,%eax // compute difference (ie, return value) +LExit: // here with difference already in %eax + popl %edi + popl %esi + ret