]> git.saurik.com Git - apple/libc.git/blob - i386/string/strcmp.s
fb1047f43a491b8c71a75c070cb37ed5da36c981
[apple/libc.git] / i386 / string / strcmp.s
1 /*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24
25 // ***************
26 // * S T R C M P *
27 // ***************
28 //
29 // int strcmp(const char *s1, const char *s2);
30 //
31 // We optimize the compare by doing it in parallel, using SSE. This introduces
32 // a complication: if we blindly did vector loads from both sides until
33 // finding a difference (or 0), we might get a spurious page fault by
34 // reading bytes past the difference. To avoid this, we never do a load
35 // that crosses a page boundary.
36
37 .text
38 .globl _strcmp
39
40 .align 4
41 _strcmp: // int strcmp(const char *s1,const char *s2);
42 pushl %esi
43 pushl %edi
44 movl 12(%esp),%esi // get LHS ptr
45 movl 16(%esp),%edi // get RHS ptr
46
47
48 // In order to avoid spurious page faults, we loop over:
49 //
50 // min( bytes_in_LHS_page, bytes_in_RHS_page) >> 4
51 //
52 // 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte
53 // comparison until reaching the next page, then resume the vector comparison.
54 // %esi = LHS ptr
55 // %edi = RHS ptr
56
57 LNextChunk:
58 movl %esi,%eax // copy ptrs
59 movl %edi,%edx
60 andl $4095,%eax // mask down to page offsets
61 andl $4095,%edx
62 cmpl %eax,%edx // which is bigger?
63 cmova %edx,%eax // %eax = max(LHS offset, RHS offset);
64 movl $4096,%edx
65 subl %eax,%edx // get #bytes to next page crossing
66 movl %edx,%eax
67 shrl $4,%edx // get #chunks till end of operand or page
68 jnz LLoopOverChunks // enter vector loop
69 movl %eax,%edx // no chunks...
70 jmp LLoopOverBytes // ...so loop over bytes until page end
71
72
73 // Loop over bytes.
74 // %esi = LHS ptr
75 // %edi = RHS ptr
76 // %edx = byte count
77
78 .align 4,0x90 // align inner loops to optimize I-fetch
79 LLoopOverBytes:
80 movzb (%esi),%eax // get LHS byte
81 movzb (%edi),%ecx // get RHS byte
82 inc %esi
83 inc %edi
84 testl %eax,%eax // 0?
85 jz LExit0 // yes, we're done
86 subl %ecx,%eax // compare them
87 jnz LExit // done if not equal
88 dec %edx // more to go?
89 jnz LLoopOverBytes
90
91 jmp LNextChunk // we've come to end of page
92
93
94 // Loop over 16-byte chunks.
95 // %esi = LHS ptr
96 // %edi = RHS ptr
97 // %edx = chunk count
98
99 .align 4,0x90 // align inner loops to optimize I-fetch
100 LLoopOverChunks:
101 movdqu (%esi),%xmm1 // get LHS
102 movdqu (%edi),%xmm2 // get RHS
103 pxor %xmm0,%xmm0 // get some 0s in the shadow of the loads
104 addl $16,%esi
105 pcmpeqb %xmm1,%xmm2 // compare LHS to RHS
106 pcmpeqb %xmm1,%xmm0 // compare LHS to 0s
107 addl $16,%edi
108 pmovmskb %xmm2,%eax // get result mask for comparison of LHS and RHS
109 pmovmskb %xmm0,%ecx // get result mask for 0 check
110 xorl $0xFFFF,%eax // complement compare mask so 1 means "not equal"
111 orl %ecx,%eax // combine the masks and check for 1-bits
112 jnz LFoundDiffOr0 // we found differing bytes or a 0-byte
113 dec %edx // more to go?
114 jnz LLoopOverChunks
115
116 jmp LNextChunk // compare up to next page boundary
117
118
119 // Found a zero and/or a difference in vector compare.
120 // %esi = LHS ptr, already advanced by 16
121 // %edi = RHS ptr, already advanced by 16
122 // %eax = bit n set if bytes n differed or were 0
123
124 LFoundDiffOr0:
125 bsf %eax,%edx // which byte differed or was 0?
126 subl $16,%esi // point to start of vectors while we wait for bit scan
127 subl $16,%edi
128 movzb (%esi,%edx),%eax // get LHS byte
129 movzb (%edi,%edx),%ecx // get RHS byte
130 subl %ecx,%eax // compute difference (ie, return value)
131 popl %edi
132 popl %esi
133 ret
134
135
136 // Found a zero and/or difference in byte loop.
137 // %eax = LHS byte
138 // %ecx = RHS byte
139
140 LExit0:
141 subl %ecx,%eax // compute difference (ie, return value)
142 LExit: // here with difference already in %eax
143 popl %edi
144 popl %esi
145 ret