]>
Commit | Line | Data |
---|---|---|
eb1cde05 A |
1 | /* |
2 | * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
24 | ||
25 | // ***************** | |
26 | // * S T R N C M P * | |
27 | // ***************** | |
28 | // | |
29 | // int strncmp(const char *s1, const char *s2, size_t len); | |
30 | // | |
31 | // We optimize the compare by doing it vector parallel. This introduces | |
32 | // a complication: if we blindly did vector loads from both sides until | |
33 | // finding a difference (or 0), we might get a spurious page fault by | |
34 | // reading bytes past the difference. To avoid this, we never do a load | |
35 | // that crosses a page boundary. | |
36 | ||
37 | #define kShort 20 // too short for vectors (must be >16) | |
38 | ||
39 | .text | |
40 | .globl _strncmp | |
41 | ||
42 | .align 4 | |
43 | _strncmp: // int strncmp(const char *s1, const char *s2, size_t len); | |
44 | pushl %esi | |
45 | pushl %edi | |
46 | movl 20(%esp),%ecx // get length | |
47 | movl 12(%esp),%esi // get LHS ptr | |
48 | movl 16(%esp),%edi // get RHS ptr | |
49 | push %ebx | |
50 | cmpl $(kShort),%ecx // worth accelerating? | |
51 | ja LNotShort // yes | |
52 | ||
53 | ||
54 | // Too short to bother with parallel compares. Loop over bytes. | |
55 | // %esi = LHS ptr | |
56 | // %edi = RHS ptr | |
57 | // %ecx = length (<= kShort) | |
58 | ||
59 | LShort: | |
60 | testl %ecx,%ecx // 0-length? | |
61 | jnz LShortLoop // no | |
62 | jmp LReturn0 // yes, return 0 | |
63 | .align 4,0x90 // align inner loops to optimize I-fetch | |
64 | LShortLoop: // loop over bytes | |
65 | movzb (%esi),%eax // get LHS byte | |
66 | movzb (%edi),%ebx // get RHS byte | |
67 | incl %esi | |
68 | incl %edi | |
69 | testl %eax,%eax // LHS==0 ? | |
70 | jz LNotEqual // yes, this terminates comparison | |
71 | subl %ebx,%eax // compare them | |
72 | jnz LExit // done if not equal | |
73 | decl %ecx // decrement length | |
74 | jnz LShortLoop | |
75 | LReturn0: | |
76 | xorl %eax,%eax // all bytes equal, so return 0 | |
77 | LExit: // return value is in %eax | |
78 | popl %ebx | |
79 | popl %edi | |
80 | popl %esi | |
81 | ret | |
82 | ||
83 | LNotEqual: // LHS in eax, RHS in ebx | |
84 | subl %ebx,%eax // generate return value (nonzero) | |
85 | popl %ebx | |
86 | popl %edi | |
87 | popl %esi | |
88 | ret | |
89 | ||
90 | ||
91 | // Loop over bytes until we reach end of a page. | |
92 | // %esi = LHS ptr | |
93 | // %edi = RHS ptr | |
94 | // %ecx = length remaining after end of loop (ie, already adjusted) | |
95 | // %edx = #bytes until next page (1..15) | |
96 | ||
97 | .align 4,0x90 // align inner loops to optimize I-fetch | |
98 | LLoopOverBytes: | |
99 | movzb (%esi),%eax // get LHS byte | |
100 | movzb (%edi),%ebx // get RHS byte | |
101 | inc %esi | |
102 | inc %edi | |
103 | testl %eax,%eax // LHS==0 ? | |
104 | jz LNotEqual // yes, this terminates comparison | |
105 | subl %ebx,%eax // compare them | |
106 | jnz LExit // done if not equal | |
107 | dec %edx // more to go? | |
108 | jnz LLoopOverBytes | |
109 | ||
110 | ||
111 | // Long enough to justify overhead of setting up vector compares. In order to | |
112 | // avoid spurious page faults, we loop over: | |
113 | // | |
114 | // min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4 | |
115 | // | |
116 | // 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte | |
117 | // comparison until reaching the next page, then resume the vector comparison. | |
118 | // %esi = LHS ptr | |
119 | // %edi = RHS ptr | |
120 | // %ecx = length (> kShort) | |
121 | ||
122 | LNotShort: | |
123 | movl %esi,%eax // copy ptrs | |
124 | movl %edi,%edx | |
125 | andl $4095,%eax // mask down to page offsets | |
126 | andl $4095,%edx | |
127 | cmpl %eax,%edx // which is bigger? | |
128 | cmova %edx,%eax // %eax = max(LHS offset, RHS offset); | |
129 | movl $4096,%edx | |
130 | subl %eax,%edx // get #bytes to next page crossing | |
131 | cmpl %ecx,%edx // will operand run out first? | |
132 | cmova %ecx,%edx // get min(length remaining, bytes to page end) | |
133 | movl %edx,%eax | |
134 | shrl $4,%edx // get #chunks till end of operand or page | |
135 | jnz LLoopOverChunks // enter vector loop | |
136 | ||
137 | // Too near page end for vectors. | |
138 | ||
139 | subl %eax,%ecx // adjust length remaining | |
140 | movl %eax,%edx // %edx <- #bytes to page end | |
141 | cmpl $(kShort),%ecx // will there be enough after we cross page for vectors? | |
142 | ja LLoopOverBytes // yes | |
143 | addl %eax,%ecx // no, restore total length remaining | |
144 | jmp LShortLoop // compare rest byte-by-byte (%ecx != 0) | |
145 | ||
146 | ||
147 | // Loop over 16-byte chunks. | |
148 | // %esi = LHS ptr | |
149 | // %edi = RHS ptr | |
150 | // %ecx = length remaining | |
151 | // %edx = chunk count | |
152 | ||
153 | .align 4,0x90 // align inner loops to optimize I-fetch | |
154 | LLoopOverChunks: | |
155 | movdqu (%esi),%xmm1 // get LHS | |
156 | movdqu (%edi),%xmm2 // get RHS | |
157 | pxor %xmm0,%xmm0 // get some 0s in the shadow of the loads | |
158 | addl $16,%esi | |
159 | pcmpeqb %xmm1,%xmm2 // compare LHS to RHS | |
160 | pcmpeqb %xmm1,%xmm0 // compare LHS to 0s | |
161 | addl $16,%edi | |
162 | pmovmskb %xmm2,%eax // get result mask for comparison of LHS and RHS | |
163 | pmovmskb %xmm0,%ebx // get result mask for 0 check | |
164 | subl $16,%ecx // decrement length remaining | |
165 | xorl $0xFFFF,%eax // complement compare mask so 1 means "not equal" | |
166 | orl %ebx,%eax // combine the masks and check for 1-bits | |
167 | jnz LFoundDiffOr0 // we found differing bytes or a 0-byte | |
168 | dec %edx // more to go? | |
169 | jnz LLoopOverChunks // yes | |
170 | ||
171 | cmpl $(kShort),%ecx // a lot more to compare? | |
172 | jbe LShort // no | |
173 | jmp LNotShort // compute distance to next page crossing etc | |
174 | ||
175 | ||
176 | // Found a zero and/or a difference in vector compare. | |
177 | // %esi = LHS ptr, already advanced by 16 | |
178 | // %edi = RHS ptr, already advanced by 16 | |
179 | // %eax = bit n set if bytes n differed or were 0 | |
180 | ||
181 | LFoundDiffOr0: | |
182 | bsf %eax,%edx // which byte differed or was 0? | |
183 | subl $16,%esi // point to start of vectors while we wait for bit scan | |
184 | subl $16,%edi | |
185 | movzb (%esi,%edx),%eax // get LHS byte | |
186 | movzb (%edi,%edx),%ecx // get RHS byte | |
187 | popl %ebx | |
188 | popl %edi | |
189 | subl %ecx,%eax // compute difference (ie, return value) | |
190 | popl %esi | |
191 | ret |