]>
Commit | Line | Data |
---|---|---|
eb1cde05 A |
1 | /* |
2 | * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
24 | ||
25 | // *************** *********** | |
26 | // * M E M C M P * and * B C M P * | |
27 | // *************** *********** | |
28 | // | |
29 | // int memcmp(const char *s1, const char *s2, size_t len); | |
30 | // int bcmp(const char *s1, const char *s2, size_t len); | |
31 | // | |
32 | // Bcmp returns (+,0,-), whereas memcmp returns the true difference | |
33 | // between the first differing bytes, but we treat them identically. | |
34 | // | |
35 | // We optimize the compare by doing it with SSE. This introduces | |
36 | // a complication: if we blindly did vector loads from both sides until | |
37 | // finding a difference, we might get a spurious page fault by | |
38 | // reading bytes past the difference. To avoid this, we never do a load | |
39 | // that crosses a page boundary. | |
40 | ||
41 | #define kShort 18 // too short for vectors (must be >16) | |
42 | ||
43 | .text | |
44 | .align 4 | |
45 | ||
46 | .globl _memcmp | |
47 | .globl _bcmp | |
48 | ||
49 | _memcmp: // int memcmp(const char *s1,const char *s2,size_t len); | |
50 | _bcmp: // int bcmp(const char *s1,const char *s2,size_t len); | |
51 | pushl %esi | |
52 | pushl %edi | |
53 | movl 20(%esp),%ecx // get length | |
54 | movl 12(%esp),%esi // get LHS ptr | |
55 | movl 16(%esp),%edi // get RHS ptr | |
56 | cmpl $(kShort),%ecx // worth accelerating? | |
57 | ja LNotShort // yes | |
58 | ||
59 | ||
60 | // Too short to bother with parallel compares. Loop over bytes. | |
61 | // %esi = LHS ptr | |
62 | // %edi = RHS ptr | |
63 | // %ecx = length (<= kShort) | |
64 | ||
65 | LShort: | |
66 | testl %ecx,%ecx // 0-length? | |
67 | jnz LShortLoop // no | |
68 | xorl %eax,%eax // return 0 | |
69 | jmp LExit | |
70 | .align 4,0x90 // align inner loops to optimize I-fetch | |
71 | LShortLoop: // loop over bytes | |
72 | movzb (%esi),%eax // get LHS byte | |
73 | movzb (%edi),%edx // get RHS byte | |
74 | incl %esi | |
75 | incl %edi | |
76 | subl %edx,%eax // compare them | |
77 | jnz LExit // done if not equal | |
78 | decl %ecx // decrement length | |
79 | jnz LShortLoop | |
80 | LExit: // return value is in %eax | |
81 | popl %edi | |
82 | popl %esi | |
83 | ret | |
84 | ||
85 | LNotEqual: // here from LLoopOverBytes with LHS in eax | |
86 | movzb (%edi),%edx // get RHS byte | |
87 | subl %edx,%eax // generate return value (nonzero) | |
88 | popl %edi | |
89 | popl %esi | |
90 | ret | |
91 | ||
92 | ||
93 | // Loop over bytes until we reach end of a page. | |
94 | // %esi = LHS ptr | |
95 | // %edi = RHS ptr | |
96 | // %ecx = length remaining after end of loop (ie, already adjusted) | |
97 | // %edx = #bytes until next page (1..15) | |
98 | ||
99 | .align 4,0x90 // align inner loops to optimize I-fetch | |
100 | LLoopOverBytes: | |
101 | movzb (%esi),%eax // get LHS byte | |
102 | inc %esi | |
103 | cmpb (%edi),%al // compare to RHS byte | |
104 | jnz LNotEqual // done if not equal | |
105 | inc %edi | |
106 | dec %edx // more to go? | |
107 | jnz LLoopOverBytes | |
108 | ||
109 | ||
110 | // Long enough to justify overhead of setting up vector compares. In order to | |
111 | // avoid spurious page faults, we loop over: | |
112 | // | |
113 | // min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4 | |
114 | // | |
115 | // 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte | |
116 | // comparison until reaching the next page, then resume the vector comparison. | |
117 | // %esi = LHS ptr | |
118 | // %edi = RHS ptr | |
119 | // %ecx = length (> kShort) | |
120 | ||
121 | LNotShort: | |
122 | movl %esi,%eax // copy ptrs | |
123 | movl %edi,%edx | |
124 | andl $4095,%eax // mask down to page offsets | |
125 | andl $4095,%edx | |
126 | cmpl %eax,%edx // which is bigger? | |
127 | cmova %edx,%eax // %eax = max(LHS offset, RHS offset); | |
128 | movl $4096,%edx | |
129 | subl %eax,%edx // get #bytes to next page crossing | |
130 | cmpl %ecx,%edx // will operand run out first? | |
131 | cmova %ecx,%edx // get min(length remaining, bytes to page end) | |
132 | movl %edx,%eax | |
133 | shrl $4,%edx // get #chunks till end of operand or page | |
134 | jnz LLoopOverChunks // enter vector loop | |
135 | ||
136 | // Too near page end for vectors. | |
137 | ||
138 | subl %eax,%ecx // adjust length remaining | |
139 | movl %eax,%edx // %edx <- #bytes to page end | |
140 | cmpl $(kShort),%ecx // will there be enough after we cross page for vectors? | |
141 | ja LLoopOverBytes // yes | |
142 | addl %eax,%ecx // no, restore total length remaining | |
143 | jmp LShortLoop // compare rest byte-by-byte (%ecx != 0) | |
144 | ||
145 | ||
146 | // Loop over 16-byte chunks. | |
147 | // %esi = LHS ptr | |
148 | // %edi = RHS ptr | |
149 | // %ecx = length remaining | |
150 | // %edx = chunk count | |
151 | ||
152 | .align 4,0x90 // align inner loops to optimize I-fetch | |
153 | LLoopOverChunks: | |
154 | movdqu (%esi),%xmm0 // get LHS | |
155 | movdqu (%edi),%xmm1 // get RHS | |
156 | addl $16,%esi | |
157 | pcmpeqb %xmm1,%xmm0 // compare LHS to RHS | |
158 | addl $16,%edi | |
159 | pmovmskb %xmm0,%eax // collect comparison result bits (1 if equal) | |
160 | subl $16,%ecx // adjust length remaining | |
161 | xorl $0xFFFF,%eax // all equal? | |
162 | jne LDifferent // no, we found differing bytes | |
163 | dec %edx // more to go? | |
164 | jnz LLoopOverChunks | |
165 | ||
166 | cmpl $(kShort),%ecx // a lot more to compare? | |
167 | jbe LShort // no | |
168 | jmp LNotShort // compute distance to next page crossing etc | |
169 | ||
170 | ||
171 | // Found a difference. | |
172 | // %esi = LHS ptr, already advanced by 16 | |
173 | // %edi = RHS ptr, already advanced by 16 | |
174 | // %eax = complemented compare vector (ie, 0 == equal) | |
175 | ||
176 | LDifferent: | |
177 | bsf %eax,%edx // which byte differed? | |
178 | subl $16,%esi // point to byte 0 while we wait for bit scan | |
179 | subl $16,%edi | |
180 | movzb (%esi,%edx),%eax // get LHS byte | |
181 | movzb (%edi,%edx),%ecx // get RHS byte | |
182 | subl %ecx,%eax // compute difference (ie, return value) | |
183 | popl %edi | |
184 | popl %esi | |
185 | ret |