]> git.saurik.com Git - apple/libc.git/blob - i386/string/memcmp.s
Libc-391.5.22.tar.gz
[apple/libc.git] / i386 / string / memcmp.s
1 /*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24
25 // *************** ***********
26 // * M E M C M P * and * B C M P *
27 // *************** ***********
28 //
29 // int memcmp(const char *s1, const char *s2, size_t len);
30 // int bcmp(const char *s1, const char *s2, size_t len);
31 //
32 // Bcmp returns (+,0,-), whereas memcmp returns the true difference
33 // between the first differing bytes, but we treat them identically.
34 //
35 // We optimize the compare by doing it with SSE. This introduces
36 // a complication: if we blindly did vector loads from both sides until
37 // finding a difference, we might get a spurious page fault by
38 // reading bytes past the difference. To avoid this, we never do a load
39 // that crosses a page boundary.
40
41 #define kShort 18 // too short for vectors (must be >16)
42
43 .text
44 .align 4
45
46 .globl _memcmp
47 .globl _bcmp
48
49 _memcmp: // int memcmp(const char *s1,const char *s2,size_t len);
50 _bcmp: // int bcmp(const char *s1,const char *s2,size_t len);
51 pushl %esi
52 pushl %edi
53 movl 20(%esp),%ecx // get length
54 movl 12(%esp),%esi // get LHS ptr
55 movl 16(%esp),%edi // get RHS ptr
56 cmpl $(kShort),%ecx // worth accelerating?
57 ja LNotShort // yes
58
59
60 // Too short to bother with parallel compares. Loop over bytes.
61 // %esi = LHS ptr
62 // %edi = RHS ptr
63 // %ecx = length (<= kShort)
64
65 LShort:
66 testl %ecx,%ecx // 0-length?
67 jnz LShortLoop // no
68 xorl %eax,%eax // return 0
69 jmp LExit
70 .align 4,0x90 // align inner loops to optimize I-fetch
71 LShortLoop: // loop over bytes
72 movzb (%esi),%eax // get LHS byte
73 movzb (%edi),%edx // get RHS byte
74 incl %esi
75 incl %edi
76 subl %edx,%eax // compare them
77 jnz LExit // done if not equal
78 decl %ecx // decrement length
79 jnz LShortLoop
80 LExit: // return value is in %eax
81 popl %edi
82 popl %esi
83 ret
84
85 LNotEqual: // here from LLoopOverBytes with LHS in eax
86 movzb (%edi),%edx // get RHS byte
87 subl %edx,%eax // generate return value (nonzero)
88 popl %edi
89 popl %esi
90 ret
91
92
93 // Loop over bytes until we reach end of a page.
94 // %esi = LHS ptr
95 // %edi = RHS ptr
96 // %ecx = length remaining after end of loop (ie, already adjusted)
97 // %edx = #bytes until next page (1..15)
98
99 .align 4,0x90 // align inner loops to optimize I-fetch
100 LLoopOverBytes:
101 movzb (%esi),%eax // get LHS byte
102 inc %esi
103 cmpb (%edi),%al // compare to RHS byte
104 jnz LNotEqual // done if not equal
105 inc %edi
106 dec %edx // more to go?
107 jnz LLoopOverBytes
108
109
110 // Long enough to justify overhead of setting up vector compares. In order to
111 // avoid spurious page faults, we loop over:
112 //
113 // min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4
114 //
115 // 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte
116 // comparison until reaching the next page, then resume the vector comparison.
117 // %esi = LHS ptr
118 // %edi = RHS ptr
119 // %ecx = length (> kShort)
120
121 LNotShort:
122 movl %esi,%eax // copy ptrs
123 movl %edi,%edx
124 andl $4095,%eax // mask down to page offsets
125 andl $4095,%edx
126 cmpl %eax,%edx // which is bigger?
127 cmova %edx,%eax // %eax = max(LHS offset, RHS offset);
128 movl $4096,%edx
129 subl %eax,%edx // get #bytes to next page crossing
130 cmpl %ecx,%edx // will operand run out first?
131 cmova %ecx,%edx // get min(length remaining, bytes to page end)
132 movl %edx,%eax
133 shrl $4,%edx // get #chunks till end of operand or page
134 jnz LLoopOverChunks // enter vector loop
135
136 // Too near page end for vectors.
137
138 subl %eax,%ecx // adjust length remaining
139 movl %eax,%edx // %edx <- #bytes to page end
140 cmpl $(kShort),%ecx // will there be enough after we cross page for vectors?
141 ja LLoopOverBytes // yes
142 addl %eax,%ecx // no, restore total length remaining
143 jmp LShortLoop // compare rest byte-by-byte (%ecx != 0)
144
145
146 // Loop over 16-byte chunks.
147 // %esi = LHS ptr
148 // %edi = RHS ptr
149 // %ecx = length remaining
150 // %edx = chunk count
151
152 .align 4,0x90 // align inner loops to optimize I-fetch
153 LLoopOverChunks:
154 movdqu (%esi),%xmm0 // get LHS
155 movdqu (%edi),%xmm1 // get RHS
156 addl $16,%esi
157 pcmpeqb %xmm1,%xmm0 // compare LHS to RHS
158 addl $16,%edi
159 pmovmskb %xmm0,%eax // collect comparison result bits (1 if equal)
160 subl $16,%ecx // adjust length remaining
161 xorl $0xFFFF,%eax // all equal?
162 jne LDifferent // no, we found differing bytes
163 dec %edx // more to go?
164 jnz LLoopOverChunks
165
166 cmpl $(kShort),%ecx // a lot more to compare?
167 jbe LShort // no
168 jmp LNotShort // compute distance to next page crossing etc
169
170
171 // Found a difference.
172 // %esi = LHS ptr, already advanced by 16
173 // %edi = RHS ptr, already advanced by 16
174 // %eax = complemented compare vector (ie, 0 == equal)
175
176 LDifferent:
177 bsf %eax,%edx // which byte differed?
178 subl $16,%esi // point to byte 0 while we wait for bit scan
179 subl $16,%edi
180 movzb (%esi,%edx),%eax // get LHS byte
181 movzb (%edi,%edx),%ecx // get RHS byte
182 subl %ecx,%eax // compute difference (ie, return value)
183 popl %edi
184 popl %esi
185 ret