]> git.saurik.com Git - apple/libc.git/blob - i386/string/strncmp.s
Libc-763.11.tar.gz
[apple/libc.git] / i386 / string / strncmp.s
1 /*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24
25 // *****************
26 // * S T R N C M P *
27 // *****************
28 //
29 // int strncmp(const char *s1, const char *s2, size_t len);
30 //
31 // We optimize the compare by doing it vector parallel. This introduces
32 // a complication: if we blindly did vector loads from both sides until
33 // finding a difference (or 0), we might get a spurious page fault by
34 // reading bytes past the difference. To avoid this, we never do a load
35 // that crosses a page boundary.
36
37 #define kShort 20 // too short for vectors (must be >16)
38
39 .text
40 .globl _strncmp
41
42 .align 4
43 _strncmp: // int strncmp(const char *s1, const char *s2, size_t len);
44 pushl %esi
45 pushl %edi
46 movl 20(%esp),%ecx // get length
47 movl 12(%esp),%esi // get LHS ptr
48 movl 16(%esp),%edi // get RHS ptr
49 push %ebx
50 cmpl $(kShort),%ecx // worth accelerating?
51 ja LNotShort // yes
52
53
54 // Too short to bother with parallel compares. Loop over bytes.
55 // %esi = LHS ptr
56 // %edi = RHS ptr
57 // %ecx = length (<= kShort)
58
59 LShort:
60 testl %ecx,%ecx // 0-length?
61 jnz LShortLoop // no
62 jmp LReturn0 // yes, return 0
63 .align 4,0x90 // align inner loops to optimize I-fetch
64 LShortLoop: // loop over bytes
65 movzb (%esi),%eax // get LHS byte
66 movzb (%edi),%ebx // get RHS byte
67 incl %esi
68 incl %edi
69 testl %eax,%eax // LHS==0 ?
70 jz LNotEqual // yes, this terminates comparison
71 subl %ebx,%eax // compare them
72 jnz LExit // done if not equal
73 decl %ecx // decrement length
74 jnz LShortLoop
75 LReturn0:
76 xorl %eax,%eax // all bytes equal, so return 0
77 LExit: // return value is in %eax
78 popl %ebx
79 popl %edi
80 popl %esi
81 ret
82
83 LNotEqual: // LHS in eax, RHS in ebx
84 subl %ebx,%eax // generate return value (nonzero)
85 popl %ebx
86 popl %edi
87 popl %esi
88 ret
89
90
91 // Loop over bytes until we reach end of a page.
92 // %esi = LHS ptr
93 // %edi = RHS ptr
94 // %ecx = length remaining after end of loop (ie, already adjusted)
95 // %edx = #bytes until next page (1..15)
96
97 .align 4,0x90 // align inner loops to optimize I-fetch
98 LLoopOverBytes:
99 movzb (%esi),%eax // get LHS byte
100 movzb (%edi),%ebx // get RHS byte
101 inc %esi
102 inc %edi
103 testl %eax,%eax // LHS==0 ?
104 jz LNotEqual // yes, this terminates comparison
105 subl %ebx,%eax // compare them
106 jnz LExit // done if not equal
107 dec %edx // more to go?
108 jnz LLoopOverBytes
109
110
111 // Long enough to justify overhead of setting up vector compares. In order to
112 // avoid spurious page faults, we loop over:
113 //
114 // min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4
115 //
116 // 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte
117 // comparison until reaching the next page, then resume the vector comparison.
118 // %esi = LHS ptr
119 // %edi = RHS ptr
120 // %ecx = length (> kShort)
121
122 LNotShort:
123 movl %esi,%eax // copy ptrs
124 movl %edi,%edx
125 andl $4095,%eax // mask down to page offsets
126 andl $4095,%edx
127 cmpl %eax,%edx // which is bigger?
128 cmova %edx,%eax // %eax = max(LHS offset, RHS offset);
129 movl $4096,%edx
130 subl %eax,%edx // get #bytes to next page crossing
131 cmpl %ecx,%edx // will operand run out first?
132 cmova %ecx,%edx // get min(length remaining, bytes to page end)
133 movl %edx,%eax
134 shrl $4,%edx // get #chunks till end of operand or page
135 jnz LLoopOverChunks // enter vector loop
136
137 // Too near page end for vectors.
138
139 subl %eax,%ecx // adjust length remaining
140 movl %eax,%edx // %edx <- #bytes to page end
141 cmpl $(kShort),%ecx // will there be enough after we cross page for vectors?
142 ja LLoopOverBytes // yes
143 addl %eax,%ecx // no, restore total length remaining
144 jmp LShortLoop // compare rest byte-by-byte (%ecx != 0)
145
146
147 // Loop over 16-byte chunks.
148 // %esi = LHS ptr
149 // %edi = RHS ptr
150 // %ecx = length remaining
151 // %edx = chunk count
152
153 .align 4,0x90 // align inner loops to optimize I-fetch
154 LLoopOverChunks:
155 movdqu (%esi),%xmm1 // get LHS
156 movdqu (%edi),%xmm2 // get RHS
157 pxor %xmm0,%xmm0 // get some 0s in the shadow of the loads
158 addl $16,%esi
159 pcmpeqb %xmm1,%xmm2 // compare LHS to RHS
160 pcmpeqb %xmm1,%xmm0 // compare LHS to 0s
161 addl $16,%edi
162 pmovmskb %xmm2,%eax // get result mask for comparison of LHS and RHS
163 pmovmskb %xmm0,%ebx // get result mask for 0 check
164 subl $16,%ecx // decrement length remaining
165 xorl $0xFFFF,%eax // complement compare mask so 1 means "not equal"
166 orl %ebx,%eax // combine the masks and check for 1-bits
167 jnz LFoundDiffOr0 // we found differing bytes or a 0-byte
168 dec %edx // more to go?
169 jnz LLoopOverChunks // yes
170
171 cmpl $(kShort),%ecx // a lot more to compare?
172 jbe LShort // no
173 jmp LNotShort // compute distance to next page crossing etc
174
175
176 // Found a zero and/or a difference in vector compare.
177 // %esi = LHS ptr, already advanced by 16
178 // %edi = RHS ptr, already advanced by 16
179 // %eax = bit n set if bytes n differed or were 0
180
181 LFoundDiffOr0:
182 bsf %eax,%edx // which byte differed or was 0?
183 subl $16,%esi // point to start of vectors while we wait for bit scan
184 subl $16,%edi
185 movzb (%esi,%edx),%eax // get LHS byte
186 movzb (%edi,%edx),%ecx // get RHS byte
187 popl %ebx
188 popl %edi
189 subl %ecx,%eax // compute difference (ie, return value)
190 popl %esi
191 ret