]> git.saurik.com Git - apple/libc.git/blob - x86_64/string/strncmp.s
Libc-498.tar.gz
[apple/libc.git] / x86_64 / string / strncmp.s
1 /*
2 * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24
25 // *****************
26 // * S T R N C M P *
27 // *****************
28 //
29 // int strncmp(const char *s1, const char *s2, size_t len);
30 //
31 // We optimize the compare by doing it vector parallel. This introduces
32 // a complication: if we blindly did vector loads from both sides until
33 // finding a difference (or 0), we might get a spurious page fault by
34 // reading bytes past the difference. To avoid this, we never do a load
35 // that crosses a page boundary.
36
37 #define kShort 20 // too short for vectors (must be >16)
38
39 .text
40 .globl _strncmp
41
42 .align 4
43 _strncmp: // int strncmp(const char *s1, const char *s2, size_t len);
44 cmpq $(kShort),%rdx // worth accelerating?
45 ja LNotShort // yes
46
47
48 // Too short to bother with parallel compares. Loop over bytes.
49 // %rdi = LHS ptr
50 // %rsi = RHS ptr
51 // %edx = length (<= kShort)
52
53 LShort:
54 testl %edx,%edx // 0-length?
55 jnz LShortLoop // no
56 jmp LReturn0 // yes, return 0
57 .align 4,0x90 // align inner loops to optimize I-fetch
58 LShortLoop: // loop over bytes
59 movzb (%rdi),%eax // get LHS byte
60 movzb (%rsi),%ecx // get RHS byte
61 incq %rdi
62 incq %rsi
63 testl %eax,%eax // LHS==0 ?
64 jz LNotEqual // yes, this terminates comparison
65 cmpl %ecx,%eax // compare them (sub won't fuse, but cmp will)
66 jnz LNotEqual // done if not equal
67 decl %edx // decrement length
68 jnz LShortLoop
69 LReturn0:
70 xorl %eax,%eax // all bytes equal, so return 0
71 ret
72
73 LNotEqual: // LHS in eax, RHS in ecx
74 subl %ecx,%eax // generate return value (nonzero)
75 ret
76
77
78 // Loop over bytes until we reach end of a page.
79 // %rdi = LHS ptr
80 // %rsi = RHS ptr
81 // %rdx = length remaining after end of loop (ie, already adjusted)
82 // %r8d = #bytes until next page (1..15)
83
84 .align 4,0x90 // align inner loops to optimize I-fetch
85 LLoopOverBytes:
86 movzb (%rdi),%eax // get LHS byte
87 movzb (%rsi),%ecx // get RHS byte
88 incq %rdi
89 incq %rsi
90 testl %eax,%eax // LHS==0 ?
91 jz LNotEqual // yes, this terminates comparison
92 cmpl %ecx,%eax // compare them (sub won't fuse, but cmp will)
93 jnz LNotEqual // done if not equal
94 decl %r8d // more to go?
95 jnz LLoopOverBytes
96
97
98 // Long enough to justify overhead of setting up vector compares. In order to
99 // avoid spurious page faults, we loop over:
100 //
101 // min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4
102 //
103 // 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte
104 // comparison until reaching the next page, then resume the vector comparison.
105 // %rdi = LHS ptr
106 // %rsi = RHS ptr
107 // %rdx = length (> kShort)
108
109 LNotShort:
110 movl %edi,%eax // copy ptrs
111 movl %esi,%r8d
112 andl $4095,%eax // mask down to page offsets
113 andl $4095,%r8d
114 cmpl %eax,%r8d // which is bigger?
115 cmova %r8d,%eax // %eax = max(LHS offset, RHS offset);
116 movl $4096,%r8d
117 subl %eax,%r8d // get #bytes to next page crossing
118 cmpq %rdx,%r8 // will operand run out first?
119 cmova %rdx,%r8 // get min(length remaining, bytes to page end)
120 movl %r8d,%eax // copy #bytes
121 shrl $4,%r8d // get #chunks till end of operand or page
122 testl %r8d,%r8d // test %r8d for 0 without partial flag update stall
123 jnz LLoopOverChunks // enter vector loop
124
125 // Too near page end for vectors.
126
127 subq %rax,%rdx // adjust length remaining
128 movl %eax,%r8d // %r8d <- #bytes to page end
129 cmpq $(kShort),%rdx // will there be enough after we cross page for vectors?
130 ja LLoopOverBytes // yes
131 addq %rax,%rdx // no, restore total length remaining
132 jmp LShortLoop // compare rest byte-by-byte (%rdx != 0)
133
134
135 // Loop over 16-byte chunks.
136 // %rdi = LHS ptr
137 // %rsi = RHS ptr
138 // %rdx = length remaining
139 // %r8d = chunk count
140
141 .align 4,0x90 // align inner loops to optimize I-fetch
142 LLoopOverChunks:
143 movdqu (%rdi),%xmm1 // get LHS
144 movdqu (%rsi),%xmm2 // get RHS
145 pxor %xmm0,%xmm0 // get some 0s in the shadow of the loads
146 addq $16,%rdi
147 pcmpeqb %xmm1,%xmm2 // compare LHS to RHS
148 pcmpeqb %xmm1,%xmm0 // compare LHS to 0s
149 addq $16,%rsi
150 pmovmskb %xmm2,%eax // get result mask for comparison of LHS and RHS
151 pmovmskb %xmm0,%ecx // get result mask for 0 check
152 subq $16,%rdx // decrement length remaining
153 xorl $0xFFFF,%eax // complement compare mask so 1 means "not equal"
154 orl %ecx,%eax // combine the masks and check for 1-bits
155 jnz LFoundDiffOr0 // we found differing bytes or a 0-byte
156 decl %r8d // more to go?
157 jnz LLoopOverChunks // yes
158
159 cmpq $(kShort),%rdx // a lot more to compare?
160 jbe LShort // no
161 jmp LNotShort // compute distance to next page crossing etc
162
163
164 // Found a zero and/or a difference in vector compare.
165 // %rdi = LHS ptr, already advanced by 16
166 // %rsi = RHS ptr, already advanced by 16
167 // %eax = bit n set if bytes n differed or were 0
168
169 LFoundDiffOr0:
170 bsf %eax,%edx // which byte differed or was 0?
171 movzb -16(%rdi,%rdx),%eax // get LHS byte
172 movzb -16(%rsi,%rdx),%edx // get RHS byte
173 subl %edx,%eax // compute difference (ie, return value)
174 ret