]>
Commit | Line | Data |
---|---|---|
8e029c65 A |
1 | /* |
2 | * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
24 | ||
25 | // ***************** | |
26 | // * S T R N C M P * | |
27 | // ***************** | |
28 | // | |
29 | // int strncmp(const char *s1, const char *s2, size_t len); | |
30 | // | |
31 | // We optimize the compare by doing it vector parallel. This introduces | |
32 | // a complication: if we blindly did vector loads from both sides until | |
33 | // finding a difference (or 0), we might get a spurious page fault by | |
34 | // reading bytes past the difference. To avoid this, we never do a load | |
35 | // that crosses a page boundary. | |
36 | ||
37 | #define kShort 20 // too short for vectors (must be >16) | |
38 | ||
39 | .text | |
40 | .globl _strncmp | |
41 | ||
42 | .align 4 | |
43 | _strncmp: // int strncmp(const char *s1, const char *s2, size_t len); | |
44 | cmpq $(kShort),%rdx // worth accelerating? | |
45 | ja LNotShort // yes | |
46 | ||
47 | ||
48 | // Too short to bother with parallel compares. Loop over bytes. | |
49 | // %rdi = LHS ptr | |
50 | // %rsi = RHS ptr | |
51 | // %edx = length (<= kShort) | |
52 | ||
53 | LShort: | |
54 | testl %edx,%edx // 0-length? | |
55 | jnz LShortLoop // no | |
56 | jmp LReturn0 // yes, return 0 | |
57 | .align 4,0x90 // align inner loops to optimize I-fetch | |
58 | LShortLoop: // loop over bytes | |
59 | movzb (%rdi),%eax // get LHS byte | |
60 | movzb (%rsi),%ecx // get RHS byte | |
61 | incq %rdi | |
62 | incq %rsi | |
63 | testl %eax,%eax // LHS==0 ? | |
64 | jz LNotEqual // yes, this terminates comparison | |
65 | cmpl %ecx,%eax // compare them (sub won't fuse, but cmp will) | |
66 | jnz LNotEqual // done if not equal | |
67 | decl %edx // decrement length | |
68 | jnz LShortLoop | |
69 | LReturn0: | |
70 | xorl %eax,%eax // all bytes equal, so return 0 | |
71 | ret | |
72 | ||
73 | LNotEqual: // LHS in eax, RHS in ecx | |
74 | subl %ecx,%eax // generate return value (nonzero) | |
75 | ret | |
76 | ||
77 | ||
78 | // Loop over bytes until we reach end of a page. | |
79 | // %rdi = LHS ptr | |
80 | // %rsi = RHS ptr | |
81 | // %rdx = length remaining after end of loop (ie, already adjusted) | |
82 | // %r8d = #bytes until next page (1..15) | |
83 | ||
84 | .align 4,0x90 // align inner loops to optimize I-fetch | |
85 | LLoopOverBytes: | |
86 | movzb (%rdi),%eax // get LHS byte | |
87 | movzb (%rsi),%ecx // get RHS byte | |
88 | incq %rdi | |
89 | incq %rsi | |
90 | testl %eax,%eax // LHS==0 ? | |
91 | jz LNotEqual // yes, this terminates comparison | |
92 | cmpl %ecx,%eax // compare them (sub won't fuse, but cmp will) | |
93 | jnz LNotEqual // done if not equal | |
94 | decl %r8d // more to go? | |
95 | jnz LLoopOverBytes | |
96 | ||
97 | ||
98 | // Long enough to justify overhead of setting up vector compares. In order to | |
99 | // avoid spurious page faults, we loop over: | |
100 | // | |
101 | // min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4 | |
102 | // | |
103 | // 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte | |
104 | // comparison until reaching the next page, then resume the vector comparison. | |
105 | // %rdi = LHS ptr | |
106 | // %rsi = RHS ptr | |
107 | // %rdx = length (> kShort) | |
108 | ||
109 | LNotShort: | |
110 | movl %edi,%eax // copy ptrs | |
111 | movl %esi,%r8d | |
112 | andl $4095,%eax // mask down to page offsets | |
113 | andl $4095,%r8d | |
114 | cmpl %eax,%r8d // which is bigger? | |
115 | cmova %r8d,%eax // %eax = max(LHS offset, RHS offset); | |
116 | movl $4096,%r8d | |
117 | subl %eax,%r8d // get #bytes to next page crossing | |
118 | cmpq %rdx,%r8 // will operand run out first? | |
119 | cmova %rdx,%r8 // get min(length remaining, bytes to page end) | |
120 | movl %r8d,%eax // copy #bytes | |
121 | shrl $4,%r8d // get #chunks till end of operand or page | |
122 | testl %r8d,%r8d // test %r8d for 0 without partial flag update stall | |
123 | jnz LLoopOverChunks // enter vector loop | |
124 | ||
125 | // Too near page end for vectors. | |
126 | ||
127 | subq %rax,%rdx // adjust length remaining | |
128 | movl %eax,%r8d // %r8d <- #bytes to page end | |
129 | cmpq $(kShort),%rdx // will there be enough after we cross page for vectors? | |
130 | ja LLoopOverBytes // yes | |
131 | addq %rax,%rdx // no, restore total length remaining | |
132 | jmp LShortLoop // compare rest byte-by-byte (%rdx != 0) | |
133 | ||
134 | ||
135 | // Loop over 16-byte chunks. | |
136 | // %rdi = LHS ptr | |
137 | // %rsi = RHS ptr | |
138 | // %rdx = length remaining | |
139 | // %r8d = chunk count | |
140 | ||
141 | .align 4,0x90 // align inner loops to optimize I-fetch | |
142 | LLoopOverChunks: | |
143 | movdqu (%rdi),%xmm1 // get LHS | |
144 | movdqu (%rsi),%xmm2 // get RHS | |
145 | pxor %xmm0,%xmm0 // get some 0s in the shadow of the loads | |
146 | addq $16,%rdi | |
147 | pcmpeqb %xmm1,%xmm2 // compare LHS to RHS | |
148 | pcmpeqb %xmm1,%xmm0 // compare LHS to 0s | |
149 | addq $16,%rsi | |
150 | pmovmskb %xmm2,%eax // get result mask for comparison of LHS and RHS | |
151 | pmovmskb %xmm0,%ecx // get result mask for 0 check | |
152 | subq $16,%rdx // decrement length remaining | |
153 | xorl $0xFFFF,%eax // complement compare mask so 1 means "not equal" | |
154 | orl %ecx,%eax // combine the masks and check for 1-bits | |
155 | jnz LFoundDiffOr0 // we found differing bytes or a 0-byte | |
156 | decl %r8d // more to go? | |
157 | jnz LLoopOverChunks // yes | |
158 | ||
159 | cmpq $(kShort),%rdx // a lot more to compare? | |
160 | jbe LShort // no | |
161 | jmp LNotShort // compute distance to next page crossing etc | |
162 | ||
163 | ||
164 | // Found a zero and/or a difference in vector compare. | |
165 | // %rdi = LHS ptr, already advanced by 16 | |
166 | // %rsi = RHS ptr, already advanced by 16 | |
167 | // %eax = bit n set if bytes n differed or were 0 | |
168 | ||
169 | LFoundDiffOr0: | |
170 | bsf %eax,%edx // which byte differed or was 0? | |
171 | movzb -16(%rdi,%rdx),%eax // get LHS byte | |
172 | movzb -16(%rsi,%rdx),%edx // get RHS byte | |
173 | subl %edx,%eax // compute difference (ie, return value) | |
174 | ret |