]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
24 | ||
25 | // *************** *********** | |
26 | // * M E M C M P * and * B C M P * | |
27 | // *************** *********** | |
28 | // | |
29 | // int memcmp(const char *s1, const char *s2, size_t len); | |
30 | // int bcmp(const char *s1, const char *s2, size_t len); | |
31 | // | |
32 | // Bcmp returns (+,0,-), whereas memcmp returns the true difference | |
33 | // between the first differing bytes, but we treat them identically. | |
34 | // | |
35 | // We optimize the compare by doing it with SSE. This introduces | |
36 | // a complication: if we blindly did vector loads from both sides until | |
37 | // finding a difference, we might get a spurious page fault by | |
38 | // reading bytes past the difference. To avoid this, we never do a load | |
39 | // that crosses a page boundary. | |
40 | ||
41 | #define kShort 18 // too short for vectors (must be >16) | |
42 | ||
43 | .text | |
44 | .align 4 | |
45 | ||
46 | .globl _memcmp | |
47 | .globl _bcmp | |
48 | ||
49 | _memcmp: // int memcmp(const char *s1,const char *s2,size_t len); | |
50 | _bcmp: // int bcmp(const char *s1,const char *s2,size_t len); | |
51 | cmpq $(kShort),%rdx // worth accelerating? | |
52 | ja LNotShort // yes | |
53 | ||
54 | ||
55 | // Too short to bother with parallel compares. Loop over bytes. | |
56 | // %rdi = LHS ptr | |
57 | // %rsi = RHS ptr | |
58 | // %edx = length (<= kShort) | |
59 | ||
60 | LShort: | |
61 | testl %edx,%edx // 0-length? | |
62 | jnz LShortLoop // no | |
63 | xorq %rax,%rax // return 0 | |
64 | jmp LExit | |
65 | .align 4,0x90 // align inner loops to optimize I-fetch | |
66 | LShortLoop: // loop over bytes | |
67 | movzb (%rdi),%eax // get LHS byte | |
68 | movzb (%rsi),%ecx // get RHS byte | |
69 | addq $1,%rdi | |
70 | addq $1,%rsi | |
71 | subl %ecx,%eax // compare them | |
72 | jnz LExit // done if not equal | |
73 | subq $1,%rdx // decrement length | |
74 | jnz LShortLoop | |
75 | LExit: // return value is in %eax | |
76 | ret | |
77 | ||
78 | LNotEqual: // here from LLoopOverBytes with LHS in eax | |
79 | movzb (%rsi),%ecx // get RHS byte | |
80 | subl %ecx,%eax // generate return value (nonzero) | |
81 | ret | |
82 | ||
83 | ||
84 | // Loop over bytes until we reach end of a page. | |
85 | // %rdi = LHS ptr | |
86 | // %edi = RHS ptr | |
87 | // %rdx = length remaining after end of loop (i.e., already adjusted) | |
88 | // %ecx = #bytes until next page (1..15) | |
89 | ||
90 | .align 4,0x90 // align inner loops to optimize I-fetch | |
91 | LLoopOverBytes: | |
92 | movzb (%rdi),%eax // get LHS byte | |
93 | addq $1,%rdi | |
94 | cmpb (%rsi),%al // compare to RHS byte | |
95 | jnz LNotEqual // done if not equal | |
96 | addq $1,%rsi | |
97 | subl $1,%ecx // more to go? | |
98 | jnz LLoopOverBytes | |
99 | ||
100 | ||
101 | // Long enough to justify overhead of setting up vector compares. In order to | |
102 | // avoid spurious page faults, we loop over: | |
103 | // | |
104 | // min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4 | |
105 | // | |
106 | // 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte | |
107 | // comparison until reaching the next page, then resume the vector comparison. | |
108 | // %rdi = LHS ptr | |
109 | // %rsi = RHS ptr | |
110 | // %rdx = length (> kShort) | |
111 | ||
112 | LNotShort: | |
113 | movq %rdi,%rax // copy ptrs | |
114 | movq %rsi,%rcx | |
115 | andq $4095,%rax // mask down to page offsets | |
116 | andq $4095,%rcx | |
117 | cmpq %rax,%rcx // which is bigger? | |
118 | cmova %rcx,%rax // %eax = max(LHS offset, RHS offset); | |
119 | movl $4096,%ecx | |
120 | subl %eax,%ecx // get #bytes to next page crossing | |
121 | cmpq %rdx,%rcx // will operand run out first? | |
122 | cmova %edx,%ecx // get min(length remaining, bytes to page end) | |
123 | movl %ecx,%eax | |
124 | shrl $4,%ecx // get #chunks till end of operand or page | |
125 | jnz LLoopOverChunks // enter vector loop | |
126 | ||
127 | // Too near page end for vectors. | |
128 | ||
129 | subq %rax,%rdx // adjust length remaining | |
130 | movl %eax,%ecx // %ecx <- #bytes to page end | |
131 | cmpq $(kShort),%rdx // will there be enough after we cross page for vectors? | |
132 | ja LLoopOverBytes // yes | |
133 | addq %rax,%rdx // no, restore total length remaining | |
134 | jmp LShortLoop // compare rest byte-by-byte (%ecx != 0) | |
135 | ||
136 | ||
137 | // Loop over 16-byte chunks. | |
138 | // %rdi = LHS ptr | |
139 | // %rsi = RHS ptr | |
140 | // %rdx = length remaining | |
141 | // %ecx = chunk count | |
142 | ||
143 | .align 4,0x90 // align inner loops to optimize I-fetch | |
144 | LLoopOverChunks: | |
145 | movdqu (%rdi),%xmm0 // get LHS | |
146 | movdqu (%rsi),%xmm1 // get RHS | |
147 | addq $16,%rdi | |
148 | pcmpeqb %xmm1,%xmm0 // compare LHS to RHS | |
149 | addq $16,%rsi | |
150 | pmovmskb %xmm0,%eax // collect comparison result bits (1 if equal) | |
151 | subq $16,%rdx // adjust length remaining | |
152 | xorl $0xFFFF,%eax // all equal? | |
153 | jne LDifferent // no, we found differing bytes | |
154 | subl $1,%ecx // more to go? | |
155 | jnz LLoopOverChunks | |
156 | ||
157 | cmpq $(kShort),%rdx // a lot more to compare? | |
158 | jbe LShort // no | |
159 | jmp LNotShort // compute distance to next page crossing etc | |
160 | ||
161 | ||
162 | // Found a difference. | |
163 | // %rdi = LHS ptr, already advanced by 16 | |
164 | // %rsi = RHS ptr, already advanced by 16 | |
165 | // %eax = complemented compare vector (ie, 0 == equal) | |
166 | ||
167 | LDifferent: | |
168 | bsf %eax,%edx // which byte differed? | |
169 | subq $16,%rdi // point to byte 0 while we wait for bit scan | |
170 | subq $16,%rsi | |
171 | movzb (%rdi,%rdx),%eax // get LHS byte | |
172 | movzb (%rsi,%rdx),%ecx // get RHS byte | |
173 | subl %ecx,%eax // compute difference (ie, return value) | |
174 | ret |