]>
Commit | Line | Data |
---|---|---|
e9ce8d39 | 1 | /* |
eb1cde05 | 2 | * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. |
e9ce8d39 A |
3 | * |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
734aad71 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
e9ce8d39 A |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
734aad71 A |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
e9ce8d39 A |
20 | * |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
eb1cde05 A |
23 | |
24 | ||
25 | // *************** | |
26 | // * S T R C M P * | |
27 | // *************** | |
28 | // | |
29 | // int strcmp(const char *s1, const char *s2); | |
30 | // | |
31 | // We optimize the compare by doing it in parallel, using SSE. This introduces | |
32 | // a complication: if we blindly did vector loads from both sides until | |
33 | // finding a difference (or 0), we might get a spurious page fault by | |
34 | // reading bytes past the difference. To avoid this, we never do a load | |
35 | // that crosses a page boundary. | |
36 | ||
37 | .text | |
38 | .globl _strcmp | |
39 | ||
40 | .align 4 | |
41 | _strcmp: // int strcmp(const char *s1,const char *s2); | |
42 | pushl %esi | |
43 | pushl %edi | |
44 | movl 12(%esp),%esi // get LHS ptr | |
45 | movl 16(%esp),%edi // get RHS ptr | |
46 | ||
47 | ||
48 | // In order to avoid spurious page faults, we loop over: | |
49 | // | |
50 | // min( bytes_in_LHS_page, bytes_in_RHS_page) >> 4 | |
51 | // | |
52 | // 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte | |
53 | // comparison until reaching the next page, then resume the vector comparison. | |
54 | // %esi = LHS ptr | |
55 | // %edi = RHS ptr | |
56 | ||
57 | LNextChunk: | |
58 | movl %esi,%eax // copy ptrs | |
59 | movl %edi,%edx | |
60 | andl $4095,%eax // mask down to page offsets | |
61 | andl $4095,%edx | |
62 | cmpl %eax,%edx // which is bigger? | |
63 | cmova %edx,%eax // %eax = max(LHS offset, RHS offset); | |
64 | movl $4096,%edx | |
65 | subl %eax,%edx // get #bytes to next page crossing | |
66 | movl %edx,%eax | |
67 | shrl $4,%edx // get #chunks till end of operand or page | |
68 | jnz LLoopOverChunks // enter vector loop | |
69 | movl %eax,%edx // no chunks... | |
70 | jmp LLoopOverBytes // ...so loop over bytes until page end | |
71 | ||
72 | ||
73 | // Loop over bytes. | |
74 | // %esi = LHS ptr | |
75 | // %edi = RHS ptr | |
76 | // %edx = byte count | |
77 | ||
78 | .align 4,0x90 // align inner loops to optimize I-fetch | |
79 | LLoopOverBytes: | |
80 | movzb (%esi),%eax // get LHS byte | |
81 | movzb (%edi),%ecx // get RHS byte | |
82 | inc %esi | |
83 | inc %edi | |
84 | testl %eax,%eax // 0? | |
85 | jz LExit0 // yes, we're done | |
86 | subl %ecx,%eax // compare them | |
87 | jnz LExit // done if not equal | |
88 | dec %edx // more to go? | |
89 | jnz LLoopOverBytes | |
90 | ||
91 | jmp LNextChunk // we've come to end of page | |
92 | ||
93 | ||
94 | // Loop over 16-byte chunks. | |
95 | // %esi = LHS ptr | |
96 | // %edi = RHS ptr | |
97 | // %edx = chunk count | |
98 | ||
99 | .align 4,0x90 // align inner loops to optimize I-fetch | |
100 | LLoopOverChunks: | |
101 | movdqu (%esi),%xmm1 // get LHS | |
102 | movdqu (%edi),%xmm2 // get RHS | |
103 | pxor %xmm0,%xmm0 // get some 0s in the shadow of the loads | |
104 | addl $16,%esi | |
105 | pcmpeqb %xmm1,%xmm2 // compare LHS to RHS | |
106 | pcmpeqb %xmm1,%xmm0 // compare LHS to 0s | |
107 | addl $16,%edi | |
108 | pmovmskb %xmm2,%eax // get result mask for comparison of LHS and RHS | |
109 | pmovmskb %xmm0,%ecx // get result mask for 0 check | |
110 | xorl $0xFFFF,%eax // complement compare mask so 1 means "not equal" | |
111 | orl %ecx,%eax // combine the masks and check for 1-bits | |
112 | jnz LFoundDiffOr0 // we found differing bytes or a 0-byte | |
113 | dec %edx // more to go? | |
114 | jnz LLoopOverChunks | |
115 | ||
116 | jmp LNextChunk // compare up to next page boundary | |
117 | ||
118 | ||
119 | // Found a zero and/or a difference in vector compare. | |
120 | // %esi = LHS ptr, already advanced by 16 | |
121 | // %edi = RHS ptr, already advanced by 16 | |
122 | // %eax = bit n set if bytes n differed or were 0 | |
123 | ||
124 | LFoundDiffOr0: | |
125 | bsf %eax,%edx // which byte differed or was 0? | |
126 | subl $16,%esi // point to start of vectors while we wait for bit scan | |
127 | subl $16,%edi | |
128 | movzb (%esi,%edx),%eax // get LHS byte | |
129 | movzb (%edi,%edx),%ecx // get RHS byte | |
130 | subl %ecx,%eax // compute difference (ie, return value) | |
131 | popl %edi | |
132 | popl %esi | |
133 | ret | |
134 | ||
135 | ||
136 | // Found a zero and/or difference in byte loop. | |
137 | // %eax = LHS byte | |
138 | // %ecx = RHS byte | |
139 | ||
140 | LExit0: | |
141 | subl %ecx,%eax // compute difference (ie, return value) | |
142 | LExit: // here with difference already in %eax | |
143 | popl %edi | |
144 | popl %esi | |
145 | ret |