]>
Commit | Line | Data |
---|---|---|
8e029c65 A |
1 | /* |
2 | * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
24 | ||
25 | // *************** | |
26 | // * S T R C M P * | |
27 | // *************** | |
28 | // | |
29 | // int strcmp(const char *s1, const char *s2); | |
30 | // | |
31 | // We optimize the compare by doing it in parallel, using SSE. This introduces | |
32 | // a complication: if we blindly did vector loads from both sides until | |
33 | // finding a difference (or 0), we might get a spurious page fault by | |
34 | // reading bytes past the difference. To avoid this, we never do a load | |
35 | // that crosses a page boundary. | |
36 | ||
37 | .text | |
38 | .globl _strcmp | |
39 | ||
40 | .align 4 | |
41 | _strcmp: // int strcmp(const char *s1,const char *s2); | |
42 | ||
43 | // In order to avoid spurious page faults, we loop over: | |
44 | // | |
45 | // min( bytes_in_LHS_page, bytes_in_RHS_page) >> 4 | |
46 | // | |
47 | // 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte | |
48 | // comparison until reaching the next page, then resume the vector comparison. | |
49 | // %rdi = LHS ptr | |
50 | // %rsi = RHS ptr | |
51 | ||
52 | LNextChunk: | |
53 | movl %edi,%eax // copy low 4 bytes of each ptr | |
54 | movl %esi,%edx | |
55 | andl $4095,%eax // mask down to page offsets | |
56 | andl $4095,%edx | |
57 | cmpl %eax,%edx // which is bigger? | |
58 | cmova %edx,%eax // %eax = max(LHS offset, RHS offset); | |
59 | movl $4096,%edx | |
60 | subl %eax,%edx // get #bytes to next page crossing | |
61 | movl %edx,%eax | |
62 | shrl $4,%edx // get #chunks till end of operand or page | |
63 | jnz LLoopOverChunks // enter vector loop | |
64 | movl %eax,%edx // no chunks... | |
65 | jmp LLoopOverBytes // ...so loop over bytes until page end | |
66 | ||
67 | ||
68 | // Loop over bytes. | |
69 | // %rdi = LHS ptr | |
70 | // %rsi = RHS ptr | |
71 | // %edx = byte count | |
72 | ||
73 | .align 4,0x90 // align inner loops to optimize I-fetch | |
74 | LLoopOverBytes: | |
75 | movzb (%rdi),%eax // get LHS byte | |
76 | movzb (%rsi),%ecx // get RHS byte | |
77 | addq $1,%rdi | |
78 | addq $1,%rsi | |
79 | testl %eax,%eax // 0? | |
80 | jz LExit0 // yes, we're done | |
81 | subl %ecx,%eax // compare them | |
82 | jnz LExit // done if not equal | |
83 | subl $1,%edx // more to go? | |
84 | jnz LLoopOverBytes | |
85 | ||
86 | jmp LNextChunk // we've come to end of page | |
87 | ||
88 | ||
89 | // Loop over 16-byte chunks. | |
90 | // %rdi = LHS ptr | |
91 | // %rsi = RHS ptr | |
92 | // %edx = chunk count | |
93 | ||
94 | .align 4,0x90 // align inner loops to optimize I-fetch | |
95 | LLoopOverChunks: | |
96 | movdqu (%rdi),%xmm1 // get LHS | |
97 | movdqu (%rsi),%xmm2 // get RHS | |
98 | pxor %xmm0,%xmm0 // get some 0s in the shadow of the loads | |
99 | addq $16,%rdi | |
100 | pcmpeqb %xmm1,%xmm2 // compare LHS to RHS | |
101 | pcmpeqb %xmm1,%xmm0 // compare LHS to 0s | |
102 | addq $16,%rsi | |
103 | pmovmskb %xmm2,%eax // get result mask for comparison of LHS and RHS | |
104 | pmovmskb %xmm0,%ecx // get result mask for 0 check | |
105 | xorl $0xFFFF,%eax // complement compare mask so 1 means "not equal" | |
106 | orl %ecx,%eax // combine the masks and check for 1-bits | |
107 | jnz LFoundDiffOr0 // we found differing bytes or a 0-byte | |
108 | subl $1,%edx // more to go? | |
109 | jnz LLoopOverChunks | |
110 | ||
111 | jmp LNextChunk // compare up to next page boundary | |
112 | ||
113 | ||
114 | // Found a zero and/or a difference in vector compare. | |
115 | // %rdi = LHS ptr, already advanced by 16 | |
116 | // %rsi = RHS ptr, already advanced by 16 | |
117 | // %eax = bit n set if bytes n differed or were 0 | |
118 | ||
119 | LFoundDiffOr0: | |
120 | bsf %eax,%edx // which byte differed or was 0? | |
121 | subq $16,%rdi // point to start of vectors while we wait for bit scan | |
122 | subq $16,%rsi | |
123 | movzb (%rdi,%rdx),%eax // get LHS byte | |
124 | movzb (%rsi,%rdx),%ecx // get RHS byte | |
125 | subl %ecx,%eax // compute difference (ie, return value) | |
126 | ret | |
127 | ||
128 | ||
129 | // Found a zero and/or difference in byte loop. | |
130 | // %eax = LHS byte | |
131 | // %ecx = RHS byte | |
132 | ||
133 | LExit0: | |
134 | subl %ecx,%eax // compute difference (ie, return value) | |
135 | LExit: // here with difference already in %eax | |
136 | ret |