]> git.saurik.com Git - apple/libc.git/blame_incremental - x86_64/string/memcmp.s
Libc-594.9.5.tar.gz
[apple/libc.git] / x86_64 / string / memcmp.s
... / ...
CommitLineData
1/*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24
25// *************** ***********
26// * M E M C M P * and * B C M P *
27// *************** ***********
28//
29// int memcmp(const char *s1, const char *s2, size_t len);
30// int bcmp(const char *s1, const char *s2, size_t len);
31//
32// Bcmp returns (+,0,-), whereas memcmp returns the true difference
33// between the first differing bytes, but we treat them identically.
34//
35// We optimize the compare by doing it with SSE. This introduces
36// a complication: if we blindly did vector loads from both sides until
37// finding a difference, we might get a spurious page fault by
38// reading bytes past the difference. To avoid this, we never do a load
39// that crosses a page boundary.
40
41#define kShort 18 // too short for vectors (must be >16)
42
43 .text
44 .align 4
45
46 .globl _memcmp
47 .globl _bcmp
48
49_memcmp: // int memcmp(const char *s1,const char *s2,size_t len);
50_bcmp: // int bcmp(const char *s1,const char *s2,size_t len);
51 cmpq $(kShort),%rdx // worth accelerating?
52 ja LNotShort // yes
53
54
55// Too short to bother with parallel compares. Loop over bytes.
56// %rdi = LHS ptr
57// %rsi = RHS ptr
58// %edx = length (<= kShort)
59
60LShort:
61 testl %edx,%edx // 0-length?
62 jnz LShortLoop // no
63 xorq %rax,%rax // return 0
64 jmp LExit
65 .align 4,0x90 // align inner loops to optimize I-fetch
66LShortLoop: // loop over bytes
67 movzb (%rdi),%eax // get LHS byte
68 movzb (%rsi),%ecx // get RHS byte
69 addq $1,%rdi
70 addq $1,%rsi
71 subl %ecx,%eax // compare them
72 jnz LExit // done if not equal
73 subq $1,%rdx // decrement length
74 jnz LShortLoop
75LExit: // return value is in %eax
76 ret
77
78LNotEqual: // here from LLoopOverBytes with LHS in eax
79 movzb (%rsi),%ecx // get RHS byte
80 subl %ecx,%eax // generate return value (nonzero)
81 ret
82
83
84// Loop over bytes until we reach end of a page.
85// %rdi = LHS ptr
86// %edi = RHS ptr
87// %rdx = length remaining after end of loop (i.e., already adjusted)
88// %ecx = #bytes until next page (1..15)
89
90 .align 4,0x90 // align inner loops to optimize I-fetch
91LLoopOverBytes:
92 movzb (%rdi),%eax // get LHS byte
93 addq $1,%rdi
94 cmpb (%rsi),%al // compare to RHS byte
95 jnz LNotEqual // done if not equal
96 addq $1,%rsi
97 subl $1,%ecx // more to go?
98 jnz LLoopOverBytes
99
100
101// Long enough to justify overhead of setting up vector compares. In order to
102// avoid spurious page faults, we loop over:
103//
104// min( length, bytes_in_LHS_page, bytes_in_RHS_page) >> 4
105//
106// 16-byte chunks. When we near a page end, we have to revert to a byte-by-byte
107// comparison until reaching the next page, then resume the vector comparison.
108// %rdi = LHS ptr
109// %rsi = RHS ptr
110// %rdx = length (> kShort)
111
112LNotShort:
113 movq %rdi,%rax // copy ptrs
114 movq %rsi,%rcx
115 andq $4095,%rax // mask down to page offsets
116 andq $4095,%rcx
117 cmpq %rax,%rcx // which is bigger?
118 cmova %rcx,%rax // %eax = max(LHS offset, RHS offset);
119 movl $4096,%ecx
120 subl %eax,%ecx // get #bytes to next page crossing
121 cmpq %rdx,%rcx // will operand run out first?
122 cmova %edx,%ecx // get min(length remaining, bytes to page end)
123 movl %ecx,%eax
124 shrl $4,%ecx // get #chunks till end of operand or page
125 jnz LLoopOverChunks // enter vector loop
126
127// Too near page end for vectors.
128
129 subq %rax,%rdx // adjust length remaining
130 movl %eax,%ecx // %ecx <- #bytes to page end
131 cmpq $(kShort),%rdx // will there be enough after we cross page for vectors?
132 ja LLoopOverBytes // yes
133 addq %rax,%rdx // no, restore total length remaining
134 jmp LShortLoop // compare rest byte-by-byte (%ecx != 0)
135
136
137// Loop over 16-byte chunks.
138// %rdi = LHS ptr
139// %rsi = RHS ptr
140// %rdx = length remaining
141// %ecx = chunk count
142
143 .align 4,0x90 // align inner loops to optimize I-fetch
144LLoopOverChunks:
145 movdqu (%rdi),%xmm0 // get LHS
146 movdqu (%rsi),%xmm1 // get RHS
147 addq $16,%rdi
148 pcmpeqb %xmm1,%xmm0 // compare LHS to RHS
149 addq $16,%rsi
150 pmovmskb %xmm0,%eax // collect comparison result bits (1 if equal)
151 subq $16,%rdx // adjust length remaining
152 xorl $0xFFFF,%eax // all equal?
153 jne LDifferent // no, we found differing bytes
154 subl $1,%ecx // more to go?
155 jnz LLoopOverChunks
156
157 cmpq $(kShort),%rdx // a lot more to compare?
158 jbe LShort // no
159 jmp LNotShort // compute distance to next page crossing etc
160
161
162// Found a difference.
163// %rdi = LHS ptr, already advanced by 16
164// %rsi = RHS ptr, already advanced by 16
165// %eax = complemented compare vector (ie, 0 == equal)
166
167LDifferent:
168 bsf %eax,%edx // which byte differed?
169 subq $16,%rdi // point to byte 0 while we wait for bit scan
170 subq $16,%rsi
171 movzb (%rdi,%rdx),%eax // get LHS byte
172 movzb (%rsi,%rdx),%ecx // get RHS byte
173 subl %ecx,%eax // compute difference (ie, return value)
174 ret