]>
Commit | Line | Data |
---|---|---|
224c7076 A |
1 | /* |
2 | * Copyright (c) 2007 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
24 | ||
25 | // ***************** | |
26 | // * S T R L C A T * | |
27 | // ***************** | |
28 | // | |
29 | // size_t strlcat(char *dst, const char *src, size_t size); | |
30 | // | |
31 | // Using 8- or 16-byte parallel loops introduce a complication: | |
32 | // if we blindly did parallel load/stores until finding | |
33 | // a 0, we might get a spurious page fault by touching bytes past it. | |
34 | // To avoid this, we never do a load that crosses a page boundary, | |
35 | // or store unnecessary bytes. | |
36 | // | |
37 | // The word parallel test for 0s relies on the following inobvious | |
38 | // but very efficient test: | |
39 | // x = dataWord + 0xFEFEFEFF | |
40 | // y = ~dataWord & 0x80808080 | |
41 | // if (x & y) == 0 then no zero found | |
42 | // The test maps any non-zero byte to zero, and any zero byte to 0x80, | |
43 | // with one exception: 0x01 bytes preceeding the first zero are also | |
44 | // mapped to 0x80. | |
45 | // | |
46 | // On Core2 class machines, this algorithm seems to be faster than the | |
47 | // naive byte-by-byte version for operands longer than about 11 bytes. | |
48 | ||
49 | .text | |
50 | .globl _strlcat | |
51 | ||
52 | ||
53 | ||
54 | // Use SSE to find the 0-byte at current end of buffer. | |
55 | // This is just a minor variant of strlen(). | |
56 | // Initial registers: | |
57 | // %rdi = dest or buffer ptr | |
58 | // %rsi = source ptr | |
59 | // %rdx = size | |
60 | ||
61 | .align 4 | |
62 | _strlcat: // size_t *strlcat(char *dst, const char *src, size_t size); | |
63 | movl %edi,%ecx // copy buffer ptr | |
64 | movq %rdi,%r10 // save copies of buffer ptr and length | |
65 | movq %rdx,%r11 | |
66 | andq $(-16),%rdi // 16-byte align buffer ptr | |
67 | pxor %xmm0,%xmm0 // get some 0s | |
68 | andl $15,%ecx // get #bytes in dq before start of buffer | |
69 | movl $16,%r8d | |
70 | orl $(-1),%eax | |
71 | subl %ecx,%r8d // #bytes from buffer start to end of dq | |
72 | subq %r8,%rdx // does buffer end before end of dq? | |
73 | jb LShortBuf1 // yes, drop into byte-by-byte mode | |
74 | movdqa (%rdi),%xmm1 // get first aligned chunk of buffer | |
75 | addq $16,%rdi | |
76 | pcmpeqb %xmm0,%xmm1 // check for 0s | |
77 | shl %cl,%eax // create mask for the bytes of aligned dq in operand | |
78 | pmovmskb %xmm1,%ecx // collect mask of 0-bytes | |
79 | andl %eax,%ecx // mask out any 0s that occur before buffer start | |
80 | jnz 2f // found end of buffer | |
81 | 1: | |
82 | subq $16,%rdx // another dq in buffer? | |
83 | jb LShortBuf2 // no, drop into byte-by-byte mode | |
84 | movdqa (%rdi),%xmm1 // get next chunk | |
85 | addq $16,%rdi | |
86 | pcmpeqb %xmm0,%xmm1 // check for 0s | |
87 | pmovmskb %xmm1,%ecx // collect mask of 0-bytes | |
88 | testl %ecx,%ecx // any 0-bytes? | |
89 | jz 1b // no | |
90 | 2: | |
91 | bsf %ecx,%ecx // find first 1-bit (ie, first 0-byte) | |
92 | subq $16,%rdi // back up ptr into buffer | |
93 | addq $16,%rdx // recover length remaining as of start of dq | |
94 | addq %rcx,%rdi // point to 0-byte | |
95 | subq %rcx,%rdx // compute #bytes remaining in buffer | |
96 | ||
97 | ||
98 | // Copy byte-by-byte until source is 8-byte aligned. | |
99 | // %rdi = points to 1st byte available in buffer | |
100 | // %rsi = src ptr | |
101 | // %rdx = buffer length remaining (ie, starting at %rdi) | |
102 | // %r10 = original buffer ptr | |
103 | ||
104 | movl %esi,%ecx // copy source ptr | |
105 | negl %ecx | |
106 | andl $7,%ecx // how many bytes to align source ptr? | |
107 | jz LAligned // already aligned | |
108 | ||
109 | ||
110 | // Loop over bytes. | |
111 | // %rdi = dest ptr | |
112 | // %rsi = source ptr | |
113 | // %rdx = length remaining in buffer | |
114 | // %ecx = number of bytes to copy (>0, may not fit in buffer) | |
115 | // %r10 = original buffer ptr | |
116 | ||
117 | LLoopOverBytes: | |
118 | movzb (%rsi),%eax // get source byte before checking buffer length | |
119 | testq %rdx,%rdx // buffer full? | |
120 | jz L0NotFound // yes | |
121 | incq %rsi | |
122 | decq %rdx | |
123 | movb %al,(%rdi) // pack into dest | |
124 | incq %rdi | |
125 | testl %eax,%eax // 0? | |
126 | jz LDone // yes, done | |
127 | decl %ecx // more to go? | |
128 | jnz LLoopOverBytes | |
129 | ||
130 | ||
131 | // Source is aligned. Loop over quadwords until end of buffer. We | |
132 | // align the source, rather than the dest, to avoid getting spurious page faults. | |
133 | // %rdi = dest ptr (unaligned) | |
134 | // %rsi = source ptr (quadword aligned) | |
135 | // %rdx = length remaining in buffer | |
136 | // %r10 = original buffer ptr | |
137 | ||
138 | LAligned: | |
139 | movl $9,%ecx // if buffer almost exhausted, prepare to copy rest byte-by-byte | |
140 | cmpq $8,%rdx // enough for at least one quadword? | |
141 | jb LLoopOverBytes | |
142 | movq $0xFEFEFEFEFEFEFEFF,%r8 // get magic constants | |
143 | movq $0x8080808080808080,%r9 | |
144 | ||
145 | ||
146 | // Loop over quadwords. | |
147 | // %rdi = dest ptr (unaligned) | |
148 | // %rsi = source ptr (quadword aligned) | |
149 | // %rdx = length remaining in buffer (>=8) | |
150 | // %r8 = 0xFEFEFEFEFEFEFEFF | |
151 | // %r9 = 0x8080808080808080 | |
152 | // %r10 = original buffer ptr | |
153 | ||
154 | LLoopOverQuads: | |
155 | movq (%rsi),%rax // get next 8 bytes of source | |
156 | subq $8,%rdx | |
157 | addq $8,%rsi | |
158 | movq %rax,%r11 // make 2 copies of quadword | |
159 | movq %rax,%rcx | |
160 | notq %r11 // use magic word-parallel test for 0s | |
161 | addq %r8,%rcx | |
162 | andq %r9,%r11 | |
163 | testq %rcx,%r11 | |
164 | jnz L0Found // one of the bytes of %rax is a 0 | |
165 | movq %rax,(%rdi) // pack 8 bytes into destination | |
166 | addq $8,%rdi | |
167 | cmpq $8,%rdx // room in buffer for another quadword? | |
168 | jae LLoopOverQuads // yes | |
169 | ||
170 | movl %edx,%ecx // copy leftovers in byte loop | |
171 | jmp LLoopOverBytes | |
172 | ||
173 | // Found a 0-byte in the quadword of source. Store a byte at a time until the 0. | |
174 | // %rdi = dest ptr (unaligned) | |
175 | // %rax = last word of source, known to have a 0-byte | |
176 | // %r10 = original buffer ptr | |
177 | ||
178 | LNextByte: | |
179 | shrq $8,%rax // next byte | |
180 | L0Found: | |
181 | movb %al,(%rdi) // pack in next byte | |
182 | incq %rdi | |
183 | testb %al,%al // 0? | |
184 | jnz LNextByte | |
185 | ||
186 | ||
187 | // Done storing string. | |
188 | // %rdi = ptr to byte after 0-byte | |
189 | // %r10 = original buffer ptr | |
190 | ||
191 | LDone: | |
192 | subq %r10,%rdi // subtract original dest ptr to get length stored | |
193 | lea -1(%rdi),%rax // minus one for 0-byte, and move to return value | |
194 | ret | |
195 | ||
196 | // Buffer filled but 0-byte not found. We return the length of the buffer plus the length | |
197 | // of the source string. This is not optimized, as it is an error condition. | |
198 | // %edi = dest ptr (ie, 1 past end of buffer) | |
199 | // %esi = source ptr (ptr to 1st byte that does not fit) | |
200 | // %rdx = 0 (ie, length remaining in buffer) | |
201 | // %r10 = original buffer ptr | |
202 | ||
203 | L0NotFound: | |
204 | movq %rdi,%rax // buffer end... | |
205 | subq %r10,%rax // ...minus start is buffer length | |
206 | jz LScanSourceTo0 // buffer is null so cannot store a 0 | |
207 | movb %dl,-1(%rdi) // store a 0 at end of buffer to delimit string | |
208 | ||
209 | ||
210 | // Scan source to end. | |
211 | // %rsi = ptr to rest of source | |
212 | // %rax = return value so far | |
213 | ||
214 | LScanSourceTo0: | |
215 | movzb (%rsi),%ecx // get next byte of source | |
216 | incq %rsi | |
217 | incq %rax // increment length | |
218 | testl %ecx,%ecx // 0? | |
219 | jnz LScanSourceTo0 | |
220 | decq %rax // don't count the 0-byte | |
221 | ret | |
222 | ||
223 | ||
224 | // Buffer too short to reach end of even one 16-byte aligned chunk. | |
225 | // %rsi = src ptr | |
226 | // %r10 = original buffer ptr | |
227 | // %r11 = original buffer length | |
228 | ||
229 | LShortBuf1: | |
230 | movq %r10,%rdi // recover buffer ptr | |
231 | movq %r11,%rdx // recover buffer length | |
232 | jmp LShortBuf3 | |
233 | ||
234 | ||
235 | // Out of aligned dq's of buffer, 0-byte still not found. | |
236 | // %rsi = src ptr | |
237 | // %rdi = ptr to 1st buffer byte not checked for 0 | |
238 | // %rdx = length remaining - 16 | |
239 | // %r10 = original buffer ptr | |
240 | // %r11 = original buffer length | |
241 | ||
242 | LShortBuf2: | |
243 | addq $16,%rdx // recover length remaining | |
244 | LShortBuf3: | |
245 | movq %r11,%rax // in case we goto LScanSourceTo0 | |
246 | movl $17,%ecx // in case we goto LLoopOverBytes | |
247 | 1: | |
248 | testq %rdx,%rdx // no 0s in buffer at all? | |
249 | jz LScanSourceTo0 // yes, cannot store a 0 | |
250 | cmpb $0,(%rdi) // is this the 0? | |
251 | jz LLoopOverBytes // yes, append source | |
252 | incq %rdi | |
253 | decq %rdx | |
254 | jmp 1b // loop looking for 0 |