]> git.saurik.com Git - apple/libc.git/blob - x86_64/string/memset.s
Libc-825.26.tar.gz
[apple/libc.git] / x86_64 / string / memset.s
1 /*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23 #include <machine/cpu_capabilities.h>
24
25
26 /* This file contains the following functions:
27 *
28 * void *memset(void *b, int c, size_t len);
29 * void memset_pattern4(void *b, const void *c4, size_t len);
30 * void memset_pattern8(void *b, const void *c8, size_t len);
31 * void memset_pattern16(void *b, const void *c16, size_t len);
32 *
33 * Calls of memset() with c==0 are routed to the bzero() routine. Most of the
34 * others go to _memset_pattern, which is entered as follows:
35 * %rdi = ptr to memory to set (aligned)
36 * %edx = length (which can be short, though we bias in favor of long operands)
37 * %xmm0 = the pattern to store
38 * Return conditions:
39 * %eax, %edi, %esi, %ecx, and %edx all trashed
40 *
41 * NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow
42 * on P4s and probably other processors.
43 */
44
45 #define kShort 255 // for nonzero memset(), too short for commpage
46
47
48 .text
49 .globl _memset
50 .align 2
51 _memset: // void *memset(void *b, int c, size_t len);
52 andl $0xFF,%esi // (c==0) ?
53 jnz LNonzero // not a bzero
54
55 movq %rdx,%rsi // put count where bzero() expects it
56 jmp _bzero // enter _bzero
57
58
59 // Handle memset of a nonzero value.
60
61 LNonzero:
62 movq %rdi,%r8 // preserve the original pointer so we can return it
63 movl %esi,%eax // replicate byte in %esi into all four bytes
64 shll $8,%esi
65 orl %esi,%eax
66 movl %eax,%esi
67 shll $16,%esi
68 orl %esi,%eax // now %eax has "c" in all 4 bytes
69 cmpq $(kShort),%rdx // is operand too short for SSE?
70 ja LCallCommpage // no
71
72 // Nonzero memset() too short to call commpage.
73 // %eax = replicated 4-byte pattern
74 // %rdi = ptr
75 // %edx = length (<= kShort)
76
77 cmpl $16,%edx // long enough to word align?
78 jge 3f // yes
79 test %edx,%edx // length==0?
80 jz 6f
81 1:
82 movb %al,(%rdi) // pack in a byte
83 addq $1,%rdi
84 subl $1,%edx
85 jnz 1b
86 jmp 6f
87 2:
88 movb %al,(%rdi) // pack in a byte
89 addq $1,%rdi
90 subl $1,%edx
91 3:
92 test $3,%edi // is ptr doubleword aligned?
93 jnz 2b // no
94 movl %edx,%ecx // copy length
95 shrl $2,%edx // #doublewords to store
96 4:
97 movl %eax,(%rdi) // store aligned doubleword
98 addq $4,%rdi
99 subl $1,%edx
100 jnz 4b
101 andl $3,%ecx // any leftover bytes?
102 jz 6f // no
103 5:
104 movb %al,(%rdi) // pack in a byte
105 addq $1,%rdi
106 subl $1,%ecx
107 jnz 5b
108 6:
109 movq %r8,%rax // get return value (ie, original ptr)
110 ret
111
112 // Nonzero memset() is long enough to call commpage.
113 // %eax = replicated 4-byte pattern
114 // %rdi = ptr
115 // %rdx = length (> kShort)
116
117 LCallCommpage:
118 movd %eax,%xmm0 // move %eax to low 4 bytes of %xmm0
119 pshufd $(0x00),%xmm0,%xmm0 // replicate across the vector
120 movq %rdi,%rcx // copy dest ptr
121 negl %ecx
122 andl $15,%ecx // get #bytes to align ptr
123 jz 2f // skip if already aligned
124 subq %rcx,%rdx // decrement length
125 1:
126 movb %al,(%rdi) // pack in a byte
127 addq $1,%rdi
128 subl $1,%ecx
129 jnz 1b
130 2: // ptr aligned, length long enough to justify
131 call Lmemset_pattern // call commpage to do the heavy lifting
132 movq %r8,%rax // get return value (ie, original ptr)
133 ret
134
135
136 // Handle memset of a 16-byte pattern.
137
138 .globl _memset_pattern16
139 .align 2, 0x90
140 _memset_pattern16: // void memset_pattern16(void *b, const void *c16, size_t len);
141 movdqu (%rsi),%xmm0 // load the pattern
142 jmp LAlignPtr
143
144
145 // Handle memset of an 8-byte pattern.
146
147 .globl _memset_pattern8
148 .align 2, 0x90
149 _memset_pattern8: // void memset_pattern8(void *b, const void *c8, size_t len);
150 movq (%rsi),%xmm0 // load pattern into low 8 bytes
151 punpcklqdq %xmm0,%xmm0 // replicate into all 16
152 jmp LAlignPtr
153
154 // Handle memset of a 4-byte pattern.
155
156 .globl _memset_pattern4
157 .align 2, 0x90
158 _memset_pattern4: // void memset_pattern4(void *b, const void *c4, size_t len);
159 movd (%rsi),%xmm0 // load pattern into low 4 bytes
160 pshufd $(0x00),%xmm0,%xmm0 // replicate the 4 bytes across the vector
161
162
163 // Align ptr if necessary. We must rotate the pattern right for each byte we
164 // store while aligning the ptr. Since there is no rotate instruction in SSE3,
165 // we have to synthesize the rotates.
166 // %rdi = ptr
167 // %rdx = length
168 // %xmm0 = pattern
169
170 LAlignPtr: // NB: can drop down to here!
171 cmpq $100,%rdx // long enough to bother aligning ptr?
172 movq %rdi,%rcx // copy ptr
173 jb LReady // not long enough
174 negl %ecx
175 andl $15,%ecx // get #bytes to align ptr
176 jz LReady // already aligned
177 subq %rcx,%rdx // adjust length
178
179 test $1,%cl // 1-byte store required?
180 movd %xmm0,%eax // get 4 low bytes in %eax
181 jz 2f // no
182 movdqa %xmm0,%xmm1 // copy pattern so we can shift in both directions
183 movb %al,(%rdi) // pack in the low-order byte
184 psrldq $1,%xmm0 // shift pattern right 1 byte
185 addq $1,%rdi
186 pslldq $15,%xmm1 // shift pattern left 15 bytes
187 shrl $8,%eax // in case 2-byte store is required
188 por %xmm1,%xmm0 // complete right rotate of pattern by 1 byte
189 2:
190 test $2,%cl // 2-byte store required?
191 jz 4f // no
192 psrldq $2,%xmm0 // shift pattern down 2 bytes
193 movw %ax,(%rdi) // pack in next two bytes
194 pinsrw $7,%eax,%xmm0 // insert low word of %eax into high word of %xmm0
195 addq $2,%rdi // adjust ptr
196 4:
197 test $4,%cl // 4-byte store required?
198 jz 8f // no
199 movd %xmm0,(%rdi) // store low 4 bytes of %xmm0
200 pshufd $(0x39),%xmm0,%xmm0 // rotate %xmm0 right 4 bytes (mask == 00 11 10 01)
201 addq $4,%rdi // adjust ptr
202 8:
203 test $8,%cl // 8-byte store required?
204 jz LReady // no
205 movq %xmm0,(%rdi) // store low 8 bytes of %xmm0
206 pshufd $(0x4e),%xmm0,%xmm0 // rotate %xmm0 right 8 bytes (mask == 01 00 11 10)
207 addq $8,%rdi // adjust ptr
208
209 // Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
210
211 LReady:
212 call Lmemset_pattern // call commpage to do the heavy lifting
213 ret
214
215
216 #define kLShort 63
217 #define kVeryLong (1024*1024)
218
219 Lmemset_pattern:
220 cmpq $(kLShort),%rdx // long enough to bother aligning?
221 ja LNotShort // yes
222 jmp LShort // no
223
224 // Here for short operands or the end of long ones.
225 // %rdx = length (<= kLShort)
226 // %rdi = ptr (may not be not aligned)
227 // %xmm0 = pattern
228
229 LUnalignedStore16:
230 movdqu %xmm0,(%rdi) // stuff in another 16 bytes
231 subl $16,%edx
232 addq $16,%rdi
233 LShort:
234 cmpl $16,%edx // room for another vector?
235 jge LUnalignedStore16 // yes
236 LLessThan16: // here at end of copy with < 16 bytes remaining
237 test $8,%dl // 8-byte store required?
238 jz 2f // no
239 movq %xmm0,(%rdi) // pack in 8 low bytes
240 psrldq $8,%xmm0 // then shift vector down 8 bytes
241 addq $8,%rdi
242 2:
243 test $4,%dl // 4-byte store required?
244 jz 3f // no
245 movd %xmm0,(%rdi) // pack in 4 low bytes
246 psrldq $4,%xmm0 // then shift vector down 4 bytes
247 addq $4,%rdi
248 3:
249 andl $3,%edx // more to go?
250 jz 5f // no
251 movd %xmm0,%eax // move remainders out into %eax
252 4: // loop on up to three bytes
253 movb %al,(%rdi) // pack in next byte
254 shrl $8,%eax // shift next byte into position
255 incq %rdi
256 dec %edx
257 jnz 4b
258 5: ret
259
260 // Long enough to justify aligning ptr. Note that we have to rotate the
261 // pattern to account for any alignment. We do this by doing two unaligned
262 // stores, and then an aligned load from the middle of the two stores.
263 // This will stall on store forwarding alignment mismatch, and the unaligned
264 // stores can be pretty slow too, but the alternatives aren't any better.
265 // Fortunately, in most cases our caller has already aligned the ptr.
266 // %rdx = length (> kLShort)
267 // %rdi = ptr (may not be aligned)
268 // %xmm0 = pattern
269
270 LNotShort:
271 movl %edi,%ecx // copy low bits of dest ptr
272 negl %ecx
273 andl $15,%ecx // mask down to #bytes to 16-byte align
274 jz LAligned // skip if already aligned
275 movdqu %xmm0,(%rdi) // store 16 unaligned bytes
276 movdqu %xmm0,16(%rdi) // and 16 more, to be sure we have an aligned chunk
277 addq %rcx,%rdi // now point to the aligned chunk
278 subq %rcx,%rdx // adjust remaining count
279 movdqa (%rdi),%xmm0 // get the rotated pattern (probably stalling)
280 addq $16,%rdi // skip past the aligned chunk
281 subq $16,%rdx
282
283 // Set up for 64-byte loops.
284 // %rdx = length remaining
285 // %rdi = ptr (aligned)
286 // %xmm0 = rotated pattern
287
288 LAligned:
289 movq %rdx,%rcx // copy length remaining
290 andl $63,%edx // mask down to residual length (0..63)
291 andq $-64,%rcx // %ecx <- #bytes we will zero in by-64 loop
292 jz LNoMoreChunks // no 64-byte chunks
293 addq %rcx,%rdi // increment ptr by length to move
294 cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores?
295 jge LVeryLong // yes
296 negq %rcx // negate length to move
297 jmp 1f
298
299 // Loop over 64-byte chunks, storing into cache.
300
301 .align 4,0x90 // keep inner loops 16-byte aligned
302 1:
303 movdqa %xmm0,(%rdi,%rcx)
304 movdqa %xmm0,16(%rdi,%rcx)
305 movdqa %xmm0,32(%rdi,%rcx)
306 movdqa %xmm0,48(%rdi,%rcx)
307 addq $64,%rcx
308 jne 1b
309
310 jmp LNoMoreChunks
311
312 // Very long operands: use non-temporal stores to bypass cache.
313
314 LVeryLong:
315 negq %rcx // negate length to move
316 jmp 1f
317
318 .align 4,0x90 // keep inner loops 16-byte aligned
319 1:
320 movntdq %xmm0,(%rdi,%rcx)
321 movntdq %xmm0,16(%rdi,%rcx)
322 movntdq %xmm0,32(%rdi,%rcx)
323 movntdq %xmm0,48(%rdi,%rcx)
324 addq $64,%rcx
325 jne 1b
326
327 sfence // required by non-temporal stores
328 jmp LNoMoreChunks
329
330 // Handle leftovers: loop by 16.
331 // %edx = length remaining (<64)
332 // %edi = ptr (aligned)
333 // %xmm0 = rotated pattern
334
335 LLoopBy16:
336 movdqa %xmm0,(%rdi) // pack in 16 more bytes
337 subl $16,%edx // decrement count
338 addq $16,%rdi // increment ptr
339 LNoMoreChunks:
340 cmpl $16,%edx // more to go?
341 jge LLoopBy16 // yes
342 jmp LLessThan16 // handle up to 15 remaining bytes