]>
Commit | Line | Data |
---|---|---|
8e029c65 A |
1 | /* |
2 | * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * The contents of this file constitute Original Code as defined in and | |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
11 | * | |
12 | * This Original Code and all software distributed under the License are | |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the | |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
19 | * | |
20 | * @APPLE_LICENSE_HEADER_END@ | |
21 | */ | |
22 | ||
23 | #include <machine/cpu_capabilities.h> | |
1f2f436a | 24 | |
8e029c65 A |
25 | |
26 | /* This file contains the following functions: | |
27 | * | |
28 | * void *memset(void *b, int c, size_t len); | |
29 | * void memset_pattern4(void *b, const void *c4, size_t len); | |
30 | * void memset_pattern8(void *b, const void *c8, size_t len); | |
31 | * void memset_pattern16(void *b, const void *c16, size_t len); | |
32 | * | |
33 | * Calls of memset() with c==0 are routed to the bzero() routine. Most of the | |
1f2f436a | 34 | * others go to _memset_pattern, which is entered as follows: |
8e029c65 A |
35 | * %rdi = ptr to memory to set (aligned) |
36 | * %edx = length (which can be short, though we bias in favor of long operands) | |
37 | * %xmm0 = the pattern to store | |
38 | * Return conditions: | |
39 | * %eax, %edi, %esi, %ecx, and %edx all trashed | |
40 | * | |
41 | * NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow | |
42 | * on P4s and probably other processors. | |
43 | */ | |
1f2f436a A |
44 | |
45 | #define kShort 255 // for nonzero memset(), too short for commpage | |
46 | ||
47 | ||
8e029c65 A |
48 | .text |
49 | .globl _memset | |
50 | .align 2 | |
51 | _memset: // void *memset(void *b, int c, size_t len); | |
52 | andl $0xFF,%esi // (c==0) ? | |
53 | jnz LNonzero // not a bzero | |
1f2f436a | 54 | |
8e029c65 | 55 | movq %rdx,%rsi // put count where bzero() expects it |
1f2f436a A |
56 | jmp _bzero // enter _bzero |
57 | ||
8e029c65 | 58 | |
1f2f436a | 59 | // Handle memset of a nonzero value. |
8e029c65 | 60 | |
8e029c65 A |
61 | LNonzero: |
62 | movq %rdi,%r8 // preserve the original pointer so we can return it | |
63 | movl %esi,%eax // replicate byte in %esi into all four bytes | |
64 | shll $8,%esi | |
65 | orl %esi,%eax | |
66 | movl %eax,%esi | |
67 | shll $16,%esi | |
68 | orl %esi,%eax // now %eax has "c" in all 4 bytes | |
69 | cmpq $(kShort),%rdx // is operand too short for SSE? | |
70 | ja LCallCommpage // no | |
1f2f436a A |
71 | |
72 | // Nonzero memset() too short to call commpage. | |
73 | // %eax = replicated 4-byte pattern | |
74 | // %rdi = ptr | |
75 | // %edx = length (<= kShort) | |
76 | ||
8e029c65 A |
77 | cmpl $16,%edx // long enough to word align? |
78 | jge 3f // yes | |
79 | test %edx,%edx // length==0? | |
80 | jz 6f | |
81 | 1: | |
82 | movb %al,(%rdi) // pack in a byte | |
83 | addq $1,%rdi | |
84 | subl $1,%edx | |
85 | jnz 1b | |
86 | jmp 6f | |
87 | 2: | |
88 | movb %al,(%rdi) // pack in a byte | |
89 | addq $1,%rdi | |
90 | subl $1,%edx | |
91 | 3: | |
92 | test $3,%edi // is ptr doubleword aligned? | |
93 | jnz 2b // no | |
94 | movl %edx,%ecx // copy length | |
95 | shrl $2,%edx // #doublewords to store | |
96 | 4: | |
97 | movl %eax,(%rdi) // store aligned doubleword | |
98 | addq $4,%rdi | |
99 | subl $1,%edx | |
100 | jnz 4b | |
101 | andl $3,%ecx // any leftover bytes? | |
102 | jz 6f // no | |
103 | 5: | |
104 | movb %al,(%rdi) // pack in a byte | |
105 | addq $1,%rdi | |
106 | subl $1,%ecx | |
107 | jnz 5b | |
108 | 6: | |
109 | movq %r8,%rax // get return value (ie, original ptr) | |
110 | ret | |
1f2f436a A |
111 | |
112 | // Nonzero memset() is long enough to call commpage. | |
113 | // %eax = replicated 4-byte pattern | |
114 | // %rdi = ptr | |
115 | // %rdx = length (> kShort) | |
116 | ||
8e029c65 A |
117 | LCallCommpage: |
118 | movd %eax,%xmm0 // move %eax to low 4 bytes of %xmm0 | |
119 | pshufd $(0x00),%xmm0,%xmm0 // replicate across the vector | |
120 | movq %rdi,%rcx // copy dest ptr | |
121 | negl %ecx | |
122 | andl $15,%ecx // get #bytes to align ptr | |
123 | jz 2f // skip if already aligned | |
124 | subq %rcx,%rdx // decrement length | |
125 | 1: | |
126 | movb %al,(%rdi) // pack in a byte | |
127 | addq $1,%rdi | |
128 | subl $1,%ecx | |
129 | jnz 1b | |
130 | 2: // ptr aligned, length long enough to justify | |
1f2f436a | 131 | call Lmemset_pattern // call commpage to do the heavy lifting |
8e029c65 A |
132 | movq %r8,%rax // get return value (ie, original ptr) |
133 | ret | |
134 | ||
135 | ||
1f2f436a A |
136 | // Handle memset of a 16-byte pattern. |
137 | ||
8e029c65 A |
138 | .globl _memset_pattern16 |
139 | .align 2, 0x90 | |
140 | _memset_pattern16: // void memset_pattern16(void *b, const void *c16, size_t len); | |
141 | movdqu (%rsi),%xmm0 // load the pattern | |
142 | jmp LAlignPtr | |
143 | ||
144 | ||
1f2f436a A |
145 | // Handle memset of an 8-byte pattern. |
146 | ||
8e029c65 A |
147 | .globl _memset_pattern8 |
148 | .align 2, 0x90 | |
149 | _memset_pattern8: // void memset_pattern8(void *b, const void *c8, size_t len); | |
150 | movq (%rsi),%xmm0 // load pattern into low 8 bytes | |
151 | punpcklqdq %xmm0,%xmm0 // replicate into all 16 | |
152 | jmp LAlignPtr | |
153 | ||
1f2f436a A |
154 | // Handle memset of a 4-byte pattern. |
155 | ||
8e029c65 A |
156 | .globl _memset_pattern4 |
157 | .align 2, 0x90 | |
158 | _memset_pattern4: // void memset_pattern4(void *b, const void *c4, size_t len); | |
159 | movd (%rsi),%xmm0 // load pattern into low 4 bytes | |
160 | pshufd $(0x00),%xmm0,%xmm0 // replicate the 4 bytes across the vector | |
161 | ||
162 | ||
1f2f436a A |
163 | // Align ptr if necessary. We must rotate the pattern right for each byte we |
164 | // store while aligning the ptr. Since there is no rotate instruction in SSE3, | |
165 | // we have to synthesize the rotates. | |
166 | // %rdi = ptr | |
167 | // %rdx = length | |
168 | // %xmm0 = pattern | |
169 | ||
8e029c65 A |
170 | LAlignPtr: // NB: can drop down to here! |
171 | cmpq $100,%rdx // long enough to bother aligning ptr? | |
172 | movq %rdi,%rcx // copy ptr | |
173 | jb LReady // not long enough | |
174 | negl %ecx | |
175 | andl $15,%ecx // get #bytes to align ptr | |
176 | jz LReady // already aligned | |
177 | subq %rcx,%rdx // adjust length | |
1f2f436a | 178 | |
8e029c65 A |
179 | test $1,%cl // 1-byte store required? |
180 | movd %xmm0,%eax // get 4 low bytes in %eax | |
181 | jz 2f // no | |
182 | movdqa %xmm0,%xmm1 // copy pattern so we can shift in both directions | |
183 | movb %al,(%rdi) // pack in the low-order byte | |
184 | psrldq $1,%xmm0 // shift pattern right 1 byte | |
185 | addq $1,%rdi | |
186 | pslldq $15,%xmm1 // shift pattern left 15 bytes | |
187 | shrl $8,%eax // in case 2-byte store is required | |
188 | por %xmm1,%xmm0 // complete right rotate of pattern by 1 byte | |
189 | 2: | |
190 | test $2,%cl // 2-byte store required? | |
191 | jz 4f // no | |
192 | psrldq $2,%xmm0 // shift pattern down 2 bytes | |
193 | movw %ax,(%rdi) // pack in next two bytes | |
194 | pinsrw $7,%eax,%xmm0 // insert low word of %eax into high word of %xmm0 | |
195 | addq $2,%rdi // adjust ptr | |
196 | 4: | |
197 | test $4,%cl // 4-byte store required? | |
198 | jz 8f // no | |
199 | movd %xmm0,(%rdi) // store low 4 bytes of %xmm0 | |
200 | pshufd $(0x39),%xmm0,%xmm0 // rotate %xmm0 right 4 bytes (mask == 00 11 10 01) | |
201 | addq $4,%rdi // adjust ptr | |
202 | 8: | |
203 | test $8,%cl // 8-byte store required? | |
204 | jz LReady // no | |
205 | movq %xmm0,(%rdi) // store low 8 bytes of %xmm0 | |
206 | pshufd $(0x4e),%xmm0,%xmm0 // rotate %xmm0 right 8 bytes (mask == 01 00 11 10) | |
207 | addq $8,%rdi // adjust ptr | |
1f2f436a A |
208 | |
209 | // Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting. | |
8e029c65 A |
210 | |
211 | LReady: | |
1f2f436a | 212 | call Lmemset_pattern // call commpage to do the heavy lifting |
8e029c65 | 213 | ret |
1f2f436a A |
214 | |
215 | ||
216 | #define kLShort 63 | |
217 | #define kVeryLong (1024*1024) | |
218 | ||
219 | Lmemset_pattern: | |
220 | cmpq $(kLShort),%rdx // long enough to bother aligning? | |
221 | ja LNotShort // yes | |
222 | jmp LShort // no | |
223 | ||
224 | // Here for short operands or the end of long ones. | |
225 | // %rdx = length (<= kLShort) | |
226 | // %rdi = ptr (may not be not aligned) | |
227 | // %xmm0 = pattern | |
228 | ||
229 | LUnalignedStore16: | |
230 | movdqu %xmm0,(%rdi) // stuff in another 16 bytes | |
231 | subl $16,%edx | |
232 | addq $16,%rdi | |
233 | LShort: | |
234 | cmpl $16,%edx // room for another vector? | |
235 | jge LUnalignedStore16 // yes | |
236 | LLessThan16: // here at end of copy with < 16 bytes remaining | |
237 | test $8,%dl // 8-byte store required? | |
238 | jz 2f // no | |
239 | movq %xmm0,(%rdi) // pack in 8 low bytes | |
240 | psrldq $8,%xmm0 // then shift vector down 8 bytes | |
241 | addq $8,%rdi | |
242 | 2: | |
243 | test $4,%dl // 4-byte store required? | |
244 | jz 3f // no | |
245 | movd %xmm0,(%rdi) // pack in 4 low bytes | |
246 | psrldq $4,%xmm0 // then shift vector down 4 bytes | |
247 | addq $4,%rdi | |
248 | 3: | |
249 | andl $3,%edx // more to go? | |
250 | jz 5f // no | |
251 | movd %xmm0,%eax // move remainders out into %eax | |
252 | 4: // loop on up to three bytes | |
253 | movb %al,(%rdi) // pack in next byte | |
254 | shrl $8,%eax // shift next byte into position | |
255 | incq %rdi | |
256 | dec %edx | |
257 | jnz 4b | |
258 | 5: ret | |
259 | ||
260 | // Long enough to justify aligning ptr. Note that we have to rotate the | |
261 | // pattern to account for any alignment. We do this by doing two unaligned | |
262 | // stores, and then an aligned load from the middle of the two stores. | |
263 | // This will stall on store forwarding alignment mismatch, and the unaligned | |
264 | // stores can be pretty slow too, but the alternatives aren't any better. | |
265 | // Fortunately, in most cases our caller has already aligned the ptr. | |
266 | // %rdx = length (> kLShort) | |
267 | // %rdi = ptr (may not be aligned) | |
268 | // %xmm0 = pattern | |
269 | ||
270 | LNotShort: | |
271 | movl %edi,%ecx // copy low bits of dest ptr | |
272 | negl %ecx | |
273 | andl $15,%ecx // mask down to #bytes to 16-byte align | |
274 | jz LAligned // skip if already aligned | |
275 | movdqu %xmm0,(%rdi) // store 16 unaligned bytes | |
276 | movdqu %xmm0,16(%rdi) // and 16 more, to be sure we have an aligned chunk | |
277 | addq %rcx,%rdi // now point to the aligned chunk | |
278 | subq %rcx,%rdx // adjust remaining count | |
279 | movdqa (%rdi),%xmm0 // get the rotated pattern (probably stalling) | |
280 | addq $16,%rdi // skip past the aligned chunk | |
281 | subq $16,%rdx | |
282 | ||
283 | // Set up for 64-byte loops. | |
284 | // %rdx = length remaining | |
285 | // %rdi = ptr (aligned) | |
286 | // %xmm0 = rotated pattern | |
287 | ||
288 | LAligned: | |
289 | movq %rdx,%rcx // copy length remaining | |
290 | andl $63,%edx // mask down to residual length (0..63) | |
291 | andq $-64,%rcx // %ecx <- #bytes we will zero in by-64 loop | |
292 | jz LNoMoreChunks // no 64-byte chunks | |
293 | addq %rcx,%rdi // increment ptr by length to move | |
294 | cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores? | |
295 | jge LVeryLong // yes | |
296 | negq %rcx // negate length to move | |
297 | jmp 1f | |
298 | ||
299 | // Loop over 64-byte chunks, storing into cache. | |
300 | ||
301 | .align 4,0x90 // keep inner loops 16-byte aligned | |
302 | 1: | |
303 | movdqa %xmm0,(%rdi,%rcx) | |
304 | movdqa %xmm0,16(%rdi,%rcx) | |
305 | movdqa %xmm0,32(%rdi,%rcx) | |
306 | movdqa %xmm0,48(%rdi,%rcx) | |
307 | addq $64,%rcx | |
308 | jne 1b | |
309 | ||
310 | jmp LNoMoreChunks | |
311 | ||
312 | // Very long operands: use non-temporal stores to bypass cache. | |
313 | ||
314 | LVeryLong: | |
315 | negq %rcx // negate length to move | |
316 | jmp 1f | |
317 | ||
318 | .align 4,0x90 // keep inner loops 16-byte aligned | |
319 | 1: | |
320 | movntdq %xmm0,(%rdi,%rcx) | |
321 | movntdq %xmm0,16(%rdi,%rcx) | |
322 | movntdq %xmm0,32(%rdi,%rcx) | |
323 | movntdq %xmm0,48(%rdi,%rcx) | |
324 | addq $64,%rcx | |
325 | jne 1b | |
326 | ||
327 | sfence // required by non-temporal stores | |
328 | jmp LNoMoreChunks | |
329 | ||
330 | // Handle leftovers: loop by 16. | |
331 | // %edx = length remaining (<64) | |
332 | // %edi = ptr (aligned) | |
333 | // %xmm0 = rotated pattern | |
334 | ||
335 | LLoopBy16: | |
336 | movdqa %xmm0,(%rdi) // pack in 16 more bytes | |
337 | subl $16,%edx // decrement count | |
338 | addq $16,%rdi // increment ptr | |
339 | LNoMoreChunks: | |
340 | cmpl $16,%edx // more to go? | |
341 | jge LLoopBy16 // yes | |
342 | jmp LLessThan16 // handle up to 15 remaining bytes |