2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
23 #include <machine/cpu_capabilities.h>
26 /* This file contains the following functions:
28 * void *memset(void *b, int c, size_t len);
29 * void memset_pattern4(void *b, const void *c4, size_t len);
30 * void memset_pattern8(void *b, const void *c8, size_t len);
31 * void memset_pattern16(void *b, const void *c16, size_t len);
33 * Calls of memset() with c==0 are routed to the bzero() routine. Most of the
34 * others go to _COMM_PAGE_MEMSET_PATTERN, which is entered as follows:
35 * %edi = ptr to memory to set (aligned)
36 * %edx = length (which can be short, though we bias in favor of long operands)
37 * %xmm0 = the pattern to store
39 * %eax, %edi, %esi, %ecx, and %edx all trashed
41 * NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow
42 * on P4s and probably other processors.
45 #define kShort 255 // for nonzero memset(), too short for commpage
51 _memset: // void *memset(void *b, int c, size_t len);
52 movl 8(%esp),%eax // get 1-byte pattern
53 movl 12(%esp),%edx // get length
54 andl $0xFF,%eax // (c==0) ?
55 jnz LNonzero // not a bzero
57 movl $(_COMM_PAGE_BZERO),%eax// map memset(p,0,n) into bzero(p,n)
58 movl %edx,8(%esp) // put count where bzero() expects it
59 jmp %eax // enter commpage
62 // Handle memset of a nonzero value.
65 pushl %edi // save a few nonvolatiles
67 movl %eax,%esi // replicate byte in %al into all four bytes
68 movl 12(%esp),%edi // point to operand
73 orl %esi,%eax // now %eax has "c" in all 4 bytes
74 cmpl $(kShort),%edx // is operand too short for SSE?
75 ja LCallCommpage // no
77 // Nonzero memset() too short to call commpage.
78 // %eax = replicated 4-byte pattern
80 // %edx = length (<= kShort)
82 cmpl $16,%edx // long enough to word align?
84 test %edx,%edx // length==0?
87 movb %al,(%edi) // pack in a byte
93 movb %al,(%edi) // pack in a byte
97 test $3,%edi // is ptr doubleword aligned?
99 movl %edx,%ecx // copy length
100 shrl $2,%edx // #doublewords to store
102 movl %eax,(%edi) // store aligned doubleword
106 andl $3,%ecx // any leftover bytes?
109 movb %al,(%edi) // pack in a byte
114 movl 12(%esp),%eax // get return value (ie, original ptr)
119 // Nonzero memset() is long enough to call commpage.
120 // %eax = replicated 4-byte pattern
122 // %edx = length (> kShort)
125 movd %eax,%xmm0 // move %eax to low 4 bytes of %xmm0
126 pshufd $(0x00),%xmm0,%xmm0 // replicate across the vector
127 movl %edi,%ecx // copy dest ptr
129 andl $15,%ecx // get #bytes to align ptr
130 jz 2f // skip if already aligned
131 subl %ecx,%edx // decrement length
133 movb %al,(%edi) // pack in a byte
137 2: // ptr aligned, length long enough to justify
138 movl $(_COMM_PAGE_MEMSET_PATTERN),%eax
139 call %eax // call commpage to do the heavy lifting
140 movl 12(%esp),%eax // get return value (ie, original ptr)
146 // Handle memset of a 16-byte pattern.
148 .globl _memset_pattern16
150 _memset_pattern16: // void memset_pattern16(void *b, const void *c16, size_t len);
153 movl 20(%esp),%edx // get length
154 movl 16(%esp),%esi // get ptr to 16-byte pattern
155 movl 12(%esp),%edi // point to operand
156 movdqu (%esi),%xmm0 // load the pattern
160 // Handle memset of an 8-byte pattern.
162 .globl _memset_pattern8
164 _memset_pattern8: // void memset_pattern8(void *b, const void *c8, size_t len);
167 movl 20(%esp),%edx // get length
168 movl 16(%esp),%esi // get ptr to 8-byte pattern
169 movl 12(%esp),%edi // point to operand
170 movq (%esi),%xmm0 // load pattern into low 8 bytes
171 punpcklqdq %xmm0,%xmm0 // replicate into all 16
174 // Handle memset of a 4-byte pattern.
176 .globl _memset_pattern4
178 _memset_pattern4: // void memset_pattern4(void *b, const void *c4, size_t len);
181 movl 20(%esp),%edx // get length
182 movl 16(%esp),%esi // get ptr to 4-byte pattern
183 movl 12(%esp),%edi // point to operand
184 movd (%esi),%xmm0 // load pattern into low 4 bytes
185 pshufd $(0x00),%xmm0,%xmm0 // replicate the 4 bytes across the vector
188 // Align ptr if necessary. We must rotate the pattern right for each byte we
189 // store while aligning the ptr. Since there is no rotate instruction in SSE3,
190 // we have to synthesize the rotates.
195 LAlignPtr: // NB: can drop down to here!
196 cmpl $100,%edx // long enough to bother aligning ptr?
197 movl %edi,%ecx // copy ptr
198 jb LReady // not long enough
200 andl $15,%ecx // get #bytes to align ptr
201 jz LReady // already aligned
202 subl %ecx,%edx // adjust length
204 test $1,%cl // 1-byte store required?
205 movd %xmm0,%eax // get 4 low bytes in %eax
207 movdqa %xmm0,%xmm1 // copy pattern so we can shift in both directions
208 movb %al,(%edi) // pack in the low-order byte
209 psrldq $1,%xmm0 // shift pattern right 1 byte
211 pslldq $15,%xmm1 // shift pattern left 15 bytes
212 shrl $8,%eax // in case 2-byte store is required
213 por %xmm1,%xmm0 // complete right rotate of pattern by 1 byte
215 test $2,%cl // 2-byte store required?
217 psrldq $2,%xmm0 // shift pattern down 2 bytes
218 movw %ax,(%edi) // pack in next two bytes
219 pinsrw $7,%eax,%xmm0 // insert low word of %eax into high word of %xmm0
220 addl $2,%edi // adjust ptr
222 test $4,%cl // 4-byte store required?
224 movd %xmm0,(%edi) // store low 4 bytes of %xmm0
225 pshufd $(0x39),%xmm0,%xmm0 // rotate %xmm0 right 4 bytes (mask == 00 11 10 01)
226 addl $4,%edi // adjust ptr
228 test $8,%cl // 8-byte store required?
230 movq %xmm0,(%edi) // store low 8 bytes of %xmm0
231 pshufd $(0x4e),%xmm0,%xmm0 // rotate %xmm0 right 8 bytes (mask == 01 00 11 10)
232 addl $8,%edi // adjust ptr
234 // Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
237 movl $(_COMM_PAGE_MEMSET_PATTERN),%eax
238 call %eax // call commpage to do the heavy lifting