]>
Commit | Line | Data |
---|---|---|
7c78c529 A |
1 | /* |
2 | * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * The contents of this file constitute Original Code as defined in and | |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
11 | * | |
12 | * This Original Code and all software distributed under the License are | |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the | |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
19 | * | |
20 | * @APPLE_LICENSE_HEADER_END@ | |
21 | */ | |
22 | ||
23 | #include <machine/cpu_capabilities.h> | |
24 | ||
25 | ||
26 | /* This file contains the following functions: | |
27 | * | |
28 | * void *memset(void *b, int c, size_t len); | |
29 | * void memset_pattern4(void *b, const void *c4, size_t len); | |
30 | * void memset_pattern8(void *b, const void *c8, size_t len); | |
31 | * void memset_pattern16(void *b, const void *c16, size_t len); | |
32 | * | |
33 | * Calls of memset() with c==0 are routed to the bzero() routine. Most of the | |
34 | * others go to _COMM_PAGE_MEMSET_PATTERN, which is entered as follows: | |
35 | * %edi = ptr to memory to set (aligned) | |
36 | * %edx = length (which can be short, though we bias in favor of long operands) | |
37 | * %xmm0 = the pattern to store | |
38 | * Return conditions: | |
39 | * %eax, %edi, %esi, %ecx, and %edx all trashed | |
40 | * | |
41 | * NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow | |
42 | * on P4s and probably other processors. | |
43 | */ | |
44 | ||
45 | #define kShort 255 // for nonzero memset(), too short for commpage | |
46 | ||
47 | ||
48 | .text | |
49 | .globl _memset | |
50 | .align 2 | |
51 | _memset: // void *memset(void *b, int c, size_t len); | |
52 | movl 8(%esp),%eax // get 1-byte pattern | |
53 | movl 12(%esp),%edx // get length | |
54 | andl $0xFF,%eax // (c==0) ? | |
55 | jnz LNonzero // not a bzero | |
56 | ||
57 | movl $(_COMM_PAGE_BZERO),%eax// map memset(p,0,n) into bzero(p,n) | |
58 | movl %edx,8(%esp) // put count where bzero() expects it | |
59 | jmp %eax // enter commpage | |
60 | ||
61 | ||
62 | // Handle memset of a nonzero value. | |
63 | ||
64 | LNonzero: | |
65 | pushl %edi // save a few nonvolatiles | |
66 | pushl %esi | |
67 | movl %eax,%esi // replicate byte in %al into all four bytes | |
68 | movl 12(%esp),%edi // point to operand | |
69 | shll $8,%esi | |
70 | orl %esi,%eax | |
71 | movl %eax,%esi | |
72 | shll $16,%esi | |
73 | orl %esi,%eax // now %eax has "c" in all 4 bytes | |
74 | cmpl $(kShort),%edx // is operand too short for SSE? | |
75 | ja LCallCommpage // no | |
76 | ||
77 | // Nonzero memset() too short to call commpage. | |
78 | // %eax = replicated 4-byte pattern | |
79 | // %edi = ptr | |
80 | // %edx = length (<= kShort) | |
81 | ||
82 | cmpl $16,%edx // long enough to word align? | |
83 | jge 3f // yes | |
84 | test %edx,%edx // length==0? | |
85 | jz 6f | |
86 | 1: | |
87 | movb %al,(%edi) // pack in a byte | |
88 | inc %edi | |
89 | dec %edx | |
90 | jnz 1b | |
91 | jmp 6f | |
92 | 2: | |
93 | movb %al,(%edi) // pack in a byte | |
94 | inc %edi | |
95 | dec %edx | |
96 | 3: | |
97 | test $3,%edi // is ptr doubleword aligned? | |
98 | jnz 2b // no | |
99 | movl %edx,%ecx // copy length | |
100 | shrl $2,%edx // #doublewords to store | |
101 | 4: | |
102 | movl %eax,(%edi) // store aligned doubleword | |
103 | addl $4,%edi | |
104 | dec %edx | |
105 | jnz 4b | |
106 | andl $3,%ecx // any leftover bytes? | |
107 | jz 6f // no | |
108 | 5: | |
109 | movb %al,(%edi) // pack in a byte | |
110 | inc %edi | |
111 | dec %ecx | |
112 | jnz 5b | |
113 | 6: | |
114 | movl 12(%esp),%eax // get return value (ie, original ptr) | |
115 | popl %esi | |
116 | popl %edi | |
117 | ret | |
118 | ||
119 | // Nonzero memset() is long enough to call commpage. | |
120 | // %eax = replicated 4-byte pattern | |
121 | // %edi = ptr | |
122 | // %edx = length (> kShort) | |
123 | ||
124 | LCallCommpage: | |
125 | movd %eax,%xmm0 // move %eax to low 4 bytes of %xmm0 | |
126 | pshufd $(0x00),%xmm0,%xmm0 // replicate across the vector | |
127 | movl %edi,%ecx // copy dest ptr | |
128 | negl %ecx | |
129 | andl $15,%ecx // get #bytes to align ptr | |
130 | jz 2f // skip if already aligned | |
131 | subl %ecx,%edx // decrement length | |
132 | 1: | |
133 | movb %al,(%edi) // pack in a byte | |
134 | inc %edi | |
135 | dec %ecx | |
136 | jnz 1b | |
137 | 2: // ptr aligned, length long enough to justify | |
138 | movl $(_COMM_PAGE_MEMSET_PATTERN),%eax | |
139 | call %eax // call commpage to do the heavy lifting | |
140 | movl 12(%esp),%eax // get return value (ie, original ptr) | |
141 | popl %esi | |
142 | popl %edi | |
143 | ret | |
144 | ||
145 | ||
146 | // Handle memset of a 16-byte pattern. | |
147 | ||
148 | .globl _memset_pattern16 | |
149 | .align 2, 0x90 | |
150 | _memset_pattern16: // void memset_pattern16(void *b, const void *c16, size_t len); | |
151 | pushl %edi | |
152 | pushl %esi | |
153 | movl 20(%esp),%edx // get length | |
154 | movl 16(%esp),%esi // get ptr to 16-byte pattern | |
155 | movl 12(%esp),%edi // point to operand | |
156 | movdqu (%esi),%xmm0 // load the pattern | |
157 | jmp LAlignPtr | |
158 | ||
159 | ||
160 | // Handle memset of an 8-byte pattern. | |
161 | ||
162 | .globl _memset_pattern8 | |
163 | .align 2, 0x90 | |
164 | _memset_pattern8: // void memset_pattern8(void *b, const void *c8, size_t len); | |
165 | pushl %edi | |
166 | pushl %esi | |
167 | movl 20(%esp),%edx // get length | |
168 | movl 16(%esp),%esi // get ptr to 8-byte pattern | |
169 | movl 12(%esp),%edi // point to operand | |
170 | movq (%esi),%xmm0 // load pattern into low 8 bytes | |
171 | punpcklqdq %xmm0,%xmm0 // replicate into all 16 | |
172 | jmp LAlignPtr | |
173 | ||
174 | // Handle memset of a 4-byte pattern. | |
175 | ||
176 | .globl _memset_pattern4 | |
177 | .align 2, 0x90 | |
178 | _memset_pattern4: // void memset_pattern4(void *b, const void *c4, size_t len); | |
179 | pushl %edi | |
180 | pushl %esi | |
181 | movl 20(%esp),%edx // get length | |
182 | movl 16(%esp),%esi // get ptr to 4-byte pattern | |
183 | movl 12(%esp),%edi // point to operand | |
184 | movd (%esi),%xmm0 // load pattern into low 4 bytes | |
185 | pshufd $(0x00),%xmm0,%xmm0 // replicate the 4 bytes across the vector | |
186 | ||
187 | ||
188 | // Align ptr if necessary. We must rotate the pattern right for each byte we | |
189 | // store while aligning the ptr. Since there is no rotate instruction in SSE3, | |
190 | // we have to synthesize the rotates. | |
191 | // %edi = ptr | |
192 | // %edx = length | |
193 | // %xmm0 = pattern | |
194 | ||
195 | LAlignPtr: // NB: can drop down to here! | |
196 | cmpl $100,%edx // long enough to bother aligning ptr? | |
197 | movl %edi,%ecx // copy ptr | |
198 | jb LReady // not long enough | |
199 | negl %ecx | |
200 | andl $15,%ecx // get #bytes to align ptr | |
201 | jz LReady // already aligned | |
202 | subl %ecx,%edx // adjust length | |
203 | ||
204 | test $1,%cl // 1-byte store required? | |
205 | movd %xmm0,%eax // get 4 low bytes in %eax | |
206 | jz 2f // no | |
207 | movdqa %xmm0,%xmm1 // copy pattern so we can shift in both directions | |
208 | movb %al,(%edi) // pack in the low-order byte | |
209 | psrldq $1,%xmm0 // shift pattern right 1 byte | |
210 | inc %edi | |
211 | pslldq $15,%xmm1 // shift pattern left 15 bytes | |
212 | shrl $8,%eax // in case 2-byte store is required | |
213 | por %xmm1,%xmm0 // complete right rotate of pattern by 1 byte | |
214 | 2: | |
215 | test $2,%cl // 2-byte store required? | |
216 | jz 4f // no | |
217 | psrldq $2,%xmm0 // shift pattern down 2 bytes | |
218 | movw %ax,(%edi) // pack in next two bytes | |
219 | pinsrw $7,%eax,%xmm0 // insert low word of %eax into high word of %xmm0 | |
220 | addl $2,%edi // adjust ptr | |
221 | 4: | |
222 | test $4,%cl // 4-byte store required? | |
223 | jz 8f // no | |
224 | movd %xmm0,(%edi) // store low 4 bytes of %xmm0 | |
225 | pshufd $(0x39),%xmm0,%xmm0 // rotate %xmm0 right 4 bytes (mask == 00 11 10 01) | |
226 | addl $4,%edi // adjust ptr | |
227 | 8: | |
228 | test $8,%cl // 8-byte store required? | |
229 | jz LReady // no | |
230 | movq %xmm0,(%edi) // store low 8 bytes of %xmm0 | |
231 | pshufd $(0x4e),%xmm0,%xmm0 // rotate %xmm0 right 8 bytes (mask == 01 00 11 10) | |
232 | addl $8,%edi // adjust ptr | |
233 | ||
234 | // Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting. | |
235 | ||
236 | LReady: | |
237 | movl $(_COMM_PAGE_MEMSET_PATTERN),%eax | |
238 | call %eax // call commpage to do the heavy lifting | |
239 | popl %esi | |
240 | popl %edi | |
241 | ret |