]> git.saurik.com Git - apple/libc.git/blame - i386/string/memset.s
Libc-763.12.tar.gz
[apple/libc.git] / i386 / string / memset.s
CommitLineData
eb1cde05
A
1/*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23#include <machine/cpu_capabilities.h>
24
25
26/* This file contains the following functions:
27 *
28 * void *memset(void *b, int c, size_t len);
29 * void memset_pattern4(void *b, const void *c4, size_t len);
30 * void memset_pattern8(void *b, const void *c8, size_t len);
31 * void memset_pattern16(void *b, const void *c16, size_t len);
32 *
33 * Calls of memset() with c==0 are routed to the bzero() routine. Most of the
1f2f436a 34 * others go to _memset_pattern, which is entered as follows:
eb1cde05
A
35 * %edi = ptr to memory to set (aligned)
36 * %edx = length (which can be short, though we bias in favor of long operands)
37 * %xmm0 = the pattern to store
38 * Return conditions:
39 * %eax, %edi, %esi, %ecx, and %edx all trashed
40 *
41 * NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow
42 * on P4s and probably other processors.
43 */
44
45 #define kShort 255 // for nonzero memset(), too short for commpage
46
47
48 .text
49 .globl _memset
50 .align 2
51_memset: // void *memset(void *b, int c, size_t len);
52 movl 8(%esp),%eax // get 1-byte pattern
53 movl 12(%esp),%edx // get length
54 andl $0xFF,%eax // (c==0) ?
55 jnz LNonzero // not a bzero
56
eb1cde05 57 movl %edx,8(%esp) // put count where bzero() expects it
1f2f436a 58 jmp _bzero // enter _bzero
eb1cde05
A
59
60
61 // Handle memset of a nonzero value.
62
63LNonzero:
64 pushl %edi // save a few nonvolatiles
65 pushl %esi
66 movl %eax,%esi // replicate byte in %al into all four bytes
67 movl 12(%esp),%edi // point to operand
68 shll $8,%esi
69 orl %esi,%eax
70 movl %eax,%esi
71 shll $16,%esi
72 orl %esi,%eax // now %eax has "c" in all 4 bytes
73 cmpl $(kShort),%edx // is operand too short for SSE?
74 ja LCallCommpage // no
75
76// Nonzero memset() too short to call commpage.
77// %eax = replicated 4-byte pattern
78// %edi = ptr
79// %edx = length (<= kShort)
80
81 cmpl $16,%edx // long enough to word align?
82 jge 3f // yes
83 test %edx,%edx // length==0?
84 jz 6f
851:
86 movb %al,(%edi) // pack in a byte
87 inc %edi
88 dec %edx
89 jnz 1b
90 jmp 6f
912:
92 movb %al,(%edi) // pack in a byte
93 inc %edi
94 dec %edx
953:
96 test $3,%edi // is ptr doubleword aligned?
97 jnz 2b // no
98 movl %edx,%ecx // copy length
99 shrl $2,%edx // #doublewords to store
1004:
101 movl %eax,(%edi) // store aligned doubleword
102 addl $4,%edi
103 dec %edx
104 jnz 4b
105 andl $3,%ecx // any leftover bytes?
106 jz 6f // no
1075:
108 movb %al,(%edi) // pack in a byte
109 inc %edi
110 dec %ecx
111 jnz 5b
1126:
113 movl 12(%esp),%eax // get return value (ie, original ptr)
114 popl %esi
115 popl %edi
116 ret
117
118// Nonzero memset() is long enough to call commpage.
119// %eax = replicated 4-byte pattern
120// %edi = ptr
121// %edx = length (> kShort)
122
123LCallCommpage:
124 movd %eax,%xmm0 // move %eax to low 4 bytes of %xmm0
125 pshufd $(0x00),%xmm0,%xmm0 // replicate across the vector
126 movl %edi,%ecx // copy dest ptr
127 negl %ecx
128 andl $15,%ecx // get #bytes to align ptr
129 jz 2f // skip if already aligned
130 subl %ecx,%edx // decrement length
1311:
132 movb %al,(%edi) // pack in a byte
133 inc %edi
134 dec %ecx
135 jnz 1b
1362: // ptr aligned, length long enough to justify
1f2f436a 137 call _memset_pattern // call commpage to do the heavy lifting
eb1cde05
A
138 movl 12(%esp),%eax // get return value (ie, original ptr)
139 popl %esi
140 popl %edi
141 ret
142
143
144// Handle memset of a 16-byte pattern.
145
146 .globl _memset_pattern16
147 .align 2, 0x90
148_memset_pattern16: // void memset_pattern16(void *b, const void *c16, size_t len);
149 pushl %edi
150 pushl %esi
151 movl 20(%esp),%edx // get length
152 movl 16(%esp),%esi // get ptr to 16-byte pattern
153 movl 12(%esp),%edi // point to operand
154 movdqu (%esi),%xmm0 // load the pattern
155 jmp LAlignPtr
156
157
158// Handle memset of an 8-byte pattern.
159
160 .globl _memset_pattern8
161 .align 2, 0x90
162_memset_pattern8: // void memset_pattern8(void *b, const void *c8, size_t len);
163 pushl %edi
164 pushl %esi
165 movl 20(%esp),%edx // get length
166 movl 16(%esp),%esi // get ptr to 8-byte pattern
167 movl 12(%esp),%edi // point to operand
168 movq (%esi),%xmm0 // load pattern into low 8 bytes
169 punpcklqdq %xmm0,%xmm0 // replicate into all 16
170 jmp LAlignPtr
171
172// Handle memset of a 4-byte pattern.
173
174 .globl _memset_pattern4
175 .align 2, 0x90
176_memset_pattern4: // void memset_pattern4(void *b, const void *c4, size_t len);
177 pushl %edi
178 pushl %esi
179 movl 20(%esp),%edx // get length
180 movl 16(%esp),%esi // get ptr to 4-byte pattern
181 movl 12(%esp),%edi // point to operand
182 movd (%esi),%xmm0 // load pattern into low 4 bytes
183 pshufd $(0x00),%xmm0,%xmm0 // replicate the 4 bytes across the vector
184
185
186// Align ptr if necessary. We must rotate the pattern right for each byte we
187// store while aligning the ptr. Since there is no rotate instruction in SSE3,
188// we have to synthesize the rotates.
189// %edi = ptr
190// %edx = length
191// %xmm0 = pattern
192
193LAlignPtr: // NB: can drop down to here!
194 cmpl $100,%edx // long enough to bother aligning ptr?
195 movl %edi,%ecx // copy ptr
196 jb LReady // not long enough
197 negl %ecx
198 andl $15,%ecx // get #bytes to align ptr
199 jz LReady // already aligned
200 subl %ecx,%edx // adjust length
201
202 test $1,%cl // 1-byte store required?
203 movd %xmm0,%eax // get 4 low bytes in %eax
204 jz 2f // no
205 movdqa %xmm0,%xmm1 // copy pattern so we can shift in both directions
206 movb %al,(%edi) // pack in the low-order byte
207 psrldq $1,%xmm0 // shift pattern right 1 byte
208 inc %edi
209 pslldq $15,%xmm1 // shift pattern left 15 bytes
210 shrl $8,%eax // in case 2-byte store is required
211 por %xmm1,%xmm0 // complete right rotate of pattern by 1 byte
2122:
213 test $2,%cl // 2-byte store required?
214 jz 4f // no
215 psrldq $2,%xmm0 // shift pattern down 2 bytes
216 movw %ax,(%edi) // pack in next two bytes
217 pinsrw $7,%eax,%xmm0 // insert low word of %eax into high word of %xmm0
218 addl $2,%edi // adjust ptr
2194:
220 test $4,%cl // 4-byte store required?
221 jz 8f // no
222 movd %xmm0,(%edi) // store low 4 bytes of %xmm0
223 pshufd $(0x39),%xmm0,%xmm0 // rotate %xmm0 right 4 bytes (mask == 00 11 10 01)
224 addl $4,%edi // adjust ptr
2258:
226 test $8,%cl // 8-byte store required?
227 jz LReady // no
228 movq %xmm0,(%edi) // store low 8 bytes of %xmm0
229 pshufd $(0x4e),%xmm0,%xmm0 // rotate %xmm0 right 8 bytes (mask == 01 00 11 10)
230 addl $8,%edi // adjust ptr
231
232// Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
233
234LReady:
1f2f436a 235 call _memset_pattern // call commpage to do the heavy lifting
eb1cde05
A
236 popl %esi
237 popl %edi
238 ret