]> git.saurik.com Git - apple/libc.git/blob - x86_64/string/memset.s
Libc-391.5.18.tar.gz
[apple/libc.git] / x86_64 / string / memset.s
1 /*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23 #include <machine/cpu_capabilities.h>
24
25
26 /* This file contains the following functions:
27 *
28 * void *memset(void *b, int c, size_t len);
29 * void memset_pattern4(void *b, const void *c4, size_t len);
30 * void memset_pattern8(void *b, const void *c8, size_t len);
31 * void memset_pattern16(void *b, const void *c16, size_t len);
32 *
33 * Calls of memset() with c==0 are routed to the bzero() routine. Most of the
34 * others go to _COMM_PAGE_MEMSET_PATTERN, which is entered as follows:
35 * %rdi = ptr to memory to set (aligned)
36 * %edx = length (which can be short, though we bias in favor of long operands)
37 * %xmm0 = the pattern to store
38 * Return conditions:
39 * %eax, %edi, %esi, %ecx, and %edx all trashed
40 *
41 * NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow
42 * on P4s and probably other processors.
43 */
44
45 #define kShort 255 // for nonzero memset(), too short for commpage
46
47
48 .text
49 .globl _memset
50 .align 2
51 _memset: // void *memset(void *b, int c, size_t len);
52 andl $0xFF,%esi // (c==0) ?
53 jnz LNonzero // not a bzero
54
55 movq $(_COMM_PAGE_BZERO),%rax// map memset(p,0,n) into bzero(p,n)
56 movq %rdx,%rsi // put count where bzero() expects it
57 jmp *%rax // enter commpage
58
59
60 // Handle memset of a nonzero value.
61
62 LNonzero:
63 movq %rdi,%r8 // preserve the original pointer so we can return it
64 movl %esi,%eax // replicate byte in %esi into all four bytes
65 shll $8,%esi
66 orl %esi,%eax
67 movl %eax,%esi
68 shll $16,%esi
69 orl %esi,%eax // now %eax has "c" in all 4 bytes
70 cmpq $(kShort),%rdx // is operand too short for SSE?
71 ja LCallCommpage // no
72
73 // Nonzero memset() too short to call commpage.
74 // %eax = replicated 4-byte pattern
75 // %rdi = ptr
76 // %edx = length (<= kShort)
77
78 cmpl $16,%edx // long enough to word align?
79 jge 3f // yes
80 test %edx,%edx // length==0?
81 jz 6f
82 1:
83 movb %al,(%rdi) // pack in a byte
84 addq $1,%rdi
85 subl $1,%edx
86 jnz 1b
87 jmp 6f
88 2:
89 movb %al,(%rdi) // pack in a byte
90 addq $1,%rdi
91 subl $1,%edx
92 3:
93 test $3,%edi // is ptr doubleword aligned?
94 jnz 2b // no
95 movl %edx,%ecx // copy length
96 shrl $2,%edx // #doublewords to store
97 4:
98 movl %eax,(%rdi) // store aligned doubleword
99 addq $4,%rdi
100 subl $1,%edx
101 jnz 4b
102 andl $3,%ecx // any leftover bytes?
103 jz 6f // no
104 5:
105 movb %al,(%rdi) // pack in a byte
106 addq $1,%rdi
107 subl $1,%ecx
108 jnz 5b
109 6:
110 movq %r8,%rax // get return value (ie, original ptr)
111 ret
112
113 // Nonzero memset() is long enough to call commpage.
114 // %eax = replicated 4-byte pattern
115 // %rdi = ptr
116 // %rdx = length (> kShort)
117
118 LCallCommpage:
119 movd %eax,%xmm0 // move %eax to low 4 bytes of %xmm0
120 pshufd $(0x00),%xmm0,%xmm0 // replicate across the vector
121 movq %rdi,%rcx // copy dest ptr
122 negl %ecx
123 andl $15,%ecx // get #bytes to align ptr
124 jz 2f // skip if already aligned
125 subq %rcx,%rdx // decrement length
126 1:
127 movb %al,(%rdi) // pack in a byte
128 addq $1,%rdi
129 subl $1,%ecx
130 jnz 1b
131 2: // ptr aligned, length long enough to justify
132 movq $(_COMM_PAGE_MEMSET_PATTERN),%rax
133 call *%rax // call commpage to do the heavy lifting
134 movq %r8,%rax // get return value (ie, original ptr)
135 ret
136
137
138 // Handle memset of a 16-byte pattern.
139
140 .globl _memset_pattern16
141 .align 2, 0x90
142 _memset_pattern16: // void memset_pattern16(void *b, const void *c16, size_t len);
143 movdqu (%rsi),%xmm0 // load the pattern
144 jmp LAlignPtr
145
146
147 // Handle memset of an 8-byte pattern.
148
149 .globl _memset_pattern8
150 .align 2, 0x90
151 _memset_pattern8: // void memset_pattern8(void *b, const void *c8, size_t len);
152 movq (%rsi),%xmm0 // load pattern into low 8 bytes
153 punpcklqdq %xmm0,%xmm0 // replicate into all 16
154 jmp LAlignPtr
155
156 // Handle memset of a 4-byte pattern.
157
158 .globl _memset_pattern4
159 .align 2, 0x90
160 _memset_pattern4: // void memset_pattern4(void *b, const void *c4, size_t len);
161 movd (%rsi),%xmm0 // load pattern into low 4 bytes
162 pshufd $(0x00),%xmm0,%xmm0 // replicate the 4 bytes across the vector
163
164
165 // Align ptr if necessary. We must rotate the pattern right for each byte we
166 // store while aligning the ptr. Since there is no rotate instruction in SSE3,
167 // we have to synthesize the rotates.
168 // %rdi = ptr
169 // %rdx = length
170 // %xmm0 = pattern
171
172 LAlignPtr: // NB: can drop down to here!
173 cmpq $100,%rdx // long enough to bother aligning ptr?
174 movq %rdi,%rcx // copy ptr
175 jb LReady // not long enough
176 negl %ecx
177 andl $15,%ecx // get #bytes to align ptr
178 jz LReady // already aligned
179 subq %rcx,%rdx // adjust length
180
181 test $1,%cl // 1-byte store required?
182 movd %xmm0,%eax // get 4 low bytes in %eax
183 jz 2f // no
184 movdqa %xmm0,%xmm1 // copy pattern so we can shift in both directions
185 movb %al,(%rdi) // pack in the low-order byte
186 psrldq $1,%xmm0 // shift pattern right 1 byte
187 addq $1,%rdi
188 pslldq $15,%xmm1 // shift pattern left 15 bytes
189 shrl $8,%eax // in case 2-byte store is required
190 por %xmm1,%xmm0 // complete right rotate of pattern by 1 byte
191 2:
192 test $2,%cl // 2-byte store required?
193 jz 4f // no
194 psrldq $2,%xmm0 // shift pattern down 2 bytes
195 movw %ax,(%rdi) // pack in next two bytes
196 pinsrw $7,%eax,%xmm0 // insert low word of %eax into high word of %xmm0
197 addq $2,%rdi // adjust ptr
198 4:
199 test $4,%cl // 4-byte store required?
200 jz 8f // no
201 movd %xmm0,(%rdi) // store low 4 bytes of %xmm0
202 pshufd $(0x39),%xmm0,%xmm0 // rotate %xmm0 right 4 bytes (mask == 00 11 10 01)
203 addq $4,%rdi // adjust ptr
204 8:
205 test $8,%cl // 8-byte store required?
206 jz LReady // no
207 movq %xmm0,(%rdi) // store low 8 bytes of %xmm0
208 pshufd $(0x4e),%xmm0,%xmm0 // rotate %xmm0 right 8 bytes (mask == 01 00 11 10)
209 addq $8,%rdi // adjust ptr
210
211 // Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
212
213 LReady:
214 movq $(_COMM_PAGE_MEMSET_PATTERN),%rax
215 call *%rax // call commpage to do the heavy lifting
216 ret