]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/commpage/memset_pattern_sse2.s
xnu-1486.2.11.tar.gz
[apple/xnu.git] / osfmk / i386 / commpage / memset_pattern_sse2.s
1 /*
2 * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <machine/cpu_capabilities.h>
30 #include <machine/commpage.h>
31
32 /* The common path for nonzero memset and the memset_pattern routines,
33 * tuned for Pentium-M class processors with SSE2 and 64-byte cache lines.
34 * This is used by the following functions:
35 *
36 * void *memset(void *b, int c, size_t len); // when c!=0
37 * void memset_pattern4(void *b, const void *c4, size_t len);
38 * void memset_pattern8(void *b, const void *c8, size_t len);
39 * void memset_pattern16(void *b, const void *c16, size_t len);
40 *
41 * Note bzero() and memset() of 0 are handled separately.
42 */
43
44 #define kShort 63
45 #define kVeryLong (1024*1024)
46
47 // Initial entry from Libc with parameters passed in registers. Although we
48 // correctly handle misaligned ptrs and short operands, they are inefficient.
49 // Therefore our caller should filter out short operands and exploit local
50 // knowledge (ie, original pattern length) to align the ptr if possible.
51 // When called, we expect:
52 // %edi = ptr to memory to set (not necessarily aligned)
53 // %edx = length (may be short or even 0)
54 // %xmm0 = the pattern to store
55 // Return conditions:
56 // %eax, %edi, %esi, %ecx, and %edx all trashed
57
58 COMMPAGE_FUNCTION_START(memset_pattern_sse2, 32, 5)
59 cmpl $(kShort),%edx // long enough to bother aligning?
60 ja LNotShort // yes
61 jmp LShort // no
62
63 // Here for short operands or the end of long ones.
64 // %edx = length
65 // %edi = ptr (may not be not aligned)
66 // %xmm0 = pattern
67
68 LUnalignedStore16:
69 movdqu %xmm0,(%edi) // stuff in another 16 bytes
70 subl $16,%edx
71 addl $16,%edi
72 LShort:
73 cmpl $16,%edx // room for another vector?
74 jge LUnalignedStore16 // yes
75 LLessThan16: // here at end of copy with < 16 bytes remaining
76 test $8,%dl // 8-byte store required?
77 jz 2f // no
78 movq %xmm0,(%edi) // pack in 8 low bytes
79 psrldq $8,%xmm0 // then shift vector down 8 bytes
80 addl $8,%edi
81 2:
82 test $4,%dl // 4-byte store required?
83 jz 3f // no
84 movd %xmm0,(%edi) // pack in 4 low bytes
85 psrldq $4,%xmm0 // then shift vector down 4 bytes
86 addl $4,%edi
87 3:
88 andl $3,%edx // more to go?
89 jz 5f // no
90 movd %xmm0,%eax // move remainders out into %eax
91 4: // loop on up to three bytes
92 movb %al,(%edi) // pack in next byte
93 shrl $8,%eax // shift next byte into position
94 inc %edi
95 dec %edx
96 jnz 4b
97 5: ret
98
99 // Long enough to justify aligning ptr. Note that we have to rotate the
100 // pattern to account for any alignment. We do this by doing two unaligned
101 // stores, and then an aligned load from the middle of the two stores.
102 // This will stall on store forwarding alignment mismatch, and the unaligned
103 // stores can be pretty slow too, but the alternatives aren't any better.
104 // Fortunately, in most cases our caller has already aligned the ptr.
105 // %edx = length (> kShort)
106 // %edi = ptr (may not be aligned)
107 // %xmm0 = pattern
108
109 LNotShort:
110 movl %edi,%ecx // copy dest ptr
111 negl %ecx
112 andl $15,%ecx // mask down to #bytes to 16-byte align
113 jz LAligned // skip if already aligned
114 movdqu %xmm0,(%edi) // store 16 unaligned bytes
115 movdqu %xmm0,16(%edi) // and 16 more, to be sure we have an aligned chunk
116 addl %ecx,%edi // now point to the aligned chunk
117 subl %ecx,%edx // adjust remaining count
118 movdqa (%edi),%xmm0 // get the rotated pattern (probably stalling)
119 addl $16,%edi // skip past the aligned chunk
120 subl $16,%edx
121
122 // Set up for 64-byte loops.
123 // %edx = length remaining
124 // %edi = ptr (aligned)
125 // %xmm0 = rotated pattern
126
127 LAligned:
128 movl %edx,%ecx // copy length remaining
129 andl $63,%edx // mask down to residual length (0..63)
130 andl $-64,%ecx // %ecx <- #bytes we will zero in by-64 loop
131 jz LNoMoreChunks // no 64-byte chunks
132 addl %ecx,%edi // increment ptr by length to move
133 cmpl $(kVeryLong),%ecx // long enough to justify non-temporal stores?
134 jge LVeryLong // yes
135 negl %ecx // negate length to move
136 jmp 1f
137
138 // Loop over 64-byte chunks, storing into cache.
139
140 .align 4,0x90 // keep inner loops 16-byte aligned
141 1:
142 movdqa %xmm0,(%edi,%ecx)
143 movdqa %xmm0,16(%edi,%ecx)
144 movdqa %xmm0,32(%edi,%ecx)
145 movdqa %xmm0,48(%edi,%ecx)
146 addl $64,%ecx
147 jne 1b
148
149 jmp LNoMoreChunks
150
151 // Very long operands: use non-temporal stores to bypass cache.
152
153 LVeryLong:
154 negl %ecx // negate length to move
155 jmp 1f
156
157 .align 4,0x90 // keep inner loops 16-byte aligned
158 1:
159 movntdq %xmm0,(%edi,%ecx)
160 movntdq %xmm0,16(%edi,%ecx)
161 movntdq %xmm0,32(%edi,%ecx)
162 movntdq %xmm0,48(%edi,%ecx)
163 addl $64,%ecx
164 jne 1b
165
166 sfence // required by non-temporal stores
167 jmp LNoMoreChunks
168
169 // Handle leftovers: loop by 16.
170 // %edx = length remaining (<64)
171 // %edi = ptr (aligned)
172 // %xmm0 = rotated pattern
173
174 LLoopBy16:
175 movdqa %xmm0,(%edi) // pack in 16 more bytes
176 subl $16,%edx // decrement count
177 addl $16,%edi // increment ptr
178 LNoMoreChunks:
179 cmpl $16,%edx // more to go?
180 jge LLoopBy16 // yes
181 jmp LLessThan16 // handle up to 15 remaining bytes
182
183 COMMPAGE_DESCRIPTOR(memset_pattern_sse2,_COMM_PAGE_MEMSET_PATTERN,kHasSSE2,0)