]> git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/i386/commpage/memset_pattern_sse3.s
xnu-792.25.20.tar.gz
[apple/xnu.git] / osfmk / i386 / commpage / memset_pattern_sse3.s
... / ...
CommitLineData
1/*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23#include <machine/cpu_capabilities.h>
24#include <machine/commpage.h>
25
26/* The common path for nonzero memset and the memset_pattern routines,
27 * tuned for Pentium-M class processors with SSE3 and 64-byte cache lines.
28 * This is used by the following functions:
29 *
30 * void *memset(void *b, int c, size_t len); // when c!=0
31 * void memset_pattern4(void *b, const void *c4, size_t len);
32 * void memset_pattern8(void *b, const void *c8, size_t len);
33 * void memset_pattern16(void *b, const void *c16, size_t len);
34 *
35 * Note bzero() and memset() of 0 are handled separately.
36 */
37
38#define kShort 63
39#define kVeryLong (1024*1024)
40
41// Initial entry from Libc with parameters passed in registers. Although we
42// correctly handle misaligned ptrs and short operands, they are inefficient.
43// Therefore our caller should filter out short operands and exploit local
44// knowledge (ie, original pattern length) to align the ptr if possible.
45// When called, we expect:
46// %edi = ptr to memory to set (not necessarily aligned)
47// %edx = length (may be short or even 0)
48// %xmm0 = the pattern to store
49// Return conditions:
50// %eax, %edi, %esi, %ecx, and %edx all trashed
51
52 .text
53 .align 5, 0x90
54Lmemset_pattern_sse3:
55 cmpl $(kShort),%edx // long enough to bother aligning?
56 ja LNotShort // yes
57 jmp LShort // no
58
59// Here for short operands or the end of long ones.
60// %edx = length
61// %edi = ptr (may not be not aligned)
62// %xmm0 = pattern
63
64LUnalignedStore16:
65 movdqu %xmm0,(%edi) // stuff in another 16 bytes
66 subl $16,%edx
67 addl $16,%edi
68LShort:
69 cmpl $16,%edx // room for another vector?
70 jge LUnalignedStore16 // yes
71LLessThan16: // here at end of copy with < 16 bytes remaining
72 test $8,%dl // 8-byte store required?
73 jz 2f // no
74 movq %xmm0,(%edi) // pack in 8 low bytes
75 psrldq $8,%xmm0 // then shift vector down 8 bytes
76 addl $8,%edi
772:
78 test $4,%dl // 4-byte store required?
79 jz 3f // no
80 movd %xmm0,(%edi) // pack in 4 low bytes
81 psrldq $4,%xmm0 // then shift vector down 4 bytes
82 addl $4,%edi
833:
84 andl $3,%edx // more to go?
85 jz 5f // no
86 movd %xmm0,%eax // move remainders out into %eax
874: // loop on up to three bytes
88 movb %al,(%edi) // pack in next byte
89 shrl $8,%eax // shift next byte into position
90 inc %edi
91 dec %edx
92 jnz 4b
935: ret
94
95// Long enough to justify aligning ptr. Note that we have to rotate the
96// pattern to account for any alignment. We do this by doing two unaligned
97// stores, and then an aligned load from the middle of the two stores.
98// This will stall on store forwarding alignment mismatch, and the unaligned
99// stores can be pretty slow too, but the alternatives aren't any better.
100// Fortunately, in most cases our caller has already aligned the ptr.
101// %edx = length (> kShort)
102// %edi = ptr (may not be aligned)
103// %xmm0 = pattern
104
105LNotShort:
106 movl %edi,%ecx // copy dest ptr
107 negl %ecx
108 andl $15,%ecx // mask down to #bytes to 16-byte align
109 jz LAligned // skip if already aligned
110 movdqu %xmm0,(%edi) // store 16 unaligned bytes
111 movdqu %xmm0,16(%edi) // and 16 more, to be sure we have an aligned chunk
112 addl %ecx,%edi // now point to the aligned chunk
113 subl %ecx,%edx // adjust remaining count
114 movdqa (%edi),%xmm0 // get the rotated pattern (probably stalling)
115 addl $16,%edi // skip past the aligned chunk
116 subl $16,%edx
117
118// Set up for 64-byte loops.
119// %edx = length remaining
120// %edi = ptr (aligned)
121// %xmm0 = rotated pattern
122
123LAligned:
124 movl %edx,%ecx // copy length remaining
125 andl $63,%edx // mask down to residual length (0..63)
126 andl $-64,%ecx // %ecx <- #bytes we will zero in by-64 loop
127 jz LNoMoreChunks // no 64-byte chunks
128 addl %ecx,%edi // increment ptr by length to move
129 cmpl $(kVeryLong),%ecx // long enough to justify non-temporal stores?
130 jge LVeryLong // yes
131 negl %ecx // negate length to move
132 jmp 1f
133
134// Loop over 64-byte chunks, storing into cache.
135
136 .align 4,0x90 // keep inner loops 16-byte aligned
1371:
138 movdqa %xmm0,(%edi,%ecx)
139 movdqa %xmm0,16(%edi,%ecx)
140 movdqa %xmm0,32(%edi,%ecx)
141 movdqa %xmm0,48(%edi,%ecx)
142 addl $64,%ecx
143 jne 1b
144
145 jmp LNoMoreChunks
146
147// Very long operands: use non-temporal stores to bypass cache.
148
149LVeryLong:
150 negl %ecx // negate length to move
151 jmp 1f
152
153 .align 4,0x90 // keep inner loops 16-byte aligned
1541:
155 movntdq %xmm0,(%edi,%ecx)
156 movntdq %xmm0,16(%edi,%ecx)
157 movntdq %xmm0,32(%edi,%ecx)
158 movntdq %xmm0,48(%edi,%ecx)
159 addl $64,%ecx
160 jne 1b
161
162 sfence // required by non-temporal stores
163 jmp LNoMoreChunks
164
165// Handle leftovers: loop by 16.
166// %edx = length remaining (<64)
167// %edi = ptr (aligned)
168// %xmm0 = rotated pattern
169
170LLoopBy16:
171 movdqa %xmm0,(%edi) // pack in 16 more bytes
172 subl $16,%edx // decrement count
173 addl $16,%edi // increment ptr
174LNoMoreChunks:
175 cmpl $16,%edx // more to go?
176 jge LLoopBy16 // yes
177 jmp LLessThan16 // handle up to 15 remaining bytes
178
179 COMMPAGE_DESCRIPTOR(memset_pattern_sse3,_COMM_PAGE_MEMSET_PATTERN,kHasSSE2,0)