]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/commpage/memset_pattern_sse3_64.s
ab2f42fc9a2832d0eec87ae393ad6a2d244b2dd5
[apple/xnu.git] / osfmk / i386 / commpage / memset_pattern_sse3_64.s
1 /*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23 #include <machine/cpu_capabilities.h>
24 #include <machine/commpage.h>
25
26 /* The common path for nonzero memset and the memset_pattern routines,
27 * tuned for Pentium-M class processors with SSE3 and 64-byte cache lines.
28 * This is the 64-bit bersion. It is used by the following functions:
29 *
30 * void *memset(void *b, int c, size_t len); // when c!=0
31 * void memset_pattern4(void *b, const void *c4, size_t len);
32 * void memset_pattern8(void *b, const void *c8, size_t len);
33 * void memset_pattern16(void *b, const void *c16, size_t len);
34 *
35 * Note bzero() and memset() of 0 are handled separately.
36 */
37
38 #define kShort 63
39 #define kVeryLong (1024*1024)
40
41 // Initial entry from Libc with parameters passed in registers. Although we
42 // correctly handle misaligned ptrs and short operands, they are inefficient.
43 // Therefore our caller should filter out short operands and exploit local
44 // knowledge (ie, original pattern length) to align the ptr if possible.
45 // When called, we expect:
46 // %rdi = ptr to memory to set (not necessarily aligned)
47 // %rdx = length (may be short or even 0)
48 // %xmm0 = the pattern to store
49 // Return conditions:
50 // %rax, %rdi, %rsi, %rcx, and %rdx all trashed
51 // we preserve %r8, %r9, %r10, and %r11
52
53 .text
54 .align 5, 0x90
55 .code64
56 Lmemset_pattern_sse3_64:
57 cmpq $(kShort),%rdx // long enough to bother aligning?
58 ja LNotShort // yes
59 jmp LShort // no
60
61 // Here for short operands or the end of long ones.
62 // %rdx = length (<= kShort)
63 // %rdi = ptr (may not be not aligned)
64 // %xmm0 = pattern
65
66 LUnalignedStore16:
67 movdqu %xmm0,(%rdi) // stuff in another 16 bytes
68 subl $16,%edx
69 addq $16,%rdi
70 LShort:
71 cmpl $16,%edx // room for another vector?
72 jge LUnalignedStore16 // yes
73 LLessThan16: // here at end of copy with < 16 bytes remaining
74 test $8,%dl // 8-byte store required?
75 jz 2f // no
76 movq %xmm0,(%rdi) // pack in 8 low bytes
77 psrldq $8,%xmm0 // then shift vector down 8 bytes
78 addq $8,%rdi
79 2:
80 test $4,%dl // 4-byte store required?
81 jz 3f // no
82 movd %xmm0,(%rdi) // pack in 4 low bytes
83 psrldq $4,%xmm0 // then shift vector down 4 bytes
84 addq $4,%rdi
85 3:
86 andl $3,%edx // more to go?
87 jz 5f // no
88 movd %xmm0,%eax // move remainders out into %eax
89 4: // loop on up to three bytes
90 movb %al,(%rdi) // pack in next byte
91 shrl $8,%eax // shift next byte into position
92 incq %rdi
93 dec %edx
94 jnz 4b
95 5: ret
96
97 // Long enough to justify aligning ptr. Note that we have to rotate the
98 // pattern to account for any alignment. We do this by doing two unaligned
99 // stores, and then an aligned load from the middle of the two stores.
100 // This will stall on store forwarding alignment mismatch, and the unaligned
101 // stores can be pretty slow too, but the alternatives aren't any better.
102 // Fortunately, in most cases our caller has already aligned the ptr.
103 // %rdx = length (> kShort)
104 // %rdi = ptr (may not be aligned)
105 // %xmm0 = pattern
106
107 LNotShort:
108 movl %edi,%ecx // copy low bits of dest ptr
109 negl %ecx
110 andl $15,%ecx // mask down to #bytes to 16-byte align
111 jz LAligned // skip if already aligned
112 movdqu %xmm0,(%rdi) // store 16 unaligned bytes
113 movdqu %xmm0,16(%rdi) // and 16 more, to be sure we have an aligned chunk
114 addq %rcx,%rdi // now point to the aligned chunk
115 subq %rcx,%rdx // adjust remaining count
116 movdqa (%rdi),%xmm0 // get the rotated pattern (probably stalling)
117 addq $16,%rdi // skip past the aligned chunk
118 subq $16,%rdx
119
120 // Set up for 64-byte loops.
121 // %rdx = length remaining
122 // %rdi = ptr (aligned)
123 // %xmm0 = rotated pattern
124
125 LAligned:
126 movq %rdx,%rcx // copy length remaining
127 andl $63,%edx // mask down to residual length (0..63)
128 andq $-64,%rcx // %ecx <- #bytes we will zero in by-64 loop
129 jz LNoMoreChunks // no 64-byte chunks
130 addq %rcx,%rdi // increment ptr by length to move
131 cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores?
132 jge LVeryLong // yes
133 negq %rcx // negate length to move
134 jmp 1f
135
136 // Loop over 64-byte chunks, storing into cache.
137
138 .align 4,0x90 // keep inner loops 16-byte aligned
139 1:
140 movdqa %xmm0,(%rdi,%rcx)
141 movdqa %xmm0,16(%rdi,%rcx)
142 movdqa %xmm0,32(%rdi,%rcx)
143 movdqa %xmm0,48(%rdi,%rcx)
144 addq $64,%rcx
145 jne 1b
146
147 jmp LNoMoreChunks
148
149 // Very long operands: use non-temporal stores to bypass cache.
150
151 LVeryLong:
152 negq %rcx // negate length to move
153 jmp 1f
154
155 .align 4,0x90 // keep inner loops 16-byte aligned
156 1:
157 movntdq %xmm0,(%rdi,%rcx)
158 movntdq %xmm0,16(%rdi,%rcx)
159 movntdq %xmm0,32(%rdi,%rcx)
160 movntdq %xmm0,48(%rdi,%rcx)
161 addq $64,%rcx
162 jne 1b
163
164 sfence // required by non-temporal stores
165 jmp LNoMoreChunks
166
167 // Handle leftovers: loop by 16.
168 // %edx = length remaining (<64)
169 // %edi = ptr (aligned)
170 // %xmm0 = rotated pattern
171
172 LLoopBy16:
173 movdqa %xmm0,(%rdi) // pack in 16 more bytes
174 subl $16,%edx // decrement count
175 addq $16,%rdi // increment ptr
176 LNoMoreChunks:
177 cmpl $16,%edx // more to go?
178 jge LLoopBy16 // yes
179 jmp LLessThan16 // handle up to 15 remaining bytes
180
181 COMMPAGE_DESCRIPTOR(memset_pattern_sse3_64,_COMM_PAGE_MEMSET_PATTERN,kHasSSE3,0)