]>
Commit | Line | Data |
---|---|---|
0c530ab8 A |
1 | /* |
2 | * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. | |
3 | * | |
2d21ac55 A |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
0c530ab8 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
0c530ab8 A |
27 | */ |
28 | ||
29 | #include <machine/cpu_capabilities.h> | |
30 | #include <machine/commpage.h> | |
31 | ||
32 | /* The common path for nonzero memset and the memset_pattern routines, | |
2d21ac55 | 33 | * tuned for Pentium-M class processors with SSE2 and 64-byte cache lines. |
0c530ab8 A |
34 | * This is the 64-bit bersion. It is used by the following functions: |
35 | * | |
36 | * void *memset(void *b, int c, size_t len); // when c!=0 | |
37 | * void memset_pattern4(void *b, const void *c4, size_t len); | |
38 | * void memset_pattern8(void *b, const void *c8, size_t len); | |
39 | * void memset_pattern16(void *b, const void *c16, size_t len); | |
40 | * | |
41 | * Note bzero() and memset() of 0 are handled separately. | |
42 | */ | |
43 | ||
44 | #define kShort 63 | |
45 | #define kVeryLong (1024*1024) | |
46 | ||
47 | // Initial entry from Libc with parameters passed in registers. Although we | |
48 | // correctly handle misaligned ptrs and short operands, they are inefficient. | |
49 | // Therefore our caller should filter out short operands and exploit local | |
50 | // knowledge (ie, original pattern length) to align the ptr if possible. | |
51 | // When called, we expect: | |
52 | // %rdi = ptr to memory to set (not necessarily aligned) | |
53 | // %rdx = length (may be short or even 0) | |
54 | // %xmm0 = the pattern to store | |
55 | // Return conditions: | |
56 | // %rax, %rdi, %rsi, %rcx, and %rdx all trashed | |
57 | // we preserve %r8, %r9, %r10, and %r11 | |
58 | ||
59 | .text | |
60 | .align 5, 0x90 | |
61 | .code64 | |
2d21ac55 | 62 | Lmemset_pattern_sse2_64: |
0c530ab8 A |
63 | cmpq $(kShort),%rdx // long enough to bother aligning? |
64 | ja LNotShort // yes | |
65 | jmp LShort // no | |
66 | ||
67 | // Here for short operands or the end of long ones. | |
68 | // %rdx = length (<= kShort) | |
69 | // %rdi = ptr (may not be not aligned) | |
70 | // %xmm0 = pattern | |
71 | ||
72 | LUnalignedStore16: | |
73 | movdqu %xmm0,(%rdi) // stuff in another 16 bytes | |
74 | subl $16,%edx | |
75 | addq $16,%rdi | |
76 | LShort: | |
77 | cmpl $16,%edx // room for another vector? | |
78 | jge LUnalignedStore16 // yes | |
79 | LLessThan16: // here at end of copy with < 16 bytes remaining | |
80 | test $8,%dl // 8-byte store required? | |
81 | jz 2f // no | |
82 | movq %xmm0,(%rdi) // pack in 8 low bytes | |
83 | psrldq $8,%xmm0 // then shift vector down 8 bytes | |
84 | addq $8,%rdi | |
85 | 2: | |
86 | test $4,%dl // 4-byte store required? | |
87 | jz 3f // no | |
88 | movd %xmm0,(%rdi) // pack in 4 low bytes | |
89 | psrldq $4,%xmm0 // then shift vector down 4 bytes | |
90 | addq $4,%rdi | |
91 | 3: | |
92 | andl $3,%edx // more to go? | |
93 | jz 5f // no | |
94 | movd %xmm0,%eax // move remainders out into %eax | |
95 | 4: // loop on up to three bytes | |
96 | movb %al,(%rdi) // pack in next byte | |
97 | shrl $8,%eax // shift next byte into position | |
98 | incq %rdi | |
99 | dec %edx | |
100 | jnz 4b | |
101 | 5: ret | |
102 | ||
103 | // Long enough to justify aligning ptr. Note that we have to rotate the | |
104 | // pattern to account for any alignment. We do this by doing two unaligned | |
105 | // stores, and then an aligned load from the middle of the two stores. | |
106 | // This will stall on store forwarding alignment mismatch, and the unaligned | |
107 | // stores can be pretty slow too, but the alternatives aren't any better. | |
108 | // Fortunately, in most cases our caller has already aligned the ptr. | |
109 | // %rdx = length (> kShort) | |
110 | // %rdi = ptr (may not be aligned) | |
111 | // %xmm0 = pattern | |
112 | ||
113 | LNotShort: | |
114 | movl %edi,%ecx // copy low bits of dest ptr | |
115 | negl %ecx | |
116 | andl $15,%ecx // mask down to #bytes to 16-byte align | |
117 | jz LAligned // skip if already aligned | |
118 | movdqu %xmm0,(%rdi) // store 16 unaligned bytes | |
119 | movdqu %xmm0,16(%rdi) // and 16 more, to be sure we have an aligned chunk | |
120 | addq %rcx,%rdi // now point to the aligned chunk | |
121 | subq %rcx,%rdx // adjust remaining count | |
122 | movdqa (%rdi),%xmm0 // get the rotated pattern (probably stalling) | |
123 | addq $16,%rdi // skip past the aligned chunk | |
124 | subq $16,%rdx | |
125 | ||
126 | // Set up for 64-byte loops. | |
127 | // %rdx = length remaining | |
128 | // %rdi = ptr (aligned) | |
129 | // %xmm0 = rotated pattern | |
130 | ||
131 | LAligned: | |
132 | movq %rdx,%rcx // copy length remaining | |
133 | andl $63,%edx // mask down to residual length (0..63) | |
134 | andq $-64,%rcx // %ecx <- #bytes we will zero in by-64 loop | |
135 | jz LNoMoreChunks // no 64-byte chunks | |
136 | addq %rcx,%rdi // increment ptr by length to move | |
137 | cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores? | |
138 | jge LVeryLong // yes | |
139 | negq %rcx // negate length to move | |
140 | jmp 1f | |
141 | ||
142 | // Loop over 64-byte chunks, storing into cache. | |
143 | ||
144 | .align 4,0x90 // keep inner loops 16-byte aligned | |
145 | 1: | |
146 | movdqa %xmm0,(%rdi,%rcx) | |
147 | movdqa %xmm0,16(%rdi,%rcx) | |
148 | movdqa %xmm0,32(%rdi,%rcx) | |
149 | movdqa %xmm0,48(%rdi,%rcx) | |
150 | addq $64,%rcx | |
151 | jne 1b | |
152 | ||
153 | jmp LNoMoreChunks | |
154 | ||
155 | // Very long operands: use non-temporal stores to bypass cache. | |
156 | ||
157 | LVeryLong: | |
158 | negq %rcx // negate length to move | |
159 | jmp 1f | |
160 | ||
161 | .align 4,0x90 // keep inner loops 16-byte aligned | |
162 | 1: | |
163 | movntdq %xmm0,(%rdi,%rcx) | |
164 | movntdq %xmm0,16(%rdi,%rcx) | |
165 | movntdq %xmm0,32(%rdi,%rcx) | |
166 | movntdq %xmm0,48(%rdi,%rcx) | |
167 | addq $64,%rcx | |
168 | jne 1b | |
169 | ||
170 | sfence // required by non-temporal stores | |
171 | jmp LNoMoreChunks | |
172 | ||
173 | // Handle leftovers: loop by 16. | |
174 | // %edx = length remaining (<64) | |
175 | // %edi = ptr (aligned) | |
176 | // %xmm0 = rotated pattern | |
177 | ||
178 | LLoopBy16: | |
179 | movdqa %xmm0,(%rdi) // pack in 16 more bytes | |
180 | subl $16,%edx // decrement count | |
181 | addq $16,%rdi // increment ptr | |
182 | LNoMoreChunks: | |
183 | cmpl $16,%edx // more to go? | |
184 | jge LLoopBy16 // yes | |
185 | jmp LLessThan16 // handle up to 15 remaining bytes | |
186 | ||
2d21ac55 | 187 | COMMPAGE_DESCRIPTOR(memset_pattern_sse2_64,_COMM_PAGE_MEMSET_PATTERN,kHasSSE2,0) |