]>
Commit | Line | Data |
---|---|---|
c910b4d9 A |
1 | /* |
2 | * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | #include <machine/cpu_capabilities.h> | |
30 | #include <machine/commpage.h> | |
31 | ||
32 | /* | |
33 | * The bcopy/memcpy loops, tuned for Nehalem. This is the 64-bit version. | |
34 | * | |
35 | * The following #defines are tightly coupled to the u-architecture: | |
36 | */ | |
37 | ||
38 | #define kShort 80 // too short to bother with SSE (must be >=80) | |
39 | ||
40 | ||
41 | // void bcopy(const void *src, void *dst, size_t len); | |
42 | ||
43 | .text | |
44 | .code64 | |
45 | .align 5, 0x90 | |
46 | Lbcopy_sse42_64: // void bcopy(const void *src, void *dst, size_t len) | |
47 | pushq %rbp // set up a frame for backtraces | |
48 | movq %rsp,%rbp | |
49 | movq %rsi,%rax // copy dest ptr | |
50 | movq %rdi,%rsi // xchange source and dest ptrs | |
51 | movq %rax,%rdi | |
52 | subq %rsi,%rax // (dest - source) | |
53 | cmpq %rdx,%rax // must move in reverse if (dest - source) < length | |
54 | jb LReverseIsland | |
55 | cmpq $(kShort),%rdx // long enough to bother with SSE? | |
56 | jbe LShort // no | |
57 | jmp LNotShort | |
58 | ||
59 | // | |
60 | // void *memcpy(void *dst, const void *src, size_t len); | |
61 | // void *memmove(void *dst, const void *src, size_t len); | |
62 | // | |
63 | // NB: These need to be 32 bytes from bcopy(): | |
64 | // | |
65 | ||
66 | .align 5, 0x90 | |
67 | Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len) | |
68 | Lmemmove: // void *memmove(void *dst, const void *src, size_t len) | |
69 | pushq %rbp // set up a frame for backtraces | |
70 | movq %rsp,%rbp | |
71 | movq %rdi,%r11 // save return value here | |
72 | movq %rdi,%rax | |
73 | subq %rsi,%rax // (dest - source) | |
74 | cmpq %rdx,%rax // must move in reverse if (dest - source) < length | |
75 | jb LReverseIsland | |
76 | cmpq $(kShort),%rdx // long enough to bother with SSE? | |
77 | ja LNotShort // yes | |
78 | ||
79 | // Handle short forward copies. As the most common case, this is the fall-through path. | |
80 | // rdx = length (<= kShort) | |
81 | // rsi = source ptr | |
82 | // rdi = dest ptr | |
83 | ||
84 | LShort: | |
85 | movl %edx,%ecx // copy length using 32-bit operation | |
86 | shrl $2,%ecx // get #doublewords | |
87 | jz 3f | |
88 | 2: // loop copying doublewords | |
89 | movl (%rsi),%eax | |
90 | addq $4,%rsi | |
91 | movl %eax,(%rdi) | |
92 | addq $4,%rdi | |
93 | decl %ecx | |
94 | jnz 2b | |
95 | 3: // handle leftover bytes (0..3) in last word | |
96 | andl $3,%edx // any leftover bytes? | |
97 | jz 5f | |
98 | 4: // loop copying bytes | |
99 | movb (%rsi),%al | |
100 | incq %rsi | |
101 | movb %al,(%rdi) | |
102 | incq %rdi | |
103 | decl %edx | |
104 | jnz 4b | |
105 | 5: | |
106 | movq %r11,%rax // get return value (dst ptr) for memcpy/memmove | |
107 | popq %rbp | |
108 | ret | |
109 | ||
110 | ||
111 | LReverseIsland: // keep the "jb" above a short branch... | |
112 | jmp LReverse // ...because reverse moves are uncommon | |
113 | ||
114 | ||
115 | // Handle forward moves that are long enough to justify use of SSE. | |
116 | // First, 16-byte align the destination. | |
117 | // rdx = length (> kShort) | |
118 | // rsi = source ptr | |
119 | // rdi = dest ptr | |
120 | ||
121 | LNotShort: | |
122 | movl %edi,%ecx // copy low half of destination ptr | |
123 | negl %ecx | |
124 | andl $15,%ecx // get #bytes to align destination | |
125 | jz LDestAligned // already aligned | |
126 | subl %ecx,%edx // decrement length | |
127 | 1: // loop copying 1..15 bytes | |
128 | movb (%rsi),%al | |
129 | inc %rsi | |
130 | movb %al,(%rdi) | |
131 | inc %rdi | |
132 | dec %ecx | |
133 | jnz 1b | |
134 | ||
135 | ||
136 | // Destination is now aligned. Nehalem does a great job with unaligned SSE loads, | |
137 | // so we use MOVDQU rather than aligned loads and shifts. Since kShort>=80, we | |
138 | // know there is at least one 64-byte chunk to move. | |
139 | // When we enter the copy loops, the following registers are set up: | |
140 | // rdx = residual length (0..63) | |
141 | // rcx = -(length to move), a multiple of 64 less than 2GB | |
142 | // rsi = ptr to 1st source byte not to move (unaligned) | |
143 | // rdi = ptr to 1st dest byte not to move (aligned) | |
144 | ||
145 | LDestAligned: | |
146 | movq %rdx,%rcx // copy length | |
147 | andl $63,%edx // get remaining bytes for LShort | |
148 | andq $-64,%rcx // get number of bytes we will copy in inner loop | |
149 | addq %rcx,%rsi // point to 1st byte not copied | |
150 | addq %rcx,%rdi | |
151 | negq %rcx // now generate offset to 1st byte to be copied | |
152 | testl $15,%esi // source also aligned? | |
153 | jnz LUnalignedLoop | |
154 | jmp LAlignedLoop | |
155 | ||
156 | ||
157 | // Forward loop for aligned operands. | |
158 | ||
159 | .align 4,0x90 // 16-byte align inner loops | |
160 | LAlignedLoop: // loop over 64-byte chunks | |
161 | movdqa (%rsi,%rcx),%xmm0 | |
162 | movdqa 16(%rsi,%rcx),%xmm1 | |
163 | movdqa 32(%rsi,%rcx),%xmm2 | |
164 | movdqa 48(%rsi,%rcx),%xmm3 | |
165 | ||
166 | movdqa %xmm0,(%rdi,%rcx) | |
167 | movdqa %xmm1,16(%rdi,%rcx) | |
168 | movdqa %xmm2,32(%rdi,%rcx) | |
169 | movdqa %xmm3,48(%rdi,%rcx) | |
170 | ||
171 | addq $64,%rcx | |
172 | jnz LAlignedLoop | |
173 | ||
174 | jmp LShort // copy remaining 0..63 bytes and done | |
175 | ||
176 | ||
177 | // Forward loop for unaligned operands. | |
178 | ||
179 | .align 4,0x90 // 16-byte align inner loops | |
180 | LUnalignedLoop: // loop over 64-byte chunks | |
181 | movdqu (%rsi,%rcx),%xmm0 | |
182 | movdqu 16(%rsi,%rcx),%xmm1 | |
183 | movdqu 32(%rsi,%rcx),%xmm2 | |
184 | movdqu 48(%rsi,%rcx),%xmm3 | |
185 | ||
186 | movdqa %xmm0,(%rdi,%rcx) | |
187 | movdqa %xmm1,16(%rdi,%rcx) | |
188 | movdqa %xmm2,32(%rdi,%rcx) | |
189 | movdqa %xmm3,48(%rdi,%rcx) | |
190 | ||
191 | addq $64,%rcx | |
192 | jnz LUnalignedLoop | |
193 | ||
194 | jmp LShort // copy remaining 0..63 bytes and done | |
195 | ||
196 | ||
197 | // Reverse moves. These are only used with destructive overlap. | |
198 | // rdx = length | |
199 | // rsi = source ptr | |
200 | // rdi = dest ptr | |
201 | ||
202 | LReverse: | |
203 | addq %rdx,%rsi // point to end of strings | |
204 | addq %rdx,%rdi | |
205 | cmpq $(kShort),%rdx // long enough to bother with SSE? | |
206 | ja LReverseNotShort // yes | |
207 | ||
208 | // Handle reverse short copies. | |
209 | // edx = length (<= kShort) | |
210 | // rsi = one byte past end of source | |
211 | // rdi = one byte past end of dest | |
212 | ||
213 | LReverseShort: | |
214 | movl %edx,%ecx // copy length | |
215 | shrl $3,%ecx // #quadwords | |
216 | jz 3f | |
217 | 1: | |
218 | subq $8,%rsi | |
219 | movq (%rsi),%rax | |
220 | subq $8,%rdi | |
221 | movq %rax,(%rdi) | |
222 | decl %ecx | |
223 | jnz 1b | |
224 | 3: | |
225 | andl $7,%edx // bytes? | |
226 | jz 5f | |
227 | 4: | |
228 | decq %rsi | |
229 | movb (%rsi),%al | |
230 | decq %rdi | |
231 | movb %al,(%rdi) | |
232 | decl %edx | |
233 | jnz 4b | |
234 | 5: | |
235 | movq %r11,%rax // get return value (dst ptr) for memcpy/memmove | |
236 | popq %rbp | |
237 | ret | |
238 | ||
239 | // Handle a reverse move long enough to justify using SSE. | |
240 | // rdx = length (> kShort) | |
241 | // rsi = one byte past end of source | |
242 | // rdi = one byte past end of dest | |
243 | ||
244 | LReverseNotShort: | |
245 | movl %edi,%ecx // copy destination | |
246 | andl $15,%ecx // get #bytes to align destination | |
247 | jz LReverseDestAligned // already aligned | |
248 | subq %rcx,%rdx // adjust length | |
249 | 1: // loop copying 1..15 bytes | |
250 | decq %rsi | |
251 | movb (%rsi),%al | |
252 | decq %rdi | |
253 | movb %al,(%rdi) | |
254 | decl %ecx | |
255 | jnz 1b | |
256 | ||
257 | // Destination is now aligned. Prepare for reverse loops. | |
258 | ||
259 | LReverseDestAligned: | |
260 | movq %rdx,%rcx // copy length | |
261 | andl $63,%edx // get remaining bytes for LReverseShort | |
262 | andq $-64,%rcx // get number of bytes we will copy in inner loop | |
263 | subq %rcx,%rsi // point to endpoint of copy | |
264 | subq %rcx,%rdi | |
265 | testl $15,%esi // is source aligned too? | |
266 | jnz LReverseUnalignedLoop // no | |
267 | ||
268 | LReverseAlignedLoop: // loop over 64-byte chunks | |
269 | movdqa -16(%rsi,%rcx),%xmm0 | |
270 | movdqa -32(%rsi,%rcx),%xmm1 | |
271 | movdqa -48(%rsi,%rcx),%xmm2 | |
272 | movdqa -64(%rsi,%rcx),%xmm3 | |
273 | ||
274 | movdqa %xmm0,-16(%rdi,%rcx) | |
275 | movdqa %xmm1,-32(%rdi,%rcx) | |
276 | movdqa %xmm2,-48(%rdi,%rcx) | |
277 | movdqa %xmm3,-64(%rdi,%rcx) | |
278 | ||
279 | subq $64,%rcx | |
280 | jne LReverseAlignedLoop | |
281 | ||
282 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
283 | ||
284 | ||
285 | // Reverse, unaligned loop. LDDQU==MOVDQU on these machines. | |
286 | ||
287 | LReverseUnalignedLoop: // loop over 64-byte chunks | |
288 | movdqu -16(%rsi,%rcx),%xmm0 | |
289 | movdqu -32(%rsi,%rcx),%xmm1 | |
290 | movdqu -48(%rsi,%rcx),%xmm2 | |
291 | movdqu -64(%rsi,%rcx),%xmm3 | |
292 | ||
293 | movdqa %xmm0,-16(%rdi,%rcx) | |
294 | movdqa %xmm1,-32(%rdi,%rcx) | |
295 | movdqa %xmm2,-48(%rdi,%rcx) | |
296 | movdqa %xmm3,-64(%rdi,%rcx) | |
297 | ||
298 | subq $64,%rcx | |
299 | jne LReverseUnalignedLoop | |
300 | ||
301 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
302 | ||
303 | ||
304 | COMMPAGE_DESCRIPTOR(bcopy_sse42_64,_COMM_PAGE_BCOPY,kHasSSE4_2,0) |