]> git.saurik.com Git - apple/xnu.git/blame - osfmk/i386/commpage/memset_pattern_sse2_64.s
xnu-1228.12.14.tar.gz
[apple/xnu.git] / osfmk / i386 / commpage / memset_pattern_sse2_64.s
CommitLineData
0c530ab8
A
1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
2d21ac55
A
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
0c530ab8
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
0c530ab8
A
27 */
28
29#include <machine/cpu_capabilities.h>
30#include <machine/commpage.h>
31
32/* The common path for nonzero memset and the memset_pattern routines,
2d21ac55 33 * tuned for Pentium-M class processors with SSE2 and 64-byte cache lines.
0c530ab8
A
34 * This is the 64-bit bersion. It is used by the following functions:
35 *
36 * void *memset(void *b, int c, size_t len); // when c!=0
37 * void memset_pattern4(void *b, const void *c4, size_t len);
38 * void memset_pattern8(void *b, const void *c8, size_t len);
39 * void memset_pattern16(void *b, const void *c16, size_t len);
40 *
41 * Note bzero() and memset() of 0 are handled separately.
42 */
43
44#define kShort 63
45#define kVeryLong (1024*1024)
46
47// Initial entry from Libc with parameters passed in registers. Although we
48// correctly handle misaligned ptrs and short operands, they are inefficient.
49// Therefore our caller should filter out short operands and exploit local
50// knowledge (ie, original pattern length) to align the ptr if possible.
51// When called, we expect:
52// %rdi = ptr to memory to set (not necessarily aligned)
53// %rdx = length (may be short or even 0)
54// %xmm0 = the pattern to store
55// Return conditions:
56// %rax, %rdi, %rsi, %rcx, and %rdx all trashed
57// we preserve %r8, %r9, %r10, and %r11
58
59 .text
60 .align 5, 0x90
61 .code64
2d21ac55 62Lmemset_pattern_sse2_64:
0c530ab8
A
63 cmpq $(kShort),%rdx // long enough to bother aligning?
64 ja LNotShort // yes
65 jmp LShort // no
66
67// Here for short operands or the end of long ones.
68// %rdx = length (<= kShort)
69// %rdi = ptr (may not be not aligned)
70// %xmm0 = pattern
71
72LUnalignedStore16:
73 movdqu %xmm0,(%rdi) // stuff in another 16 bytes
74 subl $16,%edx
75 addq $16,%rdi
76LShort:
77 cmpl $16,%edx // room for another vector?
78 jge LUnalignedStore16 // yes
79LLessThan16: // here at end of copy with < 16 bytes remaining
80 test $8,%dl // 8-byte store required?
81 jz 2f // no
82 movq %xmm0,(%rdi) // pack in 8 low bytes
83 psrldq $8,%xmm0 // then shift vector down 8 bytes
84 addq $8,%rdi
852:
86 test $4,%dl // 4-byte store required?
87 jz 3f // no
88 movd %xmm0,(%rdi) // pack in 4 low bytes
89 psrldq $4,%xmm0 // then shift vector down 4 bytes
90 addq $4,%rdi
913:
92 andl $3,%edx // more to go?
93 jz 5f // no
94 movd %xmm0,%eax // move remainders out into %eax
954: // loop on up to three bytes
96 movb %al,(%rdi) // pack in next byte
97 shrl $8,%eax // shift next byte into position
98 incq %rdi
99 dec %edx
100 jnz 4b
1015: ret
102
103// Long enough to justify aligning ptr. Note that we have to rotate the
104// pattern to account for any alignment. We do this by doing two unaligned
105// stores, and then an aligned load from the middle of the two stores.
106// This will stall on store forwarding alignment mismatch, and the unaligned
107// stores can be pretty slow too, but the alternatives aren't any better.
108// Fortunately, in most cases our caller has already aligned the ptr.
109// %rdx = length (> kShort)
110// %rdi = ptr (may not be aligned)
111// %xmm0 = pattern
112
113LNotShort:
114 movl %edi,%ecx // copy low bits of dest ptr
115 negl %ecx
116 andl $15,%ecx // mask down to #bytes to 16-byte align
117 jz LAligned // skip if already aligned
118 movdqu %xmm0,(%rdi) // store 16 unaligned bytes
119 movdqu %xmm0,16(%rdi) // and 16 more, to be sure we have an aligned chunk
120 addq %rcx,%rdi // now point to the aligned chunk
121 subq %rcx,%rdx // adjust remaining count
122 movdqa (%rdi),%xmm0 // get the rotated pattern (probably stalling)
123 addq $16,%rdi // skip past the aligned chunk
124 subq $16,%rdx
125
126// Set up for 64-byte loops.
127// %rdx = length remaining
128// %rdi = ptr (aligned)
129// %xmm0 = rotated pattern
130
131LAligned:
132 movq %rdx,%rcx // copy length remaining
133 andl $63,%edx // mask down to residual length (0..63)
134 andq $-64,%rcx // %ecx <- #bytes we will zero in by-64 loop
135 jz LNoMoreChunks // no 64-byte chunks
136 addq %rcx,%rdi // increment ptr by length to move
137 cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores?
138 jge LVeryLong // yes
139 negq %rcx // negate length to move
140 jmp 1f
141
142// Loop over 64-byte chunks, storing into cache.
143
144 .align 4,0x90 // keep inner loops 16-byte aligned
1451:
146 movdqa %xmm0,(%rdi,%rcx)
147 movdqa %xmm0,16(%rdi,%rcx)
148 movdqa %xmm0,32(%rdi,%rcx)
149 movdqa %xmm0,48(%rdi,%rcx)
150 addq $64,%rcx
151 jne 1b
152
153 jmp LNoMoreChunks
154
155// Very long operands: use non-temporal stores to bypass cache.
156
157LVeryLong:
158 negq %rcx // negate length to move
159 jmp 1f
160
161 .align 4,0x90 // keep inner loops 16-byte aligned
1621:
163 movntdq %xmm0,(%rdi,%rcx)
164 movntdq %xmm0,16(%rdi,%rcx)
165 movntdq %xmm0,32(%rdi,%rcx)
166 movntdq %xmm0,48(%rdi,%rcx)
167 addq $64,%rcx
168 jne 1b
169
170 sfence // required by non-temporal stores
171 jmp LNoMoreChunks
172
173// Handle leftovers: loop by 16.
174// %edx = length remaining (<64)
175// %edi = ptr (aligned)
176// %xmm0 = rotated pattern
177
178LLoopBy16:
179 movdqa %xmm0,(%rdi) // pack in 16 more bytes
180 subl $16,%edx // decrement count
181 addq $16,%rdi // increment ptr
182LNoMoreChunks:
183 cmpl $16,%edx // more to go?
184 jge LLoopBy16 // yes
185 jmp LLessThan16 // handle up to 15 remaining bytes
186
2d21ac55 187 COMMPAGE_DESCRIPTOR(memset_pattern_sse2_64,_COMM_PAGE_MEMSET_PATTERN,kHasSSE2,0)