]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/commpage/bzero_sse3.s
xnu-792.13.8.tar.gz
[apple/xnu.git] / osfmk / i386 / commpage / bzero_sse3.s
1 /*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30
31 #include <machine/cpu_capabilities.h>
32 #include <machine/commpage.h>
33
34 /*
35 * Bzero, tuned for Pentium-M class processors with SSE3
36 * and 64-byte cache lines.
37 *
38 * This routine is also used for memset(p,0,n), which is a common case
39 * since gcc sometimes silently maps bzero() into memset(). As a result,
40 * we always load the original ptr into %eax before returning.
41 */
42
43 #define kShort 80 // too short to bother with SSE (must be >=80)
44 #define kVeryLong (1024*1024)
45
46
47 .text
48 .align 5, 0x90
49 Lbzero_sse3: // void bzero(void *b, size_t len);
50 pushl %ebp // set up a frame for backtraces
51 movl %esp,%ebp
52 pushl %edi
53 movl 8(%ebp),%edi // get ptr
54 movl 12(%ebp),%edx // get length
55
56 xorl %eax,%eax // set fill data to 0
57 cmpl $(kShort),%edx // long enough for SSE?
58 jg LNotShort // yes
59
60 // Here for short operands or the end of long ones.
61 // %edx = length
62 // %edi = ptr
63 // %eax = zero
64
65 Lshort:
66 cmpl $16,%edx // long enough to word align?
67 jge 3f // yes
68 test %edx,%edx // length==0?
69 jz 6f
70 1:
71 movb %al,(%edi) // zero a byte
72 inc %edi
73 dec %edx
74 jnz 1b
75 jmp 6f
76 2:
77 movb %al,(%edi) // zero a byte
78 inc %edi
79 dec %edx
80 3:
81 test $3,%edi // is ptr doubleword aligned?
82 jnz 2b // no
83 movl %edx,%ecx // copy length
84 shrl $2,%edx // #doublewords to store
85 4:
86 movl %eax,(%edi) // zero an aligned doubleword
87 addl $4,%edi
88 dec %edx
89 jnz 4b
90 andl $3,%ecx // mask down to #bytes at end (0..3)
91 jz 6f // none
92 5:
93 movb %al,(%edi) // zero a byte
94 inc %edi
95 dec %ecx
96 jnz 5b
97 6:
98 movl 8(%ebp),%eax // get return value in case this was a call of memset()
99 popl %edi
100 popl %ebp
101 ret
102
103
104 // We will be using SSE, so align ptr.
105
106 LNotShort:
107 movl %edi,%ecx
108 negl %ecx
109 andl $15,%ecx // mask down to #bytes to 16-byte align
110 jz LDestAligned // already aligned
111 subl %ecx,%edx // decrement length
112 0: // loop storing bytes to align the ptr
113 movb %al,(%edi) // pack in a byte
114 inc %edi
115 dec %ecx
116 jnz 0b
117
118 // Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks.
119 // %edx = length
120 // %edi = ptr
121 // %eax = zero
122
123 LDestAligned:
124 movl %edx,%ecx
125 andl $63,%edx // mask down to residual length (0..63)
126 andl $-64,%ecx // get #bytes we will zero in this loop
127 pxor %xmm0,%xmm0 // zero an SSE register
128 addl %ecx,%edi // increment ptr by length to move
129 cmpl $(kVeryLong),%ecx // long enough to justify non-temporal stores?
130 jae LVeryLong // yes
131 negl %ecx // negate length to move
132 jmp 1f
133
134 // Loop over 64-byte chunks, storing into cache.
135
136 .align 4,0x90 // keep inner loops 16-byte aligned
137 1:
138 movdqa %xmm0,(%edi,%ecx)
139 movdqa %xmm0,16(%edi,%ecx)
140 movdqa %xmm0,32(%edi,%ecx)
141 movdqa %xmm0,48(%edi,%ecx)
142 addl $64,%ecx
143 jne 1b
144
145 jmp Lshort
146
147 // Very long operands: use non-temporal stores to bypass cache.
148
149 LVeryLong:
150 negl %ecx // negate length to move
151 jmp 1f
152
153 .align 4,0x90 // keep inner loops 16-byte aligned
154 1:
155 movntdq %xmm0,(%edi,%ecx)
156 movntdq %xmm0,16(%edi,%ecx)
157 movntdq %xmm0,32(%edi,%ecx)
158 movntdq %xmm0,48(%edi,%ecx)
159 addl $64,%ecx
160 jne 1b
161
162 sfence // required by non-temporal stores
163 jmp Lshort
164
165
166 COMMPAGE_DESCRIPTOR(bzero_sse3,_COMM_PAGE_BZERO,kHasSSE2,0)