]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/commpage/bzero_sse2_64.s
d82d77e6f88a3a7a8a9fd8f8e689bd148574e072
[apple/xnu.git] / osfmk / i386 / commpage / bzero_sse2_64.s
1 /*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <machine/cpu_capabilities.h>
30 #include <machine/commpage.h>
31
32 /*
33 * Bzero, tuned for Pentium-M class processors with SSE2
34 * and 64-byte cache lines. This is the 64-bit version.
35 *
36 * This routine is also used for memset(p,0,n), which is a common case
37 * since gcc sometimes silently maps bzero() into memset(). As a result,
38 * we always load the original ptr into %eax before returning.
39 */
40
41 #define kShort 80 // too short to bother with SSE (must be >=80)
42 #define kVeryLong (1024*1024)
43
44
45 .text
46 .code64
47 .align 5, 0x90
48 Lbzero_sse2_64: // void bzero(void *b, size_t len);
49 pushq %rbp // set up a frame for backtraces
50 movq %rsp,%rbp
51 xorl %eax,%eax // set fill data to 0
52 movq %rdi,%r11 // save original ptr as return value
53 cmpq $(kShort),%rsi // long enough for SSE?
54 jg LNotShort // yes
55
56 // Here for short operands or the end of long ones.
57 // %esi = length (<= kShort)
58 // %rdi = ptr
59 // %eax = zero
60
61 Lshort:
62 cmpl $16,%esi // long enough to word align?
63 jge 3f // yes
64 test %esi,%esi // length==0?
65 jz 6f
66 1:
67 movb %al,(%rdi) // zero a byte
68 incq %rdi
69 decl %esi
70 jnz 1b
71 jmp 6f
72 2:
73 movb %al,(%rdi) // zero a byte
74 incq %rdi
75 decl %esi
76 3:
77 testl $3,%edi // is ptr doubleword aligned?
78 jnz 2b // no
79 movl %esi,%ecx // copy length
80 shrl $2,%esi // #doublewords to store
81 4:
82 movl %eax,(%rdi) // zero an aligned doubleword
83 addq $4,%rdi
84 decl %esi
85 jnz 4b
86 andl $3,%ecx // mask down to #bytes at end (0..3)
87 jz 6f // none
88 5:
89 movb %al,(%rdi) // zero a byte
90 incq %rdi
91 decl %ecx
92 jnz 5b
93 6:
94 movq %r11,%rax // set return value in case this was a call of memset()
95 popq %rbp
96 ret
97
98
99 // We will be using SSE, so align ptr.
100 // %rsi = length (> kShort)
101 // %rdi = ptr
102 // %eax = zero
103
104 LNotShort:
105 movl %edi,%ecx // get #bytes to 16-byte align ptr
106 negl %ecx
107 andl $15,%ecx
108 jz LDestAligned // already aligned
109 subq %rcx,%rsi // decrement length
110 0: // loop storing bytes to align the ptr
111 movb %al,(%rdi) // pack in a byte
112 incq %rdi
113 decl %ecx
114 jnz 0b
115
116 // Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks.
117 // %rsi = length (> (kShort-15))
118 // %rdi = ptr (aligned)
119 // %eax = zero
120
121 LDestAligned:
122 movq %rsi,%rcx
123 andl $63,%esi // mask down to residual length (0..63)
124 andq $-64,%rcx // get #bytes we will zero in this loop
125 pxor %xmm0,%xmm0 // zero an SSE register
126 addq %rcx,%rdi // increment ptr by length to move
127 cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores?
128 jae LVeryLong // yes
129 negq %rcx // negate length to move
130 jmp 1f
131
132 // Loop over 64-byte chunks, storing into cache.
133
134 .align 4,0x90 // keep inner loops 16-byte aligned
135 1:
136 movdqa %xmm0,(%rdi,%rcx)
137 movdqa %xmm0,16(%rdi,%rcx)
138 movdqa %xmm0,32(%rdi,%rcx)
139 movdqa %xmm0,48(%rdi,%rcx)
140 addq $64,%rcx
141 jne 1b
142
143 jmp Lshort
144
145 // Very long operands: use non-temporal stores to bypass cache.
146
147 LVeryLong:
148 negq %rcx // negate length to move
149 jmp 1f
150
151 .align 4,0x90 // keep inner loops 16-byte aligned
152 1:
153 movntdq %xmm0,(%rdi,%rcx)
154 movntdq %xmm0,16(%rdi,%rcx)
155 movntdq %xmm0,32(%rdi,%rcx)
156 movntdq %xmm0,48(%rdi,%rcx)
157 addq $64,%rcx
158 jne 1b
159
160 sfence // required by non-temporal stores
161 jmp Lshort
162
163
164 COMMPAGE_DESCRIPTOR(bzero_sse2_64,_COMM_PAGE_BZERO,kHasSSE2,0)