]> git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/i386/commpage/bzero_sse3.s
xnu-792.10.96.tar.gz
[apple/xnu.git] / osfmk / i386 / commpage / bzero_sse3.s
... / ...
CommitLineData
1/*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23#include <machine/cpu_capabilities.h>
24#include <machine/commpage.h>
25
26/*
27 * Bzero, tuned for Pentium-M class processors with SSE3
28 * and 64-byte cache lines.
29 *
30 * This routine is also used for memset(p,0,n), which is a common case
31 * since gcc sometimes silently maps bzero() into memset(). As a result,
32 * we always load the original ptr into %eax before returning.
33 */
34
35#define kShort 80 // too short to bother with SSE (must be >=80)
36#define kVeryLong (1024*1024)
37
38
39 .text
40 .align 5, 0x90
41Lbzero_sse3: // void bzero(void *b, size_t len);
42 pushl %ebp // set up a frame for backtraces
43 movl %esp,%ebp
44 pushl %edi
45 movl 8(%ebp),%edi // get ptr
46 movl 12(%ebp),%edx // get length
47
48 xorl %eax,%eax // set fill data to 0
49 cmpl $(kShort),%edx // long enough for SSE?
50 jg LNotShort // yes
51
52// Here for short operands or the end of long ones.
53// %edx = length
54// %edi = ptr
55// %eax = zero
56
57Lshort:
58 cmpl $16,%edx // long enough to word align?
59 jge 3f // yes
60 test %edx,%edx // length==0?
61 jz 6f
621:
63 movb %al,(%edi) // zero a byte
64 inc %edi
65 dec %edx
66 jnz 1b
67 jmp 6f
682:
69 movb %al,(%edi) // zero a byte
70 inc %edi
71 dec %edx
723:
73 test $3,%edi // is ptr doubleword aligned?
74 jnz 2b // no
75 movl %edx,%ecx // copy length
76 shrl $2,%edx // #doublewords to store
774:
78 movl %eax,(%edi) // zero an aligned doubleword
79 addl $4,%edi
80 dec %edx
81 jnz 4b
82 andl $3,%ecx // mask down to #bytes at end (0..3)
83 jz 6f // none
845:
85 movb %al,(%edi) // zero a byte
86 inc %edi
87 dec %ecx
88 jnz 5b
896:
90 movl 8(%ebp),%eax // get return value in case this was a call of memset()
91 popl %edi
92 popl %ebp
93 ret
94
95
96// We will be using SSE, so align ptr.
97
98LNotShort:
99 movl %edi,%ecx
100 negl %ecx
101 andl $15,%ecx // mask down to #bytes to 16-byte align
102 jz LDestAligned // already aligned
103 subl %ecx,%edx // decrement length
1040: // loop storing bytes to align the ptr
105 movb %al,(%edi) // pack in a byte
106 inc %edi
107 dec %ecx
108 jnz 0b
109
110// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks.
111// %edx = length
112// %edi = ptr
113// %eax = zero
114
115LDestAligned:
116 movl %edx,%ecx
117 andl $63,%edx // mask down to residual length (0..63)
118 andl $-64,%ecx // get #bytes we will zero in this loop
119 pxor %xmm0,%xmm0 // zero an SSE register
120 addl %ecx,%edi // increment ptr by length to move
121 cmpl $(kVeryLong),%ecx // long enough to justify non-temporal stores?
122 jae LVeryLong // yes
123 negl %ecx // negate length to move
124 jmp 1f
125
126// Loop over 64-byte chunks, storing into cache.
127
128 .align 4,0x90 // keep inner loops 16-byte aligned
1291:
130 movdqa %xmm0,(%edi,%ecx)
131 movdqa %xmm0,16(%edi,%ecx)
132 movdqa %xmm0,32(%edi,%ecx)
133 movdqa %xmm0,48(%edi,%ecx)
134 addl $64,%ecx
135 jne 1b
136
137 jmp Lshort
138
139// Very long operands: use non-temporal stores to bypass cache.
140
141LVeryLong:
142 negl %ecx // negate length to move
143 jmp 1f
144
145 .align 4,0x90 // keep inner loops 16-byte aligned
1461:
147 movntdq %xmm0,(%edi,%ecx)
148 movntdq %xmm0,16(%edi,%ecx)
149 movntdq %xmm0,32(%edi,%ecx)
150 movntdq %xmm0,48(%edi,%ecx)
151 addl $64,%ecx
152 jne 1b
153
154 sfence // required by non-temporal stores
155 jmp Lshort
156
157
158 COMMPAGE_DESCRIPTOR(bzero_sse3,_COMM_PAGE_BZERO,kHasSSE2,0)