]> git.saurik.com Git - apple/xnu.git/blame - osfmk/i386/commpage/bzero_sse42.s
xnu-1228.15.4.tar.gz
[apple/xnu.git] / osfmk / i386 / commpage / bzero_sse42.s
CommitLineData
c910b4d9
A
1/*
2 * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <machine/cpu_capabilities.h>
30#include <machine/commpage.h>
31
32/*
33 * Bzero, tuned for processors with SSE4.2 and 64-byte cache lines, ie Nehalem.
34 * We don't actually use SSE4.2, but rather use it to identify Nehalem.
35 *
36 * We do not use nontemporal operations, but use MOVDQA in preference to REP/STOS.
37 *
38 * This routine is also used for memset(p,0,n), which is a common case
39 * since gcc sometimes silently maps bzero() into memset(). As a result,
40 * we always load the original ptr into %eax before returning.
41 */
42
43#define kShort 80 // too short to bother with SSE (must be >=80)
44
45
46 .text
47 .align 5, 0x90
48Lbzero_sse42: // void bzero(void *b, size_t len);
49 pushl %ebp // set up a frame for backtraces
50 movl %esp,%ebp
51 pushl %edi
52 movl 8(%ebp),%edi // get ptr
53 movl 12(%ebp),%edx // get length
54
55 xorl %eax,%eax // set fill data to 0
56 cmpl $(kShort),%edx // long enough for SSE?
57 jg LNotShort // yes
58
59// Here for short operands or the end of long ones.
60// %edx = length
61// %edi = ptr
62// %eax = zero
63
64Lshort:
65 cmpl $12,%edx // long enough to word align?
66 jge 3f // yes
67 test %edx,%edx // length==0?
68 jz 6f
691:
70 movb %al,(%edi) // zero a byte
71 inc %edi
72 dec %edx
73 jnz 1b
74 jmp 6f
752:
76 movb %al,(%edi) // zero a byte
77 inc %edi
78 dec %edx
793:
80 test $3,%edi // is ptr doubleword aligned?
81 jnz 2b // no
82 movl %edx,%ecx // copy length
83 shrl $2,%edx // #doublewords to store
844:
85 movl %eax,(%edi) // zero an aligned doubleword
86 addl $4,%edi
87 dec %edx
88 jnz 4b
89 andl $3,%ecx // mask down to #bytes at end (0..3)
90 jz 6f // none
915:
92 movb %al,(%edi) // zero a byte
93 inc %edi
94 dec %ecx
95 jnz 5b
966:
97 movl 8(%ebp),%eax // get return value in case this was a call of memset()
98 popl %edi
99 popl %ebp
100 ret
101
102
103// We will be using SSE, so align ptr.
104// %edx = length
105// %edi = ptr
106// %eax = zero
107
108LNotShort:
109 testl $3,%edi // 4-byte aligned?
110 jz 2f // yes
111 movb %al,(%edi) // zero another byte
112 incl %edi
113 decl %edx
114 jmp LNotShort
1151: // zero doublewords until 16-byte aligned
116 movl %eax,(%edi)
117 addl $4,%edi
118 subl $4,%edx
1192:
120 testl $15,%edi // 16-byte aligned?
121 jnz 1b // no
122
123
124// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks.
125// %edx = length
126// %edi = ptr
127// %eax = zero
128
129LDestAligned:
130 movl %edx,%ecx
131 andl $63,%edx // mask down to residual length (0..63)
132 andl $-64,%ecx // get #bytes we will zero in this loop
133 pxor %xmm0,%xmm0 // zero an SSE register
134 addl %ecx,%edi // increment ptr by length to move
135 negl %ecx // negate length to move
136 jmp 1f
137
138// Loop over 64-byte chunks, storing into cache.
139
140 .align 4,0x90 // keep inner loops 16-byte aligned
1411:
142 movdqa %xmm0,(%edi,%ecx)
143 movdqa %xmm0,16(%edi,%ecx)
144 movdqa %xmm0,32(%edi,%ecx)
145 movdqa %xmm0,48(%edi,%ecx)
146 addl $64,%ecx
147 jne 1b
148
149 jmp Lshort
150
151
152
153 COMMPAGE_DESCRIPTOR(bzero_sse42,_COMM_PAGE_BZERO,kHasSSE4_2,0)