]>
Commit | Line | Data |
---|---|---|
c910b4d9 A |
1 | /* |
2 | * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | #include <machine/cpu_capabilities.h> | |
30 | #include <machine/commpage.h> | |
31 | ||
32 | /* | |
33 | * Bzero, tuned for processors with SSE4.2 and 64-byte cache lines, ie Nehalem. | |
34 | * We don't actually use SSE4.2, but rather use it to identify Nehalem. | |
35 | * | |
36 | * We do not use nontemporal operations, but use MOVDQA in preference to REP/STOS. | |
37 | * | |
38 | * This routine is also used for memset(p,0,n), which is a common case | |
39 | * since gcc sometimes silently maps bzero() into memset(). As a result, | |
40 | * we always load the original ptr into %eax before returning. | |
41 | */ | |
42 | ||
43 | #define kShort 80 // too short to bother with SSE (must be >=80) | |
44 | ||
45 | ||
46 | .text | |
47 | .align 5, 0x90 | |
48 | Lbzero_sse42: // void bzero(void *b, size_t len); | |
49 | pushl %ebp // set up a frame for backtraces | |
50 | movl %esp,%ebp | |
51 | pushl %edi | |
52 | movl 8(%ebp),%edi // get ptr | |
53 | movl 12(%ebp),%edx // get length | |
54 | ||
55 | xorl %eax,%eax // set fill data to 0 | |
56 | cmpl $(kShort),%edx // long enough for SSE? | |
57 | jg LNotShort // yes | |
58 | ||
59 | // Here for short operands or the end of long ones. | |
60 | // %edx = length | |
61 | // %edi = ptr | |
62 | // %eax = zero | |
63 | ||
64 | Lshort: | |
65 | cmpl $12,%edx // long enough to word align? | |
66 | jge 3f // yes | |
67 | test %edx,%edx // length==0? | |
68 | jz 6f | |
69 | 1: | |
70 | movb %al,(%edi) // zero a byte | |
71 | inc %edi | |
72 | dec %edx | |
73 | jnz 1b | |
74 | jmp 6f | |
75 | 2: | |
76 | movb %al,(%edi) // zero a byte | |
77 | inc %edi | |
78 | dec %edx | |
79 | 3: | |
80 | test $3,%edi // is ptr doubleword aligned? | |
81 | jnz 2b // no | |
82 | movl %edx,%ecx // copy length | |
83 | shrl $2,%edx // #doublewords to store | |
84 | 4: | |
85 | movl %eax,(%edi) // zero an aligned doubleword | |
86 | addl $4,%edi | |
87 | dec %edx | |
88 | jnz 4b | |
89 | andl $3,%ecx // mask down to #bytes at end (0..3) | |
90 | jz 6f // none | |
91 | 5: | |
92 | movb %al,(%edi) // zero a byte | |
93 | inc %edi | |
94 | dec %ecx | |
95 | jnz 5b | |
96 | 6: | |
97 | movl 8(%ebp),%eax // get return value in case this was a call of memset() | |
98 | popl %edi | |
99 | popl %ebp | |
100 | ret | |
101 | ||
102 | ||
103 | // We will be using SSE, so align ptr. | |
104 | // %edx = length | |
105 | // %edi = ptr | |
106 | // %eax = zero | |
107 | ||
108 | LNotShort: | |
109 | testl $3,%edi // 4-byte aligned? | |
110 | jz 2f // yes | |
111 | movb %al,(%edi) // zero another byte | |
112 | incl %edi | |
113 | decl %edx | |
114 | jmp LNotShort | |
115 | 1: // zero doublewords until 16-byte aligned | |
116 | movl %eax,(%edi) | |
117 | addl $4,%edi | |
118 | subl $4,%edx | |
119 | 2: | |
120 | testl $15,%edi // 16-byte aligned? | |
121 | jnz 1b // no | |
122 | ||
123 | ||
124 | // Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks. | |
125 | // %edx = length | |
126 | // %edi = ptr | |
127 | // %eax = zero | |
128 | ||
129 | LDestAligned: | |
130 | movl %edx,%ecx | |
131 | andl $63,%edx // mask down to residual length (0..63) | |
132 | andl $-64,%ecx // get #bytes we will zero in this loop | |
133 | pxor %xmm0,%xmm0 // zero an SSE register | |
134 | addl %ecx,%edi // increment ptr by length to move | |
135 | negl %ecx // negate length to move | |
136 | jmp 1f | |
137 | ||
138 | // Loop over 64-byte chunks, storing into cache. | |
139 | ||
140 | .align 4,0x90 // keep inner loops 16-byte aligned | |
141 | 1: | |
142 | movdqa %xmm0,(%edi,%ecx) | |
143 | movdqa %xmm0,16(%edi,%ecx) | |
144 | movdqa %xmm0,32(%edi,%ecx) | |
145 | movdqa %xmm0,48(%edi,%ecx) | |
146 | addl $64,%ecx | |
147 | jne 1b | |
148 | ||
149 | jmp Lshort | |
150 | ||
151 | ||
152 | ||
153 | COMMPAGE_DESCRIPTOR(bzero_sse42,_COMM_PAGE_BZERO,kHasSSE4_2,0) |