]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * The contents of this file constitute Original Code as defined in and | |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
11 | * | |
12 | * This Original Code and all software distributed under the License are | |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the | |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
19 | * | |
20 | * @APPLE_LICENSE_HEADER_END@ | |
21 | */ | |
22 | ||
23 | #include <machine/cpu_capabilities.h> | |
24 | #include <machine/commpage.h> | |
25 | ||
26 | /* | |
27 | * Bzero, tuned for Pentium-M class processors with SSE3 | |
28 | * and 64-byte cache lines. | |
29 | * | |
30 | * This routine is also used for memset(p,0,n), which is a common case | |
31 | * since gcc sometimes silently maps bzero() into memset(). As a result, | |
32 | * we always load the original ptr into %eax before returning. | |
33 | */ | |
34 | ||
35 | #define kShort 80 // too short to bother with SSE (must be >=80) | |
36 | #define kVeryLong (1024*1024) | |
37 | ||
38 | ||
39 | .text | |
40 | .align 5, 0x90 | |
41 | Lbzero_sse3: // void bzero(void *b, size_t len); | |
42 | pushl %ebp // set up a frame for backtraces | |
43 | movl %esp,%ebp | |
44 | pushl %edi | |
45 | movl 8(%ebp),%edi // get ptr | |
46 | movl 12(%ebp),%edx // get length | |
47 | ||
48 | xorl %eax,%eax // set fill data to 0 | |
49 | cmpl $(kShort),%edx // long enough for SSE? | |
50 | jg LNotShort // yes | |
51 | ||
52 | // Here for short operands or the end of long ones. | |
53 | // %edx = length | |
54 | // %edi = ptr | |
55 | // %eax = zero | |
56 | ||
57 | Lshort: | |
58 | cmpl $16,%edx // long enough to word align? | |
59 | jge 3f // yes | |
60 | test %edx,%edx // length==0? | |
61 | jz 6f | |
62 | 1: | |
63 | movb %al,(%edi) // zero a byte | |
64 | inc %edi | |
65 | dec %edx | |
66 | jnz 1b | |
67 | jmp 6f | |
68 | 2: | |
69 | movb %al,(%edi) // zero a byte | |
70 | inc %edi | |
71 | dec %edx | |
72 | 3: | |
73 | test $3,%edi // is ptr doubleword aligned? | |
74 | jnz 2b // no | |
75 | movl %edx,%ecx // copy length | |
76 | shrl $2,%edx // #doublewords to store | |
77 | 4: | |
78 | movl %eax,(%edi) // zero an aligned doubleword | |
79 | addl $4,%edi | |
80 | dec %edx | |
81 | jnz 4b | |
82 | andl $3,%ecx // mask down to #bytes at end (0..3) | |
83 | jz 6f // none | |
84 | 5: | |
85 | movb %al,(%edi) // zero a byte | |
86 | inc %edi | |
87 | dec %ecx | |
88 | jnz 5b | |
89 | 6: | |
90 | movl 8(%ebp),%eax // get return value in case this was a call of memset() | |
91 | popl %edi | |
92 | popl %ebp | |
93 | ret | |
94 | ||
95 | ||
96 | // We will be using SSE, so align ptr. | |
97 | ||
98 | LNotShort: | |
99 | movl %edi,%ecx | |
100 | negl %ecx | |
101 | andl $15,%ecx // mask down to #bytes to 16-byte align | |
102 | jz LDestAligned // already aligned | |
103 | subl %ecx,%edx // decrement length | |
104 | 0: // loop storing bytes to align the ptr | |
105 | movb %al,(%edi) // pack in a byte | |
106 | inc %edi | |
107 | dec %ecx | |
108 | jnz 0b | |
109 | ||
110 | // Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks. | |
111 | // %edx = length | |
112 | // %edi = ptr | |
113 | // %eax = zero | |
114 | ||
115 | LDestAligned: | |
116 | movl %edx,%ecx | |
117 | andl $63,%edx // mask down to residual length (0..63) | |
118 | andl $-64,%ecx // get #bytes we will zero in this loop | |
119 | pxor %xmm0,%xmm0 // zero an SSE register | |
120 | addl %ecx,%edi // increment ptr by length to move | |
121 | cmpl $(kVeryLong),%ecx // long enough to justify non-temporal stores? | |
122 | jae LVeryLong // yes | |
123 | negl %ecx // negate length to move | |
124 | jmp 1f | |
125 | ||
126 | // Loop over 64-byte chunks, storing into cache. | |
127 | ||
128 | .align 4,0x90 // keep inner loops 16-byte aligned | |
129 | 1: | |
130 | movdqa %xmm0,(%edi,%ecx) | |
131 | movdqa %xmm0,16(%edi,%ecx) | |
132 | movdqa %xmm0,32(%edi,%ecx) | |
133 | movdqa %xmm0,48(%edi,%ecx) | |
134 | addl $64,%ecx | |
135 | jne 1b | |
136 | ||
137 | jmp Lshort | |
138 | ||
139 | // Very long operands: use non-temporal stores to bypass cache. | |
140 | ||
141 | LVeryLong: | |
142 | negl %ecx // negate length to move | |
143 | jmp 1f | |
144 | ||
145 | .align 4,0x90 // keep inner loops 16-byte aligned | |
146 | 1: | |
147 | movntdq %xmm0,(%edi,%ecx) | |
148 | movntdq %xmm0,16(%edi,%ecx) | |
149 | movntdq %xmm0,32(%edi,%ecx) | |
150 | movntdq %xmm0,48(%edi,%ecx) | |
151 | addl $64,%ecx | |
152 | jne 1b | |
153 | ||
154 | sfence // required by non-temporal stores | |
155 | jmp Lshort | |
156 | ||
157 | ||
158 | COMMPAGE_DESCRIPTOR(bzero_sse3,_COMM_PAGE_BZERO,kHasSSE2,0) |