]>
Commit | Line | Data |
---|---|---|
0c530ab8 A |
1 | /* |
2 | * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. | |
3 | * | |
2d21ac55 A |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
0c530ab8 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
0c530ab8 A |
27 | */ |
28 | ||
29 | #include <machine/cpu_capabilities.h> | |
30 | #include <machine/commpage.h> | |
31 | ||
32 | /* | |
2d21ac55 | 33 | * Bzero, tuned for Pentium-M class processors with SSE2 |
0c530ab8 A |
34 | * and 64-byte cache lines. This is the 64-bit version. |
35 | * | |
36 | * This routine is also used for memset(p,0,n), which is a common case | |
37 | * since gcc sometimes silently maps bzero() into memset(). As a result, | |
38 | * we always load the original ptr into %eax before returning. | |
39 | */ | |
40 | ||
41 | #define kShort 80 // too short to bother with SSE (must be >=80) | |
42 | #define kVeryLong (1024*1024) | |
43 | ||
b0d623f7 | 44 | // void bzero(void *b, size_t len); |
0c530ab8 | 45 | |
b0d623f7 | 46 | COMMPAGE_FUNCTION_START(bzero_sse2_64, 64, 5) |
0c530ab8 A |
47 | pushq %rbp // set up a frame for backtraces |
48 | movq %rsp,%rbp | |
49 | xorl %eax,%eax // set fill data to 0 | |
50 | movq %rdi,%r11 // save original ptr as return value | |
51 | cmpq $(kShort),%rsi // long enough for SSE? | |
52 | jg LNotShort // yes | |
53 | ||
54 | // Here for short operands or the end of long ones. | |
55 | // %esi = length (<= kShort) | |
56 | // %rdi = ptr | |
57 | // %eax = zero | |
58 | ||
59 | Lshort: | |
60 | cmpl $16,%esi // long enough to word align? | |
61 | jge 3f // yes | |
62 | test %esi,%esi // length==0? | |
63 | jz 6f | |
64 | 1: | |
65 | movb %al,(%rdi) // zero a byte | |
66 | incq %rdi | |
67 | decl %esi | |
68 | jnz 1b | |
69 | jmp 6f | |
70 | 2: | |
71 | movb %al,(%rdi) // zero a byte | |
72 | incq %rdi | |
73 | decl %esi | |
74 | 3: | |
75 | testl $3,%edi // is ptr doubleword aligned? | |
76 | jnz 2b // no | |
77 | movl %esi,%ecx // copy length | |
78 | shrl $2,%esi // #doublewords to store | |
79 | 4: | |
80 | movl %eax,(%rdi) // zero an aligned doubleword | |
81 | addq $4,%rdi | |
82 | decl %esi | |
83 | jnz 4b | |
84 | andl $3,%ecx // mask down to #bytes at end (0..3) | |
85 | jz 6f // none | |
86 | 5: | |
87 | movb %al,(%rdi) // zero a byte | |
88 | incq %rdi | |
89 | decl %ecx | |
90 | jnz 5b | |
91 | 6: | |
92 | movq %r11,%rax // set return value in case this was a call of memset() | |
93 | popq %rbp | |
94 | ret | |
95 | ||
96 | ||
97 | // We will be using SSE, so align ptr. | |
98 | // %rsi = length (> kShort) | |
99 | // %rdi = ptr | |
100 | // %eax = zero | |
101 | ||
102 | LNotShort: | |
103 | movl %edi,%ecx // get #bytes to 16-byte align ptr | |
104 | negl %ecx | |
105 | andl $15,%ecx | |
106 | jz LDestAligned // already aligned | |
107 | subq %rcx,%rsi // decrement length | |
108 | 0: // loop storing bytes to align the ptr | |
109 | movb %al,(%rdi) // pack in a byte | |
110 | incq %rdi | |
111 | decl %ecx | |
112 | jnz 0b | |
113 | ||
114 | // Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks. | |
115 | // %rsi = length (> (kShort-15)) | |
116 | // %rdi = ptr (aligned) | |
117 | // %eax = zero | |
118 | ||
119 | LDestAligned: | |
120 | movq %rsi,%rcx | |
121 | andl $63,%esi // mask down to residual length (0..63) | |
122 | andq $-64,%rcx // get #bytes we will zero in this loop | |
123 | pxor %xmm0,%xmm0 // zero an SSE register | |
124 | addq %rcx,%rdi // increment ptr by length to move | |
125 | cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores? | |
126 | jae LVeryLong // yes | |
127 | negq %rcx // negate length to move | |
128 | jmp 1f | |
129 | ||
130 | // Loop over 64-byte chunks, storing into cache. | |
131 | ||
132 | .align 4,0x90 // keep inner loops 16-byte aligned | |
133 | 1: | |
134 | movdqa %xmm0,(%rdi,%rcx) | |
135 | movdqa %xmm0,16(%rdi,%rcx) | |
136 | movdqa %xmm0,32(%rdi,%rcx) | |
137 | movdqa %xmm0,48(%rdi,%rcx) | |
138 | addq $64,%rcx | |
139 | jne 1b | |
140 | ||
141 | jmp Lshort | |
142 | ||
143 | // Very long operands: use non-temporal stores to bypass cache. | |
144 | ||
145 | LVeryLong: | |
146 | negq %rcx // negate length to move | |
147 | jmp 1f | |
148 | ||
149 | .align 4,0x90 // keep inner loops 16-byte aligned | |
150 | 1: | |
151 | movntdq %xmm0,(%rdi,%rcx) | |
152 | movntdq %xmm0,16(%rdi,%rcx) | |
153 | movntdq %xmm0,32(%rdi,%rcx) | |
154 | movntdq %xmm0,48(%rdi,%rcx) | |
155 | addq $64,%rcx | |
156 | jne 1b | |
157 | ||
158 | sfence // required by non-temporal stores | |
159 | jmp Lshort | |
160 | ||
b0d623f7 | 161 | COMMPAGE_DESCRIPTOR(bzero_sse2_64,_COMM_PAGE_BZERO,kHasSSE2,kHasSSE4_2) |