]> git.saurik.com Git - apple/libc.git/blame_incremental - i386/string/bzero_sse2.s
Libc-763.11.tar.gz
[apple/libc.git] / i386 / string / bzero_sse2.s
... / ...
CommitLineData
1/*
2 * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <machine/cpu_capabilities.h>
30#include <platfunc.h>
31
32/*
33 * Bzero, tuned for Pentium-M class processors with SSE2
34 * and 64-byte cache lines.
35 *
36 * This routine is also used for memset(p,0,n), which is a common case
37 * since gcc sometimes silently maps bzero() into memset(). As a result,
38 * we always load the original ptr into %eax before returning.
39 */
40
41#define kShort 80 // too short to bother with SSE (must be >=80)
42#define kVeryLong (1024*1024)
43
44// void bzero(void *b, size_t len);
45
46PLATFUNC_FUNCTION_START(bzero, sse2, 32, 5)
47 pushl %ebp // set up a frame for backtraces
48 movl %esp,%ebp
49 pushl %edi
50 movl 8(%ebp),%edi // get ptr
51 movl 12(%ebp),%edx // get length
52
53 xorl %eax,%eax // set fill data to 0
54 cmpl $(kShort),%edx // long enough for SSE?
55 jg LNotShort // yes
56
57// Here for short operands or the end of long ones.
58// %edx = length
59// %edi = ptr
60// %eax = zero
61
62Lshort:
63 cmpl $16,%edx // long enough to word align?
64 jge 3f // yes
65 test %edx,%edx // length==0?
66 jz 6f
671:
68 movb %al,(%edi) // zero a byte
69 inc %edi
70 dec %edx
71 jnz 1b
72 jmp 6f
732:
74 movb %al,(%edi) // zero a byte
75 inc %edi
76 dec %edx
773:
78 test $3,%edi // is ptr doubleword aligned?
79 jnz 2b // no
80 movl %edx,%ecx // copy length
81 shrl $2,%edx // #doublewords to store
824:
83 movl %eax,(%edi) // zero an aligned doubleword
84 addl $4,%edi
85 dec %edx
86 jnz 4b
87 andl $3,%ecx // mask down to #bytes at end (0..3)
88 jz 6f // none
895:
90 movb %al,(%edi) // zero a byte
91 inc %edi
92 dec %ecx
93 jnz 5b
946:
95 movl 8(%ebp),%eax // get return value in case this was a call of memset()
96 popl %edi
97 popl %ebp
98 ret
99
100
101// We will be using SSE, so align ptr.
102
103LNotShort:
104 movl %edi,%ecx
105 negl %ecx
106 andl $15,%ecx // mask down to #bytes to 16-byte align
107 jz LDestAligned // already aligned
108 subl %ecx,%edx // decrement length
1090: // loop storing bytes to align the ptr
110 movb %al,(%edi) // pack in a byte
111 inc %edi
112 dec %ecx
113 jnz 0b
114
115// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks.
116// %edx = length
117// %edi = ptr
118// %eax = zero
119
120LDestAligned:
121 movl %edx,%ecx
122 andl $63,%edx // mask down to residual length (0..63)
123 andl $-64,%ecx // get #bytes we will zero in this loop
124 pxor %xmm0,%xmm0 // zero an SSE register
125 addl %ecx,%edi // increment ptr by length to move
126 cmpl $(kVeryLong),%ecx // long enough to justify non-temporal stores?
127 jae LVeryLong // yes
128 negl %ecx // negate length to move
129 jmp 1f
130
131// Loop over 64-byte chunks, storing into cache.
132
133 .align 4,0x90 // keep inner loops 16-byte aligned
1341:
135 movdqa %xmm0,(%edi,%ecx)
136 movdqa %xmm0,16(%edi,%ecx)
137 movdqa %xmm0,32(%edi,%ecx)
138 movdqa %xmm0,48(%edi,%ecx)
139 addl $64,%ecx
140 jne 1b
141
142 jmp Lshort
143
144// Very long operands: use non-temporal stores to bypass cache.
145
146LVeryLong:
147 negl %ecx // negate length to move
148 jmp 1f
149
150 .align 4,0x90 // keep inner loops 16-byte aligned
1511:
152 movntdq %xmm0,(%edi,%ecx)
153 movntdq %xmm0,16(%edi,%ecx)
154 movntdq %xmm0,32(%edi,%ecx)
155 movntdq %xmm0,48(%edi,%ecx)
156 addl $64,%ecx
157 jne 1b
158
159 sfence // required by non-temporal stores
160 jmp Lshort
161
162PLATFUNC_DESCRIPTOR(bzero,sse2,kHasSSE2,kHasSSE4_2)