]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/commpage/bzero_sse42_64.s
xnu-1228.15.4.tar.gz
[apple/xnu.git] / osfmk / i386 / commpage / bzero_sse42_64.s
1 /*
2 * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <machine/cpu_capabilities.h>
30 #include <machine/commpage.h>
31
32 /*
33 * Bzero, tuned for processors with SSE4.2 and 64-byte cache lines, ie Nehalem.
34 * We don't actually use SSE4.2, but rather use it to identify Nehalem.
35 * This is the 64-bit version.
36 *
37 * We do not use nontemporal operations, but use MOVDQA in preference to REP/STOS.
38 *
39 * This routine is also used for memset(p,0,n), which is a common case
40 * since gcc sometimes silently maps bzero() into memset(). As a result,
41 * we always load the original ptr into %eax before returning.
42 */
43
44 #define kShort 80 // too short to bother with SSE (must be >=80)
45
46
47 .text
48 .code64
49 .align 5, 0x90
50 Lbzero_sse42_64: // void bzero(void *b, size_t len);
51 pushq %rbp // set up a frame for backtraces
52 movq %rsp,%rbp
53 xorl %eax,%eax // set fill data to 0
54 movq %rdi,%r11 // save original ptr as return value
55 cmpq $(kShort),%rsi // long enough for SSE?
56 jg LNotShort // yes
57
58 // Here for short operands or the end of long ones.
59 // %esi = length (<= kShort)
60 // %rdi = ptr
61 // %eax = zero
62
63 Lshort:
64 cmpl $12,%esi // long enough to word align?
65 jge 3f // yes
66 test %esi,%esi // length==0?
67 jz 6f
68 1:
69 movb %al,(%rdi) // zero a byte
70 incq %rdi
71 decl %esi
72 jnz 1b
73 jmp 6f
74 2:
75 movb %al,(%rdi) // zero a byte
76 incq %rdi
77 decl %esi
78 3:
79 testl $3,%edi // is ptr doubleword aligned?
80 jnz 2b // no
81 movl %esi,%ecx // copy length
82 shrl $2,%esi // #doublewords to store
83 4:
84 movl %eax,(%rdi) // zero an aligned doubleword
85 addq $4,%rdi
86 decl %esi
87 jnz 4b
88 andl $3,%ecx // mask down to #bytes at end (0..3)
89 jz 6f // none
90 5:
91 movb %al,(%rdi) // zero a byte
92 incq %rdi
93 decl %ecx
94 jnz 5b
95 6:
96 movq %r11,%rax // set return value in case this was a call of memset()
97 popq %rbp
98 ret
99
100
101 // We will be using SSE, so align ptr.
102 // %rsi = length (> kShort)
103 // %rdi = ptr
104 // %eax = zero
105
106 LNotShort:
107 testl $3,%edi // 4-byte aligned?
108 jz 2f // yes
109 movb %al,(%rdi) // zero another byte
110 incq %rdi
111 decq %rsi
112 jmp LNotShort
113 1: // zero doublewords until 16-byte aligned
114 movl %eax,(%rdi)
115 addq $4,%rdi
116 subq $4,%rsi
117 2:
118 testl $15,%edi // 16-byte aligned?
119 jnz 1b // no
120
121 // Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks.
122 // %rsi = length (> (kShort-15))
123 // %rdi = ptr (aligned)
124 // %eax = zero
125
126 LDestAligned:
127 movq %rsi,%rcx
128 andl $63,%esi // mask down to residual length (0..63)
129 andq $-64,%rcx // get #bytes we will zero in this loop
130 pxor %xmm0,%xmm0 // zero an SSE register
131 addq %rcx,%rdi // increment ptr by length to move
132 negq %rcx // negate length to move
133 jmp 1f
134
135 // Loop over 64-byte chunks, storing into cache.
136
137 .align 4,0x90 // keep inner loops 16-byte aligned
138 1:
139 movdqa %xmm0,(%rdi,%rcx)
140 movdqa %xmm0,16(%rdi,%rcx)
141 movdqa %xmm0,32(%rdi,%rcx)
142 movdqa %xmm0,48(%rdi,%rcx)
143 addq $64,%rcx
144 jne 1b
145
146 jmp Lshort
147
148
149 COMMPAGE_DESCRIPTOR(bzero_sse42_64,_COMM_PAGE_BZERO,kHasSSE4_2,0)