]> git.saurik.com Git - apple/libc.git/blob - i386/string/strncpy.s
Libc-997.1.1.tar.gz
[apple/libc.git] / i386 / string / strncpy.s
1 /*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 #include <machine/cpu_capabilities.h>
25
26
27 // *****************
28 // * S T R N C P Y *
29 // *****************
30 //
31 // char *strncpy(const char *dst, const char *src, size_t n);
32 //
33 // We optimize the move by doing it vector parallel. This introduces
34 // a complication: if we blindly did vector load/stores until finding
35 // a 0, we might get a spurious page fault by touching bytes past it.
36 // To avoid this, we never do a load that crosses a page boundary,
37 // and never store a byte we don't have to.
38 //
39 // We align the destination, because unaligned vector stores are slow.
40 //
41 // Recall that strncpy() zero fills the remainder of the dest buffer,
42 // and does not terminate the string if it's length is greater than or
43 // equal to n.
44
45 #define kShort 31 // too short to bother with vector loop
46
47 .text
48 .globl _strncpy
49
50 .align 4
51 _strncpy: // char *strncpy(const char *dst, const char *src, size_t n);
52 pushl %edi
53 pushl %esi
54 movl 12(%esp),%edi // get dest ptr
55 movl 16(%esp),%esi // get source ptr
56 movl 20(%esp),%ecx // get length
57 movl %edi,%edx // copy dest ptr
58 negl %edx
59 andl $15,%edx // how many bytes to align dest ptr?
60 jnz LCheckShortCopy // align destination first
61
62
63 // In order to avoid spurious page faults, we loop until nearing the source page
64 // end. Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
65 // then resume the vector loop.
66 // %esi = source ptr (unaligned)
67 // %edi = dest ptr (aligned)
68 // %ecx = buffer length remaining
69
70 LNextChunk: // NB: can drop down to here
71 movl %esi,%eax // copy source ptr
72 movl $4096,%edx
73 andl $4095,%eax // get offset into source page
74 subl %eax,%edx // get #bytes remaining in source page
75 cmpl %ecx,%edx // will buffer run out before the page end?
76 cmova %ecx,%edx // get min(length remaining, bytes to page end)
77 shrl $4,%edx // get #chunks till end of page
78 jnz LLoopOverChunks // enter vector loop
79
80 // We can't use the chunk loop yet. Check for short and empty buffers, then use byte loop.
81
82 LCrossPage: // if buffer is large enough, cross source page
83 movl $16,%edx // move 16 bytes to cross page but keep dest aligned
84 LCheckShortCopy: // we propose to copy %edx bytes in byte loop
85 cmpl $(kShort),%ecx // much left?
86 ja LLoopOverBytes // yes, loop over bytes then more chunks
87 movl %ecx,%edx // no, use the byte loop for everything
88 testl %ecx,%ecx // have we filled buffer?
89 jnz LLoopOverBytes // no
90 jmp LDone
91
92
93 // Loop over bytes.
94 // %esi = source ptr
95 // %edi = dest ptr
96 // %ecx = buffer length remaining
97 // %edx = count of bytes to loop over (<= buffer length)
98
99 .align 4,0x90 // align inner loops to optimize I-fetch
100 LLoopOverBytes:
101 movzb (%esi),%eax // get source byte
102 inc %esi
103 dec %ecx // decrement length
104 movb %al,(%edi) // pack into dest
105 inc %edi
106 testl %eax,%eax // 0?
107 jz LZeroBuffer // yes, we're done copying string
108 dec %edx // more to go?
109 jnz LLoopOverBytes
110
111 testl %ecx,%ecx // at end of buffer?
112 jnz LNextChunk // no, xfer chunks
113 jmp LDone // yes
114
115
116 // Loop over 16-byte chunks.
117 // %esi = source ptr (unaligned)
118 // %edi = dest ptr (aligned)
119 // %ecx = buffer length remaining
120 // %edx = chunk count
121
122 .align 4,0x90 // align inner loops to optimize I-fetch
123 LLoopOverChunks:
124 movdqu (%esi),%xmm1 // get source
125 pxor %xmm0,%xmm0 // get some 0s
126 addl $16,%esi
127 pcmpeqb %xmm1,%xmm0 // compare source to 0s
128 pmovmskb %xmm0,%eax // get result mask for 0 check
129 testl %eax,%eax // any 0s?
130 jnz LFound0 // yes, exit loop
131 movdqa %xmm1,(%edi) // no 0s so do aligned store into destination
132 addl $16,%edi
133 subl $16,%ecx // decrement length remaining
134 dec %edx // more to go?
135 jnz LLoopOverChunks
136
137 jmp LCrossPage // cross page but keep dest aligned
138
139
140 // Found a zero in the vector. Figure out where it is, and store the bytes
141 // up to it. It is possible that we should check to be sure (%ecx >= 16), and
142 // just do an aligned store of %xmm1 if so. But if we did, we'd be doing byte
143 // stores into the same double quadword in bzero(), which might hit a hazard.
144 // Experimentation needed.
145 // %edi = dest ptr (aligned)
146 // %eax = result mask
147 // %ecx = buffer length remaining
148 // %xmm1 = source vector
149
150 LFound0:
151 bsf %eax,%edx // find first 0
152 subl %edx,%ecx // decrement remaining buffer length
153 test $8,%dl // 8-byte store required?
154 jz 4f // no
155 movq %xmm1,(%edi) // pack in 8 low bytes
156 psrldq $8,%xmm1 // then shift vector down 8 bytes
157 addl $8,%edi
158 4:
159 test $4,%dl // 4-byte store required?
160 jz 3f // no
161 movd %xmm1,(%edi) // pack in 4 low bytes
162 psrldq $4,%xmm1 // then shift vector down 4 bytes
163 addl $4,%edi
164 3:
165 andl $3,%edx // more to go?
166 jz LZeroBuffer // no
167 movd %xmm1,%eax // move remainders out of vector into %eax
168 1: // loop on up to three bytes
169 movb %al,(%edi) // pack in next byte
170 shrl $8,%eax // shift next byte into position
171 inc %edi
172 dec %edx
173 jnz 1b
174
175 // We've copied the string. Now zero the rest of the buffer, using commpage bzero().
176 // %edi = dest ptr
177 // %ecx = buffer length remaining
178
179 LZeroBuffer:
180 // The stack currently is aligned to 4 mod 16 (it was 0 mod 16 at the time of
181 // the call, and the return address, edi, and esi have been pushed). It needs
182 // to aligned 0 mod 16 when we call bzero, so we subtract 20 from esp (not 4
183 // because we need to have 8 bytes for the arguments to bzero).
184 subl $20,%esp
185 movl %ecx,4(%esp) // remaining buffer size
186 movl %edi, (%esp) // pointer to first unstored byte
187 call _bzero
188 addl $20,%esp
189
190 LDone:
191 movl 12(%esp),%eax // original dest ptr is return value
192 popl %esi
193 popl %edi
194 ret