]> git.saurik.com Git - apple/libc.git/blame - x86_64/string/strncpy.s
Libc-997.1.1.tar.gz
[apple/libc.git] / x86_64 / string / strncpy.s
CommitLineData
8e029c65
A
1/*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24#include <machine/cpu_capabilities.h>
25
26
27// *****************
28// * S T R N C P Y *
29// *****************
30//
31// char *strncpy(const char *dst, const char *src, size_t n);
32//
33// We optimize the move by doing it vector parallel. This introduces
34// a complication: if we blindly did vector load/stores until finding
35// a 0, we might get a spurious page fault by touching bytes past it.
36// To avoid this, we never do a load that crosses a page boundary,
37// and never store a byte we don't have to.
38//
39// We align the destination, because unaligned vector stores are slow.
40//
41// Recall that strncpy() zero fills the remainder of the dest buffer,
42// and does not terminate the string if its length is greater than or
43// equal to n.
44
45#define kShort 31 // too short to bother with vector loop
46
47 .text
48 .globl _strncpy
49
50 .align 4
51_strncpy: // char *strncpy(const char *dst, const char *src, size_t n);
52 movq %rdi,%r8 // preserve destination pointer so we can return it
53 movl %edi,%ecx // copy low 4 bytes of dest ptr
54 negl %ecx
55 andl $15,%ecx // how many bytes to align dest ptr?
56 jnz LCheckShortCopy // align destination first
57
58
59// In order to avoid spurious page faults, we loop until nearing the source page
60// end. Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
61// then resume the vector loop.
62// %rsi = source ptr (unaligned)
63// %rdi = dest ptr (aligned)
64// %rdx = buffer length remaining
65
66LNextChunk: // NB: can drop down to here
67 movl %esi,%eax // copy the low 4 bytes of the source ptr
68 movl $4096,%ecx
69 andl $4095,%eax // get offset into source page
70 subl %eax,%ecx // get #bytes remaining in source page
71 cmpq %rdx,%rcx // will buffer run out before the page end?
72 cmova %rdx,%rcx // get min(length remaining, bytes to page end)
73 shrl $4,%ecx // get #chunks till end of page
74 jnz LLoopOverChunks // enter vector loop
75
76// We can't use the chunk loop yet. Check for short and empty buffers, then use byte loop.
77
78LCrossPage: // if buffer is large enough, cross source page
79 movl $16,%ecx // move 16 bytes to cross page but keep dest aligned
80LCheckShortCopy: // we propose to copy %ecx bytes in byte loop
81 cmpq $(kShort),%rdx // much left?
82 ja LLoopOverBytes // yes, loop over bytes then more chunks
83 movl %edx,%ecx // no, use the byte loop for everything
84 testl %edx,%edx // have we filled buffer?
85 jnz LLoopOverBytes // no
86 jmp LDone
87
88
89// Loop over bytes.
90// %rsi = source ptr
91// %rdi = dest ptr
92// %rdx = buffer length remaining
93// %rcx = count of bytes to loop over (<= buffer length)
94
95 .align 4,0x90 // align inner loops to optimize I-fetch
96LLoopOverBytes:
97 movzb (%rsi),%eax // get source byte
98 addq $1,%rsi
99 subq $1,%rdx // decrement length
100 movb %al,(%rdi) // pack into dest
101 addq $1,%rdi
102 testl %eax,%eax // 0?
103 jz LZeroBuffer // yes, we're done copying string
104 subq $1,%rcx // more to go?
105 jnz LLoopOverBytes
106
107 testq %rdx,%rdx // at end of buffer?
108 jnz LNextChunk // no, xfer chunks
109 jmp LDone // yes
110
111
112// Loop over 16-byte chunks.
113// %rsi = source ptr (unaligned)
114// %rdi = dest ptr (aligned)
115// %rdx = buffer length remaining
116// %ecx = chunk count
117
118 .align 4,0x90 // align inner loops to optimize I-fetch
119LLoopOverChunks:
120 movdqu (%rsi),%xmm1 // get source
121 pxor %xmm0,%xmm0 // get some 0s
122 addq $16,%rsi
123 pcmpeqb %xmm1,%xmm0 // compare source to 0s
124 pmovmskb %xmm0,%eax // get result mask for 0 check
125 testl %eax,%eax // any 0s?
126 jnz LFound0 // yes, exit loop
127 movdqa %xmm1,(%rdi) // no 0s so do aligned store into destination
128 addq $16,%rdi
129 subq $16,%rdx // decrement length remaining
130 subl $1,%ecx // more to go?
131 jnz LLoopOverChunks
132
133 jmp LCrossPage // cross page but keep dest aligned
134
135
136// Found a zero in the vector. Figure out where it is, and store the bytes
137// up to it. It is possible that we should check to be sure (%rdx >= 16), and
138// just do an aligned store of %xmm1 if so. But if we did, we'd be doing byte
139// stores into the same double quadword in bzero(), which might hit a hazard.
140// Experimentation needed.
141// %rdi = dest ptr (aligned)
142// %eax = result mask
143// %rdx = buffer length remaining
144// %xmm1 = source vector
145
146LFound0:
147 bsf %eax,%ecx // find first 0
148 subq %rcx,%rdx // decrement remaining buffer length
149 test $8,%cl // 8-byte store required?
150 jz 4f // no
151 movq %xmm1,(%rdi) // pack in 8 low bytes
152 psrldq $8,%xmm1 // then shift vector down 8 bytes
153 addq $8,%rdi
1544:
155 test $4,%cl // 4-byte store required?
156 jz 3f // no
157 movd %xmm1,(%rdi) // pack in 4 low bytes
158 psrldq $4,%xmm1 // then shift vector down 4 bytes
159 addq $4,%rdi
1603:
161 andl $3,%ecx // more to go?
162 jz LZeroBuffer // no
163 movd %xmm1,%eax // move remainders out of vector into %eax
1641: // loop on up to three bytes
165 movb %al,(%rdi) // pack in next byte
166 shrl $8,%eax // shift next byte into position
167 addq $1,%rdi
168 subl $1,%ecx
169 jnz 1b
170
171// We've copied the string. Now zero the rest of the buffer, using commpage bzero().
172// %rdi = dest ptr
173// %rcx = buffer length remaining
174
175LZeroBuffer:
176 movq %rdx,%rsi // remaining buffer size (2nd argument)
6465356a 177 subq $8,%rsp // align stack to 16B before call
1f2f436a 178 call _bzero
6465356a 179 addq $8,%rsp // restore stack
8e029c65
A
180
181LDone:
182 movq %r8,%rax // original dest ptr is return value
183 ret