2 * Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 * This file implements strcpy( ) for the x86_64 architecture.
33 /*****************************************************************************
35 *****************************************************************************/
42 .macro ClearFrameAndReturn
47 /*****************************************************************************
49 *****************************************************************************/
54 // char *strcpy(char * restrict d, const char * restrict s);
56 // copies the string s to d, and returns d. We look for NUL bytes using
57 // pcmpeqb on 16-byte aligned blocks. Although this may read past the
58 // end of the string, because all access is aligned, it will never
59 // read past the end of the string across a page boundary, or even
60 // accross a cacheline.
64 // Load the 16-byte block containing the first byte of the string, and
65 // compare each byte to zero. If any NUL bytes are present in this
66 // block, the corresponding *bit* in esi will be set to 1.
72 // The 16 bytes that we checked for NUL included some bytes preceeding
73 // the start of the string, if s is not 16-byte aligned. We create a
74 // mask based on the alignment of s which covers only those bits
75 // corresponding to bytes that do not preceed s, and check for NULs
76 // only in those bits. If we find one, the string is too small to use
77 // a vector copy, so jump to dedicated small-buffer implementation.
84 // Check the next 16-byte block for NUL. If none are found, that guarantees
85 // that the string is at least 16 bytes long, which means that we can use a
86 // single unaligned vector copy to handle any edging at the start of the
87 // string. If instead a NUL is found, fall into the byte-by-byte copy loop.
88 movdqa 16(%rsi), %xmm1
95 /*****************************************************************************
96 * GPR copy implementation *
97 *****************************************************************************/
99 // There is at least one NUL in the 32 aligned bytes containing the start
100 // of the string being copied. We assemble a bitmap for those 32 bytes from
101 // eax and edx, then shift it right by cl to throw out any bits preceeding
102 // the start of the string. We can then identify the position of the
103 // first NUL byte using BSF.
109 // Restore the original source pointer, and copy the destination pointer
110 // to rax so that it is returned on exit.
117 // At this point we simply need to copy rdx + 1 bytes from rsi to rdi. If
118 // the length is >= 8, start by doing a word-by-word copy; otherwise, use
119 // a byte-by-byte copy loop.
120 sub $7, %rdx // 7 instead of 8 to account for NUL
122 0: mov (%rsi,%rdx),%rcx
123 mov %rcx, (%rdi,%rdx)
128 2: movzb -1(%rsi,%rdx),%rcx
129 movb %cl, -1(%rdi,%rdx)
132 3: ClearFrameAndReturn
135 /*****************************************************************************
136 * SSE copy implementation *
137 *****************************************************************************/
140 // Begin by doing a single unaligned vector copy for edging. We no longer
141 // have the original source pointer, but we can reconstruct it as rsi + rcx.
142 movdqu (%rsi,%rcx),%xmm0
144 // Next copy the original destination pointer to rax so that it is returned
145 // on exit, and adjust the destination pointer to correspond to rsi.
149 // Main copy loop: store the 16 bytes loaded in the previous iteration of the
150 // loop, as they are already known to not contain a NUL. The load the next
151 // 16 bytes and check for NUL.
152 0: movdqu %xmm1, 16(%rdi,%rcx)
154 movdqa 16(%rsi,%rcx),%xmm1
161 // Cleanup: at least one of the bytes in the last 16 that were loaded was
162 // NUL. The corresponding bits of dx are set, and all other bits are zero.
163 // Thus, we can use BSF to find the position of the first NUL. Once we have
164 // this information, we use an unaligned copy that runs precisely up to this
165 // position to handle edging.
168 movdqu 1(%rsi,%rcx),%xmm0 // offset is 1 so that we copy the trailing
169 movdqu %xmm0, 1(%rdi,%rcx) // NUL byte as well.