]> git.saurik.com Git - apple/libc.git/blob - x86_64/string/strcpy.s
Libc-1244.1.7.tar.gz
[apple/libc.git] / x86_64 / string / strcpy.s
1 /*
2 * Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 * This file implements strcpy( ) for the x86_64 architecture.
29 */
30
31 .globl _strcpy
32
33 /*****************************************************************************
34 * Macros *
35 *****************************************************************************/
36
37 .macro EstablishFrame
38 push %rbp
39 mov %rsp, %rbp
40 .endm
41
42 .macro ClearFrameAndReturn
43 pop %rbp
44 ret
45 .endm
46
47 /*****************************************************************************
48 * Entrypoint *
49 *****************************************************************************/
50
51 .text
52 .align 5
53 _strcpy:
54 // char *strcpy(char * restrict d, const char * restrict s);
55 //
56 // copies the string s to d, and returns d. We look for NUL bytes using
57 // pcmpeqb on 16-byte aligned blocks. Although this may read past the
58 // end of the string, because all access is aligned, it will never
59 // read past the end of the string across a page boundary, or even
60 // accross a cacheline.
61 EstablishFrame
62 mov %rsi, %rcx
63
64 // Load the 16-byte block containing the first byte of the string, and
65 // compare each byte to zero. If any NUL bytes are present in this
66 // block, the corresponding *bit* in esi will be set to 1.
67 and $-16, %rsi
68 pxor %xmm0, %xmm0
69 pcmpeqb (%rsi), %xmm0
70 pmovmskb %xmm0, %eax
71
72 // The 16 bytes that we checked for NUL included some bytes preceeding
73 // the start of the string, if s is not 16-byte aligned. We create a
74 // mask based on the alignment of s which covers only those bits
75 // corresponding to bytes that do not preceed s, and check for NULs
76 // only in those bits. If we find one, the string is too small to use
77 // a vector copy, so jump to dedicated small-buffer implementation.
78 and $0xf, %rcx
79 or $-1, %rdx
80 shl %cl, %rdx
81 and %edx, %eax
82 jnz L_strcpyGPR
83
84 // Check the next 16-byte block for NUL. If none are found, that guarantees
85 // that the string is at least 16 bytes long, which means that we can use a
86 // single unaligned vector copy to handle any edging at the start of the
87 // string. If instead a NUL is found, fall into the byte-by-byte copy loop.
88 movdqa 16(%rsi), %xmm1
89 pxor %xmm0, %xmm0
90 pcmpeqb %xmm1, %xmm0
91 pmovmskb %xmm0, %edx
92 test %edx, %edx
93 jz L_strcpySSE
94
95 /*****************************************************************************
96 * GPR copy implementation *
97 *****************************************************************************/
98
99 // There is at least one NUL in the 32 aligned bytes containing the start
100 // of the string being copied. We assemble a bitmap for those 32 bytes from
101 // eax and edx, then shift it right by cl to throw out any bits preceeding
102 // the start of the string. We can then identify the position of the
103 // first NUL byte using BSF.
104 shl $16, %edx
105 or %edx, %eax
106 L_strcpyGPR:
107 shr %cl, %eax
108 bsf %eax, %edx
109 // Restore the original source pointer, and copy the destination pointer
110 // to rax so that it is returned on exit.
111 add %rcx, %rsi
112 mov %rdi, %rax
113 add $1, %rdx
114 call _memcpy
115 ClearFrameAndReturn
116 /*
117 // At this point we simply need to copy rdx + 1 bytes from rsi to rdi. If
118 // the length is >= 8, start by doing a word-by-word copy; otherwise, use
119 // a byte-by-byte copy loop.
120 sub $7, %rdx // 7 instead of 8 to account for NUL
121 jb 1f
122 0: mov (%rsi,%rdx),%rcx
123 mov %rcx, (%rdi,%rdx)
124 sub $8, %rdx
125 jae 0b
126 1: add $8, %rdx
127 jz 3f
128 2: movzb -1(%rsi,%rdx),%rcx
129 movb %cl, -1(%rdi,%rdx)
130 sub $1, %rdx
131 jnz 2b
132 3: ClearFrameAndReturn
133 */
134
135 /*****************************************************************************
136 * SSE copy implementation *
137 *****************************************************************************/
138
139 L_strcpySSE:
140 // Begin by doing a single unaligned vector copy for edging. We no longer
141 // have the original source pointer, but we can reconstruct it as rsi + rcx.
142 movdqu (%rsi,%rcx),%xmm0
143 movdqu %xmm0, (%rdi)
144 // Next copy the original destination pointer to rax so that it is returned
145 // on exit, and adjust the destination pointer to correspond to rsi.
146 mov %rdi, %rax
147 sub %rcx, %rdi
148 xor %rcx, %rcx
149 // Main copy loop: store the 16 bytes loaded in the previous iteration of the
150 // loop, as they are already known to not contain a NUL. The load the next
151 // 16 bytes and check for NUL.
152 0: movdqu %xmm1, 16(%rdi,%rcx)
153 add $16, %rcx
154 movdqa 16(%rsi,%rcx),%xmm1
155 pxor %xmm0, %xmm0
156 pcmpeqb %xmm1, %xmm0
157 pmovmskb %xmm0, %edx
158 test %edx, %edx
159 jz 0b
160
161 // Cleanup: at least one of the bytes in the last 16 that were loaded was
162 // NUL. The corresponding bits of dx are set, and all other bits are zero.
163 // Thus, we can use BSF to find the position of the first NUL. Once we have
164 // this information, we use an unaligned copy that runs precisely up to this
165 // position to handle edging.
166 bsf %edx, %edx
167 add %rdx, %rcx
168 movdqu 1(%rsi,%rcx),%xmm0 // offset is 1 so that we copy the trailing
169 movdqu %xmm0, 1(%rdi,%rcx) // NUL byte as well.
170 ClearFrameAndReturn
171