]> git.saurik.com Git - apple/libc.git/blob - x86_64/string/strcpy.s
d667b05f173a5224b5bd85e739ecf35e7b79c84f
[apple/libc.git] / x86_64 / string / strcpy.s
1 /*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24
25 // ***************
26 // * S T R C P Y *
27 // ***************
28 //
29 // char *strcpy(const char *dst, const char *src);
30 //
31 // We optimize the move by doing it vector parallel. This introduces
32 // a complication: if we blindly did vector load/stores until finding
33 // a 0, we might get a spurious page fault by touching bytes past it.
34 // To avoid this, we never do a load that crosses a page boundary,
35 // and never store a byte we don't have to.
36 //
37 // We align the destination, because unaligned vector stores are slow.
38
39 .text
40 .globl _strcpy
41
42 .align 4
43 _strcpy: // char *strcpy(const char *dst, const char *src);
44 movq %rdi,%rcx // preserve dest ptr so we can return it
45 movl %edi,%edx // copy low 4 bytes of dest ptr
46 negl %edx
47 andl $15,%edx // how many bytes to align dest ptr?
48 jnz LLoopOverBytes // not aligned, so go do so
49
50
51 // In order to avoid spurious page faults, we loop until nearing the source page
52 // end. Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
53 // then resume the vector loop.
54 // %rsi = source ptr (unaligned)
55 // %rdi = dest ptr (aligned)
56
57 LNextChunk:
58 movl %esi,%eax // copy low 4 bytes of source ptr
59 movl $4096,%edx
60 andl $4095,%eax // get offset into source page
61 subl %eax,%edx // get #bytes remaining in source page
62 shrl $4,%edx // get #chunks till end of page
63 jnz LLoopOverChunks // enter vector loop
64 movl $16,%edx // move 16 bytes to cross page but keep dest aligned
65 jmp LLoopOverBytes
66
67
68 // Loop over bytes.
69 // %rsi = source ptr
70 // %rdi = dest ptr
71 // %edx = byte count
72
73 .align 4,0x90 // align inner loops to optimize I-fetch
74 LLoopOverBytes:
75 movzb (%rsi),%eax // get source byte
76 addq $1,%rsi
77 movb %al,(%rdi) // pack into dest
78 addq $1,%rdi
79 testl %eax,%eax // 0?
80 jz LDone // yes, we're done
81 subl $1,%edx // more to go?
82 jnz LLoopOverBytes
83
84 jmp LNextChunk // we've come to end of page
85
86
87 // Loop over 16-byte chunks.
88 // %rsi = source ptr (unaligned)
89 // %rdi = dest ptr (aligned)
90 // %edx = chunk count
91
92 .align 4,0x90 // align inner loops to optimize I-fetch
93 LLoopOverChunks:
94 movdqu (%rsi),%xmm1 // get source
95 pxor %xmm0,%xmm0 // get some 0s
96 addq $16,%rsi
97 pcmpeqb %xmm1,%xmm0 // compare source to 0s
98 pmovmskb %xmm0,%eax // get result mask for 0 check
99 testl %eax,%eax // any 0s?
100 jnz LFound0 // yes, exit loop
101 movdqa %xmm1,(%rdi) // no 0s so do aligned store into destination
102 addq $16,%rdi
103 subl $1,%edx // more to go?
104 jnz LLoopOverChunks
105
106 movl $16,%edx // move 16 bytes
107 jmp LLoopOverBytes // cross page but keep dest aligned
108
109
110 // Found a zero in the vector. Figure out where it is, and store the bytes
111 // up to it.
112 // %rdi = dest ptr (aligned)
113 // %eax = result mask
114 // %xmm1 = source vector
115
116 LFound0:
117 bsf %eax,%edx // find first 0
118 addl $1,%edx // we need to store the 0 too
119 test $16,%dl // was 0 last byte?
120 jz 8f // no
121 movdqa %xmm1,(%rdi) // yes, store entire vector
122 jmp LDone
123 8:
124 test $8,%dl // 8-byte store required?
125 jz 4f // no
126 movq %xmm1,(%rdi) // pack in 8 low bytes
127 psrldq $8,%xmm1 // then shift vector down 8 bytes
128 addq $8,%rdi
129 4:
130 test $4,%dl // 4-byte store required?
131 jz 3f // no
132 movd %xmm1,(%rdi) // pack in 4 low bytes
133 psrldq $4,%xmm1 // then shift vector down 4 bytes
134 addq $4,%rdi
135 3:
136 andl $3,%edx // more to go?
137 jz LDone // no
138 movd %xmm1,%eax // move remainders out of vector into %eax
139 1: // loop on up to three bytes
140 movb %al,(%rdi) // pack in next byte
141 shrl $8,%eax // shift next byte into position
142 addq $1,%rdi
143 dec %edx
144 jnz 1b
145
146 LDone:
147 movq %rcx,%rax // original dest ptr is return value
148 ret