]> git.saurik.com Git - apple/libc.git/blame - i386/string/strcpy.s
Libc-391.4.1.tar.gz
[apple/libc.git] / i386 / string / strcpy.s
CommitLineData
eb1cde05
A
1/*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24
25// ***************
26// * S T R C P Y *
27// ***************
28//
29// char *strcpy(const char *dst, const char *src);
30//
31// We optimize the move by doing it vector parallel. This introduces
32// a complication: if we blindly did vector load/stores until finding
33// a 0, we might get a spurious page fault by touching bytes past it.
34// To avoid this, we never do a load that crosses a page boundary,
35// and never store a byte we don't have to.
36//
37// We align the destination, because unaligned vector stores are slow.
38
39 .text
40 .globl _strcpy
41
42 .align 4
43_strcpy: // char *strcpy(const char *dst, const char *src);
44 pushl %edi
45 movl 8(%esp),%edi // get dest ptr
46 movl 12(%esp),%ecx // get source ptr
47 movl %edi,%edx // copy dest ptr
48 negl %edx
49 andl $15,%edx // how many bytes to align dest ptr?
50 jnz LLoopOverBytes // not aligned, so go do so
51
52
53// In order to avoid spurious page faults, we loop until nearing the source page
54// end. Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
55// then resume the vector loop.
56// %ecx = source ptr (unaligned)
57// %edi = dest ptr (aligned)
58
59LNextChunk:
60 movl %ecx,%eax // copy source ptr
61 movl $4096,%edx
62 andl $4095,%eax // get offset into source page
63 subl %eax,%edx // get #bytes remaining in source page
64 shrl $4,%edx // get #chunks till end of page
65 jnz LLoopOverChunks // enter vector loop
66 movl $16,%edx // move 16 bytes to cross page but keep dest aligned
67 jmp LLoopOverBytes
68
69
70// Loop over bytes.
71// %ecx = source ptr
72// %edi = dest ptr
73// %edx = byte count
74
75 .align 4,0x90 // align inner loops to optimize I-fetch
76LLoopOverBytes:
77 movzb (%ecx),%eax // get source byte
78 inc %ecx
79 movb %al,(%edi) // pack into dest
80 inc %edi
81 testl %eax,%eax // 0?
82 jz LDone // yes, we're done
83 dec %edx // more to go?
84 jnz LLoopOverBytes
85
86 jmp LNextChunk // we've come to end of page
87
88
89// Loop over 16-byte chunks.
90// %ecx = source ptr (unaligned)
91// %edi = dest ptr (aligned)
92// %edx = chunk count
93
94 .align 4,0x90 // align inner loops to optimize I-fetch
95LLoopOverChunks:
96 movdqu (%ecx),%xmm1 // get source
97 pxor %xmm0,%xmm0 // get some 0s
98 addl $16,%ecx
99 pcmpeqb %xmm1,%xmm0 // compare source to 0s
100 pmovmskb %xmm0,%eax // get result mask for 0 check
101 testl %eax,%eax // any 0s?
102 jnz LFound0 // yes, exit loop
103 movdqa %xmm1,(%edi) // no 0s so do aligned store into destination
104 addl $16,%edi
105 dec %edx // more to go?
106 jnz LLoopOverChunks
107
108 movl $16,%edx // move 16 bytes
109 jmp LLoopOverBytes // cross page but keep dest aligned
110
111
112// Found a zero in the vector. Figure out where it is, and store the bytes
113// up to it.
114// %edi = dest ptr (aligned)
115// %eax = result mask
116// %xmm1 = source vector
117
118LFound0:
119 bsf %eax,%edx // find first 0
120 inc %edx // we need to store the 0 too
121 test $16,%dl // was 0 last byte?
122 jz 8f // no
123 movdqa %xmm1,(%edi) // yes, store entire vector
124 jmp LDone
1258:
126 test $8,%dl // 8-byte store required?
127 jz 4f // no
128 movq %xmm1,(%edi) // pack in 8 low bytes
129 psrldq $8,%xmm1 // then shift vector down 8 bytes
130 addl $8,%edi
1314:
132 test $4,%dl // 4-byte store required?
133 jz 3f // no
134 movd %xmm1,(%edi) // pack in 4 low bytes
135 psrldq $4,%xmm1 // then shift vector down 4 bytes
136 addl $4,%edi
1373:
138 andl $3,%edx // more to go?
139 jz LDone // no
140 movd %xmm1,%eax // move remainders out of vector into %eax
1411: // loop on up to three bytes
142 movb %al,(%edi) // pack in next byte
143 shrl $8,%eax // shift next byte into position
144 inc %edi
145 dec %edx
146 jnz 1b
147
148LDone:
149 movl 8(%esp),%eax // original dest ptr is return value
150 popl %edi
151 ret