]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
24 | ||
25 | // *************** | |
26 | // * S T R C P Y * | |
27 | // *************** | |
28 | // | |
29 | // char *strcpy(const char *dst, const char *src); | |
30 | // | |
31 | // We optimize the move by doing it vector parallel. This introduces | |
32 | // a complication: if we blindly did vector load/stores until finding | |
33 | // a 0, we might get a spurious page fault by touching bytes past it. | |
34 | // To avoid this, we never do a load that crosses a page boundary, | |
35 | // and never store a byte we don't have to. | |
36 | // | |
37 | // We align the destination, because unaligned vector stores are slow. | |
38 | ||
39 | .text | |
40 | .globl _strcpy | |
41 | ||
42 | .align 4 | |
43 | _strcpy: // char *strcpy(const char *dst, const char *src); | |
44 | pushl %edi | |
45 | movl 8(%esp),%edi // get dest ptr | |
46 | movl 12(%esp),%ecx // get source ptr | |
47 | movl %edi,%edx // copy dest ptr | |
48 | negl %edx | |
49 | andl $15,%edx // how many bytes to align dest ptr? | |
50 | jnz LLoopOverBytes // not aligned, so go do so | |
51 | ||
52 | ||
53 | // In order to avoid spurious page faults, we loop until nearing the source page | |
54 | // end. Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed, | |
55 | // then resume the vector loop. | |
56 | // %ecx = source ptr (unaligned) | |
57 | // %edi = dest ptr (aligned) | |
58 | ||
59 | LNextChunk: | |
60 | movl %ecx,%eax // copy source ptr | |
61 | movl $4096,%edx | |
62 | andl $4095,%eax // get offset into source page | |
63 | subl %eax,%edx // get #bytes remaining in source page | |
64 | shrl $4,%edx // get #chunks till end of page | |
65 | jnz LLoopOverChunks // enter vector loop | |
66 | movl $16,%edx // move 16 bytes to cross page but keep dest aligned | |
67 | jmp LLoopOverBytes | |
68 | ||
69 | ||
70 | // Loop over bytes. | |
71 | // %ecx = source ptr | |
72 | // %edi = dest ptr | |
73 | // %edx = byte count | |
74 | ||
75 | .align 4,0x90 // align inner loops to optimize I-fetch | |
76 | LLoopOverBytes: | |
77 | movzb (%ecx),%eax // get source byte | |
78 | inc %ecx | |
79 | movb %al,(%edi) // pack into dest | |
80 | inc %edi | |
81 | testl %eax,%eax // 0? | |
82 | jz LDone // yes, we're done | |
83 | dec %edx // more to go? | |
84 | jnz LLoopOverBytes | |
85 | ||
86 | jmp LNextChunk // we've come to end of page | |
87 | ||
88 | ||
89 | // Loop over 16-byte chunks. | |
90 | // %ecx = source ptr (unaligned) | |
91 | // %edi = dest ptr (aligned) | |
92 | // %edx = chunk count | |
93 | ||
94 | .align 4,0x90 // align inner loops to optimize I-fetch | |
95 | LLoopOverChunks: | |
96 | movdqu (%ecx),%xmm1 // get source | |
97 | pxor %xmm0,%xmm0 // get some 0s | |
98 | addl $16,%ecx | |
99 | pcmpeqb %xmm1,%xmm0 // compare source to 0s | |
100 | pmovmskb %xmm0,%eax // get result mask for 0 check | |
101 | testl %eax,%eax // any 0s? | |
102 | jnz LFound0 // yes, exit loop | |
103 | movdqa %xmm1,(%edi) // no 0s so do aligned store into destination | |
104 | addl $16,%edi | |
105 | dec %edx // more to go? | |
106 | jnz LLoopOverChunks | |
107 | ||
108 | movl $16,%edx // move 16 bytes | |
109 | jmp LLoopOverBytes // cross page but keep dest aligned | |
110 | ||
111 | ||
112 | // Found a zero in the vector. Figure out where it is, and store the bytes | |
113 | // up to it. | |
114 | // %edi = dest ptr (aligned) | |
115 | // %eax = result mask | |
116 | // %xmm1 = source vector | |
117 | ||
118 | LFound0: | |
119 | bsf %eax,%edx // find first 0 | |
120 | inc %edx // we need to store the 0 too | |
121 | test $16,%dl // was 0 last byte? | |
122 | jz 8f // no | |
123 | movdqa %xmm1,(%edi) // yes, store entire vector | |
124 | jmp LDone | |
125 | 8: | |
126 | test $8,%dl // 8-byte store required? | |
127 | jz 4f // no | |
128 | movq %xmm1,(%edi) // pack in 8 low bytes | |
129 | psrldq $8,%xmm1 // then shift vector down 8 bytes | |
130 | addl $8,%edi | |
131 | 4: | |
132 | test $4,%dl // 4-byte store required? | |
133 | jz 3f // no | |
134 | movd %xmm1,(%edi) // pack in 4 low bytes | |
135 | psrldq $4,%xmm1 // then shift vector down 4 bytes | |
136 | addl $4,%edi | |
137 | 3: | |
138 | andl $3,%edx // more to go? | |
139 | jz LDone // no | |
140 | movd %xmm1,%eax // move remainders out of vector into %eax | |
141 | 1: // loop on up to three bytes | |
142 | movb %al,(%edi) // pack in next byte | |
143 | shrl $8,%eax // shift next byte into position | |
144 | inc %edi | |
145 | dec %edx | |
146 | jnz 1b | |
147 | ||
148 | LDone: | |
149 | movl 8(%esp),%eax // original dest ptr is return value | |
150 | popl %edi | |
151 | ret |