]>
Commit | Line | Data |
---|---|---|
eb1cde05 A |
1 | /* |
2 | * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
24 | #include <machine/cpu_capabilities.h> | |
25 | ||
26 | ||
27 | // ***************** | |
28 | // * S T R N C P Y * | |
29 | // ***************** | |
30 | // | |
31 | // char *strncpy(const char *dst, const char *src, size_t n); | |
32 | // | |
33 | // We optimize the move by doing it vector parallel. This introduces | |
34 | // a complication: if we blindly did vector load/stores until finding | |
35 | // a 0, we might get a spurious page fault by touching bytes past it. | |
36 | // To avoid this, we never do a load that crosses a page boundary, | |
37 | // and never store a byte we don't have to. | |
38 | // | |
39 | // We align the destination, because unaligned vector stores are slow. | |
40 | // | |
41 | // Recall that strncpy() zero fills the remainder of the dest buffer, | |
42 | // and does not terminate the string if it's length is greater than or | |
43 | // equal to n. | |
44 | ||
45 | #define kShort 31 // too short to bother with vector loop | |
46 | ||
47 | .text | |
48 | .globl _strncpy | |
49 | ||
50 | .align 4 | |
51 | _strncpy: // char *strncpy(const char *dst, const char *src, size_t n); | |
52 | pushl %edi | |
53 | pushl %esi | |
54 | movl 12(%esp),%edi // get dest ptr | |
55 | movl 16(%esp),%esi // get source ptr | |
56 | movl 20(%esp),%ecx // get length | |
57 | movl %edi,%edx // copy dest ptr | |
58 | negl %edx | |
59 | andl $15,%edx // how many bytes to align dest ptr? | |
60 | jnz LCheckShortCopy // align destination first | |
61 | ||
62 | ||
63 | // In order to avoid spurious page faults, we loop until nearing the source page | |
64 | // end. Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed, | |
65 | // then resume the vector loop. | |
66 | // %esi = source ptr (unaligned) | |
67 | // %edi = dest ptr (aligned) | |
68 | // %ecx = buffer length remaining | |
69 | ||
70 | LNextChunk: // NB: can drop down to here | |
71 | movl %esi,%eax // copy source ptr | |
72 | movl $4096,%edx | |
73 | andl $4095,%eax // get offset into source page | |
74 | subl %eax,%edx // get #bytes remaining in source page | |
75 | cmpl %ecx,%edx // will buffer run out before the page end? | |
76 | cmova %ecx,%edx // get min(length remaining, bytes to page end) | |
77 | shrl $4,%edx // get #chunks till end of page | |
78 | jnz LLoopOverChunks // enter vector loop | |
79 | ||
80 | // We can't use the chunk loop yet. Check for short and empty buffers, then use byte loop. | |
81 | ||
82 | LCrossPage: // if buffer is large enough, cross source page | |
83 | movl $16,%edx // move 16 bytes to cross page but keep dest aligned | |
84 | LCheckShortCopy: // we propose to copy %edx bytes in byte loop | |
85 | cmpl $(kShort),%ecx // much left? | |
86 | ja LLoopOverBytes // yes, loop over bytes then more chunks | |
87 | movl %ecx,%edx // no, use the byte loop for everything | |
88 | testl %ecx,%ecx // have we filled buffer? | |
89 | jnz LLoopOverBytes // no | |
90 | jmp LDone | |
91 | ||
92 | ||
93 | // Loop over bytes. | |
94 | // %esi = source ptr | |
95 | // %edi = dest ptr | |
96 | // %ecx = buffer length remaining | |
97 | // %edx = count of bytes to loop over (<= buffer length) | |
98 | ||
99 | .align 4,0x90 // align inner loops to optimize I-fetch | |
100 | LLoopOverBytes: | |
101 | movzb (%esi),%eax // get source byte | |
102 | inc %esi | |
103 | dec %ecx // decrement length | |
104 | movb %al,(%edi) // pack into dest | |
105 | inc %edi | |
106 | testl %eax,%eax // 0? | |
107 | jz LZeroBuffer // yes, we're done copying string | |
108 | dec %edx // more to go? | |
109 | jnz LLoopOverBytes | |
110 | ||
111 | testl %ecx,%ecx // at end of buffer? | |
112 | jnz LNextChunk // no, xfer chunks | |
113 | jmp LDone // yes | |
114 | ||
115 | ||
116 | // Loop over 16-byte chunks. | |
117 | // %esi = source ptr (unaligned) | |
118 | // %edi = dest ptr (aligned) | |
119 | // %ecx = buffer length remaining | |
120 | // %edx = chunk count | |
121 | ||
122 | .align 4,0x90 // align inner loops to optimize I-fetch | |
123 | LLoopOverChunks: | |
124 | movdqu (%esi),%xmm1 // get source | |
125 | pxor %xmm0,%xmm0 // get some 0s | |
126 | addl $16,%esi | |
127 | pcmpeqb %xmm1,%xmm0 // compare source to 0s | |
128 | pmovmskb %xmm0,%eax // get result mask for 0 check | |
129 | testl %eax,%eax // any 0s? | |
130 | jnz LFound0 // yes, exit loop | |
131 | movdqa %xmm1,(%edi) // no 0s so do aligned store into destination | |
132 | addl $16,%edi | |
133 | subl $16,%ecx // decrement length remaining | |
134 | dec %edx // more to go? | |
135 | jnz LLoopOverChunks | |
136 | ||
137 | jmp LCrossPage // cross page but keep dest aligned | |
138 | ||
139 | ||
140 | // Found a zero in the vector. Figure out where it is, and store the bytes | |
141 | // up to it. It is possible that we should check to be sure (%ecx >= 16), and | |
142 | // just do an aligned store of %xmm1 if so. But if we did, we'd be doing byte | |
143 | // stores into the same double quadword in bzero(), which might hit a hazard. | |
144 | // Experimentation needed. | |
145 | // %edi = dest ptr (aligned) | |
146 | // %eax = result mask | |
147 | // %ecx = buffer length remaining | |
148 | // %xmm1 = source vector | |
149 | ||
150 | LFound0: | |
151 | bsf %eax,%edx // find first 0 | |
152 | subl %edx,%ecx // decrement remaining buffer length | |
153 | test $8,%dl // 8-byte store required? | |
154 | jz 4f // no | |
155 | movq %xmm1,(%edi) // pack in 8 low bytes | |
156 | psrldq $8,%xmm1 // then shift vector down 8 bytes | |
157 | addl $8,%edi | |
158 | 4: | |
159 | test $4,%dl // 4-byte store required? | |
160 | jz 3f // no | |
161 | movd %xmm1,(%edi) // pack in 4 low bytes | |
162 | psrldq $4,%xmm1 // then shift vector down 4 bytes | |
163 | addl $4,%edi | |
164 | 3: | |
165 | andl $3,%edx // more to go? | |
166 | jz LZeroBuffer // no | |
167 | movd %xmm1,%eax // move remainders out of vector into %eax | |
168 | 1: // loop on up to three bytes | |
169 | movb %al,(%edi) // pack in next byte | |
170 | shrl $8,%eax // shift next byte into position | |
171 | inc %edi | |
172 | dec %edx | |
173 | jnz 1b | |
174 | ||
175 | // We've copied the string. Now zero the rest of the buffer, using commpage bzero(). | |
176 | // %edi = dest ptr | |
177 | // %ecx = buffer length remaining | |
178 | ||
179 | LZeroBuffer: | |
180 | pushl %ecx // remaining buffer size | |
181 | pushl %edi // ptr to 1st unstored byte | |
182 | movl $(_COMM_PAGE_BZERO),%eax | |
8e029c65 | 183 | call *%eax |
eb1cde05 A |
184 | addl $8,%esp // pop off the arguments |
185 | ||
186 | LDone: | |
187 | movl 12(%esp),%eax // original dest ptr is return value | |
188 | popl %esi | |
189 | popl %edi | |
190 | ret |