]>
Commit | Line | Data |
---|---|---|
8e029c65 A |
1 | /* |
2 | * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
24 | #include <machine/cpu_capabilities.h> | |
25 | ||
26 | ||
27 | // ***************** | |
28 | // * S T R N C P Y * | |
29 | // ***************** | |
30 | // | |
31 | // char *strncpy(const char *dst, const char *src, size_t n); | |
32 | // | |
33 | // We optimize the move by doing it vector parallel. This introduces | |
34 | // a complication: if we blindly did vector load/stores until finding | |
35 | // a 0, we might get a spurious page fault by touching bytes past it. | |
36 | // To avoid this, we never do a load that crosses a page boundary, | |
37 | // and never store a byte we don't have to. | |
38 | // | |
39 | // We align the destination, because unaligned vector stores are slow. | |
40 | // | |
41 | // Recall that strncpy() zero fills the remainder of the dest buffer, | |
42 | // and does not terminate the string if its length is greater than or | |
43 | // equal to n. | |
44 | ||
45 | #define kShort 31 // too short to bother with vector loop | |
46 | ||
47 | .text | |
48 | .globl _strncpy | |
49 | ||
50 | .align 4 | |
51 | _strncpy: // char *strncpy(const char *dst, const char *src, size_t n); | |
52 | movq %rdi,%r8 // preserve destination pointer so we can return it | |
53 | movl %edi,%ecx // copy low 4 bytes of dest ptr | |
54 | negl %ecx | |
55 | andl $15,%ecx // how many bytes to align dest ptr? | |
56 | jnz LCheckShortCopy // align destination first | |
57 | ||
58 | ||
59 | // In order to avoid spurious page faults, we loop until nearing the source page | |
60 | // end. Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed, | |
61 | // then resume the vector loop. | |
62 | // %rsi = source ptr (unaligned) | |
63 | // %rdi = dest ptr (aligned) | |
64 | // %rdx = buffer length remaining | |
65 | ||
66 | LNextChunk: // NB: can drop down to here | |
67 | movl %esi,%eax // copy the low 4 bytes of the source ptr | |
68 | movl $4096,%ecx | |
69 | andl $4095,%eax // get offset into source page | |
70 | subl %eax,%ecx // get #bytes remaining in source page | |
71 | cmpq %rdx,%rcx // will buffer run out before the page end? | |
72 | cmova %rdx,%rcx // get min(length remaining, bytes to page end) | |
73 | shrl $4,%ecx // get #chunks till end of page | |
74 | jnz LLoopOverChunks // enter vector loop | |
75 | ||
76 | // We can't use the chunk loop yet. Check for short and empty buffers, then use byte loop. | |
77 | ||
78 | LCrossPage: // if buffer is large enough, cross source page | |
79 | movl $16,%ecx // move 16 bytes to cross page but keep dest aligned | |
80 | LCheckShortCopy: // we propose to copy %ecx bytes in byte loop | |
81 | cmpq $(kShort),%rdx // much left? | |
82 | ja LLoopOverBytes // yes, loop over bytes then more chunks | |
83 | movl %edx,%ecx // no, use the byte loop for everything | |
84 | testl %edx,%edx // have we filled buffer? | |
85 | jnz LLoopOverBytes // no | |
86 | jmp LDone | |
87 | ||
88 | ||
89 | // Loop over bytes. | |
90 | // %rsi = source ptr | |
91 | // %rdi = dest ptr | |
92 | // %rdx = buffer length remaining | |
93 | // %rcx = count of bytes to loop over (<= buffer length) | |
94 | ||
95 | .align 4,0x90 // align inner loops to optimize I-fetch | |
96 | LLoopOverBytes: | |
97 | movzb (%rsi),%eax // get source byte | |
98 | addq $1,%rsi | |
99 | subq $1,%rdx // decrement length | |
100 | movb %al,(%rdi) // pack into dest | |
101 | addq $1,%rdi | |
102 | testl %eax,%eax // 0? | |
103 | jz LZeroBuffer // yes, we're done copying string | |
104 | subq $1,%rcx // more to go? | |
105 | jnz LLoopOverBytes | |
106 | ||
107 | testq %rdx,%rdx // at end of buffer? | |
108 | jnz LNextChunk // no, xfer chunks | |
109 | jmp LDone // yes | |
110 | ||
111 | ||
112 | // Loop over 16-byte chunks. | |
113 | // %rsi = source ptr (unaligned) | |
114 | // %rdi = dest ptr (aligned) | |
115 | // %rdx = buffer length remaining | |
116 | // %ecx = chunk count | |
117 | ||
118 | .align 4,0x90 // align inner loops to optimize I-fetch | |
119 | LLoopOverChunks: | |
120 | movdqu (%rsi),%xmm1 // get source | |
121 | pxor %xmm0,%xmm0 // get some 0s | |
122 | addq $16,%rsi | |
123 | pcmpeqb %xmm1,%xmm0 // compare source to 0s | |
124 | pmovmskb %xmm0,%eax // get result mask for 0 check | |
125 | testl %eax,%eax // any 0s? | |
126 | jnz LFound0 // yes, exit loop | |
127 | movdqa %xmm1,(%rdi) // no 0s so do aligned store into destination | |
128 | addq $16,%rdi | |
129 | subq $16,%rdx // decrement length remaining | |
130 | subl $1,%ecx // more to go? | |
131 | jnz LLoopOverChunks | |
132 | ||
133 | jmp LCrossPage // cross page but keep dest aligned | |
134 | ||
135 | ||
136 | // Found a zero in the vector. Figure out where it is, and store the bytes | |
137 | // up to it. It is possible that we should check to be sure (%rdx >= 16), and | |
138 | // just do an aligned store of %xmm1 if so. But if we did, we'd be doing byte | |
139 | // stores into the same double quadword in bzero(), which might hit a hazard. | |
140 | // Experimentation needed. | |
141 | // %rdi = dest ptr (aligned) | |
142 | // %eax = result mask | |
143 | // %rdx = buffer length remaining | |
144 | // %xmm1 = source vector | |
145 | ||
146 | LFound0: | |
147 | bsf %eax,%ecx // find first 0 | |
148 | subq %rcx,%rdx // decrement remaining buffer length | |
149 | test $8,%cl // 8-byte store required? | |
150 | jz 4f // no | |
151 | movq %xmm1,(%rdi) // pack in 8 low bytes | |
152 | psrldq $8,%xmm1 // then shift vector down 8 bytes | |
153 | addq $8,%rdi | |
154 | 4: | |
155 | test $4,%cl // 4-byte store required? | |
156 | jz 3f // no | |
157 | movd %xmm1,(%rdi) // pack in 4 low bytes | |
158 | psrldq $4,%xmm1 // then shift vector down 4 bytes | |
159 | addq $4,%rdi | |
160 | 3: | |
161 | andl $3,%ecx // more to go? | |
162 | jz LZeroBuffer // no | |
163 | movd %xmm1,%eax // move remainders out of vector into %eax | |
164 | 1: // loop on up to three bytes | |
165 | movb %al,(%rdi) // pack in next byte | |
166 | shrl $8,%eax // shift next byte into position | |
167 | addq $1,%rdi | |
168 | subl $1,%ecx | |
169 | jnz 1b | |
170 | ||
171 | // We've copied the string. Now zero the rest of the buffer, using commpage bzero(). | |
172 | // %rdi = dest ptr | |
173 | // %rcx = buffer length remaining | |
174 | ||
175 | LZeroBuffer: | |
176 | movq %rdx,%rsi // remaining buffer size (2nd argument) | |
177 | movq $(_COMM_PAGE_BZERO),%rax | |
178 | call *%rax | |
179 | ||
180 | LDone: | |
181 | movq %r8,%rax // original dest ptr is return value | |
182 | ret |