]>
Commit | Line | Data |
---|---|---|
8e029c65 A |
1 | /* |
2 | * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
23 | ||
24 | ||
25 | // *************** | |
26 | // * S T R C P Y * | |
27 | // *************** | |
28 | // | |
29 | // char *strcpy(const char *dst, const char *src); | |
30 | // | |
31 | // We optimize the move by doing it vector parallel. This introduces | |
32 | // a complication: if we blindly did vector load/stores until finding | |
33 | // a 0, we might get a spurious page fault by touching bytes past it. | |
34 | // To avoid this, we never do a load that crosses a page boundary, | |
35 | // and never store a byte we don't have to. | |
36 | // | |
37 | // We align the destination, because unaligned vector stores are slow. | |
38 | ||
39 | .text | |
40 | .globl _strcpy | |
41 | ||
42 | .align 4 | |
43 | _strcpy: // char *strcpy(const char *dst, const char *src); | |
44 | movq %rdi,%rcx // preserve dest ptr so we can return it | |
45 | movl %edi,%edx // copy low 4 bytes of dest ptr | |
46 | negl %edx | |
47 | andl $15,%edx // how many bytes to align dest ptr? | |
48 | jnz LLoopOverBytes // not aligned, so go do so | |
49 | ||
50 | ||
51 | // In order to avoid spurious page faults, we loop until nearing the source page | |
52 | // end. Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed, | |
53 | // then resume the vector loop. | |
54 | // %rsi = source ptr (unaligned) | |
55 | // %rdi = dest ptr (aligned) | |
56 | ||
57 | LNextChunk: | |
58 | movl %esi,%eax // copy low 4 bytes of source ptr | |
59 | movl $4096,%edx | |
60 | andl $4095,%eax // get offset into source page | |
61 | subl %eax,%edx // get #bytes remaining in source page | |
62 | shrl $4,%edx // get #chunks till end of page | |
63 | jnz LLoopOverChunks // enter vector loop | |
64 | movl $16,%edx // move 16 bytes to cross page but keep dest aligned | |
65 | jmp LLoopOverBytes | |
66 | ||
67 | ||
68 | // Loop over bytes. | |
69 | // %rsi = source ptr | |
70 | // %rdi = dest ptr | |
71 | // %edx = byte count | |
72 | ||
73 | .align 4,0x90 // align inner loops to optimize I-fetch | |
74 | LLoopOverBytes: | |
75 | movzb (%rsi),%eax // get source byte | |
76 | addq $1,%rsi | |
77 | movb %al,(%rdi) // pack into dest | |
78 | addq $1,%rdi | |
79 | testl %eax,%eax // 0? | |
80 | jz LDone // yes, we're done | |
81 | subl $1,%edx // more to go? | |
82 | jnz LLoopOverBytes | |
83 | ||
84 | jmp LNextChunk // we've come to end of page | |
85 | ||
86 | ||
87 | // Loop over 16-byte chunks. | |
88 | // %rsi = source ptr (unaligned) | |
89 | // %rdi = dest ptr (aligned) | |
90 | // %edx = chunk count | |
91 | ||
92 | .align 4,0x90 // align inner loops to optimize I-fetch | |
93 | LLoopOverChunks: | |
94 | movdqu (%rsi),%xmm1 // get source | |
95 | pxor %xmm0,%xmm0 // get some 0s | |
96 | addq $16,%rsi | |
97 | pcmpeqb %xmm1,%xmm0 // compare source to 0s | |
98 | pmovmskb %xmm0,%eax // get result mask for 0 check | |
99 | testl %eax,%eax // any 0s? | |
100 | jnz LFound0 // yes, exit loop | |
101 | movdqa %xmm1,(%rdi) // no 0s so do aligned store into destination | |
102 | addq $16,%rdi | |
103 | subl $1,%edx // more to go? | |
104 | jnz LLoopOverChunks | |
105 | ||
106 | movl $16,%edx // move 16 bytes | |
107 | jmp LLoopOverBytes // cross page but keep dest aligned | |
108 | ||
109 | ||
110 | // Found a zero in the vector. Figure out where it is, and store the bytes | |
111 | // up to it. | |
112 | // %rdi = dest ptr (aligned) | |
113 | // %eax = result mask | |
114 | // %xmm1 = source vector | |
115 | ||
116 | LFound0: | |
117 | bsf %eax,%edx // find first 0 | |
118 | addl $1,%edx // we need to store the 0 too | |
119 | test $16,%dl // was 0 last byte? | |
120 | jz 8f // no | |
121 | movdqa %xmm1,(%rdi) // yes, store entire vector | |
122 | jmp LDone | |
123 | 8: | |
124 | test $8,%dl // 8-byte store required? | |
125 | jz 4f // no | |
126 | movq %xmm1,(%rdi) // pack in 8 low bytes | |
127 | psrldq $8,%xmm1 // then shift vector down 8 bytes | |
128 | addq $8,%rdi | |
129 | 4: | |
130 | test $4,%dl // 4-byte store required? | |
131 | jz 3f // no | |
132 | movd %xmm1,(%rdi) // pack in 4 low bytes | |
133 | psrldq $4,%xmm1 // then shift vector down 4 bytes | |
134 | addq $4,%rdi | |
135 | 3: | |
136 | andl $3,%edx // more to go? | |
137 | jz LDone // no | |
138 | movd %xmm1,%eax // move remainders out of vector into %eax | |
139 | 1: // loop on up to three bytes | |
140 | movb %al,(%rdi) // pack in next byte | |
141 | shrl $8,%eax // shift next byte into position | |
142 | addq $1,%rdi | |
143 | dec %edx | |
144 | jnz 1b | |
145 | ||
146 | LDone: | |
147 | movq %rcx,%rax // original dest ptr is return value | |
148 | ret |