]> git.saurik.com Git - apple/libc.git/blob - ppc/string/strlcat.s
41f001406503fc7b362035908f1797483087d14a
[apple/libc.git] / ppc / string / strlcat.s
1 /*
2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 #define ASSEMBLER
24 #include <mach/ppc/asm.h>
25 #undef ASSEMBLER
26
27 // *****************
28 // * S T R L C A T *
29 // *****************
30 //
31 // size_t strlcat(char *dst, const char *src, size_t count);
32 //
33 // We optimize the move by doing it word parallel. This introduces
34 // a complication: if we blindly did word load/stores until finding
35 // a 0, we might get a spurious page fault by touching bytes past it.
36 // We are allowed to touch the "count" bytes starting at "dst", but
37 // when appending the "src", we must not do a "lwz" that crosses a page
38 // boundary, or store past "count".
39 //
40 // The test for 0s relies on the following inobvious but very efficient
41 // word-parallel test:
42 // x = dataWord + 0xFEFEFEFF
43 // y = ~dataWord & 0x80808080
44 // if (x & y) == 0 then no zero found
45 // The test maps any non-zero byte to zero, and any zero byte to 0x80,
46 // with one exception: 0x01 bytes preceeding the first zero are also
47 // mapped to 0x80.
48 //
49 // Note that "count" is the total buffer length, including the length
50 // of the "dst" string. This is different than strncat().
51
52 .text
53 .globl EXT(strlcat)
54
55 .align 5
56 LEXT(strlcat)
57 srwi. r0,r5,2 // get #words to scan
58 dcbtst 0,r3 // touch in dst
59 lis r6,hi16(0xFEFEFEFF) // start to load magic constants
60 lis r7,hi16(0x80808080)
61 dcbt 0,r4 // touch in source
62 ori r6,r6,lo16(0xFEFEFEFF)
63 ori r7,r7,lo16(0x80808080)
64 mr r9,r3 // use r9 for dest ptr (r3 remembers dst start)
65 beq-- L0bytes // buffer length <4
66 mtctr r0 // set up loop
67 b L0words // enter word loop
68
69 // Loop over words looking for 0.
70 // r3 = original start of buffer
71 // r4 = source ptr (unaligned)
72 // r5 = original buffer size
73 // r6 = 0xFEFEFEFF
74 // r7 = 0x80808080
75 // r9 = dest ptr (unaligned)
76 // ctr = #words remaining in buffer
77
78 .align 5 // align inner loops for speed
79 L0words:
80 lwz r8,0(r9) // r8 <- next dest word
81 addi r9,r9,4
82 add r10,r8,r6 // r10 <- word + 0xFEFEFEFF
83 andc r12,r7,r8 // r12 <- ~word & 0x80808080
84 and. r11,r10,r12 // r11 <- nonzero iff word has a 0-byte
85 bdnzt eq,L0words // loop until 0 found or buffer end
86
87 beq-- L0bytes // skip if 0 not found
88
89 slwi r0,r8,7 // move 0x01 bits (false hits) into 0x80 position
90 subi r9,r9,4 // back up r9 to the start of the word
91 andc r11,r11,r0 // mask out false hits
92 cntlzw r0,r11 // find 0 byte (r0 = 0, 8, 16, or 24)
93 srwi r0,r0,3 // now r0 = 0, 1, 2, or 3
94 add r9,r9,r0 // now r9 points to the 0-byte in dest
95 b L0found // start to append source
96
97 // Loop over bytes looking for 0.
98 // r3 = original start of buffer
99 // r4 = source ptr (unaligned)
100 // r5 = original buffer size
101 // r6 = 0xFEFEFEFF
102 // r7 = 0x80808080
103 // r9 = dest ptr (unaligned)
104
105 L0bytes:
106 andi. r0,r5,3 // get #bytes remaining in buffer
107 mtctr r0 // set up byte loop
108 beq-- L0notfound // skip if 0 not found in buffer (error)
109 L0byteloop:
110 lbz r8,0(r9) // r8 <- next dest byte
111 addi r9,r9,1
112 cmpwi r8,0 // 0 ?
113 bdnzf eq,L0byteloop // loop until 0 found or buffer end
114
115 bne-- L0notfound // skip if 0 not found (error)
116 subi r9,r9,1 // back up, so r9 points to the 0
117
118 // End of dest found, so we can start appending source. First, align the source,
119 // in order to avoid spurious page faults.
120 // r3 = original start of buffer
121 // r4 = original source ptr (unaligned)
122 // r5 = original buffer size
123 // r6 = 0xFEFEFEFF
124 // r7 = 0x80808080
125 // r9 = ptr to 0-byte in dest (unaligned)
126
127 L0found:
128 andi. r0,r4,3 // is source aligned?
129 add r5,r5,r3 // get ptr to end of buffer
130 sub r5,r5,r9 // get #bytes remaining in buffer, counting the 0 (r5>0)
131 beq Laligned // skip if source already word aligned
132 subfic r0,r0,4 // not aligned, get #bytes to align r4
133 b Lbyteloop1 // r5!=0, so skip check
134
135 // Copy min(r0,r5) bytes, until 0-byte.
136 // r0 = #bytes we propose to copy (NOTE: must be >0)
137 // r4 = source ptr (unaligned)
138 // r5 = length remaining in buffer (may be 0)
139 // r6 = 0xFEFEFEFF
140 // r7 = 0x80808080
141 // r9 = dest ptr (unaligned)
142
143 Lbyteloop:
144 cmpwi r5,0 // buffer empty? (note: unsigned)
145 beq-- Loverrun // buffer filled before end of source reached
146 Lbyteloop1: // entry when we know r5!=0
147 lbz r8,0(r4) // r8 <- next source byte
148 subic. r0,r0,1 // decrement count of bytes to move
149 addi r4,r4,1
150 subi r5,r5,1 // decrement buffer length remaining
151 stb r8,0(r9) // pack into dest
152 cmpwi cr1,r8,0 // 0-byte?
153 addi r9,r9,1
154 beq cr1,L0stored // byte was 0, so done
155 bne Lbyteloop // r0!=0, source not yet aligned
156
157 // Source is word aligned. Loop over words until 0-byte found or end
158 // of buffer.
159 // r3 = original start of buffer
160 // r4 = source ptr (word aligned)
161 // r5 = length remaining in buffer
162 // r6 = 0xFEFEFEFF
163 // r7 = 0x80808080
164 // r9 = dest ptr (unaligned)
165
166 Laligned:
167 srwi. r8,r5,2 // get #words in buffer
168 addi r0,r5,1 // if no words...
169 beq-- Lbyteloop // ...copy to end of buffer
170 mtctr r8 // set up word loop count
171 rlwinm r5,r5,0,0x3 // mask buffer length down to leftover bytes
172 b LwordloopEnter
173
174 // Inner loop: move a word at a time, until one of two conditions:
175 // - a zero byte is found
176 // - end of buffer
177 // At this point, registers are as follows:
178 // r3 = original start of buffer
179 // r4 = source ptr (word aligned)
180 // r5 = bytes leftover in buffer (0..3)
181 // r6 = 0xFEFEFEFF
182 // r7 = 0x80808080
183 // r9 = dest ptr (unaligned)
184 // ctr = whole words left in buffer
185
186 .align 5 // align inner loop, which is 8 words long
187 Lwordloop:
188 stw r8,0(r9) // pack word into destination
189 addi r9,r9,4
190 LwordloopEnter:
191 lwz r8,0(r4) // r8 <- next 4 source bytes
192 addi r4,r4,4
193 add r10,r8,r6 // r10 <- word + 0xFEFEFEFF
194 andc r12,r7,r8 // r12 <- ~word & 0x80808080
195 and. r11,r10,r12 // r11 <- nonzero iff word has a 0-byte
196 bdnzt eq,Lwordloop // loop if ctr!=0 and cr0_eq
197
198 beq-- Lleftovers // skip if no 0-byte found, copy leftovers
199
200 // Found a 0-byte. Store last word up to and including the 0, a byte at a time.
201 // r3 = original start of buffer
202 // r8 = last word, known to have a 0-byte
203 // r9 = dest ptr (one past 0)
204
205 Lstorelastbytes:
206 srwi. r0,r8,24 // right justify next byte and test for 0
207 slwi r8,r8,8 // shift next byte into position
208 stb r0,0(r9) // pack into dest
209 addi r9,r9,1
210 bne Lstorelastbytes // loop until 0 stored
211
212 // Append op successful, O stored into buffer. Return total length.
213 // r3 = original start of buffer
214 // r9 = dest ptr (one past 0)
215
216 L0stored:
217 sub r3,r9,r3 // get (length+1) of string in buffer
218 subi r3,r3,1 // return length
219 blr
220
221 // 0-byte not found in aligned source words. There are up to 3 leftover source
222 // bytes, hopefully the 0-byte is among them.
223 // r4 = source ptr (word aligned)
224 // r5 = leftover bytes in buffer (0..3)
225 // r6 = 0xFEFEFEFF
226 // r7 = 0x80808080
227 // r8 = last full word of source
228 // r9 = dest ptr (unaligned)
229
230 Lleftovers:
231 stw r8,0(r9) // store last word
232 addi r9,r9,4
233 addi r0,r5,1 // make sure r5 terminates byte loop (not r0)
234 b Lbyteloop
235
236 // Buffer filled during append without finding the end of source. Overwrite the
237 // last byte in buffer with a 0, and compute how long the concatenated string would
238 // have been, if the buffer had been large enough.
239 // r3 = original start of buffer
240 // r4 = source ptr (1st byte not copied into buffer)
241 // r9 = dest ptr (one past end of buffer)
242
243 Loverrun:
244 sub. r3,r9,r3 // compute #bytes stored in buffer
245 li r0,0 // get a 0
246 beq-- Lskip // buffer was 0-length
247 stb r0,-1(r9) // jam in delimiting 0
248
249 // Buffer full, check to see how much longer source is. We don't optimize this,
250 // since overruns are an error.
251
252 Lskip:
253 lbz r8,0(r4) // get next source byte
254 addi r4,r4,1
255 addi r3,r3,1 // increment length of "ideal" string
256 cmpwi r8,0 // 0?
257 bne Lskip
258
259 subi r3,r3,1 // don't count 0 in length
260 blr // return length of string we "wanted" to create
261
262 // 0 not found in buffer (append not yet begun.) We don't store a delimiting 0,
263 // but do compute how long the concatenated string would have been, assuming the length
264 // of "dst" is the length of the buffer.
265 // r3 = original start of buffer
266 // r4 = original source ptr
267 // r9 = dest ptr (one past end of buffer)
268
269 L0notfound:
270 sub r3,r9,r3 // compute #bytes in buffer
271 b Lskip // add strlen(source) to r3
272