ppc/string/strlcat.s

   1 /*
   2  * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25 #define ASSEMBLER
  26 #include <mach/ppc/asm.h>
  27 #undef  ASSEMBLER
  28
  29 // *****************
  30 // * S T R L C A T *
  31 // *****************
  32 //
  33 // size_t       strlcat(char *dst, const char *src, size_t count);
  34 //
  35 // We optimize the move by doing it word parallel.  This introduces
  36 // a complication: if we blindly did word load/stores until finding
  37 // a 0, we might get a spurious page fault by touching bytes past it.
  38 // We are allowed to touch the "count" bytes starting at "dst", but
  39 // when appending the "src", we must not do a "lwz" that crosses a page
  40 // boundary, or store past "count".
  41 //
  42 // The test for 0s relies on the following inobvious but very efficient
  43 // word-parallel test:
  44 //              x =  dataWord + 0xFEFEFEFF
  45 //              y = ~dataWord & 0x80808080
  46 //              if (x & y) == 0 then no zero found
  47 // The test maps any non-zero byte to zero, and any zero byte to 0x80,
  48 // with one exception: 0x01 bytes preceeding the first zero are also
  49 // mapped to 0x80.
  50 //
  51 // Note that "count" is the total buffer length, including the length
  52 // of the "dst" string.  This is different than strncat().
  53
  54         .text
  55         .globl EXT(strlcat)
  56
  57         .align  5
  58 LEXT(strlcat)
  59         srwi.   r0,r5,2                         // get #words to scan
  60         dcbtst  0,r3                            // touch in dst
  61         lis             r6,hi16(0xFEFEFEFF)     // start to load magic constants
  62         lis             r7,hi16(0x80808080)
  63         dcbt    0,r4                            // touch in source
  64         ori             r6,r6,lo16(0xFEFEFEFF)
  65         ori             r7,r7,lo16(0x80808080)
  66         mr              r9,r3                           // use r9 for dest ptr (r3 remembers dst start)
  67         beq--   L0bytes                         // buffer length <4
  68         mtctr   r0                                      // set up loop
  69         b               L0words                         // enter word loop
  70
  71 // Loop over words looking for 0.
  72 //              r3 = original start of buffer
  73 //              r4 = source ptr (unaligned)
  74 //              r5 = original buffer size
  75 //              r6 = 0xFEFEFEFF
  76 //              r7 = 0x80808080
  77 //              r9 = dest ptr (unaligned)
  78 //     ctr = #words remaining in buffer
  79
  80         .align  5                                       // align inner loops for speed
  81 L0words:
  82         lwz             r8,0(r9)                        // r8 <- next dest word
  83         addi    r9,r9,4
  84         add             r10,r8,r6                       // r10 <-  word + 0xFEFEFEFF
  85         andc    r12,r7,r8                       // r12 <- ~word & 0x80808080
  86         and.    r11,r10,r12                     // r11 <- nonzero iff word has a 0-byte
  87         bdnzt   eq,L0words                      // loop until 0 found or buffer end
  88
  89         beq--   L0bytes                         // skip if 0 not found
  90
  91         slwi    r0,r8,7                         // move 0x01 bits (false hits) into 0x80 position
  92         subi    r9,r9,4                         // back up r9 to the start of the word
  93         andc    r11,r11,r0                      // mask out false hits
  94         cntlzw  r0,r11                          // find 0 byte (r0 = 0, 8, 16, or 24)
  95         srwi    r0,r0,3                         // now r0 = 0, 1, 2, or 3
  96         add             r9,r9,r0                        // now r9 points to the 0-byte in dest
  97         b               L0found                         // start to append source
  98
  99 // Loop over bytes looking for 0.
 100 //              r3 = original start of buffer
 101 //              r4 = source ptr (unaligned)
 102 //              r5 = original buffer size
 103 //              r6 = 0xFEFEFEFF
 104 //              r7 = 0x80808080
 105 //              r9 = dest ptr (unaligned)
 106
 107 L0bytes:
 108         andi.   r0,r5,3                         // get #bytes remaining in buffer
 109         mtctr   r0                                      // set up byte loop
 110         beq--   L0notfound                      // skip if 0 not found in buffer (error)
 111 L0byteloop:
 112         lbz             r8,0(r9)                        // r8 <- next dest byte
 113         addi    r9,r9,1
 114         cmpwi   r8,0                            // 0 ?
 115         bdnzf   eq,L0byteloop           // loop until 0 found or buffer end
 116
 117         bne--   L0notfound                      // skip if 0 not found (error)
 118         subi    r9,r9,1                         // back up, so r9 points to the 0
 119
 120 // End of dest found, so we can start appending source.  First, align the source,
 121 // in order to avoid spurious page faults.
 122 //              r3 = original start of buffer
 123 //              r4 = original source ptr (unaligned)
 124 //              r5 = original buffer size
 125 //              r6 = 0xFEFEFEFF
 126 //              r7 = 0x80808080
 127 //              r9 = ptr to 0-byte in dest (unaligned)
 128
 129 L0found:
 130         andi.   r0,r4,3                         // is source aligned?
 131         add             r5,r5,r3                        // get ptr to end of buffer
 132         sub             r5,r5,r9                        // get #bytes remaining in buffer, counting the 0 (r5>0)
 133         beq             Laligned                        // skip if source already word aligned
 134         subfic  r0,r0,4                         // not aligned, get #bytes to align r4
 135         b               Lbyteloop1                      // r5!=0, so skip check
 136
 137 // Copy min(r0,r5) bytes, until 0-byte.
 138 //              r0 = #bytes we propose to copy (NOTE: must be >0)
 139 //              r4 = source ptr (unaligned)
 140 //              r5 = length remaining in buffer (may be 0)
 141 //              r6 = 0xFEFEFEFF
 142 //              r7 = 0x80808080
 143 //              r9 = dest ptr (unaligned)
 144
 145 Lbyteloop:
 146         cmpwi   r5,0                            // buffer empty? (note: unsigned)
 147         beq--   Loverrun                        // buffer filled before end of source reached
 148 Lbyteloop1:                                                     // entry when we know r5!=0
 149         lbz             r8,0(r4)                        // r8 <- next source byte
 150         subic.  r0,r0,1                         // decrement count of bytes to move
 151         addi    r4,r4,1
 152         subi    r5,r5,1                         // decrement buffer length remaining
 153         stb             r8,0(r9)                        // pack into dest
 154         cmpwi   cr1,r8,0                        // 0-byte?
 155         addi    r9,r9,1
 156         beq             cr1,L0stored            // byte was 0, so done
 157         bne             Lbyteloop                       // r0!=0, source not yet aligned
 158
 159 // Source is word aligned.  Loop over words until 0-byte found or end
 160 // of buffer.
 161 //              r3 = original start of buffer
 162 //              r4 = source ptr (word aligned)
 163 //              r5 = length remaining in buffer
 164 //              r6 = 0xFEFEFEFF
 165 //              r7 = 0x80808080
 166 //              r9 = dest ptr (unaligned)
 167
 168 Laligned:
 169         srwi.   r8,r5,2                         // get #words in buffer
 170         addi    r0,r5,1                         // if no words...
 171         beq--   Lbyteloop                       // ...copy to end of buffer
 172         mtctr   r8                                      // set up word loop count
 173         rlwinm  r5,r5,0,0x3                     // mask buffer length down to leftover bytes
 174         b               LwordloopEnter
 175
 176 // Inner loop: move a word at a time, until one of two conditions:
 177 //              - a zero byte is found
 178 //              - end of buffer
 179 // At this point, registers are as follows:
 180 //              r3 = original start of buffer
 181 //              r4 = source ptr (word aligned)
 182 //              r5 = bytes leftover in buffer (0..3)
 183 //              r6 = 0xFEFEFEFF
 184 //              r7 = 0x80808080
 185 //              r9 = dest ptr (unaligned)
 186 //     ctr = whole words left in buffer
 187
 188         .align  5                                       // align inner loop, which is 8 words long
 189 Lwordloop:
 190         stw             r8,0(r9)                        // pack word into destination
 191         addi    r9,r9,4
 192 LwordloopEnter:
 193         lwz             r8,0(r4)                        // r8 <- next 4 source bytes
 194         addi    r4,r4,4
 195         add             r10,r8,r6                       // r10 <-  word + 0xFEFEFEFF
 196         andc    r12,r7,r8                       // r12 <- ~word & 0x80808080
 197         and.    r11,r10,r12                     // r11 <- nonzero iff word has a 0-byte
 198         bdnzt   eq,Lwordloop            // loop if ctr!=0 and cr0_eq
 199
 200         beq--   Lleftovers                      // skip if no 0-byte found, copy leftovers
 201
 202 // Found a 0-byte.  Store last word up to and including the 0, a byte at a time.
 203 //              r3 = original start of buffer
 204 //              r8 = last word, known to have a 0-byte
 205 //              r9 = dest ptr (one past 0)
 206
 207 Lstorelastbytes:
 208         srwi.   r0,r8,24                        // right justify next byte and test for 0
 209         slwi    r8,r8,8                         // shift next byte into position
 210         stb             r0,0(r9)                        // pack into dest
 211         addi    r9,r9,1
 212         bne             Lstorelastbytes         // loop until 0 stored
 213
 214 // Append op successful, O stored into buffer.  Return total length.
 215 //              r3 = original start of buffer
 216 //              r9 = dest ptr (one past 0)
 217
 218 L0stored:
 219         sub             r3,r9,r3                        // get (length+1) of string in buffer
 220         subi    r3,r3,1                         // return length
 221         blr
 222
 223 // 0-byte not found in aligned source words.  There are up to 3 leftover source
 224 // bytes, hopefully the 0-byte is among them.
 225 //              r4 = source ptr (word aligned)
 226 //              r5 = leftover bytes in buffer (0..3)
 227 //              r6 = 0xFEFEFEFF
 228 //              r7 = 0x80808080
 229 //              r8 = last full word of source
 230 //              r9 = dest ptr (unaligned)
 231
 232 Lleftovers:
 233         stw             r8,0(r9)                        // store last word
 234         addi    r9,r9,4
 235         addi    r0,r5,1                         // make sure r5 terminates byte loop (not r0)
 236         b               Lbyteloop
 237
 238 // Buffer filled during append without finding the end of source.  Overwrite the
 239 // last byte in buffer with a 0, and compute how long the concatenated string would
 240 // have been, if the buffer had been large enough.
 241 //              r3 = original start of buffer
 242 //              r4 = source ptr (1st byte not copied into buffer)
 243 //              r9 = dest ptr (one past end of buffer)
 244
 245 Loverrun:
 246         sub.    r3,r9,r3                        // compute #bytes stored in buffer
 247         li              r0,0                            // get a 0
 248         beq--   Lskip                           // buffer was 0-length
 249         stb             r0,-1(r9)                       // jam in delimiting 0
 250
 251 // Buffer full, check to see how much longer source is.  We don't optimize this,
 252 // since overruns are an error.
 253
 254 Lskip:
 255         lbz             r8,0(r4)                        // get next source byte
 256         addi    r4,r4,1
 257         addi    r3,r3,1                         // increment length of "ideal" string
 258         cmpwi   r8,0                            // 0?
 259         bne             Lskip
 260
 261         subi    r3,r3,1                         // don't count 0 in length
 262         blr                                                     // return length of string we "wanted" to create
 263
 264 // 0 not found in buffer (append not yet begun.)  We don't store a delimiting 0,
 265 // but do compute how long the concatenated string would have been, assuming the length
 266 // of "dst" is the length of the buffer.
 267 //              r3 = original start of buffer
 268 //              r4 = original source ptr
 269 //              r9 = dest ptr (one past end of buffer)
 270
 271 L0notfound:
 272         sub             r3,r9,r3                        // compute #bytes in buffer
 273         b               Lskip                           // add strlen(source) to r3
 274