ppc/string/strlcat.s

   1 /*
   2  * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 #define ASSEMBLER
  24 #include <mach/ppc/asm.h>
  25 #undef  ASSEMBLER
  26
  27 // *****************
  28 // * S T R L C A T *
  29 // *****************
  30 //
  31 // size_t       strlcat(char *dst, const char *src, size_t count);
  32 //
  33 // We optimize the move by doing it word parallel.  This introduces
  34 // a complication: if we blindly did word load/stores until finding
  35 // a 0, we might get a spurious page fault by touching bytes past it.
  36 // We are allowed to touch the "count" bytes starting at "dst", but
  37 // when appending the "src", we must not do a "lwz" that crosses a page
  38 // boundary, or store past "count".
  39 //
  40 // The test for 0s relies on the following inobvious but very efficient
  41 // word-parallel test:
  42 //              x =  dataWord + 0xFEFEFEFF
  43 //              y = ~dataWord & 0x80808080
  44 //              if (x & y) == 0 then no zero found
  45 // The test maps any non-zero byte to zero, and any zero byte to 0x80,
  46 // with one exception: 0x01 bytes preceeding the first zero are also
  47 // mapped to 0x80.
  48 //
  49 // Note that "count" is the total buffer length, including the length
  50 // of the "dst" string.  This is different than strncat().
  51
  52         .text
  53         .globl EXT(strlcat)
  54
  55         .align  5
  56 LEXT(strlcat)
  57         srwi.   r0,r5,2                         // get #words to scan
  58         dcbtst  0,r3                            // touch in dst
  59         lis             r6,hi16(0xFEFEFEFF)     // start to load magic constants
  60         lis             r7,hi16(0x80808080)
  61         dcbt    0,r4                            // touch in source
  62         ori             r6,r6,lo16(0xFEFEFEFF)
  63         ori             r7,r7,lo16(0x80808080)
  64         mr              r9,r3                           // use r9 for dest ptr (r3 remembers dst start)
  65         beq--   L0bytes                         // buffer length <4
  66         mtctr   r0                                      // set up loop
  67         b               L0words                         // enter word loop
  68
  69 // Loop over words looking for 0.
  70 //              r3 = original start of buffer
  71 //              r4 = source ptr (unaligned)
  72 //              r5 = original buffer size
  73 //              r6 = 0xFEFEFEFF
  74 //              r7 = 0x80808080
  75 //              r9 = dest ptr (unaligned)
  76 //     ctr = #words remaining in buffer
  77
  78         .align  5                                       // align inner loops for speed
  79 L0words:
  80         lwz             r8,0(r9)                        // r8 <- next dest word
  81         addi    r9,r9,4
  82         add             r10,r8,r6                       // r10 <-  word + 0xFEFEFEFF
  83         andc    r12,r7,r8                       // r12 <- ~word & 0x80808080
  84         and.    r11,r10,r12                     // r11 <- nonzero iff word has a 0-byte
  85         bdnzt   eq,L0words                      // loop until 0 found or buffer end
  86
  87         beq--   L0bytes                         // skip if 0 not found
  88
  89         slwi    r0,r8,7                         // move 0x01 bits (false hits) into 0x80 position
  90         subi    r9,r9,4                         // back up r9 to the start of the word
  91         andc    r11,r11,r0                      // mask out false hits
  92         cntlzw  r0,r11                          // find 0 byte (r0 = 0, 8, 16, or 24)
  93         srwi    r0,r0,3                         // now r0 = 0, 1, 2, or 3
  94         add             r9,r9,r0                        // now r9 points to the 0-byte in dest
  95         b               L0found                         // start to append source
  96
  97 // Loop over bytes looking for 0.
  98 //              r3 = original start of buffer
  99 //              r4 = source ptr (unaligned)
 100 //              r5 = original buffer size
 101 //              r6 = 0xFEFEFEFF
 102 //              r7 = 0x80808080
 103 //              r9 = dest ptr (unaligned)
 104
 105 L0bytes:
 106         andi.   r0,r5,3                         // get #bytes remaining in buffer
 107         mtctr   r0                                      // set up byte loop
 108         beq--   L0notfound                      // skip if 0 not found in buffer (error)
 109 L0byteloop:
 110         lbz             r8,0(r9)                        // r8 <- next dest byte
 111         addi    r9,r9,1
 112         cmpwi   r8,0                            // 0 ?
 113         bdnzf   eq,L0byteloop           // loop until 0 found or buffer end
 114
 115         bne--   L0notfound                      // skip if 0 not found (error)
 116         subi    r9,r9,1                         // back up, so r9 points to the 0
 117
 118 // End of dest found, so we can start appending source.  First, align the source,
 119 // in order to avoid spurious page faults.
 120 //              r3 = original start of buffer
 121 //              r4 = original source ptr (unaligned)
 122 //              r5 = original buffer size
 123 //              r6 = 0xFEFEFEFF
 124 //              r7 = 0x80808080
 125 //              r9 = ptr to 0-byte in dest (unaligned)
 126
 127 L0found:
 128         andi.   r0,r4,3                         // is source aligned?
 129         add             r5,r5,r3                        // get ptr to end of buffer
 130         sub             r5,r5,r9                        // get #bytes remaining in buffer, counting the 0 (r5>0)
 131         beq             Laligned                        // skip if source already word aligned
 132         subfic  r0,r0,4                         // not aligned, get #bytes to align r4
 133         b               Lbyteloop1                      // r5!=0, so skip check
 134
 135 // Copy min(r0,r5) bytes, until 0-byte.
 136 //              r0 = #bytes we propose to copy (NOTE: must be >0)
 137 //              r4 = source ptr (unaligned)
 138 //              r5 = length remaining in buffer (may be 0)
 139 //              r6 = 0xFEFEFEFF
 140 //              r7 = 0x80808080
 141 //              r9 = dest ptr (unaligned)
 142
 143 Lbyteloop:
 144         cmpwi   r5,0                            // buffer empty? (note: unsigned)
 145         beq--   Loverrun                        // buffer filled before end of source reached
 146 Lbyteloop1:                                                     // entry when we know r5!=0
 147         lbz             r8,0(r4)                        // r8 <- next source byte
 148         subic.  r0,r0,1                         // decrement count of bytes to move
 149         addi    r4,r4,1
 150         subi    r5,r5,1                         // decrement buffer length remaining
 151         stb             r8,0(r9)                        // pack into dest
 152         cmpwi   cr1,r8,0                        // 0-byte?
 153         addi    r9,r9,1
 154         beq             cr1,L0stored            // byte was 0, so done
 155         bne             Lbyteloop                       // r0!=0, source not yet aligned
 156
 157 // Source is word aligned.  Loop over words until 0-byte found or end
 158 // of buffer.
 159 //              r3 = original start of buffer
 160 //              r4 = source ptr (word aligned)
 161 //              r5 = length remaining in buffer
 162 //              r6 = 0xFEFEFEFF
 163 //              r7 = 0x80808080
 164 //              r9 = dest ptr (unaligned)
 165
 166 Laligned:
 167         srwi.   r8,r5,2                         // get #words in buffer
 168         addi    r0,r5,1                         // if no words...
 169         beq--   Lbyteloop                       // ...copy to end of buffer
 170         mtctr   r8                                      // set up word loop count
 171         rlwinm  r5,r5,0,0x3                     // mask buffer length down to leftover bytes
 172         b               LwordloopEnter
 173
 174 // Inner loop: move a word at a time, until one of two conditions:
 175 //              - a zero byte is found
 176 //              - end of buffer
 177 // At this point, registers are as follows:
 178 //              r3 = original start of buffer
 179 //              r4 = source ptr (word aligned)
 180 //              r5 = bytes leftover in buffer (0..3)
 181 //              r6 = 0xFEFEFEFF
 182 //              r7 = 0x80808080
 183 //              r9 = dest ptr (unaligned)
 184 //     ctr = whole words left in buffer
 185
 186         .align  5                                       // align inner loop, which is 8 words long
 187 Lwordloop:
 188         stw             r8,0(r9)                        // pack word into destination
 189         addi    r9,r9,4
 190 LwordloopEnter:
 191         lwz             r8,0(r4)                        // r8 <- next 4 source bytes
 192         addi    r4,r4,4
 193         add             r10,r8,r6                       // r10 <-  word + 0xFEFEFEFF
 194         andc    r12,r7,r8                       // r12 <- ~word & 0x80808080
 195         and.    r11,r10,r12                     // r11 <- nonzero iff word has a 0-byte
 196         bdnzt   eq,Lwordloop            // loop if ctr!=0 and cr0_eq
 197
 198         beq--   Lleftovers                      // skip if no 0-byte found, copy leftovers
 199
 200 // Found a 0-byte.  Store last word up to and including the 0, a byte at a time.
 201 //              r3 = original start of buffer
 202 //              r8 = last word, known to have a 0-byte
 203 //              r9 = dest ptr (one past 0)
 204
 205 Lstorelastbytes:
 206         srwi.   r0,r8,24                        // right justify next byte and test for 0
 207         slwi    r8,r8,8                         // shift next byte into position
 208         stb             r0,0(r9)                        // pack into dest
 209         addi    r9,r9,1
 210         bne             Lstorelastbytes         // loop until 0 stored
 211
 212 // Append op successful, O stored into buffer.  Return total length.
 213 //              r3 = original start of buffer
 214 //              r9 = dest ptr (one past 0)
 215
 216 L0stored:
 217         sub             r3,r9,r3                        // get (length+1) of string in buffer
 218         subi    r3,r3,1                         // return length
 219         blr
 220
 221 // 0-byte not found in aligned source words.  There are up to 3 leftover source
 222 // bytes, hopefully the 0-byte is among them.
 223 //              r4 = source ptr (word aligned)
 224 //              r5 = leftover bytes in buffer (0..3)
 225 //              r6 = 0xFEFEFEFF
 226 //              r7 = 0x80808080
 227 //              r8 = last full word of source
 228 //              r9 = dest ptr (unaligned)
 229
 230 Lleftovers:
 231         stw             r8,0(r9)                        // store last word
 232         addi    r9,r9,4
 233         addi    r0,r5,1                         // make sure r5 terminates byte loop (not r0)
 234         b               Lbyteloop
 235
 236 // Buffer filled during append without finding the end of source.  Overwrite the
 237 // last byte in buffer with a 0, and compute how long the concatenated string would
 238 // have been, if the buffer had been large enough.
 239 //              r3 = original start of buffer
 240 //              r4 = source ptr (1st byte not copied into buffer)
 241 //              r9 = dest ptr (one past end of buffer)
 242
 243 Loverrun:
 244         sub.    r3,r9,r3                        // compute #bytes stored in buffer
 245         li              r0,0                            // get a 0
 246         beq--   Lskip                           // buffer was 0-length
 247         stb             r0,-1(r9)                       // jam in delimiting 0
 248
 249 // Buffer full, check to see how much longer source is.  We don't optimize this,
 250 // since overruns are an error.
 251
 252 Lskip:
 253         lbz             r8,0(r4)                        // get next source byte
 254         addi    r4,r4,1
 255         addi    r3,r3,1                         // increment length of "ideal" string
 256         cmpwi   r8,0                            // 0?
 257         bne             Lskip
 258
 259         subi    r3,r3,1                         // don't count 0 in length
 260         blr                                                     // return length of string we "wanted" to create
 261
 262 // 0 not found in buffer (append not yet begun.)  We don't store a delimiting 0,
 263 // but do compute how long the concatenated string would have been, assuming the length
 264 // of "dst" is the length of the buffer.
 265 //              r3 = original start of buffer
 266 //              r4 = original source ptr
 267 //              r9 = dest ptr (one past end of buffer)
 268
 269 L0notfound:
 270         sub             r3,r9,r3                        // compute #bytes in buffer
 271         b               Lskip                           // add strlen(source) to r3
 272