ppc/string/strlcat.s

   1 /*
   2  * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 #define ASSEMBLER
  24 #include <mach/ppc/asm.h>
  25 #undef  ASSEMBLER
  26
  27 #define __APPLE_API_PRIVATE
  28 #include <machine/cpu_capabilities.h>
  29 #undef  __APPLE_API_PRIVATE
  30
  31 /* We use mode-independent "g" opcodes such as "srgi".  These expand
  32  * into word operations when targeting __ppc__, and into doubleword
  33  * operations when targeting __ppc64__.
  34  */
  35 #include <architecture/ppc/mode_independent_asm.h>
  36
  37
  38 // *****************
  39 // * S T R L C A T *
  40 // *****************
  41 //
  42 // size_t       strlcat(char *dst, const char *src, size_t count);
  43 //
  44 // We optimize the move by doing it word parallel.  This introduces
  45 // a complication: if we blindly did word load/stores until finding
  46 // a 0, we might get a spurious page fault by touching bytes past it.
  47 // We are allowed to touch the "count" bytes starting at "dst", but
  48 // when appending the "src", we must not do a "lwz" that crosses a page
  49 // boundary, or store past "count".
  50 //
  51 // The test for 0s relies on the following inobvious but very efficient
  52 // word-parallel test:
  53 //              x =  dataWord + 0xFEFEFEFF
  54 //              y = ~dataWord & 0x80808080
  55 //              if (x & y) == 0 then no zero found
  56 // The test maps any non-zero byte to zero, and any zero byte to 0x80,
  57 // with one exception: 0x01 bytes preceeding the first zero are also
  58 // mapped to 0x80.
  59 //
  60 // Note that "count" is the total buffer length, including the length
  61 // of the "dst" string.  This is different than strncat().
  62 //
  63 // In 64-bit mode, this algorithm is doubleword parallel.
  64
  65         .text
  66         .globl EXT(strlcat)
  67
  68         .align  5
  69 LEXT(strlcat)                       // size_t strlcat(char *dst, const char *src, size_t count);
  70         srgi.   r0,r5,LOG2_GPR_BYTES// get #words or doublewords to scan
  71 #if defined(__ppc__)
  72         lis             r6,hi16(0xFEFEFEFF)     // start to generate 32-bit magic constants
  73         lis             r7,hi16(0x80808080)
  74         ori             r6,r6,lo16(0xFEFEFEFF)
  75         ori             r7,r7,lo16(0x80808080)
  76 #else
  77         ld              r6,_COMM_PAGE_MAGIC_FE(0)       // get 0xFEFEFEFE FEFEFEFF from commpage
  78         ld              r7,_COMM_PAGE_MAGIC_80(0)       // get 0x80808080 80808080 from commpage
  79 #endif
  80         mr              r9,r3                           // use r9 for dest ptr (r3 remembers dst start)
  81         beq--   L0bytes                         // buffer length <4
  82         mtctr   r0                                      // set up loop
  83         b               L0words                         // enter word loop
  84
  85 // Loop over words looking for 0.
  86 //              r3 = original start of buffer
  87 //              r4 = source ptr (unaligned)
  88 //              r5 = original buffer size
  89 //              r6 = 0xFEFEFEFF
  90 //              r7 = 0x80808080
  91 //              r9 = dest ptr (unaligned)
  92 //     ctr = #words or doublewords remaining in buffer
  93
  94         .align  5                                       // align inner loops for speed
  95 L0words:
  96         lg              r8,0(r9)                        // r8 <- next dest word or doubleword
  97         addi    r9,r9,GPR_BYTES
  98         add             r10,r8,r6                       // r10 <-  word + 0xFEFEFEFF
  99         andc    r12,r7,r8                       // r12 <- ~word & 0x80808080
 100         and.    r11,r10,r12                     // r11 <- nonzero iff word has a 0-byte
 101         bdnzt   eq,L0words                      // loop until 0 found or buffer end
 102
 103         beq--   L0bytes                         // skip if 0 not found
 104
 105         slgi    r0,r8,7                         // move 0x01 bits (false hits) into 0x80 position
 106         subi    r9,r9,GPR_BYTES     // back up r9 to the start of the word
 107         andc    r11,r11,r0                      // mask out false hits
 108         cntlzg  r0,r11                          // find 0 byte (r0 = 0, 8, 16, or 24)
 109         srwi    r0,r0,3                         // now r0 = 0, 1, 2, or 3
 110         add             r9,r9,r0                        // now r9 points to the 0-byte in dest
 111         b               L0found                         // start to append source
 112
 113 // Loop over bytes looking for 0.
 114 //              r3 = original start of buffer
 115 //              r4 = source ptr (unaligned)
 116 //              r5 = original buffer size
 117 //              r6 = 0xFEFEFEFF
 118 //              r7 = 0x80808080
 119 //              r9 = dest ptr (unaligned)
 120
 121 L0bytes:
 122         andi.   r0,r5,GPR_BYTES-1   // get #bytes remaining in buffer
 123         mtctr   r0                                      // set up byte loop
 124         beq--   L0notfound                      // skip if 0 not found in buffer (error)
 125 L0byteloop:
 126         lbz             r8,0(r9)                        // r8 <- next dest byte
 127         addi    r9,r9,1
 128         cmpwi   r8,0                            // 0 ?
 129         bdnzf   eq,L0byteloop           // loop until 0 found or buffer end
 130
 131         bne--   L0notfound                      // skip if 0 not found (error)
 132         subi    r9,r9,1                         // back up, so r9 points to the 0
 133
 134 // End of dest found, so we can start appending source.  First, align the source,
 135 // in order to avoid spurious page faults.
 136 //              r3 = original start of buffer
 137 //              r4 = original source ptr (unaligned)
 138 //              r5 = original buffer size
 139 //              r6 = 0xFEFEFEFF
 140 //              r7 = 0x80808080
 141 //              r9 = ptr to 0-byte in dest (unaligned)
 142
 143 L0found:
 144         andi.   r0,r4,GPR_BYTES-1   // is source aligned?
 145         add             r5,r5,r3                        // get ptr to end of buffer
 146         sub             r5,r5,r9                        // get #bytes remaining in buffer, counting the 0 (r5>0)
 147         beq             Laligned                        // skip if source already word aligned
 148         subfic  r0,r0,GPR_BYTES     // not aligned, get #bytes to align r4
 149         b               Lbyteloop1                      // r5!=0, so skip check
 150
 151 // Copy min(r0,r5) bytes, until 0-byte.
 152 //              r0 = #bytes we propose to copy (NOTE: must be >0)
 153 //              r4 = source ptr (unaligned)
 154 //              r5 = length remaining in buffer (may be 0)
 155 //              r6 = 0xFEFEFEFF
 156 //              r7 = 0x80808080
 157 //              r9 = dest ptr (unaligned)
 158
 159 Lbyteloop:
 160         cmpgi   r5,0                            // buffer empty? (note: length is unsigned)
 161         beq--   Loverrun                        // buffer filled before end of source reached
 162 Lbyteloop1:                                                     // entry when we know r5!=0
 163         lbz             r8,0(r4)                        // r8 <- next source byte
 164         subic.  r0,r0,1                         // decrement count of bytes to move
 165         addi    r4,r4,1
 166         subi    r5,r5,1                         // decrement buffer length remaining
 167         stb             r8,0(r9)                        // pack into dest
 168         cmpwi   cr1,r8,0                        // 0-byte?
 169         addi    r9,r9,1
 170         beq             cr1,L0stored            // byte was 0, so done
 171         bne             Lbyteloop                       // r0!=0, source not yet aligned
 172
 173 // Source is aligned.  Loop over words or doublewords until 0-byte found or end
 174 // of buffer.
 175 //              r3 = original start of buffer
 176 //              r4 = source ptr (aligned)
 177 //              r5 = length remaining in buffer
 178 //              r6 = 0xFEFEFEFF
 179 //              r7 = 0x80808080
 180 //              r9 = dest ptr (unaligned)
 181
 182 Laligned:
 183         srgi.   r8,r5,LOG2_GPR_BYTES// get #words or doublewords in buffer
 184         addi    r0,r5,1                         // if no words...
 185         beq--   Lbyteloop                       // ...copy to end of buffer
 186         mtctr   r8                                      // set up loop count
 187         rlwinm  r5,r5,0,GPR_BYTES-1 // mask buffer length down to leftover bytes
 188         b               LwordloopEnter
 189
 190 // Inner loop: move a word or doubleword at a time, until one of two conditions:
 191 //              - a zero byte is found
 192 //              - end of buffer
 193 // At this point, registers are as follows:
 194 //              r3 = original start of buffer
 195 //              r4 = source ptr (aligned)
 196 //              r5 = bytes leftover in buffer (0..GPR_BYTES-1)
 197 //              r6 = 0xFEFEFEFF
 198 //              r7 = 0x80808080
 199 //              r9 = dest ptr (unaligned)
 200 //     ctr = loop count
 201
 202         .align  5                                       // align inner loop, which is 8 words long
 203 Lwordloop:
 204         stg             r8,0(r9)                        // pack word into destination
 205         addi    r9,r9,GPR_BYTES
 206 LwordloopEnter:
 207         lg              r8,0(r4)                        // r8 <- next 4 or 8 source bytes
 208         addi    r4,r4,GPR_BYTES
 209         add             r10,r8,r6                       // r10 <-  word + 0xFEFEFEFF
 210         andc    r12,r7,r8                       // r12 <- ~word & 0x80808080
 211         and.    r11,r10,r12                     // r11 <- nonzero iff word has a 0-byte
 212         bdnzt   eq,Lwordloop            // loop if ctr!=0 and cr0_eq
 213
 214         beq--   Lleftovers                      // skip if no 0-byte found, copy leftovers
 215
 216 // Found a 0-byte.  Store last word up to and including the 0, a byte at a time.
 217 //              r3 = original start of buffer
 218 //              r8 = last word, known to have a 0-byte
 219 //              r9 = dest ptr (one past 0)
 220
 221 Lstorelastbytes:
 222         srgi.   r0,r8,GPR_BYTES*8-8 // right justify next byte and test for 0
 223         slgi    r8,r8,8                         // shift next byte into position
 224         stb             r0,0(r9)                        // pack into dest
 225         addi    r9,r9,1
 226         bne             Lstorelastbytes         // loop until 0 stored
 227
 228 // Append op successful, O stored into buffer.  Return total length.
 229 //              r3 = original start of buffer
 230 //              r9 = dest ptr (one past 0)
 231
 232 L0stored:
 233         sub             r3,r9,r3                        // get (length+1) of string in buffer
 234         subi    r3,r3,1                         // return length
 235         blr
 236
 237 // 0-byte not found in aligned source words.  There are up to GPR_BYTES-1 leftover
 238 // source bytes, hopefully the 0-byte is among them.
 239 //              r4 = source ptr (aligned)
 240 //              r5 = leftover bytes in buffer (0..GPR_BYTES-1)
 241 //              r6 = 0xFEFEFEFF
 242 //              r7 = 0x80808080
 243 //              r8 = last full word or doubleword of source
 244 //              r9 = dest ptr (unaligned)
 245
 246 Lleftovers:
 247         stg             r8,0(r9)                        // store last word
 248         addi    r9,r9,GPR_BYTES
 249         addi    r0,r5,1                         // make sure r5 terminates byte loop (not r0)
 250         b               Lbyteloop
 251
 252 // Buffer filled during append without finding the end of source.  Overwrite the
 253 // last byte in buffer with a 0, and compute how long the concatenated string would
 254 // have been, if the buffer had been large enough.
 255 //              r3 = original start of buffer
 256 //              r4 = source ptr (1st byte not copied into buffer)
 257 //              r9 = dest ptr (one past end of buffer)
 258
 259 Loverrun:
 260         sub.    r3,r9,r3                        // compute #bytes stored in buffer
 261         li              r0,0                            // get a 0
 262         beq--   Lskip                           // buffer was 0-length
 263         stb             r0,-1(r9)                       // jam in delimiting 0
 264
 265 // Buffer full, check to see how much longer source is.  We don't optimize this,
 266 // since overruns are an error.
 267
 268 Lskip:
 269         lbz             r8,0(r4)                        // get next source byte
 270         addi    r4,r4,1
 271         addi    r3,r3,1                         // increment length of "ideal" string
 272         cmpwi   r8,0                            // 0?
 273         bne             Lskip
 274
 275         subi    r3,r3,1                         // don't count 0 in length
 276         blr                                                     // return length of string we "wanted" to create
 277
 278 // 0 not found in buffer (append not yet begun.)  We don't store a delimiting 0,
 279 // but do compute how long the concatenated string would have been, assuming the length
 280 // of "dst" is the length of the buffer.
 281 //              r3 = original start of buffer
 282 //              r4 = original source ptr
 283 //              r9 = dest ptr (one past end of buffer)
 284
 285 L0notfound:
 286         sub             r3,r9,r3                        // compute #bytes in buffer
 287         b               Lskip                           // add strlen(source) to r3
 288