i386/string/strlcpy.s

   1 /*
   2  * Copyright (c) 2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24
  25 // *****************
  26 // * S T R L C P Y *
  27 // *****************
  28 //
  29 // size_t  strlcpy(char *dst, const char *src, size_t size);
  30 //
  31 // We optimize the move by doing it word parallel.  This introduces
  32 // a complication: if we blindly did word load/stores until finding
  33 // a 0, we might get a spurious page fault by touching bytes past it.
  34 // To avoid this, we never do a load that crosses a page boundary,
  35 // or store unnecessary bytes.
  36 //
  37 // The test for 0s relies on the following inobvious but very efficient
  38 // word-parallel test:
  39 //              x =  dataWord + 0xFEFEFEFF
  40 //              y = ~dataWord & 0x80808080
  41 //              if (x & y) == 0 then no zero found
  42 // The test maps any non-zero byte to zero, and any zero byte to 0x80,
  43 // with one exception: 0x01 bytes preceeding the first zero are also
  44 // mapped to 0x80.
  45 //
  46 // On Core2 class machines, this word-parallel implementation seems to
  47 // be slightly faster than using SSE up to about 100 bytes.
  48 // It is faster than the naive byte-by-byte implementation for
  49 // operands longer than about 8 bytes.
  50
  51         .text
  52         .globl _strlcpy
  53
  54         .align  4
  55 _strlcpy:                               // size_t *strlcpy(char *dst, const char *src, size_t size);
  56         pushl   %edi
  57         pushl   %esi
  58         pushl   %ebx
  59         movl    16(%esp),%edi           // get dest ptr
  60         movl    20(%esp),%esi           // get source ptr
  61         movl    24(%esp),%ecx           // get length of buffer
  62         movl    %esi,%edx               // copy source ptr
  63         negl    %edx
  64         andl    $3,%edx                 // how many bytes to align source ptr?
  65         jz      LAligned                // already aligned
  66
  67
  68 // Loop over bytes.
  69 //      %edi = dest ptr
  70 //      %esi = source ptr
  71 //      %ecx = length remaining in buffer
  72 //      %edx = number of bytes to copy (>0, may not fit in buffer)
  73
  74 LLoopOverBytes:
  75         movzb   (%esi),%eax             // get source byte before checking buffer length
  76         testl   %ecx,%ecx               // buffer full?
  77         jz      L0NotFound              // yes
  78         inc     %esi
  79         dec     %ecx
  80         movb    %al,(%edi)              // pack into dest
  81         inc     %edi
  82         testl   %eax,%eax               // 0?
  83         jz      LDone                   // yes, done
  84         dec     %edx                    // more to go?
  85         jnz     LLoopOverBytes
  86
  87
  88 // Source is aligned.  Loop over words until end of buffer.  We
  89 // align the source, rather than the dest, to avoid getting spurious page faults.
  90 //      %edi = dest ptr (unaligned)
  91 //      %esi = source ptr (word aligned)
  92 //      %ecx = length remaining in buffer
  93
  94 LAligned:
  95         movl    $5,%edx                 // if buffer almost exhausted, prepare to copy rest byte-by-byte
  96         cmpl    $4,%ecx                 // enough for at least one word?
  97         jb      LLoopOverBytes
  98
  99
 100 // Loop over words.
 101 //      %edi = dest ptr (unaligned)
 102 //      %esi = source ptr (word aligned)
 103 //      %ecx = length remaining in buffer (>=4)
 104
 105 LLoopOverWords:
 106         movl    (%esi),%eax             // get next 4 bytes of source
 107         subl    $4,%ecx
 108         addl    $4,%esi
 109         movl    %eax,%edx               // make 2 copies of word
 110         movl    %eax,%ebx
 111         notl    %edx                    // use magic word-parallel test for 0s
 112         addl    $0xFEFEFEFF,%ebx
 113         andl    $0x80808080,%edx
 114         testl   %ebx,%edx
 115         jnz     L0Found                 // one of the bytes of %eax is a 0
 116         movl    %eax,(%edi)             // pack 4 bytes into destination
 117         addl    $4,%edi
 118         cmpl    $4,%ecx                 // room in buffer for another word?
 119         jae     LLoopOverWords          // yes
 120
 121         movl    %ecx,%edx               // copy leftovers in byte loop
 122         jmp     LLoopOverBytes
 123
 124 // Found a 0-byte in the word of source.  Store a byte at a time until the 0.
 125 //      %edi = dest ptr (unaligned)
 126 //      %eax = last word of source, known to have a 0-byte
 127
 128 LNextByte:
 129         shrl    $8,%eax                 // next byte
 130 L0Found:
 131         movb    %al,(%edi)              // pack in next byte
 132         incl    %edi
 133         testb   %al,%al                 // 0?
 134         jnz     LNextByte
 135
 136 // Done storing string.
 137 //      %edi = ptr to byte after 0-byte
 138
 139 LDone:
 140         subl    16(%esp),%edi           // subtract original dest ptr to get length stored
 141         decl    %edi                    // don't count the 0-byte
 142         movl    %edi,%eax               // copy to return value
 143 LExit:
 144         popl    %ebx
 145         popl    %esi
 146         popl    %edi
 147         ret
 148
 149 // Buffer filled but 0-byte not found.  We return the length of the source string.
 150 // This is not optimized, as it is an error condition.
 151 //      %edi = dest ptr (ie, 1 past end of buffer)
 152 //      %esi = source ptr (ptr to 1st byte that does not fit)
 153
 154 L0NotFound:
 155         movl    24(%esp),%eax           // reload buffer length
 156         testl   %eax,%eax               // null?
 157         jz      1f                      // yes, cannot store a 0
 158         xorl    %edx,%edx               // get a 0
 159         movb    %dl,-1(%edi)            // store a 0 at end of buffer to delimit string
 160 1:
 161         movzb   (%esi),%edx             // get next byte of source
 162         incl    %esi
 163         incl    %eax
 164         testl   %edx,%edx               // 0?
 165         jnz     1b
 166         decl    %eax                    // don't count the 0-byte
 167         jmp     LExit