x86_64/string/strcpy.s

   1 /*
   2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24
  25 // ***************
  26 // * S T R C P Y *
  27 // ***************
  28 //
  29 // char  *strcpy(const char *dst, const char *src);
  30 //
  31 // We optimize the move by doing it vector parallel.  This introduces
  32 // a complication: if we blindly did vector load/stores until finding
  33 // a 0, we might get a spurious page fault by touching bytes past it.
  34 // To avoid this, we never do a load that crosses a page boundary,
  35 // and never store a byte we don't have to.
  36 //
  37 // We align the destination, because unaligned vector stores are slow.
  38
  39         .text
  40         .globl _strcpy
  41
  42         .align  4
  43 _strcpy:                                // char *strcpy(const char *dst, const char *src);
  44         movq    %rdi,%rcx       // preserve dest ptr so we can return it
  45         movl    %edi,%edx               // copy low 4 bytes of dest ptr
  46         negl    %edx
  47         andl    $15,%edx                // how many bytes to align dest ptr?
  48         jnz     LLoopOverBytes          // not aligned, so go do so
  49
  50
  51 // In order to avoid spurious page faults, we loop until nearing the source page
  52 // end.  Then we revert to a byte-by-byte loop for 16 bytes until the page is crossed,
  53 // then resume the vector loop.
  54 //      %rsi = source ptr (unaligned)
  55 //      %rdi = dest ptr (aligned)
  56
  57 LNextChunk:
  58         movl    %esi,%eax               // copy low 4 bytes of source ptr
  59         movl    $4096,%edx
  60         andl    $4095,%eax              // get offset into source page
  61         subl    %eax,%edx               // get #bytes remaining in source page
  62         shrl    $4,%edx                 // get #chunks till end of page
  63         jnz     LLoopOverChunks         // enter vector loop
  64         movl    $16,%edx                // move 16 bytes to cross page but keep dest aligned
  65         jmp     LLoopOverBytes
  66
  67
  68 // Loop over bytes.
  69 //      %rsi = source ptr
  70 //      %rdi = dest ptr
  71 //      %edx = byte count
  72
  73         .align  4,0x90                  // align inner loops to optimize I-fetch
  74 LLoopOverBytes:
  75         movzb   (%rsi),%eax             // get source byte
  76         addq    $1,%rsi
  77         movb    %al,(%rdi)              // pack into dest
  78         addq    $1,%rdi
  79         testl   %eax,%eax               // 0?
  80         jz      LDone                   // yes, we're done
  81         subl    $1,%edx                 // more to go?
  82         jnz     LLoopOverBytes
  83
  84         jmp     LNextChunk              // we've come to end of page
  85
  86
  87 // Loop over 16-byte chunks.
  88 //      %rsi = source ptr (unaligned)
  89 //      %rdi = dest ptr (aligned)
  90 //      %edx = chunk count
  91
  92         .align  4,0x90                  // align inner loops to optimize I-fetch
  93 LLoopOverChunks:
  94         movdqu  (%rsi),%xmm1            // get source
  95         pxor    %xmm0,%xmm0             // get some 0s
  96         addq    $16,%rsi
  97         pcmpeqb %xmm1,%xmm0             // compare source to 0s
  98         pmovmskb %xmm0,%eax             // get result mask for 0 check
  99         testl   %eax,%eax               // any 0s?
 100         jnz     LFound0                 // yes, exit loop
 101         movdqa  %xmm1,(%rdi)            // no 0s so do aligned store into destination
 102         addq    $16,%rdi
 103         subl    $1,%edx                 // more to go?
 104         jnz     LLoopOverChunks
 105
 106         movl    $16,%edx                // move 16 bytes
 107         jmp     LLoopOverBytes          // cross page but keep dest aligned
 108
 109
 110 // Found a zero in the vector.  Figure out where it is, and store the bytes
 111 // up to it.
 112 //      %rdi = dest ptr (aligned)
 113 //      %eax = result mask
 114 //      %xmm1 = source vector
 115
 116 LFound0:
 117         bsf     %eax,%edx               // find first 0
 118         addl    $1,%edx                 // we need to store the 0 too
 119         test    $16,%dl                 // was 0 last byte?
 120         jz      8f                      // no
 121         movdqa  %xmm1,(%rdi)            // yes, store entire vector
 122         jmp     LDone
 123 8:
 124         test    $8,%dl                  // 8-byte store required?
 125         jz      4f                      // no
 126         movq    %xmm1,(%rdi)            // pack in 8 low bytes
 127         psrldq  $8,%xmm1                // then shift vector down 8 bytes
 128         addq    $8,%rdi
 129 4:
 130         test    $4,%dl                  // 4-byte store required?
 131         jz      3f                      // no
 132         movd    %xmm1,(%rdi)            // pack in 4 low bytes
 133         psrldq  $4,%xmm1                // then shift vector down 4 bytes
 134         addq    $4,%rdi
 135 3:
 136         andl    $3,%edx                 // more to go?
 137         jz      LDone                   // no
 138         movd    %xmm1,%eax              // move remainders out of vector into %eax
 139 1:                                      // loop on up to three bytes
 140         movb    %al,(%rdi)              // pack in next byte
 141         shrl    $8,%eax                 // shift next byte into position
 142         addq    $1,%rdi
 143         dec     %edx
 144         jnz     1b
 145
 146 LDone:
 147         movq    %rcx,%rax               // original dest ptr is return value
 148         ret