2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
24 * Strlen, for processors with SSE3.
26 * Note that all memory references must be aligned, in order to avoid spurious
27 * page faults. Thus we have to load the aligned 16-byte chunk containing the
28 * first byte of the operand, then mask out false 0s that may occur before the
31 * We favor the fall-through (ie, short operand) path.
37 _strlen: // size_t strlen(char *b);
38 pxor %xmm0,%xmm0 // zero %xmm0
39 movq %rdi,%rcx // copy ptr
40 movq %rdi,%rdx // make another copy
41 andq $(-16),%rdi // 16-byte align ptr
43 pcmpeqb (%rdi),%xmm0 // check whole qw for 0s
44 andl $15,%ecx // get #bytes in aligned dq before operand
45 shl %cl,%eax // create mask for the bytes of aligned dq in operand
46 pmovmskb %xmm0,%ecx // collect mask of 0-bytes
47 andl %eax,%ecx // mask out any 0s that occur before 1st byte
48 jz LEnterLoop // no 0-bytes (ie, 1-bits), so enter by-16 loop
50 // We've found a 0-byte.
51 // %rdi = aligned address of 16-byte block containing the terminating 0-byte
52 // %ecx = compare bit vector
55 bsf %ecx,%eax // find first 1-bit (ie, first 0-byte)
56 movq %rdx,%rcx // recover ptr to 1st byte in string
57 addq %rdi,%rax // get address of the 0-byte
58 subq %rcx,%rax // subtract address of 1st byte to get string length
61 // Loop over aligned 16-byte blocks:
62 // %rdi = address of previous block
65 pxor %xmm0,%xmm0 // get some 0-bytes
66 addq $16,%rdi // advance ptr
68 movdqa (%rdi),%xmm1 // get next chunk
70 pcmpeqb %xmm0,%xmm1 // check for 0s
71 pmovmskb %xmm1,%ecx // collect mask of 0-bytes
72 test %ecx,%ecx // any 0-bytes?
73 jz LLoop // no 0-bytes, so get next dq
75 subq $16,%rdi // back up ptr