]> git.saurik.com Git - apple/libc.git/blob - x86_64/string/strlen.s
Libc-594.9.5.tar.gz
[apple/libc.git] / x86_64 / string / strlen.s
1 /*
2 * Copyright (c) 2005-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23 /*
24 * Strlen, for processors with SSE3.
25 *
26 * Note that all memory references must be aligned, in order to avoid spurious
27 * page faults. Thus we have to load the aligned 16-byte chunk containing the
28 * first byte of the operand, then mask out false 0s that may occur before the
29 * first byte.
30 *
31 * We favor the fall-through (ie, short operand) path.
32 */
33
34 .text
35 .globl _strlen
36 .align 4, 0x90
37 _strlen: // size_t strlen(char *b);
38 pxor %xmm0,%xmm0 // zero %xmm0
39 movl %edi,%ecx // copy low half of ptr
40 movq %rdi,%rdx // make another full copy
41 andq $(-16),%rdi // 16-byte align ptr
42 orl $(-1),%eax
43 pcmpeqb (%rdi),%xmm0 // check whole qw for 0s
44 andl $15,%ecx // get #bytes in aligned dq before operand
45 shl %cl,%eax // create mask for the bytes of aligned dq in operand
46 pmovmskb %xmm0,%ecx // collect mask of 0-bytes
47 andl %eax,%ecx // mask out any 0s that occur before 1st byte
48 jz LEnterLoop // no 0-bytes (ie, 1-bits), so enter by-16 loop
49
50 // We've found a 0-byte.
51 // %rdi = aligned address of 16-byte block containing the terminating 0-byte
52 // %ecx = compare bit vector
53
54 LFoundIt:
55 bsf %ecx,%eax // find first 1-bit (ie, first 0-byte)
56 subq %rdx,%rdi // get length to start of 16-byte block while we wait
57 addq %rdi,%rax // add bytes in 16-byte block
58 ret
59
60 // Loop over aligned 16-byte blocks:
61 // %rdi = address of previous block
62
63 LEnterLoop:
64 pxor %xmm0,%xmm0 // get some 0-bytes
65 addq $16,%rdi // advance ptr
66 LLoop:
67 movdqa (%rdi),%xmm1 // get next chunk
68 addq $16,%rdi
69 pcmpeqb %xmm0,%xmm1 // check for 0s
70 pmovmskb %xmm1,%ecx // collect mask of 0-bytes
71 test %ecx,%ecx // any 0-bytes?
72 jz LLoop // no 0-bytes, so get next dq
73
74 subq $16,%rdi // back up ptr
75 jmp LFoundIt