]>
Commit | Line | Data |
---|---|---|
8e029c65 | 1 | /* |
224c7076 | 2 | * Copyright (c) 2005-2007 Apple Inc. All rights reserved. |
8e029c65 A |
3 | * |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * The contents of this file constitute Original Code as defined in and | |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
11 | * | |
12 | * This Original Code and all software distributed under the License are | |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the | |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
19 | * | |
20 | * @APPLE_LICENSE_HEADER_END@ | |
21 | */ | |
22 | ||
23 | /* | |
24 | * Strlen, for processors with SSE3. | |
25 | * | |
26 | * Note that all memory references must be aligned, in order to avoid spurious | |
27 | * page faults. Thus we have to load the aligned 16-byte chunk containing the | |
28 | * first byte of the operand, then mask out false 0s that may occur before the | |
29 | * first byte. | |
30 | * | |
31 | * We favor the fall-through (ie, short operand) path. | |
32 | */ | |
33 | ||
34 | .text | |
35 | .globl _strlen | |
36 | .align 4, 0x90 | |
37 | _strlen: // size_t strlen(char *b); | |
38 | pxor %xmm0,%xmm0 // zero %xmm0 | |
224c7076 A |
39 | movl %edi,%ecx // copy low half of ptr |
40 | movq %rdi,%rdx // make another full copy | |
8e029c65 A |
41 | andq $(-16),%rdi // 16-byte align ptr |
42 | orl $(-1),%eax | |
43 | pcmpeqb (%rdi),%xmm0 // check whole qw for 0s | |
44 | andl $15,%ecx // get #bytes in aligned dq before operand | |
45 | shl %cl,%eax // create mask for the bytes of aligned dq in operand | |
46 | pmovmskb %xmm0,%ecx // collect mask of 0-bytes | |
47 | andl %eax,%ecx // mask out any 0s that occur before 1st byte | |
48 | jz LEnterLoop // no 0-bytes (ie, 1-bits), so enter by-16 loop | |
49 | ||
50 | // We've found a 0-byte. | |
51 | // %rdi = aligned address of 16-byte block containing the terminating 0-byte | |
52 | // %ecx = compare bit vector | |
53 | ||
54 | LFoundIt: | |
55 | bsf %ecx,%eax // find first 1-bit (ie, first 0-byte) | |
224c7076 A |
56 | subq %rdx,%rdi // get length to start of 16-byte block while we wait |
57 | addq %rdi,%rax // add bytes in 16-byte block | |
8e029c65 A |
58 | ret |
59 | ||
60 | // Loop over aligned 16-byte blocks: | |
61 | // %rdi = address of previous block | |
62 | ||
63 | LEnterLoop: | |
64 | pxor %xmm0,%xmm0 // get some 0-bytes | |
65 | addq $16,%rdi // advance ptr | |
66 | LLoop: | |
67 | movdqa (%rdi),%xmm1 // get next chunk | |
68 | addq $16,%rdi | |
69 | pcmpeqb %xmm0,%xmm1 // check for 0s | |
70 | pmovmskb %xmm1,%ecx // collect mask of 0-bytes | |
71 | test %ecx,%ecx // any 0-bytes? | |
72 | jz LLoop // no 0-bytes, so get next dq | |
73 | ||
74 | subq $16,%rdi // back up ptr | |
75 | jmp LFoundIt |