]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * The contents of this file constitute Original Code as defined in and | |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
11 | * | |
12 | * This Original Code and all software distributed under the License are | |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the | |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
19 | * | |
20 | * @APPLE_LICENSE_HEADER_END@ | |
21 | */ | |
22 | ||
23 | /* | |
24 | * Strlen, for processors with SSE3. | |
25 | * | |
26 | * Note that all memory references must be aligned, in order to avoid spurious | |
27 | * page faults. Thus we have to load the aligned 16-byte chunk containing the | |
28 | * first byte of the operand, then mask out false 0s that may occur before the | |
29 | * first byte. | |
30 | * | |
31 | * We favor the fall-through (ie, short operand) path. | |
32 | */ | |
33 | ||
34 | .text | |
35 | .globl _strlen | |
36 | .align 4, 0x90 | |
37 | _strlen: // size_t strlen(char *b); | |
38 | pxor %xmm0,%xmm0 // zero %xmm0 | |
39 | movq %rdi,%rcx // copy ptr | |
40 | movq %rdi,%rdx // make another copy | |
41 | andq $(-16),%rdi // 16-byte align ptr | |
42 | orl $(-1),%eax | |
43 | pcmpeqb (%rdi),%xmm0 // check whole qw for 0s | |
44 | andl $15,%ecx // get #bytes in aligned dq before operand | |
45 | shl %cl,%eax // create mask for the bytes of aligned dq in operand | |
46 | pmovmskb %xmm0,%ecx // collect mask of 0-bytes | |
47 | andl %eax,%ecx // mask out any 0s that occur before 1st byte | |
48 | jz LEnterLoop // no 0-bytes (ie, 1-bits), so enter by-16 loop | |
49 | ||
50 | // We've found a 0-byte. | |
51 | // %rdi = aligned address of 16-byte block containing the terminating 0-byte | |
52 | // %ecx = compare bit vector | |
53 | ||
54 | LFoundIt: | |
55 | bsf %ecx,%eax // find first 1-bit (ie, first 0-byte) | |
56 | movq %rdx,%rcx // recover ptr to 1st byte in string | |
57 | addq %rdi,%rax // get address of the 0-byte | |
58 | subq %rcx,%rax // subtract address of 1st byte to get string length | |
59 | ret | |
60 | ||
61 | // Loop over aligned 16-byte blocks: | |
62 | // %rdi = address of previous block | |
63 | ||
64 | LEnterLoop: | |
65 | pxor %xmm0,%xmm0 // get some 0-bytes | |
66 | addq $16,%rdi // advance ptr | |
67 | LLoop: | |
68 | movdqa (%rdi),%xmm1 // get next chunk | |
69 | addq $16,%rdi | |
70 | pcmpeqb %xmm0,%xmm1 // check for 0s | |
71 | pmovmskb %xmm1,%ecx // collect mask of 0-bytes | |
72 | test %ecx,%ecx // any 0-bytes? | |
73 | jz LLoop // no 0-bytes, so get next dq | |
74 | ||
75 | subq $16,%rdi // back up ptr | |
76 | jmp LFoundIt |