]>
Commit | Line | Data |
---|---|---|
1c79356b A |
1 | /* |
2 | * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
e5568f75 A |
6 | * The contents of this file constitute Original Code as defined in and |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
1c79356b | 11 | * |
e5568f75 A |
12 | * This Original Code and all software distributed under the License are |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
1c79356b A |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
e5568f75 A |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
1c79356b A |
19 | * |
20 | * @APPLE_LICENSE_HEADER_END@ | |
21 | */ | |
22 | #define STANDALONE 0 | |
23 | ||
24 | #if STANDALONE | |
25 | #include "asm.h" | |
26 | #include "assym.h" | |
27 | #include "proc_reg.h" /* For CACHE_LINE_SIZE */ | |
28 | ||
29 | #else | |
30 | ||
31 | #include <mach/ppc/asm.h> | |
32 | #if 0 | |
33 | /* #include <assym.h> */ | |
34 | #include <ppc/proc_reg.h> /* For CACHE_LINE_SIZE */ | |
35 | #endif 0 | |
36 | #endif | |
37 | ||
38 | /* | |
39 | * Reg 3 - Pointer to data | |
40 | * Reg 4 - Length of data | |
41 | * Reg 5 - Accumulated sum value | |
42 | * Reg 6 - Starting on odd boundary flag (relative to byte 0 of the checksumed data) | |
43 | */ | |
44 | ||
45 | ENTRY(xsum_assym, TAG_NO_FRAME_USED) | |
46 | ||
47 | mr r11, r6 ; Swapped flag | |
48 | addi r8, 0, 0 | |
49 | addi r10, 0, 0x1f | |
50 | addi r7, 0, 1 | |
51 | addic r7, r7, 0 ; This clears the carry bit! | |
52 | mr r12, r5 ; Save the passed-in checksum value | |
53 | ||
54 | /* | |
55 | * Sum bytes before cache line boundary | |
56 | */ | |
57 | ||
58 | cmpi cr0,0,r4,0 ; Check for length of 0 | |
59 | beq Lleftovers | |
60 | ||
61 | and. r9, r3, r10 | |
62 | beq Laligned32 ; 32 byte aligned | |
63 | ||
64 | andi. r9, r3, 0x3 | |
65 | beq Laligned4 | |
66 | ||
67 | andi. r9, r3, 0x1 | |
68 | beq Laligned2 ; 2 byte aligned | |
69 | ||
70 | addi r11, 0, 1 ; swap bytes at end | |
71 | lbz r8, 0(r3) | |
72 | add r3, r3, r7 | |
73 | subf. r4, r7, r4 | |
74 | beq Ldone | |
75 | ||
76 | Laligned2: | |
77 | cmpi cr0,0,r4,2 ; If remaining length is less than two - go to wrap-up | |
78 | blt Lleftovers | |
79 | andi. r9, r3, 0x3 ; If aligned on a 4-byte boundary, go to that code | |
80 | beq Laligned4 | |
81 | lhz r5, 0(r3) ; Load and add a halfword to the checksum | |
82 | adde r8, r8, r5 | |
83 | slwi r7, r7, 1 | |
84 | add r3, r3, r7 | |
85 | subf. r4, r7, r4 | |
86 | beq Ldone | |
87 | ||
88 | ||
89 | /* | |
90 | Add longwords up to the 32 byte boundary | |
91 | */ | |
92 | ||
93 | Laligned4: | |
94 | addi r7, 0, 4 | |
95 | Lloop4: | |
96 | cmpi cr0,0,r4,4 | |
97 | blt Lleftovers | |
98 | and. r9, r3, r10 | |
99 | beq Laligned32 | |
100 | lwz r5, 0(r3) | |
101 | adde r8, r8, r5 | |
102 | add r3, r3, r7 | |
103 | subf. r4, r7, r4 | |
104 | bne Lloop4 | |
105 | b Ldone | |
106 | ||
107 | ||
108 | /* | |
109 | We're aligned on a 32 byte boundary now - add 8 longwords to checksum | |
110 | until the remaining length is less than 32 | |
111 | */ | |
112 | Laligned32: | |
113 | andis. r6, r4, 0xffff | |
114 | bne Lmainloop | |
115 | andi. r6, r4, 0xffe0 | |
116 | beq Lleftovers | |
117 | ||
118 | Lmainloop: | |
119 | addi r9, 0, 64 | |
120 | addi r10, 0, 32 | |
121 | cmpi cr0,0,r4,64 | |
122 | blt Lnopretouch | |
123 | dcbt r3, r10 ; Touch one cache-line ahead | |
124 | Lnopretouch: | |
125 | lwz r5, 0(r3) | |
126 | ||
127 | /* | |
128 | * This is the main meat of the checksum. I attempted to arrange this code | |
129 | * such that the processor would execute as many instructions as possible | |
130 | * in parallel. | |
131 | */ | |
132 | ||
133 | Lloop: | |
134 | cmpi cr0,0,r4,96 | |
135 | blt Lnotouch | |
136 | dcbt r3, r9 ; Touch two cache lines ahead | |
137 | Lnotouch: | |
138 | adde r8, r8, r5 | |
139 | lwz r5, 4(r3) | |
140 | lwz r6, 8(r3) | |
141 | lwz r7, 12(r3) | |
142 | adde r8, r8, r5 | |
143 | lwz r5, 16(r3) | |
144 | adde r8, r8, r6 | |
145 | lwz r6, 20(r3) | |
146 | adde r8, r8, r7 | |
147 | lwz r7, 24(r3) | |
148 | adde r8, r8, r5 | |
149 | lwz r5, 28(r3) | |
150 | add r3, r3, r10 | |
151 | adde r8, r8, r6 | |
152 | adde r8, r8, r7 | |
153 | adde r8, r8, r5 | |
154 | subf r4, r10, r4 | |
155 | andi. r6, r4, 0xffe0 | |
156 | beq Lleftovers | |
157 | lwz r5, 0(r3) | |
158 | b Lloop | |
159 | ||
160 | /* | |
161 | * Handle whatever bytes are left | |
162 | */ | |
163 | ||
164 | Lleftovers: | |
165 | /* | |
166 | * Handle leftover bytes | |
167 | */ | |
168 | cmpi cr0,0,r4,0 | |
169 | beq Ldone | |
170 | ||
171 | addi r7, 0, 1 | |
172 | addi r10, 0, 0x7ffc | |
173 | ||
174 | and. r9, r4, r10 | |
175 | bne Lfourormore | |
176 | srw r10, r10, r7 | |
177 | and. r9, r4, r10 | |
178 | bne Ltwoormore | |
179 | b Loneleft | |
180 | ||
181 | Lfourormore: | |
182 | addi r10, 0, 4 | |
183 | ||
184 | Lfourloop: | |
185 | lwz r5, 0(r3) | |
186 | adde r8, r8, r5 | |
187 | add r3, r3, r10 | |
188 | subf r4, r10, r4 | |
189 | andi. r6, r4, 0xfffc | |
190 | bne Lfourloop | |
191 | ||
192 | Ltwoormore: | |
193 | andi. r6, r4, 0xfffe | |
194 | beq Loneleft | |
195 | lhz r5, 0(r3) | |
196 | adde r8, r8, r5 | |
197 | addi r3, r3, 2 | |
198 | subi r4, r4, 2 | |
199 | ||
200 | Loneleft: | |
201 | cmpi cr0,0,r4,0 | |
202 | beq Ldone | |
203 | lbz r5, 0(r3) | |
204 | slwi r5, r5, 8 | |
205 | adde r8, r8, r5 | |
206 | ||
207 | /* | |
208 | * Wrap the longword around, adding the two 16-bit portions | |
209 | * to each other along with any previous and subsequent carries. | |
210 | */ | |
211 | Ldone: | |
212 | addze r8, r8 ; Add the carry | |
213 | addze r8, r8 ; Add the carry again (the last add may have carried) | |
214 | andis. r6, r8, 0xffff ; Stuff r6 with the high order 16 bits of sum word | |
215 | srwi r6, r6, 16 ; Shift it to the low order word | |
216 | andi. r8, r8, 0xffff ; Zero out the high order word | |
217 | add r8, r8, r6 ; Add the two halves | |
218 | ||
219 | andis. r6, r8, 0xffff ; Do the above again in case we carried into the | |
220 | srwi r6, r6, 16 ; high order word with the last add. | |
221 | andi. r8, r8, 0xffff | |
222 | add r3, r8, r6 | |
223 | ||
224 | cmpi cr0,0,r11,0 ; Check to see if we need to swap the bytes | |
225 | beq Ldontswap | |
226 | ||
227 | /* | |
228 | * Our buffer began on an odd boundary, so we need to swap | |
229 | * the checksum bytes. | |
230 | */ | |
231 | slwi r8, r3, 8 ; shift byte 0 to byte 1 | |
232 | clrlwi r8, r8, 16 ; Clear top 16 bits | |
233 | srwi r3, r3, 8 ; shift byte 1 to byte 0 | |
234 | or r3, r8, r3 ; or them | |
235 | ||
236 | Ldontswap: | |
237 | add r3, r3, r12 ; Add in the passed-in checksum | |
238 | andis. r6, r3, 0xffff ; Wrap and add any carries into the top 16 bits | |
239 | srwi r6, r6, 16 | |
240 | andi. r3, r3, 0xffff | |
241 | add r3, r3, r6 | |
242 | ||
243 | andis. r6, r3, 0xffff ; Do the above again in case we carried into the | |
244 | srwi r6, r6, 16 ; high order word with the last add. | |
245 | andi. r3, r3, 0xffff | |
246 | add r3, r3, r6 | |
247 | blr | |
248 | ||
249 |