]>
Commit | Line | Data |
---|---|---|
1c79356b | 1 | /* |
91447636 | 2 | * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. |
1c79356b A |
3 | * |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
ff6e181a A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
1c79356b | 12 | * |
ff6e181a A |
13 | * The Original Code and all software distributed under the License are |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
1c79356b A |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
ff6e181a A |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
1c79356b A |
20 | * |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
91447636 A |
23 | |
24 | #define kShort 11 | |
25 | #define cr1_gt 5 // bit 1 of cr1 | |
1c79356b A |
26 | |
27 | /* | |
91447636 A |
28 | * short xsum_assym( short *p, int len, short xsum, boolean odd); |
29 | * | |
30 | * r3 - Pointer to data | |
31 | * r4 - Length of data | |
32 | * r5 - Accumulated sum value | |
33 | * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data) | |
34 | * | |
35 | * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we | |
36 | * correctly handle the case where the flag is set and the address is odd. | |
37 | * | |
38 | * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum | |
39 | * of the data, treated as an array of 16-bit integers. 1s-complement sums are done | |
40 | * via "add with carry" operations on a 2s-complement machine like PPC. Note that | |
41 | * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the | |
42 | * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is | |
43 | * perfect except that it serializes the adds on the carry bit. On 64-bit machines | |
44 | * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding | |
45 | * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums, | |
46 | * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit | |
47 | * is set on the low 32-bits of the sum.) | |
48 | * | |
49 | * Using Altivec is tempting, but the performance impact of the greatly increased | |
50 | * number of exceptions and register save/restore traffic probably make it impractical | |
51 | * for now. | |
52 | */ | |
53 | .globl _xsum_assym | |
54 | .globl _xsum_nop_if_32bit | |
55 | .text | |
56 | .align 5 | |
57 | _xsum_assym: | |
58 | cmplwi cr0,r4,kShort ; too short to word align? | |
59 | rlwinm r2,r3,0,0x3 ; get byte offset in word | |
60 | dcbt 0,r3 ; touch in 1st cache line | |
61 | cmpwi cr6,r2,0 ; is address word aligned? | |
62 | ble cr0,Lshort ; skip if too short to bother aligning | |
63 | ||
64 | subfic r0,r2,4 ; get #bytes in partial word | |
65 | cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set | |
66 | addic r0,r0,0 ; turn off carry | |
67 | beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned) | |
68 | ||
69 | ; Partial word at start: zero filled on left, it becomes initial checksum. | |
70 | ||
71 | rlwinm r3,r3,0,0,29 ; word align address | |
72 | mtcrf 0x01,r2 ; move byte offset to cr7 | |
73 | lwz r6,0(r3) ; get partial word | |
74 | li r7,-1 ; start of mask for partial fill | |
75 | slwi r8,r2,3 ; multiply byte offset by 8 | |
76 | sub r4,r4,r0 ; adjust length for bytes in partial word | |
77 | crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary | |
78 | srw r7,r7,r8 ; get mask for bytes to keep in partial word | |
79 | addi r3,r3,4 ; point to next word of input | |
80 | and r2,r6,r7 ; zero fill on left | |
81 | ||
82 | ; Address is now word aligned. Prepare for inner loop over 32-byte chunks. | |
83 | ; r2 = initial checksum | |
84 | ; r3 = word aligned address | |
85 | ; r4 = length remaining | |
86 | ; r5 = accumulated sum parameter | |
87 | ; carry = off | |
88 | ; cr1_gt = "starting on odd address" flag | |
89 | ||
90 | Laligned: | |
91 | srwi. r0,r4,5 ; get count of 32-byte chunks | |
92 | mtcrf 0x02,r4 ; move residual length to cr6 and cr7 | |
93 | mtcrf 0x01,r4 | |
94 | beq cr0,Lleftovers ; no chunks | |
95 | ||
96 | mtctr r0 ; set up loop count | |
97 | li r4,32 ; offset to next chunk | |
98 | _xsum_nop_if_32bit: | |
99 | b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine) | |
100 | dcbt r4,r3 ; touch in 2nd cache line | |
101 | li r0,96 ; get touch offset | |
102 | b LInnerLoop32 ; enter 32-bit loop | |
103 | ||
104 | ; Inner loop for 32-bit machines. | |
105 | ||
106 | .align 4 | |
107 | LInnerLoop32: | |
108 | lwz r4,0(r3) | |
109 | lwz r6,4(r3) | |
110 | lwz r7,8(r3) | |
111 | lwz r8,12(r3) | |
112 | adde r2,r2,r4 | |
113 | lwz r9,16(r3) | |
114 | adde r2,r2,r6 | |
115 | lwz r10,20(r3) | |
116 | adde r2,r2,r7 | |
117 | lwz r11,24(r3) | |
118 | adde r2,r2,r8 | |
119 | lwz r12,28(r3) | |
120 | adde r2,r2,r9 | |
121 | dcbt r3,r0 | |
122 | adde r2,r2,r10 | |
123 | addi r3,r3,32 | |
124 | adde r2,r2,r11 | |
125 | adde r2,r2,r12 | |
126 | bdnz+ LInnerLoop32 | |
127 | ||
128 | ; Handle leftover bytes. | |
129 | ; r2 = checksum so far | |
130 | ; r3 = word aligned address | |
131 | ; r5 = accumulated sum parameter | |
132 | ; carry = live | |
133 | ; cr1_gt = "starting on odd address" flag | |
134 | ; cr6,cr7 = residual length | |
135 | ||
136 | Lleftovers: | |
137 | bf 27,Lleftover8 ; test 0x10 bit of residual length | |
138 | lwz r4,0(r3) | |
139 | lwz r6,4(r3) | |
140 | lwz r7,8(r3) | |
141 | lwz r8,12(r3) | |
142 | addi r3,r3,16 | |
143 | adde r2,r2,r4 | |
144 | adde r2,r2,r6 | |
145 | adde r2,r2,r7 | |
146 | adde r2,r2,r8 | |
147 | Lleftover8: | |
148 | bf 28,Lleftover4 | |
149 | lwz r4,0(r3) | |
150 | lwz r6,4(r3) | |
151 | addi r3,r3,8 | |
152 | adde r2,r2,r4 | |
153 | adde r2,r2,r6 | |
154 | Lleftover4: | |
155 | bf 29,Lleftover2 | |
156 | lwz r4,0(r3) | |
157 | addi r3,r3,4 | |
158 | adde r2,r2,r4 | |
159 | Lleftover2: | |
160 | bf 30,Lleftover1 | |
161 | lhz r4,0(r3) | |
162 | addi r3,r3,2 | |
163 | adde r2,r2,r4 | |
164 | Lleftover1: | |
165 | bf 31,Lwrapup | |
166 | lbz r4,0(r3) | |
167 | slwi r4,r4,8 ; shift last byte into proper lane | |
168 | adde r2,r2,r4 | |
169 | ||
170 | ; All data bytes checksummed. Wrap up. | |
171 | ; r2 = checksum so far (word parallel) | |
172 | ; r5 = accumulated sum parameter | |
173 | ; carry = live | |
174 | ; cr1_gt = "starting on odd address" flag | |
175 | ||
176 | Lwrapup: | |
177 | addze r2,r2 ; add in last carry | |
178 | addze r2,r2 ; in case the "addze" carries | |
179 | Lwrapupx: ; here from short-operand case, with xer(ca) undefined | |
180 | srwi r6,r2,16 ; top half of 32-bit checksum | |
181 | rlwinm r7,r2,0,0xFFFF ; lower half | |
182 | add r2,r6,r7 ; add them together | |
183 | srwi r6,r2,16 ; then do it again, in case first carried | |
184 | rlwinm r7,r2,0,0xFFFF | |
185 | add r2,r6,r7 | |
186 | bf cr1_gt,Lswapped ; test "starting on odd address" flag | |
187 | ||
188 | ; The checksum began on an odd address, so swap bytes. | |
189 | ||
190 | rlwinm r6,r2,24,0x00FF ; move top byte to bottom | |
191 | rlwinm r7,r2,8,0xFF00 ; bottom to top | |
192 | or r2,r6,r7 ; rejoin | |
193 | ||
194 | ; Finally, add in checksum passed in as a parameter. | |
195 | ||
196 | Lswapped: | |
197 | add r2,r2,r5 ; add passed-in checksum | |
198 | srwi r6,r2,16 ; top half of 32-bit checksum | |
199 | rlwinm r7,r2,0,0xFFFF ; lower half | |
200 | add r2,r6,r7 ; add them together | |
201 | srwi r6,r2,16 ; then do it again, in case first carried | |
202 | rlwinm r7,r2,0,0xFFFF | |
203 | add r3,r6,r7 ; steer result into r3 | |
204 | blr | |
205 | ||
206 | ; Handle short operands. Do a halfword at a time. | |
207 | ; r3 = address | |
208 | ; r4 = length (<= kShort) | |
209 | ; r5 = accumulated sum parameter | |
210 | ; r6 = "starting on odd byte" flag | |
211 | ||
212 | Lshort: | |
213 | cmpwi cr6,r4,2 ; at least two bytes? | |
214 | andi. r0,r4,1 ; odd length? | |
215 | li r2,0 ; initialize checksum | |
216 | cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set | |
217 | blt cr6,Lshort2 ; fewer than two bytes, so skip | |
218 | Lshort1: | |
219 | cmpwi cr6,r4,4 ; two more bytes (after we decrement)? | |
220 | lhz r7,0(r3) | |
221 | subi r4,r4,2 | |
222 | addi r3,r3,2 | |
223 | add r2,r2,r7 ; note no need for "adde" | |
224 | bge cr6,Lshort1 ; loop for 2 more bytes | |
225 | Lshort2: | |
226 | beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined | |
227 | lbz r7,0(r3) | |
228 | slwi r7,r7,8 ; shift last byte into proper lane | |
229 | add r2,r2,r7 | |
230 | b Lwrapupx | |
231 | ||
232 | ; Handle 64-bit machine. The major improvement over the 32-bit path is that we use | |
233 | ; four parallel 32-bit accumulators, which carry into the upper half naturally so we | |
234 | ; do not have to use "adde", which serializes on the carry bit. Note that we cannot | |
235 | ; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly. | |
236 | ; r2 = checksum so far (ie, the zero-filled partial first word) | |
237 | ; r3 = word aligned address | |
238 | ; r5 = accumulated sum parameter | |
239 | ; ctr = number of 32-byte chunks of input | |
240 | ; carry = unused in this code | |
241 | ; cr1_gt = "starting on odd address" flag | |
242 | ; cr6,cr7 = residual length | |
243 | ||
244 | L64BitPath: | |
245 | stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them | |
246 | stw r14,-8(r1) | |
247 | stw r15,-12(r1) | |
248 | stw r16,-16(r1) | |
249 | li r0,128 ; to touch next line | |
250 | li r13,0 ; r13-r15 are the accumulators, so initialize them | |
251 | dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores | |
252 | lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12 | |
253 | lwz r6,4(r3) | |
254 | lwz r7,8(r3) | |
255 | mr r14,r2 ; just copy incoming partial word into one of the accumulators | |
256 | li r15,0 | |
257 | lwz r8,12(r3) | |
258 | lwz r9,16(r3) | |
259 | li r16,0 | |
260 | li r0,256 ; get touch offset | |
261 | lwz r10,20(r3) | |
262 | lwz r11,24(r3) | |
263 | lwz r12,28(r3) ; load last word of previous chunk | |
264 | addi r3,r3,32 ; skip past the chunk | |
265 | bdnz++ LInnerLoop64 ; enter loop if another chunk to go | |
266 | ||
267 | b LAddLastChunk ; only one chunk | |
268 | ||
269 | ; Inner loop for 64-bit processors. This loop is scheduled for the 970. | |
270 | ; It is pipelined (loads are one iteration ahead of adds), and unrolled. | |
271 | ; It should take 9-10 cycles per iteration, which consumes 64 bytes of input. | |
272 | ||
273 | .align 5 | |
274 | LInnerLoop64: ; 64 bytes/iteration | |
275 | add r13,r13,r4 ; cycle 1 | |
276 | add r14,r14,r6 | |
277 | dcbt r3,r0 ; touch in 2 lines ahead | |
278 | lwz r4,0(r3) | |
279 | ||
280 | add r15,r15,r7 ; cycle 2, etc | |
281 | lwz r6,4(r3) | |
282 | lwz r7,8(r3) | |
283 | add r16,r16,r8 | |
284 | ||
285 | lwz r8,12(r3) | |
286 | add r13,r13,r9 | |
287 | add r14,r14,r10 | |
288 | lwz r9,16(r3) | |
289 | ||
290 | add r15,r15,r11 | |
291 | lwz r10,20(r3) | |
292 | lwz r11,24(r3) | |
293 | add r16,r16,r12 | |
294 | bdz-- LEarlyExit ; early exit if no more chunks | |
295 | ||
296 | lwz r12,28(r3) | |
297 | add r13,r13,r4 | |
298 | add r14,r14,r6 | |
299 | lwz r4,32(r3) | |
300 | ||
301 | add r15,r15,r7 | |
302 | lwz r6,36(r3) | |
303 | lwz r7,40(r3) | |
304 | add r16,r16,r8 | |
305 | ||
306 | lwz r8,44(r3) | |
307 | add r13,r13,r9 | |
308 | add r14,r14,r10 | |
309 | lwz r9,48(r3) | |
310 | ||
311 | add r15,r15,r11 | |
312 | lwz r10,52(r3) | |
313 | lwz r11,56(r3) | |
314 | add r16,r16,r12 | |
315 | ||
316 | nop ; position last load in 2nd dispatch slot | |
317 | lwz r12,60(r3) | |
318 | addi r3,r3,64 | |
319 | bdnz++ LInnerLoop64 | |
320 | ||
321 | b LAddLastChunk | |
322 | ||
323 | ; Add in the last 32-byte chunk, and any leftover bytes. | |
324 | ; r3 = word aligned address of next byte of data | |
325 | ; r5 = accumulated sum parameter | |
326 | ; r13-r16 = the four accumulators | |
327 | ; cr1_gt = "starting on odd address" flag | |
328 | ; cr6,cr7 = residual length | |
329 | ||
330 | LEarlyExit: ; here from middle of inner loop | |
331 | lwz r12,28(r3) ; load last word of last chunk | |
332 | addi r3,r3,32 | |
333 | LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12 | |
334 | add r13,r13,r4 ; add in last chunk | |
335 | add r14,r14,r6 ; these are 64-bit adds | |
336 | add r15,r15,r7 | |
337 | add r16,r16,r8 | |
338 | add r13,r13,r9 | |
339 | add r14,r14,r10 | |
340 | add r15,r15,r11 | |
341 | add r16,r16,r12 | |
342 | ||
343 | ; Handle leftover bytes, if any. | |
344 | ||
345 | bf 27,Lleft1 ; test 0x10 bit of residual length | |
346 | lwz r4,0(r3) | |
347 | lwz r6,4(r3) | |
348 | lwz r7,8(r3) | |
349 | lwz r8,12(r3) | |
350 | addi r3,r3,16 | |
351 | add r13,r13,r4 | |
352 | add r14,r14,r6 | |
353 | add r15,r15,r7 | |
354 | add r16,r16,r8 | |
355 | Lleft1: | |
356 | bf 28,Lleft2 | |
357 | lwz r4,0(r3) | |
358 | lwz r6,4(r3) | |
359 | addi r3,r3,8 | |
360 | add r13,r13,r4 | |
361 | add r14,r14,r6 | |
362 | Lleft2: | |
363 | bf 29,Lleft3 | |
364 | lwz r4,0(r3) | |
365 | addi r3,r3,4 | |
366 | add r14,r14,r4 | |
367 | Lleft3: | |
368 | bf 30,Lleft4 | |
369 | lhz r4,0(r3) | |
370 | addi r3,r3,2 | |
371 | add r15,r15,r4 | |
372 | Lleft4: | |
373 | bf 31,Lleft5 | |
374 | lbz r4,0(r3) | |
375 | slwi r4,r4,8 ; shift last byte into proper lane | |
376 | add r16,r16,r4 | |
377 | ||
378 | ; All data bytes have been checksummed. Now we must add together the four | |
379 | ; accumulators and restore the regs from the red zone. | |
380 | ; r3 = word aligned address of next byte of data | |
381 | ; r5 = accumulated sum parameter | |
382 | ; r13-r16 = the four accumulators | |
383 | ; carry = not used so far | |
384 | ; cr1_gt = "starting on odd address" flag | |
385 | ||
386 | Lleft5: | |
387 | add r8,r13,r14 ; add the four accumulators together | |
388 | add r9,r15,r16 | |
389 | lwz r13,-4(r1) ; start to restore nonvolatiles from red zone | |
390 | lwz r14,-8(r1) | |
391 | add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators | |
392 | lwz r15,-12(r1) | |
393 | lwz r16,-16(r1) | |
394 | srdi r7,r8,32 ; get upper half of 64-bit sum | |
395 | addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry) | |
396 | b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum |