]>
Commit | Line | Data |
---|---|---|
1c79356b | 1 | /* |
91447636 | 2 | * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. |
1c79356b A |
3 | * |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
e5568f75 A |
6 | * The contents of this file constitute Original Code as defined in and |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
1c79356b | 11 | * |
e5568f75 A |
12 | * This Original Code and all software distributed under the License are |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
1c79356b A |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
e5568f75 A |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
1c79356b A |
19 | * |
20 | * @APPLE_LICENSE_HEADER_END@ | |
21 | */ | |
91447636 A |
22 | |
23 | #define kShort 11 | |
24 | #define cr1_gt 5 // bit 1 of cr1 | |
1c79356b A |
25 | |
26 | /* | |
91447636 A |
27 | * short xsum_assym( short *p, int len, short xsum, boolean odd); |
28 | * | |
29 | * r3 - Pointer to data | |
30 | * r4 - Length of data | |
31 | * r5 - Accumulated sum value | |
32 | * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data) | |
33 | * | |
34 | * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we | |
35 | * correctly handle the case where the flag is set and the address is odd. | |
36 | * | |
37 | * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum | |
38 | * of the data, treated as an array of 16-bit integers. 1s-complement sums are done | |
39 | * via "add with carry" operations on a 2s-complement machine like PPC. Note that | |
40 | * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the | |
41 | * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is | |
42 | * perfect except that it serializes the adds on the carry bit. On 64-bit machines | |
43 | * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding | |
44 | * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums, | |
45 | * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit | |
46 | * is set on the low 32-bits of the sum.) | |
47 | * | |
48 | * Using Altivec is tempting, but the performance impact of the greatly increased | |
49 | * number of exceptions and register save/restore traffic probably make it impractical | |
50 | * for now. | |
51 | */ | |
52 | .globl _xsum_assym | |
53 | .globl _xsum_nop_if_32bit | |
54 | .text | |
55 | .align 5 | |
56 | _xsum_assym: | |
57 | cmplwi cr0,r4,kShort ; too short to word align? | |
58 | rlwinm r2,r3,0,0x3 ; get byte offset in word | |
59 | dcbt 0,r3 ; touch in 1st cache line | |
60 | cmpwi cr6,r2,0 ; is address word aligned? | |
61 | ble cr0,Lshort ; skip if too short to bother aligning | |
62 | ||
63 | subfic r0,r2,4 ; get #bytes in partial word | |
64 | cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set | |
65 | addic r0,r0,0 ; turn off carry | |
66 | beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned) | |
67 | ||
68 | ; Partial word at start: zero filled on left, it becomes initial checksum. | |
69 | ||
70 | rlwinm r3,r3,0,0,29 ; word align address | |
71 | mtcrf 0x01,r2 ; move byte offset to cr7 | |
72 | lwz r6,0(r3) ; get partial word | |
73 | li r7,-1 ; start of mask for partial fill | |
74 | slwi r8,r2,3 ; multiply byte offset by 8 | |
75 | sub r4,r4,r0 ; adjust length for bytes in partial word | |
76 | crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary | |
77 | srw r7,r7,r8 ; get mask for bytes to keep in partial word | |
78 | addi r3,r3,4 ; point to next word of input | |
79 | and r2,r6,r7 ; zero fill on left | |
80 | ||
81 | ; Address is now word aligned. Prepare for inner loop over 32-byte chunks. | |
82 | ; r2 = initial checksum | |
83 | ; r3 = word aligned address | |
84 | ; r4 = length remaining | |
85 | ; r5 = accumulated sum parameter | |
86 | ; carry = off | |
87 | ; cr1_gt = "starting on odd address" flag | |
88 | ||
89 | Laligned: | |
90 | srwi. r0,r4,5 ; get count of 32-byte chunks | |
91 | mtcrf 0x02,r4 ; move residual length to cr6 and cr7 | |
92 | mtcrf 0x01,r4 | |
93 | beq cr0,Lleftovers ; no chunks | |
94 | ||
95 | mtctr r0 ; set up loop count | |
96 | li r4,32 ; offset to next chunk | |
97 | _xsum_nop_if_32bit: | |
98 | b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine) | |
99 | dcbt r4,r3 ; touch in 2nd cache line | |
100 | li r0,96 ; get touch offset | |
101 | b LInnerLoop32 ; enter 32-bit loop | |
102 | ||
103 | ; Inner loop for 32-bit machines. | |
104 | ||
105 | .align 4 | |
106 | LInnerLoop32: | |
107 | lwz r4,0(r3) | |
108 | lwz r6,4(r3) | |
109 | lwz r7,8(r3) | |
110 | lwz r8,12(r3) | |
111 | adde r2,r2,r4 | |
112 | lwz r9,16(r3) | |
113 | adde r2,r2,r6 | |
114 | lwz r10,20(r3) | |
115 | adde r2,r2,r7 | |
116 | lwz r11,24(r3) | |
117 | adde r2,r2,r8 | |
118 | lwz r12,28(r3) | |
119 | adde r2,r2,r9 | |
120 | dcbt r3,r0 | |
121 | adde r2,r2,r10 | |
122 | addi r3,r3,32 | |
123 | adde r2,r2,r11 | |
124 | adde r2,r2,r12 | |
125 | bdnz+ LInnerLoop32 | |
126 | ||
127 | ; Handle leftover bytes. | |
128 | ; r2 = checksum so far | |
129 | ; r3 = word aligned address | |
130 | ; r5 = accumulated sum parameter | |
131 | ; carry = live | |
132 | ; cr1_gt = "starting on odd address" flag | |
133 | ; cr6,cr7 = residual length | |
134 | ||
135 | Lleftovers: | |
136 | bf 27,Lleftover8 ; test 0x10 bit of residual length | |
137 | lwz r4,0(r3) | |
138 | lwz r6,4(r3) | |
139 | lwz r7,8(r3) | |
140 | lwz r8,12(r3) | |
141 | addi r3,r3,16 | |
142 | adde r2,r2,r4 | |
143 | adde r2,r2,r6 | |
144 | adde r2,r2,r7 | |
145 | adde r2,r2,r8 | |
146 | Lleftover8: | |
147 | bf 28,Lleftover4 | |
148 | lwz r4,0(r3) | |
149 | lwz r6,4(r3) | |
150 | addi r3,r3,8 | |
151 | adde r2,r2,r4 | |
152 | adde r2,r2,r6 | |
153 | Lleftover4: | |
154 | bf 29,Lleftover2 | |
155 | lwz r4,0(r3) | |
156 | addi r3,r3,4 | |
157 | adde r2,r2,r4 | |
158 | Lleftover2: | |
159 | bf 30,Lleftover1 | |
160 | lhz r4,0(r3) | |
161 | addi r3,r3,2 | |
162 | adde r2,r2,r4 | |
163 | Lleftover1: | |
164 | bf 31,Lwrapup | |
165 | lbz r4,0(r3) | |
166 | slwi r4,r4,8 ; shift last byte into proper lane | |
167 | adde r2,r2,r4 | |
168 | ||
169 | ; All data bytes checksummed. Wrap up. | |
170 | ; r2 = checksum so far (word parallel) | |
171 | ; r5 = accumulated sum parameter | |
172 | ; carry = live | |
173 | ; cr1_gt = "starting on odd address" flag | |
174 | ||
175 | Lwrapup: | |
176 | addze r2,r2 ; add in last carry | |
177 | addze r2,r2 ; in case the "addze" carries | |
178 | Lwrapupx: ; here from short-operand case, with xer(ca) undefined | |
179 | srwi r6,r2,16 ; top half of 32-bit checksum | |
180 | rlwinm r7,r2,0,0xFFFF ; lower half | |
181 | add r2,r6,r7 ; add them together | |
182 | srwi r6,r2,16 ; then do it again, in case first carried | |
183 | rlwinm r7,r2,0,0xFFFF | |
184 | add r2,r6,r7 | |
185 | bf cr1_gt,Lswapped ; test "starting on odd address" flag | |
186 | ||
187 | ; The checksum began on an odd address, so swap bytes. | |
188 | ||
189 | rlwinm r6,r2,24,0x00FF ; move top byte to bottom | |
190 | rlwinm r7,r2,8,0xFF00 ; bottom to top | |
191 | or r2,r6,r7 ; rejoin | |
192 | ||
193 | ; Finally, add in checksum passed in as a parameter. | |
194 | ||
195 | Lswapped: | |
196 | add r2,r2,r5 ; add passed-in checksum | |
197 | srwi r6,r2,16 ; top half of 32-bit checksum | |
198 | rlwinm r7,r2,0,0xFFFF ; lower half | |
199 | add r2,r6,r7 ; add them together | |
200 | srwi r6,r2,16 ; then do it again, in case first carried | |
201 | rlwinm r7,r2,0,0xFFFF | |
202 | add r3,r6,r7 ; steer result into r3 | |
203 | blr | |
204 | ||
205 | ; Handle short operands. Do a halfword at a time. | |
206 | ; r3 = address | |
207 | ; r4 = length (<= kShort) | |
208 | ; r5 = accumulated sum parameter | |
209 | ; r6 = "starting on odd byte" flag | |
210 | ||
211 | Lshort: | |
212 | cmpwi cr6,r4,2 ; at least two bytes? | |
213 | andi. r0,r4,1 ; odd length? | |
214 | li r2,0 ; initialize checksum | |
215 | cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set | |
216 | blt cr6,Lshort2 ; fewer than two bytes, so skip | |
217 | Lshort1: | |
218 | cmpwi cr6,r4,4 ; two more bytes (after we decrement)? | |
219 | lhz r7,0(r3) | |
220 | subi r4,r4,2 | |
221 | addi r3,r3,2 | |
222 | add r2,r2,r7 ; note no need for "adde" | |
223 | bge cr6,Lshort1 ; loop for 2 more bytes | |
224 | Lshort2: | |
225 | beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined | |
226 | lbz r7,0(r3) | |
227 | slwi r7,r7,8 ; shift last byte into proper lane | |
228 | add r2,r2,r7 | |
229 | b Lwrapupx | |
230 | ||
231 | ; Handle 64-bit machine. The major improvement over the 32-bit path is that we use | |
232 | ; four parallel 32-bit accumulators, which carry into the upper half naturally so we | |
233 | ; do not have to use "adde", which serializes on the carry bit. Note that we cannot | |
234 | ; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly. | |
235 | ; r2 = checksum so far (ie, the zero-filled partial first word) | |
236 | ; r3 = word aligned address | |
237 | ; r5 = accumulated sum parameter | |
238 | ; ctr = number of 32-byte chunks of input | |
239 | ; carry = unused in this code | |
240 | ; cr1_gt = "starting on odd address" flag | |
241 | ; cr6,cr7 = residual length | |
242 | ||
243 | L64BitPath: | |
244 | stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them | |
245 | stw r14,-8(r1) | |
246 | stw r15,-12(r1) | |
247 | stw r16,-16(r1) | |
248 | li r0,128 ; to touch next line | |
249 | li r13,0 ; r13-r15 are the accumulators, so initialize them | |
250 | dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores | |
251 | lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12 | |
252 | lwz r6,4(r3) | |
253 | lwz r7,8(r3) | |
254 | mr r14,r2 ; just copy incoming partial word into one of the accumulators | |
255 | li r15,0 | |
256 | lwz r8,12(r3) | |
257 | lwz r9,16(r3) | |
258 | li r16,0 | |
259 | li r0,256 ; get touch offset | |
260 | lwz r10,20(r3) | |
261 | lwz r11,24(r3) | |
262 | lwz r12,28(r3) ; load last word of previous chunk | |
263 | addi r3,r3,32 ; skip past the chunk | |
264 | bdnz++ LInnerLoop64 ; enter loop if another chunk to go | |
265 | ||
266 | b LAddLastChunk ; only one chunk | |
267 | ||
268 | ; Inner loop for 64-bit processors. This loop is scheduled for the 970. | |
269 | ; It is pipelined (loads are one iteration ahead of adds), and unrolled. | |
270 | ; It should take 9-10 cycles per iteration, which consumes 64 bytes of input. | |
271 | ||
272 | .align 5 | |
273 | LInnerLoop64: ; 64 bytes/iteration | |
274 | add r13,r13,r4 ; cycle 1 | |
275 | add r14,r14,r6 | |
276 | dcbt r3,r0 ; touch in 2 lines ahead | |
277 | lwz r4,0(r3) | |
278 | ||
279 | add r15,r15,r7 ; cycle 2, etc | |
280 | lwz r6,4(r3) | |
281 | lwz r7,8(r3) | |
282 | add r16,r16,r8 | |
283 | ||
284 | lwz r8,12(r3) | |
285 | add r13,r13,r9 | |
286 | add r14,r14,r10 | |
287 | lwz r9,16(r3) | |
288 | ||
289 | add r15,r15,r11 | |
290 | lwz r10,20(r3) | |
291 | lwz r11,24(r3) | |
292 | add r16,r16,r12 | |
293 | bdz-- LEarlyExit ; early exit if no more chunks | |
294 | ||
295 | lwz r12,28(r3) | |
296 | add r13,r13,r4 | |
297 | add r14,r14,r6 | |
298 | lwz r4,32(r3) | |
299 | ||
300 | add r15,r15,r7 | |
301 | lwz r6,36(r3) | |
302 | lwz r7,40(r3) | |
303 | add r16,r16,r8 | |
304 | ||
305 | lwz r8,44(r3) | |
306 | add r13,r13,r9 | |
307 | add r14,r14,r10 | |
308 | lwz r9,48(r3) | |
309 | ||
310 | add r15,r15,r11 | |
311 | lwz r10,52(r3) | |
312 | lwz r11,56(r3) | |
313 | add r16,r16,r12 | |
314 | ||
315 | nop ; position last load in 2nd dispatch slot | |
316 | lwz r12,60(r3) | |
317 | addi r3,r3,64 | |
318 | bdnz++ LInnerLoop64 | |
319 | ||
320 | b LAddLastChunk | |
321 | ||
322 | ; Add in the last 32-byte chunk, and any leftover bytes. | |
323 | ; r3 = word aligned address of next byte of data | |
324 | ; r5 = accumulated sum parameter | |
325 | ; r13-r16 = the four accumulators | |
326 | ; cr1_gt = "starting on odd address" flag | |
327 | ; cr6,cr7 = residual length | |
328 | ||
329 | LEarlyExit: ; here from middle of inner loop | |
330 | lwz r12,28(r3) ; load last word of last chunk | |
331 | addi r3,r3,32 | |
332 | LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12 | |
333 | add r13,r13,r4 ; add in last chunk | |
334 | add r14,r14,r6 ; these are 64-bit adds | |
335 | add r15,r15,r7 | |
336 | add r16,r16,r8 | |
337 | add r13,r13,r9 | |
338 | add r14,r14,r10 | |
339 | add r15,r15,r11 | |
340 | add r16,r16,r12 | |
341 | ||
342 | ; Handle leftover bytes, if any. | |
343 | ||
344 | bf 27,Lleft1 ; test 0x10 bit of residual length | |
345 | lwz r4,0(r3) | |
346 | lwz r6,4(r3) | |
347 | lwz r7,8(r3) | |
348 | lwz r8,12(r3) | |
349 | addi r3,r3,16 | |
350 | add r13,r13,r4 | |
351 | add r14,r14,r6 | |
352 | add r15,r15,r7 | |
353 | add r16,r16,r8 | |
354 | Lleft1: | |
355 | bf 28,Lleft2 | |
356 | lwz r4,0(r3) | |
357 | lwz r6,4(r3) | |
358 | addi r3,r3,8 | |
359 | add r13,r13,r4 | |
360 | add r14,r14,r6 | |
361 | Lleft2: | |
362 | bf 29,Lleft3 | |
363 | lwz r4,0(r3) | |
364 | addi r3,r3,4 | |
365 | add r14,r14,r4 | |
366 | Lleft3: | |
367 | bf 30,Lleft4 | |
368 | lhz r4,0(r3) | |
369 | addi r3,r3,2 | |
370 | add r15,r15,r4 | |
371 | Lleft4: | |
372 | bf 31,Lleft5 | |
373 | lbz r4,0(r3) | |
374 | slwi r4,r4,8 ; shift last byte into proper lane | |
375 | add r16,r16,r4 | |
376 | ||
377 | ; All data bytes have been checksummed. Now we must add together the four | |
378 | ; accumulators and restore the regs from the red zone. | |
379 | ; r3 = word aligned address of next byte of data | |
380 | ; r5 = accumulated sum parameter | |
381 | ; r13-r16 = the four accumulators | |
382 | ; carry = not used so far | |
383 | ; cr1_gt = "starting on odd address" flag | |
384 | ||
385 | Lleft5: | |
386 | add r8,r13,r14 ; add the four accumulators together | |
387 | add r9,r15,r16 | |
388 | lwz r13,-4(r1) ; start to restore nonvolatiles from red zone | |
389 | lwz r14,-8(r1) | |
390 | add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators | |
391 | lwz r15,-12(r1) | |
392 | lwz r16,-16(r1) | |
393 | srdi r7,r8,32 ; get upper half of 64-bit sum | |
394 | addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry) | |
395 | b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum |