]>
Commit | Line | Data |
---|---|---|
1c79356b | 1 | /* |
5d5c5d0d A |
2 | * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. |
3 | * | |
2d21ac55 | 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
1c79356b | 5 | * |
2d21ac55 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
8f6c56a5 | 14 | * |
2d21ac55 A |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
8f6c56a5 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
8f6c56a5 | 25 | * |
2d21ac55 | 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
1c79356b | 27 | */ |
91447636 A |
28 | |
29 | #define kShort 11 | |
30 | #define cr1_gt 5 // bit 1 of cr1 | |
1c79356b A |
31 | |
32 | /* | |
91447636 A |
33 | * short xsum_assym( short *p, int len, short xsum, boolean odd); |
34 | * | |
35 | * r3 - Pointer to data | |
36 | * r4 - Length of data | |
37 | * r5 - Accumulated sum value | |
38 | * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data) | |
39 | * | |
40 | * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we | |
41 | * correctly handle the case where the flag is set and the address is odd. | |
42 | * | |
43 | * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum | |
44 | * of the data, treated as an array of 16-bit integers. 1s-complement sums are done | |
45 | * via "add with carry" operations on a 2s-complement machine like PPC. Note that | |
46 | * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the | |
47 | * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is | |
48 | * perfect except that it serializes the adds on the carry bit. On 64-bit machines | |
49 | * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding | |
50 | * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums, | |
51 | * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit | |
52 | * is set on the low 32-bits of the sum.) | |
53 | * | |
54 | * Using Altivec is tempting, but the performance impact of the greatly increased | |
55 | * number of exceptions and register save/restore traffic probably make it impractical | |
56 | * for now. | |
57 | */ | |
58 | .globl _xsum_assym | |
59 | .globl _xsum_nop_if_32bit | |
60 | .text | |
61 | .align 5 | |
62 | _xsum_assym: | |
63 | cmplwi cr0,r4,kShort ; too short to word align? | |
64 | rlwinm r2,r3,0,0x3 ; get byte offset in word | |
65 | dcbt 0,r3 ; touch in 1st cache line | |
66 | cmpwi cr6,r2,0 ; is address word aligned? | |
67 | ble cr0,Lshort ; skip if too short to bother aligning | |
68 | ||
69 | subfic r0,r2,4 ; get #bytes in partial word | |
70 | cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set | |
71 | addic r0,r0,0 ; turn off carry | |
72 | beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned) | |
73 | ||
74 | ; Partial word at start: zero filled on left, it becomes initial checksum. | |
75 | ||
76 | rlwinm r3,r3,0,0,29 ; word align address | |
77 | mtcrf 0x01,r2 ; move byte offset to cr7 | |
78 | lwz r6,0(r3) ; get partial word | |
79 | li r7,-1 ; start of mask for partial fill | |
80 | slwi r8,r2,3 ; multiply byte offset by 8 | |
81 | sub r4,r4,r0 ; adjust length for bytes in partial word | |
82 | crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary | |
83 | srw r7,r7,r8 ; get mask for bytes to keep in partial word | |
84 | addi r3,r3,4 ; point to next word of input | |
85 | and r2,r6,r7 ; zero fill on left | |
86 | ||
87 | ; Address is now word aligned. Prepare for inner loop over 32-byte chunks. | |
88 | ; r2 = initial checksum | |
89 | ; r3 = word aligned address | |
90 | ; r4 = length remaining | |
91 | ; r5 = accumulated sum parameter | |
92 | ; carry = off | |
93 | ; cr1_gt = "starting on odd address" flag | |
94 | ||
95 | Laligned: | |
96 | srwi. r0,r4,5 ; get count of 32-byte chunks | |
97 | mtcrf 0x02,r4 ; move residual length to cr6 and cr7 | |
98 | mtcrf 0x01,r4 | |
99 | beq cr0,Lleftovers ; no chunks | |
100 | ||
101 | mtctr r0 ; set up loop count | |
102 | li r4,32 ; offset to next chunk | |
103 | _xsum_nop_if_32bit: | |
104 | b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine) | |
105 | dcbt r4,r3 ; touch in 2nd cache line | |
106 | li r0,96 ; get touch offset | |
107 | b LInnerLoop32 ; enter 32-bit loop | |
108 | ||
109 | ; Inner loop for 32-bit machines. | |
110 | ||
111 | .align 4 | |
112 | LInnerLoop32: | |
113 | lwz r4,0(r3) | |
114 | lwz r6,4(r3) | |
115 | lwz r7,8(r3) | |
116 | lwz r8,12(r3) | |
117 | adde r2,r2,r4 | |
118 | lwz r9,16(r3) | |
119 | adde r2,r2,r6 | |
120 | lwz r10,20(r3) | |
121 | adde r2,r2,r7 | |
122 | lwz r11,24(r3) | |
123 | adde r2,r2,r8 | |
124 | lwz r12,28(r3) | |
125 | adde r2,r2,r9 | |
126 | dcbt r3,r0 | |
127 | adde r2,r2,r10 | |
128 | addi r3,r3,32 | |
129 | adde r2,r2,r11 | |
130 | adde r2,r2,r12 | |
131 | bdnz+ LInnerLoop32 | |
132 | ||
133 | ; Handle leftover bytes. | |
134 | ; r2 = checksum so far | |
135 | ; r3 = word aligned address | |
136 | ; r5 = accumulated sum parameter | |
137 | ; carry = live | |
138 | ; cr1_gt = "starting on odd address" flag | |
139 | ; cr6,cr7 = residual length | |
140 | ||
141 | Lleftovers: | |
142 | bf 27,Lleftover8 ; test 0x10 bit of residual length | |
143 | lwz r4,0(r3) | |
144 | lwz r6,4(r3) | |
145 | lwz r7,8(r3) | |
146 | lwz r8,12(r3) | |
147 | addi r3,r3,16 | |
148 | adde r2,r2,r4 | |
149 | adde r2,r2,r6 | |
150 | adde r2,r2,r7 | |
151 | adde r2,r2,r8 | |
152 | Lleftover8: | |
153 | bf 28,Lleftover4 | |
154 | lwz r4,0(r3) | |
155 | lwz r6,4(r3) | |
156 | addi r3,r3,8 | |
157 | adde r2,r2,r4 | |
158 | adde r2,r2,r6 | |
159 | Lleftover4: | |
160 | bf 29,Lleftover2 | |
161 | lwz r4,0(r3) | |
162 | addi r3,r3,4 | |
163 | adde r2,r2,r4 | |
164 | Lleftover2: | |
165 | bf 30,Lleftover1 | |
166 | lhz r4,0(r3) | |
167 | addi r3,r3,2 | |
168 | adde r2,r2,r4 | |
169 | Lleftover1: | |
170 | bf 31,Lwrapup | |
171 | lbz r4,0(r3) | |
172 | slwi r4,r4,8 ; shift last byte into proper lane | |
173 | adde r2,r2,r4 | |
174 | ||
175 | ; All data bytes checksummed. Wrap up. | |
176 | ; r2 = checksum so far (word parallel) | |
177 | ; r5 = accumulated sum parameter | |
178 | ; carry = live | |
179 | ; cr1_gt = "starting on odd address" flag | |
180 | ||
181 | Lwrapup: | |
182 | addze r2,r2 ; add in last carry | |
183 | addze r2,r2 ; in case the "addze" carries | |
184 | Lwrapupx: ; here from short-operand case, with xer(ca) undefined | |
185 | srwi r6,r2,16 ; top half of 32-bit checksum | |
186 | rlwinm r7,r2,0,0xFFFF ; lower half | |
187 | add r2,r6,r7 ; add them together | |
188 | srwi r6,r2,16 ; then do it again, in case first carried | |
189 | rlwinm r7,r2,0,0xFFFF | |
190 | add r2,r6,r7 | |
191 | bf cr1_gt,Lswapped ; test "starting on odd address" flag | |
192 | ||
193 | ; The checksum began on an odd address, so swap bytes. | |
194 | ||
195 | rlwinm r6,r2,24,0x00FF ; move top byte to bottom | |
196 | rlwinm r7,r2,8,0xFF00 ; bottom to top | |
197 | or r2,r6,r7 ; rejoin | |
198 | ||
199 | ; Finally, add in checksum passed in as a parameter. | |
200 | ||
201 | Lswapped: | |
202 | add r2,r2,r5 ; add passed-in checksum | |
203 | srwi r6,r2,16 ; top half of 32-bit checksum | |
204 | rlwinm r7,r2,0,0xFFFF ; lower half | |
205 | add r2,r6,r7 ; add them together | |
206 | srwi r6,r2,16 ; then do it again, in case first carried | |
207 | rlwinm r7,r2,0,0xFFFF | |
208 | add r3,r6,r7 ; steer result into r3 | |
209 | blr | |
210 | ||
211 | ; Handle short operands. Do a halfword at a time. | |
212 | ; r3 = address | |
213 | ; r4 = length (<= kShort) | |
214 | ; r5 = accumulated sum parameter | |
215 | ; r6 = "starting on odd byte" flag | |
216 | ||
217 | Lshort: | |
218 | cmpwi cr6,r4,2 ; at least two bytes? | |
219 | andi. r0,r4,1 ; odd length? | |
220 | li r2,0 ; initialize checksum | |
221 | cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set | |
222 | blt cr6,Lshort2 ; fewer than two bytes, so skip | |
223 | Lshort1: | |
224 | cmpwi cr6,r4,4 ; two more bytes (after we decrement)? | |
225 | lhz r7,0(r3) | |
226 | subi r4,r4,2 | |
227 | addi r3,r3,2 | |
228 | add r2,r2,r7 ; note no need for "adde" | |
229 | bge cr6,Lshort1 ; loop for 2 more bytes | |
230 | Lshort2: | |
231 | beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined | |
232 | lbz r7,0(r3) | |
233 | slwi r7,r7,8 ; shift last byte into proper lane | |
234 | add r2,r2,r7 | |
235 | b Lwrapupx | |
236 | ||
237 | ; Handle 64-bit machine. The major improvement over the 32-bit path is that we use | |
238 | ; four parallel 32-bit accumulators, which carry into the upper half naturally so we | |
239 | ; do not have to use "adde", which serializes on the carry bit. Note that we cannot | |
240 | ; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly. | |
241 | ; r2 = checksum so far (ie, the zero-filled partial first word) | |
242 | ; r3 = word aligned address | |
243 | ; r5 = accumulated sum parameter | |
244 | ; ctr = number of 32-byte chunks of input | |
245 | ; carry = unused in this code | |
246 | ; cr1_gt = "starting on odd address" flag | |
247 | ; cr6,cr7 = residual length | |
248 | ||
249 | L64BitPath: | |
250 | stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them | |
251 | stw r14,-8(r1) | |
252 | stw r15,-12(r1) | |
253 | stw r16,-16(r1) | |
254 | li r0,128 ; to touch next line | |
255 | li r13,0 ; r13-r15 are the accumulators, so initialize them | |
256 | dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores | |
257 | lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12 | |
258 | lwz r6,4(r3) | |
259 | lwz r7,8(r3) | |
260 | mr r14,r2 ; just copy incoming partial word into one of the accumulators | |
261 | li r15,0 | |
262 | lwz r8,12(r3) | |
263 | lwz r9,16(r3) | |
264 | li r16,0 | |
265 | li r0,256 ; get touch offset | |
266 | lwz r10,20(r3) | |
267 | lwz r11,24(r3) | |
268 | lwz r12,28(r3) ; load last word of previous chunk | |
269 | addi r3,r3,32 ; skip past the chunk | |
270 | bdnz++ LInnerLoop64 ; enter loop if another chunk to go | |
271 | ||
272 | b LAddLastChunk ; only one chunk | |
273 | ||
274 | ; Inner loop for 64-bit processors. This loop is scheduled for the 970. | |
275 | ; It is pipelined (loads are one iteration ahead of adds), and unrolled. | |
276 | ; It should take 9-10 cycles per iteration, which consumes 64 bytes of input. | |
277 | ||
278 | .align 5 | |
279 | LInnerLoop64: ; 64 bytes/iteration | |
280 | add r13,r13,r4 ; cycle 1 | |
281 | add r14,r14,r6 | |
282 | dcbt r3,r0 ; touch in 2 lines ahead | |
283 | lwz r4,0(r3) | |
284 | ||
285 | add r15,r15,r7 ; cycle 2, etc | |
286 | lwz r6,4(r3) | |
287 | lwz r7,8(r3) | |
288 | add r16,r16,r8 | |
289 | ||
290 | lwz r8,12(r3) | |
291 | add r13,r13,r9 | |
292 | add r14,r14,r10 | |
293 | lwz r9,16(r3) | |
294 | ||
295 | add r15,r15,r11 | |
296 | lwz r10,20(r3) | |
297 | lwz r11,24(r3) | |
298 | add r16,r16,r12 | |
299 | bdz-- LEarlyExit ; early exit if no more chunks | |
300 | ||
301 | lwz r12,28(r3) | |
302 | add r13,r13,r4 | |
303 | add r14,r14,r6 | |
304 | lwz r4,32(r3) | |
305 | ||
306 | add r15,r15,r7 | |
307 | lwz r6,36(r3) | |
308 | lwz r7,40(r3) | |
309 | add r16,r16,r8 | |
310 | ||
311 | lwz r8,44(r3) | |
312 | add r13,r13,r9 | |
313 | add r14,r14,r10 | |
314 | lwz r9,48(r3) | |
315 | ||
316 | add r15,r15,r11 | |
317 | lwz r10,52(r3) | |
318 | lwz r11,56(r3) | |
319 | add r16,r16,r12 | |
320 | ||
321 | nop ; position last load in 2nd dispatch slot | |
322 | lwz r12,60(r3) | |
323 | addi r3,r3,64 | |
324 | bdnz++ LInnerLoop64 | |
325 | ||
326 | b LAddLastChunk | |
327 | ||
328 | ; Add in the last 32-byte chunk, and any leftover bytes. | |
329 | ; r3 = word aligned address of next byte of data | |
330 | ; r5 = accumulated sum parameter | |
331 | ; r13-r16 = the four accumulators | |
332 | ; cr1_gt = "starting on odd address" flag | |
333 | ; cr6,cr7 = residual length | |
334 | ||
335 | LEarlyExit: ; here from middle of inner loop | |
336 | lwz r12,28(r3) ; load last word of last chunk | |
337 | addi r3,r3,32 | |
338 | LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12 | |
339 | add r13,r13,r4 ; add in last chunk | |
340 | add r14,r14,r6 ; these are 64-bit adds | |
341 | add r15,r15,r7 | |
342 | add r16,r16,r8 | |
343 | add r13,r13,r9 | |
344 | add r14,r14,r10 | |
345 | add r15,r15,r11 | |
346 | add r16,r16,r12 | |
347 | ||
348 | ; Handle leftover bytes, if any. | |
349 | ||
350 | bf 27,Lleft1 ; test 0x10 bit of residual length | |
351 | lwz r4,0(r3) | |
352 | lwz r6,4(r3) | |
353 | lwz r7,8(r3) | |
354 | lwz r8,12(r3) | |
355 | addi r3,r3,16 | |
356 | add r13,r13,r4 | |
357 | add r14,r14,r6 | |
358 | add r15,r15,r7 | |
359 | add r16,r16,r8 | |
360 | Lleft1: | |
361 | bf 28,Lleft2 | |
362 | lwz r4,0(r3) | |
363 | lwz r6,4(r3) | |
364 | addi r3,r3,8 | |
365 | add r13,r13,r4 | |
366 | add r14,r14,r6 | |
367 | Lleft2: | |
368 | bf 29,Lleft3 | |
369 | lwz r4,0(r3) | |
370 | addi r3,r3,4 | |
371 | add r14,r14,r4 | |
372 | Lleft3: | |
373 | bf 30,Lleft4 | |
374 | lhz r4,0(r3) | |
375 | addi r3,r3,2 | |
376 | add r15,r15,r4 | |
377 | Lleft4: | |
378 | bf 31,Lleft5 | |
379 | lbz r4,0(r3) | |
380 | slwi r4,r4,8 ; shift last byte into proper lane | |
381 | add r16,r16,r4 | |
382 | ||
383 | ; All data bytes have been checksummed. Now we must add together the four | |
384 | ; accumulators and restore the regs from the red zone. | |
385 | ; r3 = word aligned address of next byte of data | |
386 | ; r5 = accumulated sum parameter | |
387 | ; r13-r16 = the four accumulators | |
388 | ; carry = not used so far | |
389 | ; cr1_gt = "starting on odd address" flag | |
390 | ||
391 | Lleft5: | |
392 | add r8,r13,r14 ; add the four accumulators together | |
393 | add r9,r15,r16 | |
394 | lwz r13,-4(r1) ; start to restore nonvolatiles from red zone | |
395 | lwz r14,-8(r1) | |
396 | add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators | |
397 | lwz r15,-12(r1) | |
398 | lwz r16,-16(r1) | |
399 | srdi r7,r8,32 ; get upper half of 64-bit sum | |
400 | addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry) | |
401 | b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum |