2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
24 #define cr1_gt 5 // bit 1 of cr1
27 * short xsum_assym( short *p, int len, short xsum, boolean odd);
29 * r3 - Pointer to data
31 * r5 - Accumulated sum value
32 * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
34 * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we
35 * correctly handle the case where the flag is set and the address is odd.
37 * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
38 * of the data, treated as an array of 16-bit integers. 1s-complement sums are done
39 * via "add with carry" operations on a 2s-complement machine like PPC. Note that
40 * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
41 * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is
42 * perfect except that it serializes the adds on the carry bit. On 64-bit machines
43 * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
44 * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums,
45 * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
46 * is set on the low 32-bits of the sum.)
48 * Using Altivec is tempting, but the performance impact of the greatly increased
49 * number of exceptions and register save/restore traffic probably make it impractical
53 .globl _xsum_nop_if_32bit
57 cmplwi cr0,r4,kShort ; too short to word align?
58 rlwinm r2,r3,0,0x3 ; get byte offset in word
59 dcbt 0,r3 ; touch in 1st cache line
60 cmpwi cr6,r2,0 ; is address word aligned?
61 ble cr0,Lshort ; skip if too short to bother aligning
63 subfic r0,r2,4 ; get #bytes in partial word
64 cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
65 addic r0,r0,0 ; turn off carry
66 beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned)
68 ; Partial word at start: zero filled on left, it becomes initial checksum.
70 rlwinm r3,r3,0,0,29 ; word align address
71 mtcrf 0x01,r2 ; move byte offset to cr7
72 lwz r6,0(r3) ; get partial word
73 li r7,-1 ; start of mask for partial fill
74 slwi r8,r2,3 ; multiply byte offset by 8
75 sub r4,r4,r0 ; adjust length for bytes in partial word
76 crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
77 srw r7,r7,r8 ; get mask for bytes to keep in partial word
78 addi r3,r3,4 ; point to next word of input
79 and r2,r6,r7 ; zero fill on left
81 ; Address is now word aligned. Prepare for inner loop over 32-byte chunks.
82 ; r2 = initial checksum
83 ; r3 = word aligned address
84 ; r4 = length remaining
85 ; r5 = accumulated sum parameter
87 ; cr1_gt = "starting on odd address" flag
90 srwi. r0,r4,5 ; get count of 32-byte chunks
91 mtcrf 0x02,r4 ; move residual length to cr6 and cr7
93 beq cr0,Lleftovers ; no chunks
95 mtctr r0 ; set up loop count
96 li r4,32 ; offset to next chunk
98 b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine)
99 dcbt r4,r3 ; touch in 2nd cache line
100 li r0,96 ; get touch offset
101 b LInnerLoop32 ; enter 32-bit loop
103 ; Inner loop for 32-bit machines.
127 ; Handle leftover bytes.
128 ; r2 = checksum so far
129 ; r3 = word aligned address
130 ; r5 = accumulated sum parameter
132 ; cr1_gt = "starting on odd address" flag
133 ; cr6,cr7 = residual length
136 bf 27,Lleftover8 ; test 0x10 bit of residual length
166 slwi r4,r4,8 ; shift last byte into proper lane
169 ; All data bytes checksummed. Wrap up.
170 ; r2 = checksum so far (word parallel)
171 ; r5 = accumulated sum parameter
173 ; cr1_gt = "starting on odd address" flag
176 addze r2,r2 ; add in last carry
177 addze r2,r2 ; in case the "addze" carries
178 Lwrapupx: ; here from short-operand case, with xer(ca) undefined
179 srwi r6,r2,16 ; top half of 32-bit checksum
180 rlwinm r7,r2,0,0xFFFF ; lower half
181 add r2,r6,r7 ; add them together
182 srwi r6,r2,16 ; then do it again, in case first carried
183 rlwinm r7,r2,0,0xFFFF
185 bf cr1_gt,Lswapped ; test "starting on odd address" flag
187 ; The checksum began on an odd address, so swap bytes.
189 rlwinm r6,r2,24,0x00FF ; move top byte to bottom
190 rlwinm r7,r2,8,0xFF00 ; bottom to top
193 ; Finally, add in checksum passed in as a parameter.
196 add r2,r2,r5 ; add passed-in checksum
197 srwi r6,r2,16 ; top half of 32-bit checksum
198 rlwinm r7,r2,0,0xFFFF ; lower half
199 add r2,r6,r7 ; add them together
200 srwi r6,r2,16 ; then do it again, in case first carried
201 rlwinm r7,r2,0,0xFFFF
202 add r3,r6,r7 ; steer result into r3
205 ; Handle short operands. Do a halfword at a time.
207 ; r4 = length (<= kShort)
208 ; r5 = accumulated sum parameter
209 ; r6 = "starting on odd byte" flag
212 cmpwi cr6,r4,2 ; at least two bytes?
213 andi. r0,r4,1 ; odd length?
214 li r2,0 ; initialize checksum
215 cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
216 blt cr6,Lshort2 ; fewer than two bytes, so skip
218 cmpwi cr6,r4,4 ; two more bytes (after we decrement)?
222 add r2,r2,r7 ; note no need for "adde"
223 bge cr6,Lshort1 ; loop for 2 more bytes
225 beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined
227 slwi r7,r7,8 ; shift last byte into proper lane
231 ; Handle 64-bit machine. The major improvement over the 32-bit path is that we use
232 ; four parallel 32-bit accumulators, which carry into the upper half naturally so we
233 ; do not have to use "adde", which serializes on the carry bit. Note that we cannot
234 ; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
235 ; r2 = checksum so far (ie, the zero-filled partial first word)
236 ; r3 = word aligned address
237 ; r5 = accumulated sum parameter
238 ; ctr = number of 32-byte chunks of input
239 ; carry = unused in this code
240 ; cr1_gt = "starting on odd address" flag
241 ; cr6,cr7 = residual length
244 stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them
248 li r0,128 ; to touch next line
249 li r13,0 ; r13-r15 are the accumulators, so initialize them
250 dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores
251 lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12
254 mr r14,r2 ; just copy incoming partial word into one of the accumulators
259 li r0,256 ; get touch offset
262 lwz r12,28(r3) ; load last word of previous chunk
263 addi r3,r3,32 ; skip past the chunk
264 bdnz++ LInnerLoop64 ; enter loop if another chunk to go
266 b LAddLastChunk ; only one chunk
268 ; Inner loop for 64-bit processors. This loop is scheduled for the 970.
269 ; It is pipelined (loads are one iteration ahead of adds), and unrolled.
270 ; It should take 9-10 cycles per iteration, which consumes 64 bytes of input.
273 LInnerLoop64: ; 64 bytes/iteration
274 add r13,r13,r4 ; cycle 1
276 dcbt r3,r0 ; touch in 2 lines ahead
279 add r15,r15,r7 ; cycle 2, etc
293 bdz-- LEarlyExit ; early exit if no more chunks
315 nop ; position last load in 2nd dispatch slot
322 ; Add in the last 32-byte chunk, and any leftover bytes.
323 ; r3 = word aligned address of next byte of data
324 ; r5 = accumulated sum parameter
325 ; r13-r16 = the four accumulators
326 ; cr1_gt = "starting on odd address" flag
327 ; cr6,cr7 = residual length
329 LEarlyExit: ; here from middle of inner loop
330 lwz r12,28(r3) ; load last word of last chunk
332 LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12
333 add r13,r13,r4 ; add in last chunk
334 add r14,r14,r6 ; these are 64-bit adds
342 ; Handle leftover bytes, if any.
344 bf 27,Lleft1 ; test 0x10 bit of residual length
374 slwi r4,r4,8 ; shift last byte into proper lane
377 ; All data bytes have been checksummed. Now we must add together the four
378 ; accumulators and restore the regs from the red zone.
379 ; r3 = word aligned address of next byte of data
380 ; r5 = accumulated sum parameter
381 ; r13-r16 = the four accumulators
382 ; carry = not used so far
383 ; cr1_gt = "starting on odd address" flag
386 add r8,r13,r14 ; add the four accumulators together
388 lwz r13,-4(r1) ; start to restore nonvolatiles from red zone
390 add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators
393 srdi r7,r8,32 ; get upper half of 64-bit sum
394 addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry)
395 b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum