2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
32 #define cr1_gt 5 // bit 1 of cr1
35 * short xsum_assym( short *p, int len, short xsum, boolean odd);
37 * r3 - Pointer to data
39 * r5 - Accumulated sum value
40 * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
42 * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we
43 * correctly handle the case where the flag is set and the address is odd.
45 * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
46 * of the data, treated as an array of 16-bit integers. 1s-complement sums are done
47 * via "add with carry" operations on a 2s-complement machine like PPC. Note that
48 * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
49 * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is
50 * perfect except that it serializes the adds on the carry bit. On 64-bit machines
51 * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
52 * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums,
53 * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
54 * is set on the low 32-bits of the sum.)
56 * Using Altivec is tempting, but the performance impact of the greatly increased
57 * number of exceptions and register save/restore traffic probably make it impractical
61 .globl _xsum_nop_if_32bit
65 cmplwi cr0,r4,kShort ; too short to word align?
66 rlwinm r2,r3,0,0x3 ; get byte offset in word
67 dcbt 0,r3 ; touch in 1st cache line
68 cmpwi cr6,r2,0 ; is address word aligned?
69 ble cr0,Lshort ; skip if too short to bother aligning
71 subfic r0,r2,4 ; get #bytes in partial word
72 cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
73 addic r0,r0,0 ; turn off carry
74 beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned)
76 ; Partial word at start: zero filled on left, it becomes initial checksum.
78 rlwinm r3,r3,0,0,29 ; word align address
79 mtcrf 0x01,r2 ; move byte offset to cr7
80 lwz r6,0(r3) ; get partial word
81 li r7,-1 ; start of mask for partial fill
82 slwi r8,r2,3 ; multiply byte offset by 8
83 sub r4,r4,r0 ; adjust length for bytes in partial word
84 crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
85 srw r7,r7,r8 ; get mask for bytes to keep in partial word
86 addi r3,r3,4 ; point to next word of input
87 and r2,r6,r7 ; zero fill on left
89 ; Address is now word aligned. Prepare for inner loop over 32-byte chunks.
90 ; r2 = initial checksum
91 ; r3 = word aligned address
92 ; r4 = length remaining
93 ; r5 = accumulated sum parameter
95 ; cr1_gt = "starting on odd address" flag
98 srwi. r0,r4,5 ; get count of 32-byte chunks
99 mtcrf 0x02,r4 ; move residual length to cr6 and cr7
101 beq cr0,Lleftovers ; no chunks
103 mtctr r0 ; set up loop count
104 li r4,32 ; offset to next chunk
106 b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine)
107 dcbt r4,r3 ; touch in 2nd cache line
108 li r0,96 ; get touch offset
109 b LInnerLoop32 ; enter 32-bit loop
111 ; Inner loop for 32-bit machines.
135 ; Handle leftover bytes.
136 ; r2 = checksum so far
137 ; r3 = word aligned address
138 ; r5 = accumulated sum parameter
140 ; cr1_gt = "starting on odd address" flag
141 ; cr6,cr7 = residual length
144 bf 27,Lleftover8 ; test 0x10 bit of residual length
174 slwi r4,r4,8 ; shift last byte into proper lane
177 ; All data bytes checksummed. Wrap up.
178 ; r2 = checksum so far (word parallel)
179 ; r5 = accumulated sum parameter
181 ; cr1_gt = "starting on odd address" flag
184 addze r2,r2 ; add in last carry
185 addze r2,r2 ; in case the "addze" carries
186 Lwrapupx: ; here from short-operand case, with xer(ca) undefined
187 srwi r6,r2,16 ; top half of 32-bit checksum
188 rlwinm r7,r2,0,0xFFFF ; lower half
189 add r2,r6,r7 ; add them together
190 srwi r6,r2,16 ; then do it again, in case first carried
191 rlwinm r7,r2,0,0xFFFF
193 bf cr1_gt,Lswapped ; test "starting on odd address" flag
195 ; The checksum began on an odd address, so swap bytes.
197 rlwinm r6,r2,24,0x00FF ; move top byte to bottom
198 rlwinm r7,r2,8,0xFF00 ; bottom to top
201 ; Finally, add in checksum passed in as a parameter.
204 add r2,r2,r5 ; add passed-in checksum
205 srwi r6,r2,16 ; top half of 32-bit checksum
206 rlwinm r7,r2,0,0xFFFF ; lower half
207 add r2,r6,r7 ; add them together
208 srwi r6,r2,16 ; then do it again, in case first carried
209 rlwinm r7,r2,0,0xFFFF
210 add r3,r6,r7 ; steer result into r3
213 ; Handle short operands. Do a halfword at a time.
215 ; r4 = length (<= kShort)
216 ; r5 = accumulated sum parameter
217 ; r6 = "starting on odd byte" flag
220 cmpwi cr6,r4,2 ; at least two bytes?
221 andi. r0,r4,1 ; odd length?
222 li r2,0 ; initialize checksum
223 cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
224 blt cr6,Lshort2 ; fewer than two bytes, so skip
226 cmpwi cr6,r4,4 ; two more bytes (after we decrement)?
230 add r2,r2,r7 ; note no need for "adde"
231 bge cr6,Lshort1 ; loop for 2 more bytes
233 beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined
235 slwi r7,r7,8 ; shift last byte into proper lane
239 ; Handle 64-bit machine. The major improvement over the 32-bit path is that we use
240 ; four parallel 32-bit accumulators, which carry into the upper half naturally so we
241 ; do not have to use "adde", which serializes on the carry bit. Note that we cannot
242 ; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
243 ; r2 = checksum so far (ie, the zero-filled partial first word)
244 ; r3 = word aligned address
245 ; r5 = accumulated sum parameter
246 ; ctr = number of 32-byte chunks of input
247 ; carry = unused in this code
248 ; cr1_gt = "starting on odd address" flag
249 ; cr6,cr7 = residual length
252 stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them
256 li r0,128 ; to touch next line
257 li r13,0 ; r13-r15 are the accumulators, so initialize them
258 dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores
259 lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12
262 mr r14,r2 ; just copy incoming partial word into one of the accumulators
267 li r0,256 ; get touch offset
270 lwz r12,28(r3) ; load last word of previous chunk
271 addi r3,r3,32 ; skip past the chunk
272 bdnz++ LInnerLoop64 ; enter loop if another chunk to go
274 b LAddLastChunk ; only one chunk
276 ; Inner loop for 64-bit processors. This loop is scheduled for the 970.
277 ; It is pipelined (loads are one iteration ahead of adds), and unrolled.
278 ; It should take 9-10 cycles per iteration, which consumes 64 bytes of input.
281 LInnerLoop64: ; 64 bytes/iteration
282 add r13,r13,r4 ; cycle 1
284 dcbt r3,r0 ; touch in 2 lines ahead
287 add r15,r15,r7 ; cycle 2, etc
301 bdz-- LEarlyExit ; early exit if no more chunks
323 nop ; position last load in 2nd dispatch slot
330 ; Add in the last 32-byte chunk, and any leftover bytes.
331 ; r3 = word aligned address of next byte of data
332 ; r5 = accumulated sum parameter
333 ; r13-r16 = the four accumulators
334 ; cr1_gt = "starting on odd address" flag
335 ; cr6,cr7 = residual length
337 LEarlyExit: ; here from middle of inner loop
338 lwz r12,28(r3) ; load last word of last chunk
340 LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12
341 add r13,r13,r4 ; add in last chunk
342 add r14,r14,r6 ; these are 64-bit adds
350 ; Handle leftover bytes, if any.
352 bf 27,Lleft1 ; test 0x10 bit of residual length
382 slwi r4,r4,8 ; shift last byte into proper lane
385 ; All data bytes have been checksummed. Now we must add together the four
386 ; accumulators and restore the regs from the red zone.
387 ; r3 = word aligned address of next byte of data
388 ; r5 = accumulated sum parameter
389 ; r13-r16 = the four accumulators
390 ; carry = not used so far
391 ; cr1_gt = "starting on odd address" flag
394 add r8,r13,r14 ; add the four accumulators together
396 lwz r13,-4(r1) ; start to restore nonvolatiles from red zone
398 add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators
401 srdi r7,r8,32 ; get upper half of 64-bit sum
402 addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry)
403 b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum