2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 #define cr1_gt 5 // bit 1 of cr1
33 * short xsum_assym( short *p, int len, short xsum, boolean odd);
35 * r3 - Pointer to data
37 * r5 - Accumulated sum value
38 * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
40 * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we
41 * correctly handle the case where the flag is set and the address is odd.
43 * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
44 * of the data, treated as an array of 16-bit integers. 1s-complement sums are done
45 * via "add with carry" operations on a 2s-complement machine like PPC. Note that
46 * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
47 * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is
48 * perfect except that it serializes the adds on the carry bit. On 64-bit machines
49 * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
50 * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums,
51 * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
52 * is set on the low 32-bits of the sum.)
54 * Using Altivec is tempting, but the performance impact of the greatly increased
55 * number of exceptions and register save/restore traffic probably make it impractical
59 .globl _xsum_nop_if_32bit
63 cmplwi cr0,r4,kShort ; too short to word align?
64 rlwinm r2,r3,0,0x3 ; get byte offset in word
65 dcbt 0,r3 ; touch in 1st cache line
66 cmpwi cr6,r2,0 ; is address word aligned?
67 ble cr0,Lshort ; skip if too short to bother aligning
69 subfic r0,r2,4 ; get #bytes in partial word
70 cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
71 addic r0,r0,0 ; turn off carry
72 beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned)
74 ; Partial word at start: zero filled on left, it becomes initial checksum.
76 rlwinm r3,r3,0,0,29 ; word align address
77 mtcrf 0x01,r2 ; move byte offset to cr7
78 lwz r6,0(r3) ; get partial word
79 li r7,-1 ; start of mask for partial fill
80 slwi r8,r2,3 ; multiply byte offset by 8
81 sub r4,r4,r0 ; adjust length for bytes in partial word
82 crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
83 srw r7,r7,r8 ; get mask for bytes to keep in partial word
84 addi r3,r3,4 ; point to next word of input
85 and r2,r6,r7 ; zero fill on left
87 ; Address is now word aligned. Prepare for inner loop over 32-byte chunks.
88 ; r2 = initial checksum
89 ; r3 = word aligned address
90 ; r4 = length remaining
91 ; r5 = accumulated sum parameter
93 ; cr1_gt = "starting on odd address" flag
96 srwi. r0,r4,5 ; get count of 32-byte chunks
97 mtcrf 0x02,r4 ; move residual length to cr6 and cr7
99 beq cr0,Lleftovers ; no chunks
101 mtctr r0 ; set up loop count
102 li r4,32 ; offset to next chunk
104 b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine)
105 dcbt r4,r3 ; touch in 2nd cache line
106 li r0,96 ; get touch offset
107 b LInnerLoop32 ; enter 32-bit loop
109 ; Inner loop for 32-bit machines.
133 ; Handle leftover bytes.
134 ; r2 = checksum so far
135 ; r3 = word aligned address
136 ; r5 = accumulated sum parameter
138 ; cr1_gt = "starting on odd address" flag
139 ; cr6,cr7 = residual length
142 bf 27,Lleftover8 ; test 0x10 bit of residual length
172 slwi r4,r4,8 ; shift last byte into proper lane
175 ; All data bytes checksummed. Wrap up.
176 ; r2 = checksum so far (word parallel)
177 ; r5 = accumulated sum parameter
179 ; cr1_gt = "starting on odd address" flag
182 addze r2,r2 ; add in last carry
183 addze r2,r2 ; in case the "addze" carries
184 Lwrapupx: ; here from short-operand case, with xer(ca) undefined
185 srwi r6,r2,16 ; top half of 32-bit checksum
186 rlwinm r7,r2,0,0xFFFF ; lower half
187 add r2,r6,r7 ; add them together
188 srwi r6,r2,16 ; then do it again, in case first carried
189 rlwinm r7,r2,0,0xFFFF
191 bf cr1_gt,Lswapped ; test "starting on odd address" flag
193 ; The checksum began on an odd address, so swap bytes.
195 rlwinm r6,r2,24,0x00FF ; move top byte to bottom
196 rlwinm r7,r2,8,0xFF00 ; bottom to top
199 ; Finally, add in checksum passed in as a parameter.
202 add r2,r2,r5 ; add passed-in checksum
203 srwi r6,r2,16 ; top half of 32-bit checksum
204 rlwinm r7,r2,0,0xFFFF ; lower half
205 add r2,r6,r7 ; add them together
206 srwi r6,r2,16 ; then do it again, in case first carried
207 rlwinm r7,r2,0,0xFFFF
208 add r3,r6,r7 ; steer result into r3
211 ; Handle short operands. Do a halfword at a time.
213 ; r4 = length (<= kShort)
214 ; r5 = accumulated sum parameter
215 ; r6 = "starting on odd byte" flag
218 cmpwi cr6,r4,2 ; at least two bytes?
219 andi. r0,r4,1 ; odd length?
220 li r2,0 ; initialize checksum
221 cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
222 blt cr6,Lshort2 ; fewer than two bytes, so skip
224 cmpwi cr6,r4,4 ; two more bytes (after we decrement)?
228 add r2,r2,r7 ; note no need for "adde"
229 bge cr6,Lshort1 ; loop for 2 more bytes
231 beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined
233 slwi r7,r7,8 ; shift last byte into proper lane
237 ; Handle 64-bit machine. The major improvement over the 32-bit path is that we use
238 ; four parallel 32-bit accumulators, which carry into the upper half naturally so we
239 ; do not have to use "adde", which serializes on the carry bit. Note that we cannot
240 ; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
241 ; r2 = checksum so far (ie, the zero-filled partial first word)
242 ; r3 = word aligned address
243 ; r5 = accumulated sum parameter
244 ; ctr = number of 32-byte chunks of input
245 ; carry = unused in this code
246 ; cr1_gt = "starting on odd address" flag
247 ; cr6,cr7 = residual length
250 stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them
254 li r0,128 ; to touch next line
255 li r13,0 ; r13-r15 are the accumulators, so initialize them
256 dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores
257 lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12
260 mr r14,r2 ; just copy incoming partial word into one of the accumulators
265 li r0,256 ; get touch offset
268 lwz r12,28(r3) ; load last word of previous chunk
269 addi r3,r3,32 ; skip past the chunk
270 bdnz++ LInnerLoop64 ; enter loop if another chunk to go
272 b LAddLastChunk ; only one chunk
274 ; Inner loop for 64-bit processors. This loop is scheduled for the 970.
275 ; It is pipelined (loads are one iteration ahead of adds), and unrolled.
276 ; It should take 9-10 cycles per iteration, which consumes 64 bytes of input.
279 LInnerLoop64: ; 64 bytes/iteration
280 add r13,r13,r4 ; cycle 1
282 dcbt r3,r0 ; touch in 2 lines ahead
285 add r15,r15,r7 ; cycle 2, etc
299 bdz-- LEarlyExit ; early exit if no more chunks
321 nop ; position last load in 2nd dispatch slot
328 ; Add in the last 32-byte chunk, and any leftover bytes.
329 ; r3 = word aligned address of next byte of data
330 ; r5 = accumulated sum parameter
331 ; r13-r16 = the four accumulators
332 ; cr1_gt = "starting on odd address" flag
333 ; cr6,cr7 = residual length
335 LEarlyExit: ; here from middle of inner loop
336 lwz r12,28(r3) ; load last word of last chunk
338 LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12
339 add r13,r13,r4 ; add in last chunk
340 add r14,r14,r6 ; these are 64-bit adds
348 ; Handle leftover bytes, if any.
350 bf 27,Lleft1 ; test 0x10 bit of residual length
380 slwi r4,r4,8 ; shift last byte into proper lane
383 ; All data bytes have been checksummed. Now we must add together the four
384 ; accumulators and restore the regs from the red zone.
385 ; r3 = word aligned address of next byte of data
386 ; r5 = accumulated sum parameter
387 ; r13-r16 = the four accumulators
388 ; carry = not used so far
389 ; cr1_gt = "starting on odd address" flag
392 add r8,r13,r14 ; add the four accumulators together
394 lwz r13,-4(r1) ; start to restore nonvolatiles from red zone
396 add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators
399 srdi r7,r8,32 ; get upper half of 64-bit sum
400 addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry)
401 b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum