/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
*
* @APPLE_LICENSE_HEADER_END@
*/
-#define STANDALONE 0
-
-#if STANDALONE
-#include "asm.h"
-#include "assym.h"
-#include "proc_reg.h" /* For CACHE_LINE_SIZE */
-
-#else
-
-#include <mach/ppc/asm.h>
-#if 0
-/* #include <assym.h> */
-#include <ppc/proc_reg.h> /* For CACHE_LINE_SIZE */
-#endif 0
-#endif
+
+#define kShort 11
+#define cr1_gt 5 // bit 1 of cr1
/*
- * Reg 3 - Pointer to data
- * Reg 4 - Length of data
- * Reg 5 - Accumulated sum value
- * Reg 6 - Starting on odd boundary flag (relative to byte 0 of the checksumed data)
- */
-
-ENTRY(xsum_assym, TAG_NO_FRAME_USED)
-
- mr r11, r6 ; Swapped flag
- addi r8, 0, 0
- addi r10, 0, 0x1f
- addi r7, 0, 1
- addic r7, r7, 0 ; This clears the carry bit!
- mr r12, r5 ; Save the passed-in checksum value
-
- /*
- * Sum bytes before cache line boundary
- */
-
- cmpi cr0,0,r4,0 ; Check for length of 0
- beq Lleftovers
-
- and. r9, r3, r10
- beq Laligned32 ; 32 byte aligned
-
- andi. r9, r3, 0x3
- beq Laligned4
-
- andi. r9, r3, 0x1
- beq Laligned2 ; 2 byte aligned
-
- addi r11, 0, 1 ; swap bytes at end
- lbz r8, 0(r3)
- add r3, r3, r7
- subf. r4, r7, r4
- beq Ldone
-
-Laligned2:
- cmpi cr0,0,r4,2 ; If remaining length is less than two - go to wrap-up
- blt Lleftovers
- andi. r9, r3, 0x3 ; If aligned on a 4-byte boundary, go to that code
- beq Laligned4
- lhz r5, 0(r3) ; Load and add a halfword to the checksum
- adde r8, r8, r5
- slwi r7, r7, 1
- add r3, r3, r7
- subf. r4, r7, r4
- beq Ldone
-
-
- /*
- Add longwords up to the 32 byte boundary
- */
-
-Laligned4:
- addi r7, 0, 4
-Lloop4:
- cmpi cr0,0,r4,4
- blt Lleftovers
- and. r9, r3, r10
- beq Laligned32
- lwz r5, 0(r3)
- adde r8, r8, r5
- add r3, r3, r7
- subf. r4, r7, r4
- bne Lloop4
- b Ldone
-
-
- /*
- We're aligned on a 32 byte boundary now - add 8 longwords to checksum
- until the remaining length is less than 32
- */
-Laligned32:
- andis. r6, r4, 0xffff
- bne Lmainloop
- andi. r6, r4, 0xffe0
- beq Lleftovers
-
-Lmainloop:
- addi r9, 0, 64
- addi r10, 0, 32
- cmpi cr0,0,r4,64
- blt Lnopretouch
- dcbt r3, r10 ; Touch one cache-line ahead
-Lnopretouch:
- lwz r5, 0(r3)
-
- /*
- * This is the main meat of the checksum. I attempted to arrange this code
- * such that the processor would execute as many instructions as possible
- * in parallel.
- */
-
-Lloop:
- cmpi cr0,0,r4,96
- blt Lnotouch
- dcbt r3, r9 ; Touch two cache lines ahead
-Lnotouch:
- adde r8, r8, r5
- lwz r5, 4(r3)
- lwz r6, 8(r3)
- lwz r7, 12(r3)
- adde r8, r8, r5
- lwz r5, 16(r3)
- adde r8, r8, r6
- lwz r6, 20(r3)
- adde r8, r8, r7
- lwz r7, 24(r3)
- adde r8, r8, r5
- lwz r5, 28(r3)
- add r3, r3, r10
- adde r8, r8, r6
- adde r8, r8, r7
- adde r8, r8, r5
- subf r4, r10, r4
- andi. r6, r4, 0xffe0
- beq Lleftovers
- lwz r5, 0(r3)
- b Lloop
-
- /*
- * Handle whatever bytes are left
- */
-
-Lleftovers:
- /*
- * Handle leftover bytes
- */
- cmpi cr0,0,r4,0
- beq Ldone
-
- addi r7, 0, 1
- addi r10, 0, 0x7ffc
-
- and. r9, r4, r10
- bne Lfourormore
- srw r10, r10, r7
- and. r9, r4, r10
- bne Ltwoormore
- b Loneleft
-
-Lfourormore:
- addi r10, 0, 4
-
-Lfourloop:
- lwz r5, 0(r3)
- adde r8, r8, r5
- add r3, r3, r10
- subf r4, r10, r4
- andi. r6, r4, 0xfffc
- bne Lfourloop
-
-Ltwoormore:
- andi. r6, r4, 0xfffe
- beq Loneleft
- lhz r5, 0(r3)
- adde r8, r8, r5
- addi r3, r3, 2
- subi r4, r4, 2
-
-Loneleft:
- cmpi cr0,0,r4,0
- beq Ldone
- lbz r5, 0(r3)
- slwi r5, r5, 8
- adde r8, r8, r5
-
- /*
- * Wrap the longword around, adding the two 16-bit portions
- * to each other along with any previous and subsequent carries.
- */
-Ldone:
- addze r8, r8 ; Add the carry
- addze r8, r8 ; Add the carry again (the last add may have carried)
- andis. r6, r8, 0xffff ; Stuff r6 with the high order 16 bits of sum word
- srwi r6, r6, 16 ; Shift it to the low order word
- andi. r8, r8, 0xffff ; Zero out the high order word
- add r8, r8, r6 ; Add the two halves
-
- andis. r6, r8, 0xffff ; Do the above again in case we carried into the
- srwi r6, r6, 16 ; high order word with the last add.
- andi. r8, r8, 0xffff
- add r3, r8, r6
-
- cmpi cr0,0,r11,0 ; Check to see if we need to swap the bytes
- beq Ldontswap
-
- /*
- * Our buffer began on an odd boundary, so we need to swap
- * the checksum bytes.
- */
- slwi r8, r3, 8 ; shift byte 0 to byte 1
- clrlwi r8, r8, 16 ; Clear top 16 bits
- srwi r3, r3, 8 ; shift byte 1 to byte 0
- or r3, r8, r3 ; or them
-
-Ldontswap:
- add r3, r3, r12 ; Add in the passed-in checksum
- andis. r6, r3, 0xffff ; Wrap and add any carries into the top 16 bits
- srwi r6, r6, 16
- andi. r3, r3, 0xffff
- add r3, r3, r6
-
- andis. r6, r3, 0xffff ; Do the above again in case we carried into the
- srwi r6, r6, 16 ; high order word with the last add.
- andi. r3, r3, 0xffff
- add r3, r3, r6
- blr
-
-
+ * short xsum_assym( short *p, int len, short xsum, boolean odd);
+ *
+ * r3 - Pointer to data
+ * r4 - Length of data
+ * r5 - Accumulated sum value
+ * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
+ *
+ * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we
+ * correctly handle the case where the flag is set and the address is odd.
+ *
+ * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
+ * of the data, treated as an array of 16-bit integers. 1s-complement sums are done
+ * via "add with carry" operations on a 2s-complement machine like PPC. Note that
+ * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
+ * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is
+ * perfect except that it serializes the adds on the carry bit. On 64-bit machines
+ * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
+ * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums,
+ * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
+ * is set on the low 32-bits of the sum.)
+ *
+ * Using Altivec is tempting, but the performance impact of the greatly increased
+ * number of exceptions and register save/restore traffic probably make it impractical
+ * for now.
+ */
+ .globl _xsum_assym
+ .globl _xsum_nop_if_32bit
+ .text
+ .align 5
+_xsum_assym:
+ cmplwi cr0,r4,kShort ; too short to word align?
+ rlwinm r2,r3,0,0x3 ; get byte offset in word
+ dcbt 0,r3 ; touch in 1st cache line
+ cmpwi cr6,r2,0 ; is address word aligned?
+ ble cr0,Lshort ; skip if too short to bother aligning
+
+ subfic r0,r2,4 ; get #bytes in partial word
+ cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
+ addic r0,r0,0 ; turn off carry
+ beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned)
+
+; Partial word at start: zero filled on left, it becomes initial checksum.
+
+ rlwinm r3,r3,0,0,29 ; word align address
+ mtcrf 0x01,r2 ; move byte offset to cr7
+ lwz r6,0(r3) ; get partial word
+ li r7,-1 ; start of mask for partial fill
+ slwi r8,r2,3 ; multiply byte offset by 8
+ sub r4,r4,r0 ; adjust length for bytes in partial word
+ crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
+ srw r7,r7,r8 ; get mask for bytes to keep in partial word
+ addi r3,r3,4 ; point to next word of input
+ and r2,r6,r7 ; zero fill on left
+
+; Address is now word aligned. Prepare for inner loop over 32-byte chunks.
+; r2 = initial checksum
+; r3 = word aligned address
+; r4 = length remaining
+; r5 = accumulated sum parameter
+; carry = off
+; cr1_gt = "starting on odd address" flag
+
+Laligned:
+ srwi. r0,r4,5 ; get count of 32-byte chunks
+ mtcrf 0x02,r4 ; move residual length to cr6 and cr7
+ mtcrf 0x01,r4
+ beq cr0,Lleftovers ; no chunks
+
+ mtctr r0 ; set up loop count
+ li r4,32 ; offset to next chunk
+_xsum_nop_if_32bit:
+ b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine)
+ dcbt r4,r3 ; touch in 2nd cache line
+ li r0,96 ; get touch offset
+ b LInnerLoop32 ; enter 32-bit loop
+
+; Inner loop for 32-bit machines.
+
+ .align 4
+LInnerLoop32:
+ lwz r4,0(r3)
+ lwz r6,4(r3)
+ lwz r7,8(r3)
+ lwz r8,12(r3)
+ adde r2,r2,r4
+ lwz r9,16(r3)
+ adde r2,r2,r6
+ lwz r10,20(r3)
+ adde r2,r2,r7
+ lwz r11,24(r3)
+ adde r2,r2,r8
+ lwz r12,28(r3)
+ adde r2,r2,r9
+ dcbt r3,r0
+ adde r2,r2,r10
+ addi r3,r3,32
+ adde r2,r2,r11
+ adde r2,r2,r12
+ bdnz+ LInnerLoop32
+
+; Handle leftover bytes.
+; r2 = checksum so far
+; r3 = word aligned address
+; r5 = accumulated sum parameter
+; carry = live
+; cr1_gt = "starting on odd address" flag
+; cr6,cr7 = residual length
+
+Lleftovers:
+ bf 27,Lleftover8 ; test 0x10 bit of residual length
+ lwz r4,0(r3)
+ lwz r6,4(r3)
+ lwz r7,8(r3)
+ lwz r8,12(r3)
+ addi r3,r3,16
+ adde r2,r2,r4
+ adde r2,r2,r6
+ adde r2,r2,r7
+ adde r2,r2,r8
+Lleftover8:
+ bf 28,Lleftover4
+ lwz r4,0(r3)
+ lwz r6,4(r3)
+ addi r3,r3,8
+ adde r2,r2,r4
+ adde r2,r2,r6
+Lleftover4:
+ bf 29,Lleftover2
+ lwz r4,0(r3)
+ addi r3,r3,4
+ adde r2,r2,r4
+Lleftover2:
+ bf 30,Lleftover1
+ lhz r4,0(r3)
+ addi r3,r3,2
+ adde r2,r2,r4
+Lleftover1:
+ bf 31,Lwrapup
+ lbz r4,0(r3)
+ slwi r4,r4,8 ; shift last byte into proper lane
+ adde r2,r2,r4
+
+; All data bytes checksummed. Wrap up.
+; r2 = checksum so far (word parallel)
+; r5 = accumulated sum parameter
+; carry = live
+; cr1_gt = "starting on odd address" flag
+
+Lwrapup:
+ addze r2,r2 ; add in last carry
+ addze r2,r2 ; in case the "addze" carries
+Lwrapupx: ; here from short-operand case, with xer(ca) undefined
+ srwi r6,r2,16 ; top half of 32-bit checksum
+ rlwinm r7,r2,0,0xFFFF ; lower half
+ add r2,r6,r7 ; add them together
+ srwi r6,r2,16 ; then do it again, in case first carried
+ rlwinm r7,r2,0,0xFFFF
+ add r2,r6,r7
+ bf cr1_gt,Lswapped ; test "starting on odd address" flag
+
+; The checksum began on an odd address, so swap bytes.
+
+ rlwinm r6,r2,24,0x00FF ; move top byte to bottom
+ rlwinm r7,r2,8,0xFF00 ; bottom to top
+ or r2,r6,r7 ; rejoin
+
+; Finally, add in checksum passed in as a parameter.
+
+Lswapped:
+ add r2,r2,r5 ; add passed-in checksum
+ srwi r6,r2,16 ; top half of 32-bit checksum
+ rlwinm r7,r2,0,0xFFFF ; lower half
+ add r2,r6,r7 ; add them together
+ srwi r6,r2,16 ; then do it again, in case first carried
+ rlwinm r7,r2,0,0xFFFF
+ add r3,r6,r7 ; steer result into r3
+ blr
+
+; Handle short operands. Do a halfword at a time.
+; r3 = address
+; r4 = length (<= kShort)
+; r5 = accumulated sum parameter
+; r6 = "starting on odd byte" flag
+
+Lshort:
+ cmpwi cr6,r4,2 ; at least two bytes?
+ andi. r0,r4,1 ; odd length?
+ li r2,0 ; initialize checksum
+ cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
+ blt cr6,Lshort2 ; fewer than two bytes, so skip
+Lshort1:
+ cmpwi cr6,r4,4 ; two more bytes (after we decrement)?
+ lhz r7,0(r3)
+ subi r4,r4,2
+ addi r3,r3,2
+ add r2,r2,r7 ; note no need for "adde"
+ bge cr6,Lshort1 ; loop for 2 more bytes
+Lshort2:
+ beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined
+ lbz r7,0(r3)
+ slwi r7,r7,8 ; shift last byte into proper lane
+ add r2,r2,r7
+ b Lwrapupx
+
+; Handle 64-bit machine. The major improvement over the 32-bit path is that we use
+; four parallel 32-bit accumulators, which carry into the upper half naturally so we
+; do not have to use "adde", which serializes on the carry bit. Note that we cannot
+; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
+; r2 = checksum so far (ie, the zero-filled partial first word)
+; r3 = word aligned address
+; r5 = accumulated sum parameter
+; ctr = number of 32-byte chunks of input
+; carry = unused in this code
+; cr1_gt = "starting on odd address" flag
+; cr6,cr7 = residual length
+
+L64BitPath:
+ stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them
+ stw r14,-8(r1)
+ stw r15,-12(r1)
+ stw r16,-16(r1)
+ li r0,128 ; to touch next line
+ li r13,0 ; r13-r15 are the accumulators, so initialize them
+ dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores
+ lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12
+ lwz r6,4(r3)
+ lwz r7,8(r3)
+ mr r14,r2 ; just copy incoming partial word into one of the accumulators
+ li r15,0
+ lwz r8,12(r3)
+ lwz r9,16(r3)
+ li r16,0
+ li r0,256 ; get touch offset
+ lwz r10,20(r3)
+ lwz r11,24(r3)
+ lwz r12,28(r3) ; load last word of previous chunk
+ addi r3,r3,32 ; skip past the chunk
+ bdnz++ LInnerLoop64 ; enter loop if another chunk to go
+
+ b LAddLastChunk ; only one chunk
+
+; Inner loop for 64-bit processors. This loop is scheduled for the 970.
+; It is pipelined (loads are one iteration ahead of adds), and unrolled.
+; It should take 9-10 cycles per iteration, which consumes 64 bytes of input.
+
+ .align 5
+LInnerLoop64: ; 64 bytes/iteration
+ add r13,r13,r4 ; cycle 1
+ add r14,r14,r6
+ dcbt r3,r0 ; touch in 2 lines ahead
+ lwz r4,0(r3)
+
+ add r15,r15,r7 ; cycle 2, etc
+ lwz r6,4(r3)
+ lwz r7,8(r3)
+ add r16,r16,r8
+
+ lwz r8,12(r3)
+ add r13,r13,r9
+ add r14,r14,r10
+ lwz r9,16(r3)
+
+ add r15,r15,r11
+ lwz r10,20(r3)
+ lwz r11,24(r3)
+ add r16,r16,r12
+ bdz-- LEarlyExit ; early exit if no more chunks
+
+ lwz r12,28(r3)
+ add r13,r13,r4
+ add r14,r14,r6
+ lwz r4,32(r3)
+
+ add r15,r15,r7
+ lwz r6,36(r3)
+ lwz r7,40(r3)
+ add r16,r16,r8
+
+ lwz r8,44(r3)
+ add r13,r13,r9
+ add r14,r14,r10
+ lwz r9,48(r3)
+
+ add r15,r15,r11
+ lwz r10,52(r3)
+ lwz r11,56(r3)
+ add r16,r16,r12
+
+ nop ; position last load in 2nd dispatch slot
+ lwz r12,60(r3)
+ addi r3,r3,64
+ bdnz++ LInnerLoop64
+
+ b LAddLastChunk
+
+; Add in the last 32-byte chunk, and any leftover bytes.
+; r3 = word aligned address of next byte of data
+; r5 = accumulated sum parameter
+; r13-r16 = the four accumulators
+; cr1_gt = "starting on odd address" flag
+; cr6,cr7 = residual length
+
+LEarlyExit: ; here from middle of inner loop
+ lwz r12,28(r3) ; load last word of last chunk
+ addi r3,r3,32
+LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12
+ add r13,r13,r4 ; add in last chunk
+ add r14,r14,r6 ; these are 64-bit adds
+ add r15,r15,r7
+ add r16,r16,r8
+ add r13,r13,r9
+ add r14,r14,r10
+ add r15,r15,r11
+ add r16,r16,r12
+
+; Handle leftover bytes, if any.
+
+ bf 27,Lleft1 ; test 0x10 bit of residual length
+ lwz r4,0(r3)
+ lwz r6,4(r3)
+ lwz r7,8(r3)
+ lwz r8,12(r3)
+ addi r3,r3,16
+ add r13,r13,r4
+ add r14,r14,r6
+ add r15,r15,r7
+ add r16,r16,r8
+Lleft1:
+ bf 28,Lleft2
+ lwz r4,0(r3)
+ lwz r6,4(r3)
+ addi r3,r3,8
+ add r13,r13,r4
+ add r14,r14,r6
+Lleft2:
+ bf 29,Lleft3
+ lwz r4,0(r3)
+ addi r3,r3,4
+ add r14,r14,r4
+Lleft3:
+ bf 30,Lleft4
+ lhz r4,0(r3)
+ addi r3,r3,2
+ add r15,r15,r4
+Lleft4:
+ bf 31,Lleft5
+ lbz r4,0(r3)
+ slwi r4,r4,8 ; shift last byte into proper lane
+ add r16,r16,r4
+
+; All data bytes have been checksummed. Now we must add together the four
+; accumulators and restore the regs from the red zone.
+; r3 = word aligned address of next byte of data
+; r5 = accumulated sum parameter
+; r13-r16 = the four accumulators
+; carry = not used so far
+; cr1_gt = "starting on odd address" flag
+
+Lleft5:
+ add r8,r13,r14 ; add the four accumulators together
+ add r9,r15,r16
+ lwz r13,-4(r1) ; start to restore nonvolatiles from red zone
+ lwz r14,-8(r1)
+ add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators
+ lwz r15,-12(r1)
+ lwz r16,-16(r1)
+ srdi r7,r8,32 ; get upper half of 64-bit sum
+ addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry)
+ b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum