X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/a3d08fcd5120d2aa8303b6349ca8b14e3f284af3..91447636331957f3d9b5ca5b508f07c526b0074d:/bsd/dev/ppc/xsumas.s diff --git a/bsd/dev/ppc/xsumas.s b/bsd/dev/ppc/xsumas.s index c83a688f1..dae54fb13 100644 --- a/bsd/dev/ppc/xsumas.s +++ b/bsd/dev/ppc/xsumas.s @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -19,231 +19,377 @@ * * @APPLE_LICENSE_HEADER_END@ */ -#define STANDALONE 0 - -#if STANDALONE -#include "asm.h" -#include "assym.h" -#include "proc_reg.h" /* For CACHE_LINE_SIZE */ - -#else - -#include -#if 0 -/* #include */ -#include /* For CACHE_LINE_SIZE */ -#endif 0 -#endif + +#define kShort 11 +#define cr1_gt 5 // bit 1 of cr1 /* - * Reg 3 - Pointer to data - * Reg 4 - Length of data - * Reg 5 - Accumulated sum value - * Reg 6 - Starting on odd boundary flag (relative to byte 0 of the checksumed data) - */ - -ENTRY(xsum_assym, TAG_NO_FRAME_USED) - - mr r11, r6 ; Swapped flag - addi r8, 0, 0 - addi r10, 0, 0x1f - addi r7, 0, 1 - addic r7, r7, 0 ; This clears the carry bit! - mr r12, r5 ; Save the passed-in checksum value - - /* - * Sum bytes before cache line boundary - */ - - cmpi cr0,0,r4,0 ; Check for length of 0 - beq Lleftovers - - and. r9, r3, r10 - beq Laligned32 ; 32 byte aligned - - andi. r9, r3, 0x3 - beq Laligned4 - - andi. r9, r3, 0x1 - beq Laligned2 ; 2 byte aligned - - addi r11, 0, 1 ; swap bytes at end - lbz r8, 0(r3) - add r3, r3, r7 - subf. r4, r7, r4 - beq Ldone - -Laligned2: - cmpi cr0,0,r4,2 ; If remaining length is less than two - go to wrap-up - blt Lleftovers - andi. r9, r3, 0x3 ; If aligned on a 4-byte boundary, go to that code - beq Laligned4 - lhz r5, 0(r3) ; Load and add a halfword to the checksum - adde r8, r8, r5 - slwi r7, r7, 1 - add r3, r3, r7 - subf. r4, r7, r4 - beq Ldone - - - /* - Add longwords up to the 32 byte boundary - */ - -Laligned4: - addi r7, 0, 4 -Lloop4: - cmpi cr0,0,r4,4 - blt Lleftovers - and. r9, r3, r10 - beq Laligned32 - lwz r5, 0(r3) - adde r8, r8, r5 - add r3, r3, r7 - subf. r4, r7, r4 - bne Lloop4 - b Ldone - - - /* - We're aligned on a 32 byte boundary now - add 8 longwords to checksum - until the remaining length is less than 32 - */ -Laligned32: - andis. r6, r4, 0xffff - bne Lmainloop - andi. r6, r4, 0xffe0 - beq Lleftovers - -Lmainloop: - addi r9, 0, 64 - addi r10, 0, 32 - cmpi cr0,0,r4,64 - blt Lnopretouch - dcbt r3, r10 ; Touch one cache-line ahead -Lnopretouch: - lwz r5, 0(r3) - - /* - * This is the main meat of the checksum. I attempted to arrange this code - * such that the processor would execute as many instructions as possible - * in parallel. - */ - -Lloop: - cmpi cr0,0,r4,96 - blt Lnotouch - dcbt r3, r9 ; Touch two cache lines ahead -Lnotouch: - adde r8, r8, r5 - lwz r5, 4(r3) - lwz r6, 8(r3) - lwz r7, 12(r3) - adde r8, r8, r5 - lwz r5, 16(r3) - adde r8, r8, r6 - lwz r6, 20(r3) - adde r8, r8, r7 - lwz r7, 24(r3) - adde r8, r8, r5 - lwz r5, 28(r3) - add r3, r3, r10 - adde r8, r8, r6 - adde r8, r8, r7 - adde r8, r8, r5 - subf r4, r10, r4 - andi. r6, r4, 0xffe0 - beq Lleftovers - lwz r5, 0(r3) - b Lloop - - /* - * Handle whatever bytes are left - */ - -Lleftovers: - /* - * Handle leftover bytes - */ - cmpi cr0,0,r4,0 - beq Ldone - - addi r7, 0, 1 - addi r10, 0, 0x7ffc - - and. r9, r4, r10 - bne Lfourormore - srw r10, r10, r7 - and. r9, r4, r10 - bne Ltwoormore - b Loneleft - -Lfourormore: - addi r10, 0, 4 - -Lfourloop: - lwz r5, 0(r3) - adde r8, r8, r5 - add r3, r3, r10 - subf r4, r10, r4 - andi. r6, r4, 0xfffc - bne Lfourloop - -Ltwoormore: - andi. r6, r4, 0xfffe - beq Loneleft - lhz r5, 0(r3) - adde r8, r8, r5 - addi r3, r3, 2 - subi r4, r4, 2 - -Loneleft: - cmpi cr0,0,r4,0 - beq Ldone - lbz r5, 0(r3) - slwi r5, r5, 8 - adde r8, r8, r5 - - /* - * Wrap the longword around, adding the two 16-bit portions - * to each other along with any previous and subsequent carries. - */ -Ldone: - addze r8, r8 ; Add the carry - addze r8, r8 ; Add the carry again (the last add may have carried) - andis. r6, r8, 0xffff ; Stuff r6 with the high order 16 bits of sum word - srwi r6, r6, 16 ; Shift it to the low order word - andi. r8, r8, 0xffff ; Zero out the high order word - add r8, r8, r6 ; Add the two halves - - andis. r6, r8, 0xffff ; Do the above again in case we carried into the - srwi r6, r6, 16 ; high order word with the last add. - andi. r8, r8, 0xffff - add r3, r8, r6 - - cmpi cr0,0,r11,0 ; Check to see if we need to swap the bytes - beq Ldontswap - - /* - * Our buffer began on an odd boundary, so we need to swap - * the checksum bytes. - */ - slwi r8, r3, 8 ; shift byte 0 to byte 1 - clrlwi r8, r8, 16 ; Clear top 16 bits - srwi r3, r3, 8 ; shift byte 1 to byte 0 - or r3, r8, r3 ; or them - -Ldontswap: - add r3, r3, r12 ; Add in the passed-in checksum - andis. r6, r3, 0xffff ; Wrap and add any carries into the top 16 bits - srwi r6, r6, 16 - andi. r3, r3, 0xffff - add r3, r3, r6 - - andis. r6, r3, 0xffff ; Do the above again in case we carried into the - srwi r6, r6, 16 ; high order word with the last add. - andi. r3, r3, 0xffff - add r3, r3, r6 - blr - - + * short xsum_assym( short *p, int len, short xsum, boolean odd); + * + * r3 - Pointer to data + * r4 - Length of data + * r5 - Accumulated sum value + * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data) + * + * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we + * correctly handle the case where the flag is set and the address is odd. + * + * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum + * of the data, treated as an array of 16-bit integers. 1s-complement sums are done + * via "add with carry" operations on a 2s-complement machine like PPC. Note that + * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the + * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is + * perfect except that it serializes the adds on the carry bit. On 64-bit machines + * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding + * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums, + * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit + * is set on the low 32-bits of the sum.) + * + * Using Altivec is tempting, but the performance impact of the greatly increased + * number of exceptions and register save/restore traffic probably make it impractical + * for now. + */ + .globl _xsum_assym + .globl _xsum_nop_if_32bit + .text + .align 5 +_xsum_assym: + cmplwi cr0,r4,kShort ; too short to word align? + rlwinm r2,r3,0,0x3 ; get byte offset in word + dcbt 0,r3 ; touch in 1st cache line + cmpwi cr6,r2,0 ; is address word aligned? + ble cr0,Lshort ; skip if too short to bother aligning + + subfic r0,r2,4 ; get #bytes in partial word + cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set + addic r0,r0,0 ; turn off carry + beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned) + +; Partial word at start: zero filled on left, it becomes initial checksum. + + rlwinm r3,r3,0,0,29 ; word align address + mtcrf 0x01,r2 ; move byte offset to cr7 + lwz r6,0(r3) ; get partial word + li r7,-1 ; start of mask for partial fill + slwi r8,r2,3 ; multiply byte offset by 8 + sub r4,r4,r0 ; adjust length for bytes in partial word + crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary + srw r7,r7,r8 ; get mask for bytes to keep in partial word + addi r3,r3,4 ; point to next word of input + and r2,r6,r7 ; zero fill on left + +; Address is now word aligned. Prepare for inner loop over 32-byte chunks. +; r2 = initial checksum +; r3 = word aligned address +; r4 = length remaining +; r5 = accumulated sum parameter +; carry = off +; cr1_gt = "starting on odd address" flag + +Laligned: + srwi. r0,r4,5 ; get count of 32-byte chunks + mtcrf 0x02,r4 ; move residual length to cr6 and cr7 + mtcrf 0x01,r4 + beq cr0,Lleftovers ; no chunks + + mtctr r0 ; set up loop count + li r4,32 ; offset to next chunk +_xsum_nop_if_32bit: + b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine) + dcbt r4,r3 ; touch in 2nd cache line + li r0,96 ; get touch offset + b LInnerLoop32 ; enter 32-bit loop + +; Inner loop for 32-bit machines. + + .align 4 +LInnerLoop32: + lwz r4,0(r3) + lwz r6,4(r3) + lwz r7,8(r3) + lwz r8,12(r3) + adde r2,r2,r4 + lwz r9,16(r3) + adde r2,r2,r6 + lwz r10,20(r3) + adde r2,r2,r7 + lwz r11,24(r3) + adde r2,r2,r8 + lwz r12,28(r3) + adde r2,r2,r9 + dcbt r3,r0 + adde r2,r2,r10 + addi r3,r3,32 + adde r2,r2,r11 + adde r2,r2,r12 + bdnz+ LInnerLoop32 + +; Handle leftover bytes. +; r2 = checksum so far +; r3 = word aligned address +; r5 = accumulated sum parameter +; carry = live +; cr1_gt = "starting on odd address" flag +; cr6,cr7 = residual length + +Lleftovers: + bf 27,Lleftover8 ; test 0x10 bit of residual length + lwz r4,0(r3) + lwz r6,4(r3) + lwz r7,8(r3) + lwz r8,12(r3) + addi r3,r3,16 + adde r2,r2,r4 + adde r2,r2,r6 + adde r2,r2,r7 + adde r2,r2,r8 +Lleftover8: + bf 28,Lleftover4 + lwz r4,0(r3) + lwz r6,4(r3) + addi r3,r3,8 + adde r2,r2,r4 + adde r2,r2,r6 +Lleftover4: + bf 29,Lleftover2 + lwz r4,0(r3) + addi r3,r3,4 + adde r2,r2,r4 +Lleftover2: + bf 30,Lleftover1 + lhz r4,0(r3) + addi r3,r3,2 + adde r2,r2,r4 +Lleftover1: + bf 31,Lwrapup + lbz r4,0(r3) + slwi r4,r4,8 ; shift last byte into proper lane + adde r2,r2,r4 + +; All data bytes checksummed. Wrap up. +; r2 = checksum so far (word parallel) +; r5 = accumulated sum parameter +; carry = live +; cr1_gt = "starting on odd address" flag + +Lwrapup: + addze r2,r2 ; add in last carry + addze r2,r2 ; in case the "addze" carries +Lwrapupx: ; here from short-operand case, with xer(ca) undefined + srwi r6,r2,16 ; top half of 32-bit checksum + rlwinm r7,r2,0,0xFFFF ; lower half + add r2,r6,r7 ; add them together + srwi r6,r2,16 ; then do it again, in case first carried + rlwinm r7,r2,0,0xFFFF + add r2,r6,r7 + bf cr1_gt,Lswapped ; test "starting on odd address" flag + +; The checksum began on an odd address, so swap bytes. + + rlwinm r6,r2,24,0x00FF ; move top byte to bottom + rlwinm r7,r2,8,0xFF00 ; bottom to top + or r2,r6,r7 ; rejoin + +; Finally, add in checksum passed in as a parameter. + +Lswapped: + add r2,r2,r5 ; add passed-in checksum + srwi r6,r2,16 ; top half of 32-bit checksum + rlwinm r7,r2,0,0xFFFF ; lower half + add r2,r6,r7 ; add them together + srwi r6,r2,16 ; then do it again, in case first carried + rlwinm r7,r2,0,0xFFFF + add r3,r6,r7 ; steer result into r3 + blr + +; Handle short operands. Do a halfword at a time. +; r3 = address +; r4 = length (<= kShort) +; r5 = accumulated sum parameter +; r6 = "starting on odd byte" flag + +Lshort: + cmpwi cr6,r4,2 ; at least two bytes? + andi. r0,r4,1 ; odd length? + li r2,0 ; initialize checksum + cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set + blt cr6,Lshort2 ; fewer than two bytes, so skip +Lshort1: + cmpwi cr6,r4,4 ; two more bytes (after we decrement)? + lhz r7,0(r3) + subi r4,r4,2 + addi r3,r3,2 + add r2,r2,r7 ; note no need for "adde" + bge cr6,Lshort1 ; loop for 2 more bytes +Lshort2: + beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined + lbz r7,0(r3) + slwi r7,r7,8 ; shift last byte into proper lane + add r2,r2,r7 + b Lwrapupx + +; Handle 64-bit machine. The major improvement over the 32-bit path is that we use +; four parallel 32-bit accumulators, which carry into the upper half naturally so we +; do not have to use "adde", which serializes on the carry bit. Note that we cannot +; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly. +; r2 = checksum so far (ie, the zero-filled partial first word) +; r3 = word aligned address +; r5 = accumulated sum parameter +; ctr = number of 32-byte chunks of input +; carry = unused in this code +; cr1_gt = "starting on odd address" flag +; cr6,cr7 = residual length + +L64BitPath: + stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them + stw r14,-8(r1) + stw r15,-12(r1) + stw r16,-16(r1) + li r0,128 ; to touch next line + li r13,0 ; r13-r15 are the accumulators, so initialize them + dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores + lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12 + lwz r6,4(r3) + lwz r7,8(r3) + mr r14,r2 ; just copy incoming partial word into one of the accumulators + li r15,0 + lwz r8,12(r3) + lwz r9,16(r3) + li r16,0 + li r0,256 ; get touch offset + lwz r10,20(r3) + lwz r11,24(r3) + lwz r12,28(r3) ; load last word of previous chunk + addi r3,r3,32 ; skip past the chunk + bdnz++ LInnerLoop64 ; enter loop if another chunk to go + + b LAddLastChunk ; only one chunk + +; Inner loop for 64-bit processors. This loop is scheduled for the 970. +; It is pipelined (loads are one iteration ahead of adds), and unrolled. +; It should take 9-10 cycles per iteration, which consumes 64 bytes of input. + + .align 5 +LInnerLoop64: ; 64 bytes/iteration + add r13,r13,r4 ; cycle 1 + add r14,r14,r6 + dcbt r3,r0 ; touch in 2 lines ahead + lwz r4,0(r3) + + add r15,r15,r7 ; cycle 2, etc + lwz r6,4(r3) + lwz r7,8(r3) + add r16,r16,r8 + + lwz r8,12(r3) + add r13,r13,r9 + add r14,r14,r10 + lwz r9,16(r3) + + add r15,r15,r11 + lwz r10,20(r3) + lwz r11,24(r3) + add r16,r16,r12 + bdz-- LEarlyExit ; early exit if no more chunks + + lwz r12,28(r3) + add r13,r13,r4 + add r14,r14,r6 + lwz r4,32(r3) + + add r15,r15,r7 + lwz r6,36(r3) + lwz r7,40(r3) + add r16,r16,r8 + + lwz r8,44(r3) + add r13,r13,r9 + add r14,r14,r10 + lwz r9,48(r3) + + add r15,r15,r11 + lwz r10,52(r3) + lwz r11,56(r3) + add r16,r16,r12 + + nop ; position last load in 2nd dispatch slot + lwz r12,60(r3) + addi r3,r3,64 + bdnz++ LInnerLoop64 + + b LAddLastChunk + +; Add in the last 32-byte chunk, and any leftover bytes. +; r3 = word aligned address of next byte of data +; r5 = accumulated sum parameter +; r13-r16 = the four accumulators +; cr1_gt = "starting on odd address" flag +; cr6,cr7 = residual length + +LEarlyExit: ; here from middle of inner loop + lwz r12,28(r3) ; load last word of last chunk + addi r3,r3,32 +LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12 + add r13,r13,r4 ; add in last chunk + add r14,r14,r6 ; these are 64-bit adds + add r15,r15,r7 + add r16,r16,r8 + add r13,r13,r9 + add r14,r14,r10 + add r15,r15,r11 + add r16,r16,r12 + +; Handle leftover bytes, if any. + + bf 27,Lleft1 ; test 0x10 bit of residual length + lwz r4,0(r3) + lwz r6,4(r3) + lwz r7,8(r3) + lwz r8,12(r3) + addi r3,r3,16 + add r13,r13,r4 + add r14,r14,r6 + add r15,r15,r7 + add r16,r16,r8 +Lleft1: + bf 28,Lleft2 + lwz r4,0(r3) + lwz r6,4(r3) + addi r3,r3,8 + add r13,r13,r4 + add r14,r14,r6 +Lleft2: + bf 29,Lleft3 + lwz r4,0(r3) + addi r3,r3,4 + add r14,r14,r4 +Lleft3: + bf 30,Lleft4 + lhz r4,0(r3) + addi r3,r3,2 + add r15,r15,r4 +Lleft4: + bf 31,Lleft5 + lbz r4,0(r3) + slwi r4,r4,8 ; shift last byte into proper lane + add r16,r16,r4 + +; All data bytes have been checksummed. Now we must add together the four +; accumulators and restore the regs from the red zone. +; r3 = word aligned address of next byte of data +; r5 = accumulated sum parameter +; r13-r16 = the four accumulators +; carry = not used so far +; cr1_gt = "starting on odd address" flag + +Lleft5: + add r8,r13,r14 ; add the four accumulators together + add r9,r15,r16 + lwz r13,-4(r1) ; start to restore nonvolatiles from red zone + lwz r14,-8(r1) + add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators + lwz r15,-12(r1) + lwz r16,-16(r1) + srdi r7,r8,32 ; get upper half of 64-bit sum + addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry) + b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum