/* * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * The contents of this file constitute Original Code as defined in and * are subject to the Apple Public Source License Version 1.1 (the * "License"). You may not use this file except in compliance with the * License. Please obtain a copy of the License at * http://www.apple.com/publicsource and read it before using this file. * * This Original Code and all software distributed under the License are * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the * License for the specific language governing rights and limitations * under the License. * * @APPLE_LICENSE_HEADER_END@ */ #define kShort 11 #define cr1_gt 5 // bit 1 of cr1 /* * short xsum_assym( short *p, int len, short xsum, boolean odd); * * r3 - Pointer to data * r4 - Length of data * r5 - Accumulated sum value * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data) * * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we * correctly handle the case where the flag is set and the address is odd. * * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum * of the data, treated as an array of 16-bit integers. 1s-complement sums are done * via "add with carry" operations on a 2s-complement machine like PPC. Note that * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is * perfect except that it serializes the adds on the carry bit. On 64-bit machines * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums, * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit * is set on the low 32-bits of the sum.) * * Using Altivec is tempting, but the performance impact of the greatly increased * number of exceptions and register save/restore traffic probably make it impractical * for now. */ .globl _xsum_assym .globl _xsum_nop_if_32bit .text .align 5 _xsum_assym: cmplwi cr0,r4,kShort ; too short to word align? rlwinm r2,r3,0,0x3 ; get byte offset in word dcbt 0,r3 ; touch in 1st cache line cmpwi cr6,r2,0 ; is address word aligned? ble cr0,Lshort ; skip if too short to bother aligning subfic r0,r2,4 ; get #bytes in partial word cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set addic r0,r0,0 ; turn off carry beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned) ; Partial word at start: zero filled on left, it becomes initial checksum. rlwinm r3,r3,0,0,29 ; word align address mtcrf 0x01,r2 ; move byte offset to cr7 lwz r6,0(r3) ; get partial word li r7,-1 ; start of mask for partial fill slwi r8,r2,3 ; multiply byte offset by 8 sub r4,r4,r0 ; adjust length for bytes in partial word crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary srw r7,r7,r8 ; get mask for bytes to keep in partial word addi r3,r3,4 ; point to next word of input and r2,r6,r7 ; zero fill on left ; Address is now word aligned. Prepare for inner loop over 32-byte chunks. ; r2 = initial checksum ; r3 = word aligned address ; r4 = length remaining ; r5 = accumulated sum parameter ; carry = off ; cr1_gt = "starting on odd address" flag Laligned: srwi. r0,r4,5 ; get count of 32-byte chunks mtcrf 0x02,r4 ; move residual length to cr6 and cr7 mtcrf 0x01,r4 beq cr0,Lleftovers ; no chunks mtctr r0 ; set up loop count li r4,32 ; offset to next chunk _xsum_nop_if_32bit: b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine) dcbt r4,r3 ; touch in 2nd cache line li r0,96 ; get touch offset b LInnerLoop32 ; enter 32-bit loop ; Inner loop for 32-bit machines. .align 4 LInnerLoop32: lwz r4,0(r3) lwz r6,4(r3) lwz r7,8(r3) lwz r8,12(r3) adde r2,r2,r4 lwz r9,16(r3) adde r2,r2,r6 lwz r10,20(r3) adde r2,r2,r7 lwz r11,24(r3) adde r2,r2,r8 lwz r12,28(r3) adde r2,r2,r9 dcbt r3,r0 adde r2,r2,r10 addi r3,r3,32 adde r2,r2,r11 adde r2,r2,r12 bdnz+ LInnerLoop32 ; Handle leftover bytes. ; r2 = checksum so far ; r3 = word aligned address ; r5 = accumulated sum parameter ; carry = live ; cr1_gt = "starting on odd address" flag ; cr6,cr7 = residual length Lleftovers: bf 27,Lleftover8 ; test 0x10 bit of residual length lwz r4,0(r3) lwz r6,4(r3) lwz r7,8(r3) lwz r8,12(r3) addi r3,r3,16 adde r2,r2,r4 adde r2,r2,r6 adde r2,r2,r7 adde r2,r2,r8 Lleftover8: bf 28,Lleftover4 lwz r4,0(r3) lwz r6,4(r3) addi r3,r3,8 adde r2,r2,r4 adde r2,r2,r6 Lleftover4: bf 29,Lleftover2 lwz r4,0(r3) addi r3,r3,4 adde r2,r2,r4 Lleftover2: bf 30,Lleftover1 lhz r4,0(r3) addi r3,r3,2 adde r2,r2,r4 Lleftover1: bf 31,Lwrapup lbz r4,0(r3) slwi r4,r4,8 ; shift last byte into proper lane adde r2,r2,r4 ; All data bytes checksummed. Wrap up. ; r2 = checksum so far (word parallel) ; r5 = accumulated sum parameter ; carry = live ; cr1_gt = "starting on odd address" flag Lwrapup: addze r2,r2 ; add in last carry addze r2,r2 ; in case the "addze" carries Lwrapupx: ; here from short-operand case, with xer(ca) undefined srwi r6,r2,16 ; top half of 32-bit checksum rlwinm r7,r2,0,0xFFFF ; lower half add r2,r6,r7 ; add them together srwi r6,r2,16 ; then do it again, in case first carried rlwinm r7,r2,0,0xFFFF add r2,r6,r7 bf cr1_gt,Lswapped ; test "starting on odd address" flag ; The checksum began on an odd address, so swap bytes. rlwinm r6,r2,24,0x00FF ; move top byte to bottom rlwinm r7,r2,8,0xFF00 ; bottom to top or r2,r6,r7 ; rejoin ; Finally, add in checksum passed in as a parameter. Lswapped: add r2,r2,r5 ; add passed-in checksum srwi r6,r2,16 ; top half of 32-bit checksum rlwinm r7,r2,0,0xFFFF ; lower half add r2,r6,r7 ; add them together srwi r6,r2,16 ; then do it again, in case first carried rlwinm r7,r2,0,0xFFFF add r3,r6,r7 ; steer result into r3 blr ; Handle short operands. Do a halfword at a time. ; r3 = address ; r4 = length (<= kShort) ; r5 = accumulated sum parameter ; r6 = "starting on odd byte" flag Lshort: cmpwi cr6,r4,2 ; at least two bytes? andi. r0,r4,1 ; odd length? li r2,0 ; initialize checksum cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set blt cr6,Lshort2 ; fewer than two bytes, so skip Lshort1: cmpwi cr6,r4,4 ; two more bytes (after we decrement)? lhz r7,0(r3) subi r4,r4,2 addi r3,r3,2 add r2,r2,r7 ; note no need for "adde" bge cr6,Lshort1 ; loop for 2 more bytes Lshort2: beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined lbz r7,0(r3) slwi r7,r7,8 ; shift last byte into proper lane add r2,r2,r7 b Lwrapupx ; Handle 64-bit machine. The major improvement over the 32-bit path is that we use ; four parallel 32-bit accumulators, which carry into the upper half naturally so we ; do not have to use "adde", which serializes on the carry bit. Note that we cannot ; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly. ; r2 = checksum so far (ie, the zero-filled partial first word) ; r3 = word aligned address ; r5 = accumulated sum parameter ; ctr = number of 32-byte chunks of input ; carry = unused in this code ; cr1_gt = "starting on odd address" flag ; cr6,cr7 = residual length L64BitPath: stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them stw r14,-8(r1) stw r15,-12(r1) stw r16,-16(r1) li r0,128 ; to touch next line li r13,0 ; r13-r15 are the accumulators, so initialize them dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12 lwz r6,4(r3) lwz r7,8(r3) mr r14,r2 ; just copy incoming partial word into one of the accumulators li r15,0 lwz r8,12(r3) lwz r9,16(r3) li r16,0 li r0,256 ; get touch offset lwz r10,20(r3) lwz r11,24(r3) lwz r12,28(r3) ; load last word of previous chunk addi r3,r3,32 ; skip past the chunk bdnz++ LInnerLoop64 ; enter loop if another chunk to go b LAddLastChunk ; only one chunk ; Inner loop for 64-bit processors. This loop is scheduled for the 970. ; It is pipelined (loads are one iteration ahead of adds), and unrolled. ; It should take 9-10 cycles per iteration, which consumes 64 bytes of input. .align 5 LInnerLoop64: ; 64 bytes/iteration add r13,r13,r4 ; cycle 1 add r14,r14,r6 dcbt r3,r0 ; touch in 2 lines ahead lwz r4,0(r3) add r15,r15,r7 ; cycle 2, etc lwz r6,4(r3) lwz r7,8(r3) add r16,r16,r8 lwz r8,12(r3) add r13,r13,r9 add r14,r14,r10 lwz r9,16(r3) add r15,r15,r11 lwz r10,20(r3) lwz r11,24(r3) add r16,r16,r12 bdz-- LEarlyExit ; early exit if no more chunks lwz r12,28(r3) add r13,r13,r4 add r14,r14,r6 lwz r4,32(r3) add r15,r15,r7 lwz r6,36(r3) lwz r7,40(r3) add r16,r16,r8 lwz r8,44(r3) add r13,r13,r9 add r14,r14,r10 lwz r9,48(r3) add r15,r15,r11 lwz r10,52(r3) lwz r11,56(r3) add r16,r16,r12 nop ; position last load in 2nd dispatch slot lwz r12,60(r3) addi r3,r3,64 bdnz++ LInnerLoop64 b LAddLastChunk ; Add in the last 32-byte chunk, and any leftover bytes. ; r3 = word aligned address of next byte of data ; r5 = accumulated sum parameter ; r13-r16 = the four accumulators ; cr1_gt = "starting on odd address" flag ; cr6,cr7 = residual length LEarlyExit: ; here from middle of inner loop lwz r12,28(r3) ; load last word of last chunk addi r3,r3,32 LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12 add r13,r13,r4 ; add in last chunk add r14,r14,r6 ; these are 64-bit adds add r15,r15,r7 add r16,r16,r8 add r13,r13,r9 add r14,r14,r10 add r15,r15,r11 add r16,r16,r12 ; Handle leftover bytes, if any. bf 27,Lleft1 ; test 0x10 bit of residual length lwz r4,0(r3) lwz r6,4(r3) lwz r7,8(r3) lwz r8,12(r3) addi r3,r3,16 add r13,r13,r4 add r14,r14,r6 add r15,r15,r7 add r16,r16,r8 Lleft1: bf 28,Lleft2 lwz r4,0(r3) lwz r6,4(r3) addi r3,r3,8 add r13,r13,r4 add r14,r14,r6 Lleft2: bf 29,Lleft3 lwz r4,0(r3) addi r3,r3,4 add r14,r14,r4 Lleft3: bf 30,Lleft4 lhz r4,0(r3) addi r3,r3,2 add r15,r15,r4 Lleft4: bf 31,Lleft5 lbz r4,0(r3) slwi r4,r4,8 ; shift last byte into proper lane add r16,r16,r4 ; All data bytes have been checksummed. Now we must add together the four ; accumulators and restore the regs from the red zone. ; r3 = word aligned address of next byte of data ; r5 = accumulated sum parameter ; r13-r16 = the four accumulators ; carry = not used so far ; cr1_gt = "starting on odd address" flag Lleft5: add r8,r13,r14 ; add the four accumulators together add r9,r15,r16 lwz r13,-4(r1) ; start to restore nonvolatiles from red zone lwz r14,-8(r1) add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators lwz r15,-12(r1) lwz r16,-16(r1) srdi r7,r8,32 ; get upper half of 64-bit sum addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry) b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum