]> git.saurik.com Git - apple/xnu.git/blame - bsd/dev/ppc/xsumas.s
xnu-792.6.56.tar.gz
[apple/xnu.git] / bsd / dev / ppc / xsumas.s
CommitLineData
1c79356b 1/*
91447636 2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
1c79356b
A
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
ff6e181a
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
1c79356b 12 *
ff6e181a
A
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
1c79356b
A
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
ff6e181a
A
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
1c79356b
A
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
91447636
A
23
24#define kShort 11
25#define cr1_gt 5 // bit 1 of cr1
1c79356b
A
26
27/*
91447636
A
28 * short xsum_assym( short *p, int len, short xsum, boolean odd);
29 *
30 * r3 - Pointer to data
31 * r4 - Length of data
32 * r5 - Accumulated sum value
33 * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
34 *
35 * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we
36 * correctly handle the case where the flag is set and the address is odd.
37 *
38 * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
39 * of the data, treated as an array of 16-bit integers. 1s-complement sums are done
40 * via "add with carry" operations on a 2s-complement machine like PPC. Note that
41 * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
42 * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is
43 * perfect except that it serializes the adds on the carry bit. On 64-bit machines
44 * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
45 * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums,
46 * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
47 * is set on the low 32-bits of the sum.)
48 *
49 * Using Altivec is tempting, but the performance impact of the greatly increased
50 * number of exceptions and register save/restore traffic probably make it impractical
51 * for now.
52 */
53 .globl _xsum_assym
54 .globl _xsum_nop_if_32bit
55 .text
56 .align 5
57_xsum_assym:
58 cmplwi cr0,r4,kShort ; too short to word align?
59 rlwinm r2,r3,0,0x3 ; get byte offset in word
60 dcbt 0,r3 ; touch in 1st cache line
61 cmpwi cr6,r2,0 ; is address word aligned?
62 ble cr0,Lshort ; skip if too short to bother aligning
63
64 subfic r0,r2,4 ; get #bytes in partial word
65 cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
66 addic r0,r0,0 ; turn off carry
67 beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned)
68
69; Partial word at start: zero filled on left, it becomes initial checksum.
70
71 rlwinm r3,r3,0,0,29 ; word align address
72 mtcrf 0x01,r2 ; move byte offset to cr7
73 lwz r6,0(r3) ; get partial word
74 li r7,-1 ; start of mask for partial fill
75 slwi r8,r2,3 ; multiply byte offset by 8
76 sub r4,r4,r0 ; adjust length for bytes in partial word
77 crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
78 srw r7,r7,r8 ; get mask for bytes to keep in partial word
79 addi r3,r3,4 ; point to next word of input
80 and r2,r6,r7 ; zero fill on left
81
82; Address is now word aligned. Prepare for inner loop over 32-byte chunks.
83; r2 = initial checksum
84; r3 = word aligned address
85; r4 = length remaining
86; r5 = accumulated sum parameter
87; carry = off
88; cr1_gt = "starting on odd address" flag
89
90Laligned:
91 srwi. r0,r4,5 ; get count of 32-byte chunks
92 mtcrf 0x02,r4 ; move residual length to cr6 and cr7
93 mtcrf 0x01,r4
94 beq cr0,Lleftovers ; no chunks
95
96 mtctr r0 ; set up loop count
97 li r4,32 ; offset to next chunk
98_xsum_nop_if_32bit:
99 b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine)
100 dcbt r4,r3 ; touch in 2nd cache line
101 li r0,96 ; get touch offset
102 b LInnerLoop32 ; enter 32-bit loop
103
104; Inner loop for 32-bit machines.
105
106 .align 4
107LInnerLoop32:
108 lwz r4,0(r3)
109 lwz r6,4(r3)
110 lwz r7,8(r3)
111 lwz r8,12(r3)
112 adde r2,r2,r4
113 lwz r9,16(r3)
114 adde r2,r2,r6
115 lwz r10,20(r3)
116 adde r2,r2,r7
117 lwz r11,24(r3)
118 adde r2,r2,r8
119 lwz r12,28(r3)
120 adde r2,r2,r9
121 dcbt r3,r0
122 adde r2,r2,r10
123 addi r3,r3,32
124 adde r2,r2,r11
125 adde r2,r2,r12
126 bdnz+ LInnerLoop32
127
128; Handle leftover bytes.
129; r2 = checksum so far
130; r3 = word aligned address
131; r5 = accumulated sum parameter
132; carry = live
133; cr1_gt = "starting on odd address" flag
134; cr6,cr7 = residual length
135
136Lleftovers:
137 bf 27,Lleftover8 ; test 0x10 bit of residual length
138 lwz r4,0(r3)
139 lwz r6,4(r3)
140 lwz r7,8(r3)
141 lwz r8,12(r3)
142 addi r3,r3,16
143 adde r2,r2,r4
144 adde r2,r2,r6
145 adde r2,r2,r7
146 adde r2,r2,r8
147Lleftover8:
148 bf 28,Lleftover4
149 lwz r4,0(r3)
150 lwz r6,4(r3)
151 addi r3,r3,8
152 adde r2,r2,r4
153 adde r2,r2,r6
154Lleftover4:
155 bf 29,Lleftover2
156 lwz r4,0(r3)
157 addi r3,r3,4
158 adde r2,r2,r4
159Lleftover2:
160 bf 30,Lleftover1
161 lhz r4,0(r3)
162 addi r3,r3,2
163 adde r2,r2,r4
164Lleftover1:
165 bf 31,Lwrapup
166 lbz r4,0(r3)
167 slwi r4,r4,8 ; shift last byte into proper lane
168 adde r2,r2,r4
169
170; All data bytes checksummed. Wrap up.
171; r2 = checksum so far (word parallel)
172; r5 = accumulated sum parameter
173; carry = live
174; cr1_gt = "starting on odd address" flag
175
176Lwrapup:
177 addze r2,r2 ; add in last carry
178 addze r2,r2 ; in case the "addze" carries
179Lwrapupx: ; here from short-operand case, with xer(ca) undefined
180 srwi r6,r2,16 ; top half of 32-bit checksum
181 rlwinm r7,r2,0,0xFFFF ; lower half
182 add r2,r6,r7 ; add them together
183 srwi r6,r2,16 ; then do it again, in case first carried
184 rlwinm r7,r2,0,0xFFFF
185 add r2,r6,r7
186 bf cr1_gt,Lswapped ; test "starting on odd address" flag
187
188; The checksum began on an odd address, so swap bytes.
189
190 rlwinm r6,r2,24,0x00FF ; move top byte to bottom
191 rlwinm r7,r2,8,0xFF00 ; bottom to top
192 or r2,r6,r7 ; rejoin
193
194; Finally, add in checksum passed in as a parameter.
195
196Lswapped:
197 add r2,r2,r5 ; add passed-in checksum
198 srwi r6,r2,16 ; top half of 32-bit checksum
199 rlwinm r7,r2,0,0xFFFF ; lower half
200 add r2,r6,r7 ; add them together
201 srwi r6,r2,16 ; then do it again, in case first carried
202 rlwinm r7,r2,0,0xFFFF
203 add r3,r6,r7 ; steer result into r3
204 blr
205
206; Handle short operands. Do a halfword at a time.
207; r3 = address
208; r4 = length (<= kShort)
209; r5 = accumulated sum parameter
210; r6 = "starting on odd byte" flag
211
212Lshort:
213 cmpwi cr6,r4,2 ; at least two bytes?
214 andi. r0,r4,1 ; odd length?
215 li r2,0 ; initialize checksum
216 cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
217 blt cr6,Lshort2 ; fewer than two bytes, so skip
218Lshort1:
219 cmpwi cr6,r4,4 ; two more bytes (after we decrement)?
220 lhz r7,0(r3)
221 subi r4,r4,2
222 addi r3,r3,2
223 add r2,r2,r7 ; note no need for "adde"
224 bge cr6,Lshort1 ; loop for 2 more bytes
225Lshort2:
226 beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined
227 lbz r7,0(r3)
228 slwi r7,r7,8 ; shift last byte into proper lane
229 add r2,r2,r7
230 b Lwrapupx
231
232; Handle 64-bit machine. The major improvement over the 32-bit path is that we use
233; four parallel 32-bit accumulators, which carry into the upper half naturally so we
234; do not have to use "adde", which serializes on the carry bit. Note that we cannot
235; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
236; r2 = checksum so far (ie, the zero-filled partial first word)
237; r3 = word aligned address
238; r5 = accumulated sum parameter
239; ctr = number of 32-byte chunks of input
240; carry = unused in this code
241; cr1_gt = "starting on odd address" flag
242; cr6,cr7 = residual length
243
244L64BitPath:
245 stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them
246 stw r14,-8(r1)
247 stw r15,-12(r1)
248 stw r16,-16(r1)
249 li r0,128 ; to touch next line
250 li r13,0 ; r13-r15 are the accumulators, so initialize them
251 dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores
252 lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12
253 lwz r6,4(r3)
254 lwz r7,8(r3)
255 mr r14,r2 ; just copy incoming partial word into one of the accumulators
256 li r15,0
257 lwz r8,12(r3)
258 lwz r9,16(r3)
259 li r16,0
260 li r0,256 ; get touch offset
261 lwz r10,20(r3)
262 lwz r11,24(r3)
263 lwz r12,28(r3) ; load last word of previous chunk
264 addi r3,r3,32 ; skip past the chunk
265 bdnz++ LInnerLoop64 ; enter loop if another chunk to go
266
267 b LAddLastChunk ; only one chunk
268
269; Inner loop for 64-bit processors. This loop is scheduled for the 970.
270; It is pipelined (loads are one iteration ahead of adds), and unrolled.
271; It should take 9-10 cycles per iteration, which consumes 64 bytes of input.
272
273 .align 5
274LInnerLoop64: ; 64 bytes/iteration
275 add r13,r13,r4 ; cycle 1
276 add r14,r14,r6
277 dcbt r3,r0 ; touch in 2 lines ahead
278 lwz r4,0(r3)
279
280 add r15,r15,r7 ; cycle 2, etc
281 lwz r6,4(r3)
282 lwz r7,8(r3)
283 add r16,r16,r8
284
285 lwz r8,12(r3)
286 add r13,r13,r9
287 add r14,r14,r10
288 lwz r9,16(r3)
289
290 add r15,r15,r11
291 lwz r10,20(r3)
292 lwz r11,24(r3)
293 add r16,r16,r12
294 bdz-- LEarlyExit ; early exit if no more chunks
295
296 lwz r12,28(r3)
297 add r13,r13,r4
298 add r14,r14,r6
299 lwz r4,32(r3)
300
301 add r15,r15,r7
302 lwz r6,36(r3)
303 lwz r7,40(r3)
304 add r16,r16,r8
305
306 lwz r8,44(r3)
307 add r13,r13,r9
308 add r14,r14,r10
309 lwz r9,48(r3)
310
311 add r15,r15,r11
312 lwz r10,52(r3)
313 lwz r11,56(r3)
314 add r16,r16,r12
315
316 nop ; position last load in 2nd dispatch slot
317 lwz r12,60(r3)
318 addi r3,r3,64
319 bdnz++ LInnerLoop64
320
321 b LAddLastChunk
322
323; Add in the last 32-byte chunk, and any leftover bytes.
324; r3 = word aligned address of next byte of data
325; r5 = accumulated sum parameter
326; r13-r16 = the four accumulators
327; cr1_gt = "starting on odd address" flag
328; cr6,cr7 = residual length
329
330LEarlyExit: ; here from middle of inner loop
331 lwz r12,28(r3) ; load last word of last chunk
332 addi r3,r3,32
333LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12
334 add r13,r13,r4 ; add in last chunk
335 add r14,r14,r6 ; these are 64-bit adds
336 add r15,r15,r7
337 add r16,r16,r8
338 add r13,r13,r9
339 add r14,r14,r10
340 add r15,r15,r11
341 add r16,r16,r12
342
343; Handle leftover bytes, if any.
344
345 bf 27,Lleft1 ; test 0x10 bit of residual length
346 lwz r4,0(r3)
347 lwz r6,4(r3)
348 lwz r7,8(r3)
349 lwz r8,12(r3)
350 addi r3,r3,16
351 add r13,r13,r4
352 add r14,r14,r6
353 add r15,r15,r7
354 add r16,r16,r8
355Lleft1:
356 bf 28,Lleft2
357 lwz r4,0(r3)
358 lwz r6,4(r3)
359 addi r3,r3,8
360 add r13,r13,r4
361 add r14,r14,r6
362Lleft2:
363 bf 29,Lleft3
364 lwz r4,0(r3)
365 addi r3,r3,4
366 add r14,r14,r4
367Lleft3:
368 bf 30,Lleft4
369 lhz r4,0(r3)
370 addi r3,r3,2
371 add r15,r15,r4
372Lleft4:
373 bf 31,Lleft5
374 lbz r4,0(r3)
375 slwi r4,r4,8 ; shift last byte into proper lane
376 add r16,r16,r4
377
378; All data bytes have been checksummed. Now we must add together the four
379; accumulators and restore the regs from the red zone.
380; r3 = word aligned address of next byte of data
381; r5 = accumulated sum parameter
382; r13-r16 = the four accumulators
383; carry = not used so far
384; cr1_gt = "starting on odd address" flag
385
386Lleft5:
387 add r8,r13,r14 ; add the four accumulators together
388 add r9,r15,r16
389 lwz r13,-4(r1) ; start to restore nonvolatiles from red zone
390 lwz r14,-8(r1)
391 add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators
392 lwz r15,-12(r1)
393 lwz r16,-16(r1)
394 srdi r7,r8,32 ; get upper half of 64-bit sum
395 addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry)
396 b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum