2 * Copyright (c) 2011 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/sysctl.h>
33 #include <sys/mcache.h>
34 #include <sys/socket.h>
35 #include <sys/socketvar.h>
36 #include <net/if_types.h>
37 #include <net/route.h>
38 #include <netinet/in.h>
39 #include <netinet/in_systm.h>
41 #include <netinet/ip.h>
42 #include <netinet/ip_var.h>
43 #include <netinet/in_var.h>
44 #include <netinet/tcp.h>
45 #include <netinet/tcp_seq.h>
46 #include <netinet/tcpip.h>
47 #include <netinet/tcp_var.h>
48 #include <netinet/tcp_lro.h>
49 #include <netinet/lro_ext.h>
50 #include <kern/locks.h>
52 unsigned int lrocount
= 0; /* A counter used for debugging only */
53 unsigned int lro_seq_outoforder
= 0; /* Counter for debugging */
54 unsigned int lro_seq_mismatch
= 0; /* Counter for debugging */
55 unsigned int lro_eject_req
= 0; /* Counter for tracking flow ejections */
56 unsigned int lro_flushes
= 0; /* Counter for tracking number of flushes */
57 unsigned int lro_single_flushes
= 0;
58 unsigned int lro_double_flushes
= 0;
59 unsigned int lro_good_flushes
= 0;
61 unsigned int coalesc_sz
= LRO_MX_COALESCE_PKTS
;
62 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, lro_sz
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
63 &coalesc_sz
, 0, "Max coalescing size");
65 unsigned int coalesc_time
= LRO_MX_TIME_TO_BUFFER
;
66 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, lro_time
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
67 &coalesc_time
, 0, "Max coalescing time");
69 struct lro_flow lro_flow_list
[TCP_LRO_NUM_FLOWS
];
71 char lro_flow_map
[TCP_LRO_FLOW_MAP
];
73 static lck_attr_t
*tcp_lro_mtx_attr
= NULL
; /* mutex attributes */
74 static lck_grp_t
*tcp_lro_mtx_grp
= NULL
; /* mutex group */
75 static lck_grp_attr_t
*tcp_lro_mtx_grp_attr
= NULL
; /* mutex group attrs */
76 decl_lck_mtx_data( ,tcp_lro_lock
); /* Used to synchronize updates */
78 unsigned int lro_byte_count
= 0;
80 uint64_t lro_deadline
= 0; /* LRO's sense of time - protected by tcp_lro_lock */
81 uint32_t lro_timer_set
= 0;
84 u_int32_t lro_pkt_count
= 0; /* Number of packets encountered in an LRO period */
85 thread_call_t tcp_lro_timer
;
87 extern u_int32_t kipf_count
;
89 static void tcp_lro_timer_proc(void*, void*);
90 static void lro_update_stats(struct mbuf
*);
91 static void lro_update_flush_stats(struct mbuf
*);
92 static void tcp_lro_flush_flows(void);
93 static void tcp_lro_sched_timer(uint64_t);
94 static void lro_proto_input(struct mbuf
*);
96 static struct mbuf
*lro_tcp_xsum_validate(struct mbuf
*, struct ipovly
*,
98 static struct mbuf
*tcp_lro_process_pkt(struct mbuf
*, struct ip
*, struct tcphdr
*,
106 bzero(lro_flow_list
, sizeof (struct lro_flow
) * TCP_LRO_NUM_FLOWS
);
107 for (i
= 0; i
< TCP_LRO_FLOW_MAP
; i
++) {
108 lro_flow_map
[i
] = TCP_LRO_FLOW_UNINIT
;
112 * allocate lock group attribute, group and attribute for tcp_lro_lock
114 tcp_lro_mtx_grp_attr
= lck_grp_attr_alloc_init();
115 tcp_lro_mtx_grp
= lck_grp_alloc_init("tcplro", tcp_lro_mtx_grp_attr
);
116 tcp_lro_mtx_attr
= lck_attr_alloc_init();
117 lck_mtx_init(&tcp_lro_lock
, tcp_lro_mtx_grp
, tcp_lro_mtx_attr
);
119 tcp_lro_timer
= thread_call_allocate(tcp_lro_timer_proc
, NULL
);
120 if (tcp_lro_timer
== NULL
) {
121 panic_plain("%s: unable to allocate lro timer", __func__
);
128 tcp_lro_matching_tuple(struct ip
* ip_hdr
, struct tcphdr
*tcp_hdr
, int *hash
,
131 struct lro_flow
*flow
;
133 unsigned int off
= 0;
136 *hash
= LRO_HASH(ip_hdr
->ip_src
.s_addr
, ip_hdr
->ip_dst
.s_addr
,
137 tcp_hdr
->th_sport
, tcp_hdr
->th_dport
, (TCP_LRO_FLOW_MAP
- 1));
139 *flow_id
= lro_flow_map
[*hash
];
140 if (*flow_id
== TCP_LRO_FLOW_NOTFOUND
) {
144 seqnum
= tcp_hdr
->th_seq
;
145 off
= tcp_hdr
->th_off
<< 2;
146 payload_len
= ip_hdr
->ip_len
- off
;
148 flow
= &lro_flow_list
[*flow_id
];
150 if ((flow
->lr_faddr
.s_addr
== ip_hdr
->ip_src
.s_addr
) &&
151 (flow
->lr_laddr
.s_addr
== ip_hdr
->ip_dst
.s_addr
) &&
152 (flow
->lr_fport
== tcp_hdr
->th_sport
) &&
153 (flow
->lr_lport
== tcp_hdr
->th_dport
)) {
154 if (flow
->lr_tcphdr
== NULL
) {
155 if (ntohl(seqnum
) == flow
->lr_seq
) {
156 return TCP_LRO_COALESCE
;
159 printf("%s: seqnum = %x, lr_seq = %x\n",
160 __func__
, ntohl(seqnum
), flow
->lr_seq
);
163 if (SEQ_GT(ntohl(seqnum
), flow
->lr_seq
)) {
164 lro_seq_outoforder
++;
166 * Whenever we receive out of order packets it
167 * signals loss and recovery and LRO doesn't
168 * let flows recover quickly. So eject.
170 flow
->lr_flags
|= LRO_EJECT_REQ
;
176 if (flow
->lr_flags
& LRO_EJECT_REQ
) {
178 printf("%s: eject. \n", __func__
);
179 return TCP_LRO_EJECT_FLOW
;
181 if (SEQ_GT(tcp_hdr
->th_ack
, flow
->lr_tcphdr
->th_ack
)) {
183 printf("%s: th_ack = %x flow_ack = %x \n",
184 __func__
, tcp_hdr
->th_ack
,
185 flow
->lr_tcphdr
->th_ack
);
187 return TCP_LRO_EJECT_FLOW
;
190 if (ntohl(seqnum
) == (ntohl(lro_flow_list
[*flow_id
].lr_tcphdr
->th_seq
) + lro_flow_list
[*flow_id
].lr_len
)) {
191 return TCP_LRO_COALESCE
;
193 /* LRO does not handle loss recovery well, eject */
194 flow
->lr_flags
|= LRO_EJECT_REQ
;
195 return TCP_LRO_EJECT_FLOW
;
198 if (lrodebug
) printf("tcp_lro_matching_tuple: collision \n");
199 return TCP_LRO_COLLISION
;
203 tcp_lro_init_flow(int flow_id
, struct ip
* ip_hdr
, struct tcphdr
*tcp_hdr
,
204 int hash
, u_int32_t timestamp
, int payload_len
)
206 struct lro_flow
*flow
= NULL
;
208 flow
= &lro_flow_list
[flow_id
];
210 flow
->lr_hash_map
= hash
;
211 flow
->lr_faddr
.s_addr
= ip_hdr
->ip_src
.s_addr
;
212 flow
->lr_laddr
.s_addr
= ip_hdr
->ip_dst
.s_addr
;
213 flow
->lr_fport
= tcp_hdr
->th_sport
;
214 flow
->lr_lport
= tcp_hdr
->th_dport
;
215 lro_flow_map
[hash
] = flow_id
;
216 flow
->lr_timestamp
= timestamp
;
217 flow
->lr_seq
= ntohl(tcp_hdr
->th_seq
) + payload_len
;
223 tcp_lro_coalesce(int flow_id
, struct mbuf
*lro_mb
, struct tcphdr
*tcphdr
,
224 int payload_len
, int drop_hdrlen
, struct tcpopt
*topt
,
225 u_int32_t
* tsval
, u_int32_t
* tsecr
, int thflags
)
227 struct lro_flow
*flow
= NULL
;
229 struct ip
*ip
= NULL
;
231 flow
= &lro_flow_list
[flow_id
];
232 if (flow
->lr_mhead
) {
234 printf("%s: lr_mhead %x %d \n", __func__
, flow
->lr_seq
,
236 m_adj(lro_mb
, drop_hdrlen
);
238 last
= flow
->lr_mtail
;
239 while (last
->m_next
!= NULL
) {
242 last
->m_next
= lro_mb
;
244 flow
->lr_mtail
= lro_mb
;
246 ip
= mtod(flow
->lr_mhead
, struct ip
*);
247 ip
->ip_len
+= lro_mb
->m_pkthdr
.len
;
248 flow
->lr_mhead
->m_pkthdr
.len
+= lro_mb
->m_pkthdr
.len
;
250 if (flow
->lr_len
== 0) {
251 panic_plain("%s: Inconsistent LRO flow state", __func__
);
253 flow
->lr_len
+= payload_len
;
254 flow
->lr_seq
+= payload_len
;
256 * This bit is re-OR'd each time a packet is added to the
257 * large coalesced packet.
259 flow
->lr_mhead
->m_pkthdr
.aux_flags
|= MAUXF_SW_LRO_PKT
;
260 flow
->lr_mhead
->m_pkthdr
.lro_npkts
++; /* for tcpstat.tcps_rcvpack */
261 if (flow
->lr_mhead
->m_pkthdr
.lro_pktlen
<
262 lro_mb
->m_pkthdr
.lro_pktlen
) {
264 * For TCP Inter Arrival Jitter calculation, return max
265 * size encountered while coalescing a stream of pkts.
267 flow
->lr_mhead
->m_pkthdr
.lro_pktlen
=
268 lro_mb
->m_pkthdr
.lro_pktlen
;
270 /* Update the timestamp value */
271 if (topt
->to_flags
& TOF_TS
) {
272 if ((flow
->lr_tsval
) &&
273 (TSTMP_GT(topt
->to_tsval
, ntohl(*(flow
->lr_tsval
))))) {
274 *(flow
->lr_tsval
) = htonl(topt
->to_tsval
);
276 if ((flow
->lr_tsecr
) &&
277 (topt
->to_tsecr
!= 0) &&
278 (TSTMP_GT(topt
->to_tsecr
, ntohl(*(flow
->lr_tsecr
))))) {
280 printf("%s: instantaneous RTT = %d \n", __func__
,
281 topt
->to_tsecr
- ntohl(*(flow
->lr_tsecr
)));
283 *(flow
->lr_tsecr
) = htonl(topt
->to_tsecr
);
286 /* Coalesce the flags */
288 flow
->lr_tcphdr
->th_flags
|= thflags
;
290 /* Update receive window */
291 flow
->lr_tcphdr
->th_win
= tcphdr
->th_win
;
294 flow
->lr_mhead
= flow
->lr_mtail
= lro_mb
;
295 flow
->lr_mhead
->m_pkthdr
.aux_flags
|= MAUXF_SW_LRO_PKT
;
296 flow
->lr_tcphdr
= tcphdr
;
297 if ((topt
) && (topt
->to_flags
& TOF_TS
)) {
298 ASSERT(tsval
!= NULL
);
299 ASSERT(tsecr
!= NULL
);
300 flow
->lr_tsval
= tsval
;
301 flow
->lr_tsecr
= tsecr
;
303 flow
->lr_len
= payload_len
;
304 flow
->lr_timestamp
= tcp_now
;
305 tcp_lro_sched_timer(0);
307 flow
->lr_seq
= ntohl(tcphdr
->th_seq
) + payload_len
;
310 tcpstat
.tcps_coalesced_pack
++;
316 tcp_lro_eject_flow(int flow_id
)
318 struct mbuf
*mb
= NULL
;
320 mb
= lro_flow_list
[flow_id
].lr_mhead
;
321 ASSERT(lro_flow_map
[lro_flow_list
[flow_id
].lr_hash_map
] == flow_id
);
322 lro_flow_map
[lro_flow_list
[flow_id
].lr_hash_map
] = TCP_LRO_FLOW_UNINIT
;
323 bzero(&lro_flow_list
[flow_id
], sizeof(struct lro_flow
));
329 tcp_lro_eject_coalesced_pkt(int flow_id
)
331 struct mbuf
*mb
= NULL
;
332 mb
= lro_flow_list
[flow_id
].lr_mhead
;
333 lro_flow_list
[flow_id
].lr_mhead
=
334 lro_flow_list
[flow_id
].lr_mtail
= NULL
;
335 lro_flow_list
[flow_id
].lr_tcphdr
= NULL
;
340 tcp_lro_insert_flow(struct mbuf
*lro_mb
, struct ip
*ip_hdr
,
341 struct tcphdr
*tcp_hdr
, int payload_len
,
342 int drop_hdrlen
, int hash
, struct tcpopt
*topt
,
343 u_int32_t
*tsval
, u_int32_t
*tsecr
)
346 int slot_available
= 0;
347 int candidate_flow
= 0;
348 u_int32_t oldest_timestamp
;
349 struct mbuf
*mb
= NULL
;
352 oldest_timestamp
= tcp_now
;
354 /* handle collision */
355 if (lro_flow_map
[hash
] != TCP_LRO_FLOW_UNINIT
) {
359 candidate_flow
= lro_flow_map
[hash
];
360 tcpstat
.tcps_flowtbl_collision
++;
364 for (i
= 0; i
< TCP_LRO_NUM_FLOWS
; i
++) {
365 if (lro_flow_list
[i
].lr_mhead
== NULL
) {
370 if (oldest_timestamp
>= lro_flow_list
[i
].lr_timestamp
) {
372 oldest_timestamp
= lro_flow_list
[i
].lr_timestamp
;
376 if (!slot_available
) {
377 tcpstat
.tcps_flowtbl_full
++;
379 /* kick the oldest flow */
380 mb
= tcp_lro_eject_flow(candidate_flow
);
383 if (!slot_available
) {
384 printf("%s: slot unavailable.\n",__func__
);
387 printf("%s: collision.\n",__func__
);
391 candidate_flow
= i
; /* this is now the flow to be used */
395 tcp_lro_init_flow(candidate_flow
, ip_hdr
, tcp_hdr
, hash
,
396 tcp_now
, payload_len
);
397 tcp_lro_coalesce(candidate_flow
, lro_mb
, tcp_hdr
, payload_len
,
398 drop_hdrlen
, topt
, tsval
, tsecr
, 0);
403 tcp_lro_process_pkt(struct mbuf
*lro_mb
, struct ip
*ip_hdr
,
404 struct tcphdr
*tcp_hdr
, int drop_hdrlen
)
406 int flow_id
= TCP_LRO_FLOW_UNINIT
;
408 unsigned int off
= 0;
412 struct mbuf
*mb
= NULL
;
417 int ret_response
= TCP_LRO_CONSUMED
;
418 int coalesced
= 0, tcpflags
= 0, unknown_tcpopts
= 0;
421 if (lro_mb
->m_len
< (int32_t)sizeof (struct tcpiphdr
)) {
422 if ((lro_mb
= m_pullup(lro_mb
, sizeof(struct tcpiphdr
))) == 0) {
423 tcpstat
.tcps_rcvshort
++;
426 printf("tcp_lro_process_pkt:mbuf too short.\n");
432 if ((lro_mb
= lro_tcp_xsum_validate(lro_mb
,
433 (struct ipovly
*)ip_hdr
, tcp_hdr
)) == NULL
) {
435 printf("tcp_lro_process_pkt: TCP xsum failed.\n");
443 /* Avoids checksumming in tcp_input */
444 lro_mb
->m_pkthdr
.aux_flags
|= MAUXF_SW_LRO_DID_CSUM
;
446 off
= tcp_hdr
->th_off
<< 2;
447 optlen
= off
- sizeof (struct tcphdr
);
448 payload_len
= ip_hdr
->ip_len
- off
;
449 optp
= (u_char
*)(tcp_hdr
+ 1);
451 * Do quick retrieval of timestamp options ("options
452 * prediction?"). If timestamp is the only option and it's
453 * formatted as recommended in RFC 1323 appendix A, we
454 * quickly get the values now and not bother calling
455 * tcp_dooptions(), etc.
457 if ((optlen
== TCPOLEN_TSTAMP_APPA
||
458 (optlen
> TCPOLEN_TSTAMP_APPA
&&
459 optp
[TCPOLEN_TSTAMP_APPA
] == TCPOPT_EOL
)) &&
460 *(u_int32_t
*)optp
== htonl(TCPOPT_TSTAMP_HDR
) &&
461 (tcp_hdr
->th_flags
& TH_SYN
) == 0) {
462 to
.to_flags
|= TOF_TS
;
463 to
.to_tsval
= ntohl(*(u_int32_t
*)(void *)(optp
+ 4));
464 to
.to_tsecr
= ntohl(*(u_int32_t
*)(void *)(optp
+ 8));
467 * If TCP timestamps are not in use, or not the first option,
468 * skip LRO path since timestamps are used to avoid LRO
469 * from introducing additional latencies for retransmissions
470 * and other slow-paced transmissions.
472 to
.to_flags
= to
.to_tsecr
= 0;
476 /* list all the conditions that can trigger a flow ejection here */
478 thflags
= tcp_hdr
->th_flags
;
479 if (thflags
& (TH_SYN
| TH_URG
| TH_ECE
| TH_CWR
| TH_PUSH
| TH_RST
| TH_FIN
)) {
480 eject_flow
= tcpflags
= 1;
483 if (optlen
&& !((optlen
== TCPOLEN_TSTAMP_APPA
) &&
484 (to
.to_flags
& TOF_TS
))) {
485 eject_flow
= unknown_tcpopts
= 1;
488 if (payload_len
<= LRO_MIN_COALESC_SZ
) { /* zero payload ACK */
492 /* Can't coalesce ECN marked packets. */
493 ecn
= ip_hdr
->ip_tos
& IPTOS_ECN_MASK
;
494 if (ecn
== IPTOS_ECN_CE
) {
496 * ECN needs quick notification
499 printf("%s: ECE bits set.\n", __func__
);
504 lck_mtx_lock_spin(&tcp_lro_lock
);
506 retval
= tcp_lro_matching_tuple(ip_hdr
, tcp_hdr
, &hash
, &flow_id
);
510 lck_mtx_unlock(&tcp_lro_lock
);
511 ret_response
= TCP_LRO_FLOW_NOTFOUND
;
514 case TCP_LRO_COALESCE
:
515 if ((payload_len
!= 0) && (unknown_tcpopts
== 0) &&
516 (tcpflags
== 0) && (ecn
!= IPTOS_ECN_CE
) && (to
.to_flags
& TOF_TS
)) {
517 tcp_lro_coalesce(flow_id
, lro_mb
, tcp_hdr
, payload_len
,
519 (to
.to_flags
& TOF_TS
) ? (u_int32_t
*)(void *)(optp
+ 4) : NULL
,
520 (to
.to_flags
& TOF_TS
) ? (u_int32_t
*)(void *)(optp
+ 8) : NULL
,
523 printf("tcp_lro_process_pkt: coalesce len = %d. flow_id = %d payload_len = %d drop_hdrlen = %d optlen = %d lport = %d seqnum = %x.\n",
524 lro_flow_list
[flow_id
].lr_len
, flow_id
,
525 payload_len
, drop_hdrlen
, optlen
,
526 ntohs(lro_flow_list
[flow_id
].lr_lport
),
527 ntohl(tcp_hdr
->th_seq
));
529 if (lro_flow_list
[flow_id
].lr_mhead
->m_pkthdr
.lro_npkts
>= coalesc_sz
) {
535 mb
= tcp_lro_eject_coalesced_pkt(flow_id
);
536 lro_flow_list
[flow_id
].lr_seq
= ntohl(tcp_hdr
->th_seq
) +
538 lck_mtx_unlock(&tcp_lro_lock
);
544 printf("%s: pkt payload_len = %d \n", __func__
, payload_len
);
546 lro_proto_input(lro_mb
);
549 lck_mtx_unlock(&tcp_lro_lock
);
553 case TCP_LRO_EJECT_FLOW
:
554 mb
= tcp_lro_eject_coalesced_pkt(flow_id
);
555 lck_mtx_unlock(&tcp_lro_lock
);
558 printf("tcp_lro_process_pkt eject_flow, len = %d\n", mb
->m_pkthdr
.len
);
562 lro_proto_input(lro_mb
);
565 case TCP_LRO_COLLISION
:
566 lck_mtx_unlock(&tcp_lro_lock
);
567 ret_response
= TCP_LRO_FLOW_NOTFOUND
;
571 lck_mtx_unlock(&tcp_lro_lock
);
572 panic_plain("%s: unrecognized type %d", __func__
, retval
);
576 if (ret_response
== TCP_LRO_FLOW_NOTFOUND
) {
577 lro_proto_input(lro_mb
);
583 tcp_lro_timer_proc(void *arg1
, void *arg2
)
585 #pragma unused(arg1, arg2)
587 lck_mtx_lock_spin(&tcp_lro_lock
);
589 lck_mtx_unlock(&tcp_lro_lock
);
590 tcp_lro_flush_flows();
594 tcp_lro_flush_flows(void)
598 struct lro_flow
*flow
;
599 int active_flows
= 0;
600 int outstanding_flows
= 0;
601 int tcpclock_updated
= 0;
603 lck_mtx_lock(&tcp_lro_lock
);
605 while (i
< TCP_LRO_NUM_FLOWS
) {
606 flow
= &lro_flow_list
[i
];
607 if (flow
->lr_mhead
!= NULL
) {
609 if (!tcpclock_updated
) {
610 calculate_tcp_clock();
611 tcpclock_updated
= 1;
613 if (((tcp_now
- flow
->lr_timestamp
) >= coalesc_time
) ||
614 (flow
->lr_mhead
->m_pkthdr
.lro_npkts
>=
618 printf("tcp_lro_flush_flows: len =%d n_pkts = %d %d %d \n",
620 flow
->lr_mhead
->m_pkthdr
.lro_npkts
,
621 flow
->lr_timestamp
, tcp_now
);
623 mb
= tcp_lro_eject_flow(i
);
626 lck_mtx_unlock(&tcp_lro_lock
);
627 lro_update_flush_stats(mb
);
629 lck_mtx_lock(&tcp_lro_lock
);
633 tcp_lro_sched_timer(0);
636 printf("tcp_lro_flush_flows: did not flush flow of len =%d deadline = %x timestamp = %x \n",
637 flow
->lr_len
, tcp_now
, flow
->lr_timestamp
);
641 if (flow
->lr_flags
& LRO_EJECT_REQ
) {
642 mb
= tcp_lro_eject_flow(i
);
644 lck_mtx_unlock(&tcp_lro_lock
);
647 lck_mtx_lock(&tcp_lro_lock
);
652 lck_mtx_unlock(&tcp_lro_lock
);
654 if (lrocount
== 900) {
655 printf("%s: %d %d %d %d oo: %d mismatch: %d ej_req: %d coll: %d \n",
657 tcpstat
.tcps_coalesced_pack
,
658 tcpstat
.tcps_lro_twopack
,
659 tcpstat
.tcps_lro_multpack
,
660 tcpstat
.tcps_lro_largepack
,
664 tcpstat
.tcps_flowtbl_collision
);
665 printf("%s: all: %d single: %d double: %d good: %d \n",
666 __func__
, lro_flushes
, lro_single_flushes
,
667 lro_double_flushes
, lro_good_flushes
);
672 if ((lrodebug
>= 2) && (active_flows
> 1)) {
673 printf("lro_flush_flows: active_flows = %d \n", active_flows
);
679 * Must be called with tcp_lro_lock held.
680 * The hint is non-zero for longer waits. The wait time dictated by coalesc_time
681 * takes precedence, so lro_timer_set is not set for the hint case
684 tcp_lro_sched_timer(uint64_t hint
)
692 /* the intent is to wake up every coalesc_time msecs */
693 clock_interval_to_deadline(coalesc_time
,
694 (NSEC_PER_SEC
/ TCP_RETRANSHZ
), &lro_deadline
);
696 clock_interval_to_deadline(hint
, NSEC_PER_SEC
/ TCP_RETRANSHZ
,
699 thread_call_enter_delayed(tcp_lro_timer
, lro_deadline
);
703 tcp_lro(struct mbuf
*m
, unsigned int hlen
)
707 struct tcphdr
* tcp_hdr
= NULL
;
708 unsigned int off
= 0;
714 * Experiments on cellular show that the RTT is much higher
715 * than the coalescing time of 5 msecs, causing lro to flush
716 * 80% of the time on a single packet. Increasing
717 * coalescing time for cellular does not show marked
718 * improvement to throughput either. Loopback perf is hurt
719 * by the 5 msec latency and it already sends large packets.
721 if ((m
->m_pkthdr
.rcvif
->if_type
== IFT_CELLULAR
) ||
722 (m
->m_pkthdr
.rcvif
->if_type
== IFT_LOOP
)) {
726 ip_hdr
= mtod(m
, struct ip
*);
728 /* only TCP is coalesced */
729 if (ip_hdr
->ip_p
!= IPPROTO_TCP
) {
733 if (m
->m_len
< (int32_t) sizeof (struct tcpiphdr
)) {
734 if (lrodebug
) printf("tcp_lro m_pullup \n");
735 if ((m
= m_pullup(m
, sizeof (struct tcpiphdr
))) == 0) {
736 tcpstat
.tcps_rcvshort
++;
738 printf("ip_lro: rcvshort.\n");
744 tcp_hdr
= (struct tcphdr
*)((caddr_t
)ip_hdr
+ hlen
);
745 tlen
= ip_hdr
->ip_len
; //ignore IP header bytes len
746 m
->m_pkthdr
.lro_pktlen
= tlen
; /* Used to return max pkt encountered to tcp */
747 m
->m_pkthdr
.lro_npkts
= 1; /* Initialize a counter to hold num pkts coalesced */
748 off
= tcp_hdr
->th_off
<< 2;
749 if (off
< sizeof (struct tcphdr
) || off
> tlen
) {
750 tcpstat
.tcps_rcvbadoff
++;
752 printf("ip_lro: TCP off greater than TCP header.\n");
757 return (tcp_lro_process_pkt(m
, ip_hdr
, tcp_hdr
, hlen
+ off
));
761 lro_proto_input(struct mbuf
*m
)
763 struct ip
* ip_hdr
= mtod(m
, struct ip
*);
766 printf("lro_proto_input: ip_len = %d \n",
770 ip_proto_dispatch_in_wrapper(m
, ip_hdr
->ip_hl
<< 2, ip_hdr
->ip_p
);
774 lro_tcp_xsum_validate(struct mbuf
*m
, struct ipovly
*ipov
, struct tcphdr
* th
)
777 struct ip
* ip
= (struct ip
*)ipov
;
778 int tlen
= ip
->ip_len
;
780 struct ifnet
*ifp
= ((m
->m_flags
& M_PKTHDR
) && m
->m_pkthdr
.rcvif
!= NULL
) ?
781 m
->m_pkthdr
.rcvif
: NULL
;
783 /* Expect 32-bit aligned data pointer on strict-align platforms */
784 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m
);
786 if (m
->m_pkthdr
.csum_flags
& CSUM_DATA_VALID
) {
787 if (m
->m_pkthdr
.csum_flags
& CSUM_TCP_SUM16
) {
791 bcopy(ipov
->ih_x1
, b
, sizeof (ipov
->ih_x1
));
792 bzero(ipov
->ih_x1
, sizeof (ipov
->ih_x1
));
793 ipov
->ih_len
= (u_short
)tlen
;
794 #if BYTE_ORDER != BIG_ENDIAN
797 pseudo
= in_cksum(m
, sizeof (struct ip
));
798 bcopy(b
, ipov
->ih_x1
, sizeof (ipov
->ih_x1
));
800 th
->th_sum
= in_addword(pseudo
, (m
->m_pkthdr
.csum_data
& 0xFFFF));
802 if (m
->m_pkthdr
.csum_flags
& CSUM_PSEUDO_HDR
)
803 th
->th_sum
= m
->m_pkthdr
.csum_data
;
805 th
->th_sum
= in_pseudo(ip
->ip_src
.s_addr
,
806 ip
->ip_dst
.s_addr
, htonl(m
->m_pkthdr
.csum_data
+
807 ip
->ip_len
+ IPPROTO_TCP
));
809 th
->th_sum
^= 0xffff;
813 * Checksum extended TCP header and data.
815 bcopy(ipov
->ih_x1
, b
, sizeof (ipov
->ih_x1
));
816 bzero(ipov
->ih_x1
, sizeof (ipov
->ih_x1
));
817 ipov
->ih_len
= (u_short
)tlen
;
818 #if BYTE_ORDER != BIG_ENDIAN
821 len
= sizeof (struct ip
) + tlen
;
822 th
->th_sum
= in_cksum(m
, len
);
823 bcopy(b
, ipov
->ih_x1
, sizeof (ipov
->ih_x1
));
825 tcp_in_cksum_stats(len
);
828 tcpstat
.tcps_rcvbadsum
++;
829 if (ifp
!= NULL
&& ifp
->if_tcp_stat
!= NULL
) {
830 atomic_add_64(&ifp
->if_tcp_stat
->badformat
, 1);
833 printf("lro_tcp_xsum_validate: bad xsum and drop m = %p.\n",m
);
837 /* revert back the order as IP will look into this again. */
838 #if BYTE_ORDER != BIG_ENDIAN
845 * When TCP detects a stable, steady flow without out of ordering,
846 * with a sufficiently high cwnd, it invokes LRO.
849 tcp_start_coalescing(struct ip
*ip_hdr
, struct tcphdr
*tcp_hdr
, int tlen
)
853 struct mbuf
*eject_mb
;
856 hash
= LRO_HASH(ip_hdr
->ip_src
.s_addr
, ip_hdr
->ip_dst
.s_addr
,
857 tcp_hdr
->th_sport
, tcp_hdr
->th_dport
,
858 (TCP_LRO_FLOW_MAP
- 1));
861 lck_mtx_lock_spin(&tcp_lro_lock
);
862 flow_id
= lro_flow_map
[hash
];
863 if (flow_id
!= TCP_LRO_FLOW_NOTFOUND
) {
864 lf
= &lro_flow_list
[flow_id
];
865 if ((lf
->lr_faddr
.s_addr
== ip_hdr
->ip_src
.s_addr
) &&
866 (lf
->lr_laddr
.s_addr
== ip_hdr
->ip_dst
.s_addr
) &&
867 (lf
->lr_fport
== tcp_hdr
->th_sport
) &&
868 (lf
->lr_lport
== tcp_hdr
->th_dport
)) {
869 if ((lf
->lr_tcphdr
== NULL
) &&
870 (lf
->lr_seq
!= (tcp_hdr
->th_seq
+ tlen
))) {
871 lf
->lr_seq
= tcp_hdr
->th_seq
+ tlen
;
873 lf
->lr_flags
&= ~LRO_EJECT_REQ
;
875 lck_mtx_unlock(&tcp_lro_lock
);
879 HTONL(tcp_hdr
->th_seq
);
880 HTONL(tcp_hdr
->th_ack
);
882 tcp_lro_insert_flow(NULL
, ip_hdr
, tcp_hdr
, tlen
, 0, hash
,
885 lck_mtx_unlock(&tcp_lro_lock
);
887 NTOHL(tcp_hdr
->th_seq
);
888 NTOHL(tcp_hdr
->th_ack
);
890 printf("%s: src = %x dst = %x sport = %d dport = %d seq %x \n",
891 __func__
, ip_hdr
->ip_src
.s_addr
, ip_hdr
->ip_dst
.s_addr
,
892 tcp_hdr
->th_sport
, tcp_hdr
->th_dport
, tcp_hdr
->th_seq
);
894 ASSERT(eject_mb
== NULL
);
899 * When TCP detects loss or idle condition, it stops offloading
903 tcp_lro_remove_state(struct in_addr saddr
, struct in_addr daddr
,
904 unsigned short sport
, unsigned short dport
)
909 hash
= LRO_HASH(daddr
.s_addr
, saddr
.s_addr
, dport
, sport
,
910 (TCP_LRO_FLOW_MAP
- 1));
911 lck_mtx_lock_spin(&tcp_lro_lock
);
912 flow_id
= lro_flow_map
[hash
];
913 if (flow_id
== TCP_LRO_FLOW_UNINIT
) {
914 lck_mtx_unlock(&tcp_lro_lock
);
917 lf
= &lro_flow_list
[flow_id
];
918 if ((lf
->lr_faddr
.s_addr
== daddr
.s_addr
) &&
919 (lf
->lr_laddr
.s_addr
== saddr
.s_addr
) &&
920 (lf
->lr_fport
== dport
) &&
921 (lf
->lr_lport
== sport
)) {
923 printf("%s: %x %x\n", __func__
,
924 lf
->lr_flags
, lf
->lr_seq
);
926 lf
->lr_flags
|= LRO_EJECT_REQ
;
928 lck_mtx_unlock(&tcp_lro_lock
);
933 tcp_update_lro_seq(__uint32_t rcv_nxt
, struct in_addr saddr
, struct in_addr daddr
,
934 unsigned short sport
, unsigned short dport
)
939 hash
= LRO_HASH(daddr
.s_addr
, saddr
.s_addr
, dport
, sport
,
940 (TCP_LRO_FLOW_MAP
- 1));
941 lck_mtx_lock_spin(&tcp_lro_lock
);
942 flow_id
= lro_flow_map
[hash
];
943 if (flow_id
== TCP_LRO_FLOW_UNINIT
) {
944 lck_mtx_unlock(&tcp_lro_lock
);
947 lf
= &lro_flow_list
[flow_id
];
948 if ((lf
->lr_faddr
.s_addr
== daddr
.s_addr
) &&
949 (lf
->lr_laddr
.s_addr
== saddr
.s_addr
) &&
950 (lf
->lr_fport
== dport
) &&
951 (lf
->lr_lport
== sport
) &&
952 (lf
->lr_tcphdr
== NULL
)) {
953 lf
->lr_seq
= (tcp_seq
)rcv_nxt
;
955 lck_mtx_unlock(&tcp_lro_lock
);
960 lro_update_stats(struct mbuf
*m
)
962 switch(m
->m_pkthdr
.lro_npkts
) {
963 case 0: /* fall through */
968 tcpstat
.tcps_lro_twopack
++;
971 case 3: /* fall through */
973 tcpstat
.tcps_lro_multpack
++;
977 tcpstat
.tcps_lro_largepack
++;
984 lro_update_flush_stats(struct mbuf
*m
)
987 switch(m
->m_pkthdr
.lro_npkts
) {
989 case 1: lro_single_flushes
++;
991 case 2: lro_double_flushes
++;
993 default: lro_good_flushes
++;