2 * Copyright (c) 2011-2013 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/sysctl.h>
33 #include <sys/mcache.h>
34 #include <sys/socket.h>
35 #include <sys/socketvar.h>
36 #include <net/if_types.h>
37 #include <net/route.h>
38 #include <netinet/in.h>
39 #include <netinet/in_systm.h>
42 #include <netinet/ip.h>
43 #include <netinet/ip_var.h>
44 #include <netinet/in_var.h>
45 #include <netinet/tcp.h>
46 #include <netinet/tcp_seq.h>
47 #include <netinet/tcpip.h>
48 #include <netinet/tcp_var.h>
49 #include <netinet/tcp_lro.h>
50 #include <netinet/lro_ext.h>
51 #include <kern/locks.h>
53 unsigned int lrocount
= 0; /* A counter used for debugging only */
54 unsigned int lro_seq_outoforder
= 0; /* Counter for debugging */
55 unsigned int lro_seq_mismatch
= 0; /* Counter for debugging */
56 unsigned int lro_flushes
= 0; /* Counter for tracking number of flushes */
57 unsigned int lro_single_flushes
= 0;
58 unsigned int lro_double_flushes
= 0;
59 unsigned int lro_good_flushes
= 0;
61 unsigned int coalesc_sz
= LRO_MX_COALESCE_PKTS
;
62 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, lro_sz
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
63 &coalesc_sz
, 0, "Max coalescing size");
65 unsigned int coalesc_time
= LRO_MX_TIME_TO_BUFFER
;
66 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, lro_time
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
67 &coalesc_time
, 0, "Max coalescing time");
69 struct lro_flow lro_flow_list
[TCP_LRO_NUM_FLOWS
];
71 char lro_flow_map
[TCP_LRO_FLOW_MAP
];
73 static lck_attr_t
*tcp_lro_mtx_attr
= NULL
; /* mutex attributes */
74 static lck_grp_t
*tcp_lro_mtx_grp
= NULL
; /* mutex group */
75 static lck_grp_attr_t
*tcp_lro_mtx_grp_attr
= NULL
; /* mutex group attrs */
76 decl_lck_mtx_data( ,tcp_lro_lock
); /* Used to synchronize updates */
78 unsigned int lro_byte_count
= 0;
80 uint64_t lro_deadline
= 0; /* LRO's sense of time - protected by tcp_lro_lock */
81 uint32_t lro_timer_set
= 0;
84 u_int32_t lro_pkt_count
= 0; /* Number of packets encountered in an LRO period */
85 thread_call_t tcp_lro_timer
;
87 extern u_int32_t kipf_count
;
89 static void tcp_lro_timer_proc(void*, void*);
90 static void lro_update_stats(struct mbuf
*);
91 static void lro_update_flush_stats(struct mbuf
*);
92 static void tcp_lro_flush_flows(void);
93 static void tcp_lro_sched_timer(uint64_t);
94 static void lro_proto_input(struct mbuf
*);
96 static struct mbuf
*lro_tcp_xsum_validate(struct mbuf
*, struct ip
*,
98 static struct mbuf
*tcp_lro_process_pkt(struct mbuf
*, struct ip
*, struct tcphdr
*,
106 bzero(lro_flow_list
, sizeof (struct lro_flow
) * TCP_LRO_NUM_FLOWS
);
107 for (i
= 0; i
< TCP_LRO_FLOW_MAP
; i
++) {
108 lro_flow_map
[i
] = TCP_LRO_FLOW_UNINIT
;
112 * allocate lock group attribute, group and attribute for tcp_lro_lock
114 tcp_lro_mtx_grp_attr
= lck_grp_attr_alloc_init();
115 tcp_lro_mtx_grp
= lck_grp_alloc_init("tcplro", tcp_lro_mtx_grp_attr
);
116 tcp_lro_mtx_attr
= lck_attr_alloc_init();
117 lck_mtx_init(&tcp_lro_lock
, tcp_lro_mtx_grp
, tcp_lro_mtx_attr
);
119 tcp_lro_timer
= thread_call_allocate(tcp_lro_timer_proc
, NULL
);
120 if (tcp_lro_timer
== NULL
) {
121 panic_plain("%s: unable to allocate lro timer", __func__
);
128 tcp_lro_matching_tuple(struct ip
* ip_hdr
, struct tcphdr
*tcp_hdr
, int *hash
,
131 struct lro_flow
*flow
;
133 unsigned int off
= 0;
136 *hash
= LRO_HASH(ip_hdr
->ip_src
.s_addr
, ip_hdr
->ip_dst
.s_addr
,
137 tcp_hdr
->th_sport
, tcp_hdr
->th_dport
, (TCP_LRO_FLOW_MAP
- 1));
139 *flow_id
= lro_flow_map
[*hash
];
140 if (*flow_id
== TCP_LRO_FLOW_NOTFOUND
) {
144 seqnum
= tcp_hdr
->th_seq
;
145 off
= tcp_hdr
->th_off
<< 2;
146 payload_len
= ip_hdr
->ip_len
- off
;
148 flow
= &lro_flow_list
[*flow_id
];
150 if ((flow
->lr_faddr
.s_addr
== ip_hdr
->ip_src
.s_addr
) &&
151 (flow
->lr_laddr
.s_addr
== ip_hdr
->ip_dst
.s_addr
) &&
152 (flow
->lr_fport
== tcp_hdr
->th_sport
) &&
153 (flow
->lr_lport
== tcp_hdr
->th_dport
)) {
154 if (flow
->lr_tcphdr
== NULL
) {
155 if (ntohl(seqnum
) == flow
->lr_seq
) {
156 return TCP_LRO_COALESCE
;
159 printf("%s: seqnum = %x, lr_seq = %x\n",
160 __func__
, ntohl(seqnum
), flow
->lr_seq
);
163 if (SEQ_GT(ntohl(seqnum
), flow
->lr_seq
)) {
164 lro_seq_outoforder
++;
166 * Whenever we receive out of order packets it
167 * signals loss and recovery and LRO doesn't
168 * let flows recover quickly. So eject.
170 flow
->lr_flags
|= LRO_EJECT_REQ
;
176 if (flow
->lr_flags
& LRO_EJECT_REQ
) {
178 printf("%s: eject. \n", __func__
);
179 return TCP_LRO_EJECT_FLOW
;
181 if (SEQ_GT(tcp_hdr
->th_ack
, flow
->lr_tcphdr
->th_ack
)) {
183 printf("%s: th_ack = %x flow_ack = %x \n",
184 __func__
, tcp_hdr
->th_ack
,
185 flow
->lr_tcphdr
->th_ack
);
187 return TCP_LRO_EJECT_FLOW
;
190 if (ntohl(seqnum
) == (ntohl(lro_flow_list
[*flow_id
].lr_tcphdr
->th_seq
) + lro_flow_list
[*flow_id
].lr_len
)) {
191 return TCP_LRO_COALESCE
;
193 /* LRO does not handle loss recovery well, eject */
194 flow
->lr_flags
|= LRO_EJECT_REQ
;
195 return TCP_LRO_EJECT_FLOW
;
198 if (lrodebug
) printf("tcp_lro_matching_tuple: collision \n");
199 return TCP_LRO_COLLISION
;
203 tcp_lro_init_flow(int flow_id
, struct ip
* ip_hdr
, struct tcphdr
*tcp_hdr
,
204 int hash
, u_int32_t timestamp
, int payload_len
)
206 struct lro_flow
*flow
= NULL
;
208 flow
= &lro_flow_list
[flow_id
];
210 flow
->lr_hash_map
= hash
;
211 flow
->lr_faddr
.s_addr
= ip_hdr
->ip_src
.s_addr
;
212 flow
->lr_laddr
.s_addr
= ip_hdr
->ip_dst
.s_addr
;
213 flow
->lr_fport
= tcp_hdr
->th_sport
;
214 flow
->lr_lport
= tcp_hdr
->th_dport
;
215 lro_flow_map
[hash
] = flow_id
;
216 flow
->lr_timestamp
= timestamp
;
217 flow
->lr_seq
= ntohl(tcp_hdr
->th_seq
) + payload_len
;
223 tcp_lro_coalesce(int flow_id
, struct mbuf
*lro_mb
, struct tcphdr
*tcphdr
,
224 int payload_len
, int drop_hdrlen
, struct tcpopt
*topt
,
225 u_int32_t
* tsval
, u_int32_t
* tsecr
, int thflags
)
227 struct lro_flow
*flow
= NULL
;
229 struct ip
*ip
= NULL
;
231 flow
= &lro_flow_list
[flow_id
];
232 if (flow
->lr_mhead
) {
234 printf("%s: lr_mhead %x %d \n", __func__
, flow
->lr_seq
,
236 m_adj(lro_mb
, drop_hdrlen
);
238 last
= flow
->lr_mtail
;
239 while (last
->m_next
!= NULL
) {
242 last
->m_next
= lro_mb
;
244 flow
->lr_mtail
= lro_mb
;
246 ip
= mtod(flow
->lr_mhead
, struct ip
*);
247 ip
->ip_len
+= lro_mb
->m_pkthdr
.len
;
248 flow
->lr_mhead
->m_pkthdr
.len
+= lro_mb
->m_pkthdr
.len
;
250 if (flow
->lr_len
== 0) {
251 panic_plain("%s: Inconsistent LRO flow state", __func__
);
253 flow
->lr_len
+= payload_len
;
254 flow
->lr_seq
+= payload_len
;
256 * This bit is re-OR'd each time a packet is added to the
257 * large coalesced packet.
259 flow
->lr_mhead
->m_pkthdr
.pkt_flags
|= PKTF_SW_LRO_PKT
;
260 flow
->lr_mhead
->m_pkthdr
.lro_npkts
++; /* for tcpstat.tcps_rcvpack */
261 if (flow
->lr_mhead
->m_pkthdr
.lro_pktlen
<
262 lro_mb
->m_pkthdr
.lro_pktlen
) {
264 * For TCP Inter Arrival Jitter calculation, return max
265 * size encountered while coalescing a stream of pkts.
267 flow
->lr_mhead
->m_pkthdr
.lro_pktlen
=
268 lro_mb
->m_pkthdr
.lro_pktlen
;
270 /* Update the timestamp value */
271 if (topt
->to_flags
& TOF_TS
) {
272 if ((flow
->lr_tsval
) &&
273 (TSTMP_GT(topt
->to_tsval
, ntohl(*(flow
->lr_tsval
))))) {
274 *(flow
->lr_tsval
) = htonl(topt
->to_tsval
);
276 if ((flow
->lr_tsecr
) &&
277 (topt
->to_tsecr
!= 0) &&
278 (TSTMP_GT(topt
->to_tsecr
, ntohl(*(flow
->lr_tsecr
))))) {
280 printf("%s: instantaneous RTT = %d \n", __func__
,
281 topt
->to_tsecr
- ntohl(*(flow
->lr_tsecr
)));
283 *(flow
->lr_tsecr
) = htonl(topt
->to_tsecr
);
286 /* Coalesce the flags */
288 flow
->lr_tcphdr
->th_flags
|= thflags
;
290 /* Update receive window */
291 flow
->lr_tcphdr
->th_win
= tcphdr
->th_win
;
294 flow
->lr_mhead
= flow
->lr_mtail
= lro_mb
;
295 flow
->lr_mhead
->m_pkthdr
.pkt_flags
|= PKTF_SW_LRO_PKT
;
296 flow
->lr_tcphdr
= tcphdr
;
297 if ((topt
) && (topt
->to_flags
& TOF_TS
)) {
298 ASSERT(tsval
!= NULL
);
299 ASSERT(tsecr
!= NULL
);
300 flow
->lr_tsval
= tsval
;
301 flow
->lr_tsecr
= tsecr
;
303 flow
->lr_len
= payload_len
;
304 calculate_tcp_clock();
305 flow
->lr_timestamp
= tcp_now
;
306 tcp_lro_sched_timer(0);
308 flow
->lr_seq
= ntohl(tcphdr
->th_seq
) + payload_len
;
311 tcpstat
.tcps_coalesced_pack
++;
317 tcp_lro_eject_flow(int flow_id
)
319 struct mbuf
*mb
= NULL
;
321 mb
= lro_flow_list
[flow_id
].lr_mhead
;
322 ASSERT(lro_flow_map
[lro_flow_list
[flow_id
].lr_hash_map
] == flow_id
);
323 lro_flow_map
[lro_flow_list
[flow_id
].lr_hash_map
] = TCP_LRO_FLOW_UNINIT
;
324 bzero(&lro_flow_list
[flow_id
], sizeof(struct lro_flow
));
330 tcp_lro_eject_coalesced_pkt(int flow_id
)
332 struct mbuf
*mb
= NULL
;
333 mb
= lro_flow_list
[flow_id
].lr_mhead
;
334 lro_flow_list
[flow_id
].lr_mhead
=
335 lro_flow_list
[flow_id
].lr_mtail
= NULL
;
336 lro_flow_list
[flow_id
].lr_tcphdr
= NULL
;
341 tcp_lro_insert_flow(struct mbuf
*lro_mb
, struct ip
*ip_hdr
,
342 struct tcphdr
*tcp_hdr
, int payload_len
,
343 int drop_hdrlen
, int hash
, struct tcpopt
*topt
,
344 u_int32_t
*tsval
, u_int32_t
*tsecr
)
347 int slot_available
= 0;
348 int candidate_flow
= 0;
349 u_int32_t oldest_timestamp
;
350 struct mbuf
*mb
= NULL
;
353 oldest_timestamp
= tcp_now
;
355 /* handle collision */
356 if (lro_flow_map
[hash
] != TCP_LRO_FLOW_UNINIT
) {
360 candidate_flow
= lro_flow_map
[hash
];
361 tcpstat
.tcps_flowtbl_collision
++;
365 for (i
= 0; i
< TCP_LRO_NUM_FLOWS
; i
++) {
366 if (lro_flow_list
[i
].lr_mhead
== NULL
) {
371 if (oldest_timestamp
>= lro_flow_list
[i
].lr_timestamp
) {
373 oldest_timestamp
= lro_flow_list
[i
].lr_timestamp
;
377 if (!slot_available
) {
378 tcpstat
.tcps_flowtbl_full
++;
380 /* kick the oldest flow */
381 mb
= tcp_lro_eject_flow(candidate_flow
);
384 if (!slot_available
) {
385 printf("%s: slot unavailable.\n",__func__
);
388 printf("%s: collision.\n",__func__
);
392 candidate_flow
= i
; /* this is now the flow to be used */
396 tcp_lro_init_flow(candidate_flow
, ip_hdr
, tcp_hdr
, hash
,
397 tcp_now
, payload_len
);
398 tcp_lro_coalesce(candidate_flow
, lro_mb
, tcp_hdr
, payload_len
,
399 drop_hdrlen
, topt
, tsval
, tsecr
, 0);
404 tcp_lro_process_pkt(struct mbuf
*lro_mb
, struct ip
*ip_hdr
,
405 struct tcphdr
*tcp_hdr
, int drop_hdrlen
)
407 int flow_id
= TCP_LRO_FLOW_UNINIT
;
409 unsigned int off
= 0;
413 struct mbuf
*mb
= NULL
;
418 int ret_response
= TCP_LRO_CONSUMED
;
419 int coalesced
= 0, tcpflags
= 0, unknown_tcpopts
= 0;
422 if (lro_mb
->m_len
< (int32_t)sizeof (struct tcpiphdr
)) {
423 if ((lro_mb
= m_pullup(lro_mb
, sizeof(struct tcpiphdr
))) == 0) {
424 tcpstat
.tcps_rcvshort
++;
427 printf("tcp_lro_process_pkt:mbuf too short.\n");
434 lro_mb
->m_pkthdr
.pkt_flags
&= ~PKTF_SW_LRO_DID_CSUM
;
436 if ((lro_mb
= lro_tcp_xsum_validate(lro_mb
, ip_hdr
, tcp_hdr
)) == NULL
) {
438 printf("tcp_lro_process_pkt: TCP xsum failed.\n");
446 /* Avoids checksumming in tcp_input */
447 lro_mb
->m_pkthdr
.pkt_flags
|= PKTF_SW_LRO_DID_CSUM
;
449 off
= tcp_hdr
->th_off
<< 2;
450 optlen
= off
- sizeof (struct tcphdr
);
451 payload_len
= ip_hdr
->ip_len
- off
;
452 optp
= (u_char
*)(tcp_hdr
+ 1);
454 * Do quick retrieval of timestamp options ("options
455 * prediction?"). If timestamp is the only option and it's
456 * formatted as recommended in RFC 1323 appendix A, we
457 * quickly get the values now and not bother calling
458 * tcp_dooptions(), etc.
460 if ((optlen
== TCPOLEN_TSTAMP_APPA
||
461 (optlen
> TCPOLEN_TSTAMP_APPA
&&
462 optp
[TCPOLEN_TSTAMP_APPA
] == TCPOPT_EOL
)) &&
463 *(u_int32_t
*)optp
== htonl(TCPOPT_TSTAMP_HDR
) &&
464 (tcp_hdr
->th_flags
& TH_SYN
) == 0) {
465 to
.to_flags
|= TOF_TS
;
466 to
.to_tsval
= ntohl(*(u_int32_t
*)(void *)(optp
+ 4));
467 to
.to_tsecr
= ntohl(*(u_int32_t
*)(void *)(optp
+ 8));
470 * If TCP timestamps are not in use, or not the first option,
471 * skip LRO path since timestamps are used to avoid LRO
472 * from introducing additional latencies for retransmissions
473 * and other slow-paced transmissions.
475 to
.to_flags
= to
.to_tsecr
= 0;
479 /* list all the conditions that can trigger a flow ejection here */
481 thflags
= tcp_hdr
->th_flags
;
482 if (thflags
& (TH_SYN
| TH_URG
| TH_ECE
| TH_CWR
| TH_PUSH
| TH_RST
| TH_FIN
)) {
483 eject_flow
= tcpflags
= 1;
486 if (optlen
&& !((optlen
== TCPOLEN_TSTAMP_APPA
) &&
487 (to
.to_flags
& TOF_TS
))) {
488 eject_flow
= unknown_tcpopts
= 1;
491 if (payload_len
<= LRO_MIN_COALESC_SZ
) { /* zero payload ACK */
495 /* Can't coalesce ECN marked packets. */
496 ecn
= ip_hdr
->ip_tos
& IPTOS_ECN_MASK
;
497 if (ecn
== IPTOS_ECN_CE
) {
499 * ECN needs quick notification
502 printf("%s: ECE bits set.\n", __func__
);
507 lck_mtx_lock_spin(&tcp_lro_lock
);
509 retval
= tcp_lro_matching_tuple(ip_hdr
, tcp_hdr
, &hash
, &flow_id
);
513 lck_mtx_unlock(&tcp_lro_lock
);
514 ret_response
= TCP_LRO_FLOW_NOTFOUND
;
517 case TCP_LRO_COALESCE
:
518 if ((payload_len
!= 0) && (unknown_tcpopts
== 0) &&
519 (tcpflags
== 0) && (ecn
!= IPTOS_ECN_CE
) && (to
.to_flags
& TOF_TS
)) {
520 tcp_lro_coalesce(flow_id
, lro_mb
, tcp_hdr
, payload_len
,
522 (to
.to_flags
& TOF_TS
) ? (u_int32_t
*)(void *)(optp
+ 4) : NULL
,
523 (to
.to_flags
& TOF_TS
) ? (u_int32_t
*)(void *)(optp
+ 8) : NULL
,
526 printf("tcp_lro_process_pkt: coalesce len = %d. flow_id = %d payload_len = %d drop_hdrlen = %d optlen = %d lport = %d seqnum = %x.\n",
527 lro_flow_list
[flow_id
].lr_len
, flow_id
,
528 payload_len
, drop_hdrlen
, optlen
,
529 ntohs(lro_flow_list
[flow_id
].lr_lport
),
530 ntohl(tcp_hdr
->th_seq
));
532 if (lro_flow_list
[flow_id
].lr_mhead
->m_pkthdr
.lro_npkts
>= coalesc_sz
) {
538 mb
= tcp_lro_eject_coalesced_pkt(flow_id
);
539 lro_flow_list
[flow_id
].lr_seq
= ntohl(tcp_hdr
->th_seq
) +
541 calculate_tcp_clock();
542 u_int8_t timestamp
= tcp_now
- lro_flow_list
[flow_id
].lr_timestamp
;
543 lck_mtx_unlock(&tcp_lro_lock
);
545 mb
->m_pkthdr
.lro_elapsed
= timestamp
;
550 printf("%s: pkt payload_len = %d \n", __func__
, payload_len
);
552 lro_proto_input(lro_mb
);
555 lck_mtx_unlock(&tcp_lro_lock
);
559 case TCP_LRO_EJECT_FLOW
:
560 mb
= tcp_lro_eject_coalesced_pkt(flow_id
);
561 calculate_tcp_clock();
562 u_int8_t timestamp
= tcp_now
- lro_flow_list
[flow_id
].lr_timestamp
;
563 lck_mtx_unlock(&tcp_lro_lock
);
566 printf("tcp_lro_process_pkt eject_flow, len = %d\n", mb
->m_pkthdr
.len
);
567 mb
->m_pkthdr
.lro_elapsed
= timestamp
;
571 lro_proto_input(lro_mb
);
574 case TCP_LRO_COLLISION
:
575 lck_mtx_unlock(&tcp_lro_lock
);
576 ret_response
= TCP_LRO_FLOW_NOTFOUND
;
580 lck_mtx_unlock(&tcp_lro_lock
);
581 panic_plain("%s: unrecognized type %d", __func__
, retval
);
585 if (ret_response
== TCP_LRO_FLOW_NOTFOUND
) {
586 lro_proto_input(lro_mb
);
592 tcp_lro_timer_proc(void *arg1
, void *arg2
)
594 #pragma unused(arg1, arg2)
596 lck_mtx_lock_spin(&tcp_lro_lock
);
598 lck_mtx_unlock(&tcp_lro_lock
);
599 tcp_lro_flush_flows();
603 tcp_lro_flush_flows(void)
607 struct lro_flow
*flow
;
608 int tcpclock_updated
= 0;
610 lck_mtx_lock(&tcp_lro_lock
);
612 while (i
< TCP_LRO_NUM_FLOWS
) {
613 flow
= &lro_flow_list
[i
];
614 if (flow
->lr_mhead
!= NULL
) {
616 if (!tcpclock_updated
) {
617 calculate_tcp_clock();
618 tcpclock_updated
= 1;
622 printf("tcp_lro_flush_flows: len =%d n_pkts = %d %d %d \n",
624 flow
->lr_mhead
->m_pkthdr
.lro_npkts
,
625 flow
->lr_timestamp
, tcp_now
);
627 u_int8_t timestamp
= tcp_now
- flow
->lr_timestamp
;
629 mb
= tcp_lro_eject_flow(i
);
632 mb
->m_pkthdr
.lro_elapsed
= timestamp
;
633 lck_mtx_unlock(&tcp_lro_lock
);
634 lro_update_flush_stats(mb
);
636 lck_mtx_lock(&tcp_lro_lock
);
641 lck_mtx_unlock(&tcp_lro_lock
);
645 * Must be called with tcp_lro_lock held.
646 * The hint is non-zero for longer waits. The wait time dictated by coalesc_time
647 * takes precedence, so lro_timer_set is not set for the hint case
650 tcp_lro_sched_timer(uint64_t hint
)
658 /* the intent is to wake up every coalesc_time msecs */
659 clock_interval_to_deadline(coalesc_time
,
660 (NSEC_PER_SEC
/ TCP_RETRANSHZ
), &lro_deadline
);
662 clock_interval_to_deadline(hint
, NSEC_PER_SEC
/ TCP_RETRANSHZ
,
665 thread_call_enter_delayed(tcp_lro_timer
, lro_deadline
);
669 tcp_lro(struct mbuf
*m
, unsigned int hlen
)
673 struct tcphdr
* tcp_hdr
= NULL
;
674 unsigned int off
= 0;
680 * Experiments on cellular show that the RTT is much higher
681 * than the coalescing time of 5 msecs, causing lro to flush
682 * 80% of the time on a single packet. Increasing
683 * coalescing time for cellular does not show marked
684 * improvement to throughput either. Loopback perf is hurt
685 * by the 5 msec latency and it already sends large packets.
687 if (IFNET_IS_CELLULAR(m
->m_pkthdr
.rcvif
) ||
688 (m
->m_pkthdr
.rcvif
->if_type
== IFT_LOOP
)) {
692 ip_hdr
= mtod(m
, struct ip
*);
694 /* don't deal with IP options */
695 if (hlen
> sizeof (struct ip
))
698 /* only TCP is coalesced */
699 if (ip_hdr
->ip_p
!= IPPROTO_TCP
) {
703 if (m
->m_len
< (int32_t) sizeof (struct tcpiphdr
)) {
704 if (lrodebug
) printf("tcp_lro m_pullup \n");
705 if ((m
= m_pullup(m
, sizeof (struct tcpiphdr
))) == 0) {
706 tcpstat
.tcps_rcvshort
++;
708 printf("ip_lro: rcvshort.\n");
714 tcp_hdr
= (struct tcphdr
*)((caddr_t
)ip_hdr
+ hlen
);
715 tlen
= ip_hdr
->ip_len
; //ignore IP header bytes len
716 m
->m_pkthdr
.lro_pktlen
= tlen
; /* Used to return max pkt encountered to tcp */
717 m
->m_pkthdr
.lro_npkts
= 1; /* Initialize a counter to hold num pkts coalesced */
718 m
->m_pkthdr
.lro_elapsed
= 0; /* Initialize the field to carry elapsed time */
719 off
= tcp_hdr
->th_off
<< 2;
720 if (off
< sizeof (struct tcphdr
) || off
> tlen
) {
721 tcpstat
.tcps_rcvbadoff
++;
723 printf("ip_lro: TCP off greater than TCP header.\n");
728 return (tcp_lro_process_pkt(m
, ip_hdr
, tcp_hdr
, hlen
+ off
));
732 lro_proto_input(struct mbuf
*m
)
734 struct ip
* ip_hdr
= mtod(m
, struct ip
*);
737 printf("lro_proto_input: ip_len = %d \n",
741 ip_proto_dispatch_in_wrapper(m
, ip_hdr
->ip_hl
<< 2, ip_hdr
->ip_p
);
745 lro_tcp_xsum_validate(struct mbuf
*m
, struct ip
*ip
, struct tcphdr
* th
)
747 /* Expect 32-bit aligned data pointer on strict-align platforms */
748 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m
);
750 /* we shouldn't get here for IP with options; hence sizeof (ip) */
751 if (tcp_input_checksum(AF_INET
, m
, th
, sizeof (*ip
), ip
->ip_len
)) {
753 printf("%s: bad xsum and drop m = 0x%llx.\n", __func__
,
754 (uint64_t)VM_KERNEL_ADDRPERM(m
));
763 * When TCP detects a stable, steady flow without out of ordering,
764 * with a sufficiently high cwnd, it invokes LRO.
767 tcp_start_coalescing(struct ip
*ip_hdr
, struct tcphdr
*tcp_hdr
, int tlen
)
771 struct mbuf
*eject_mb
;
774 hash
= LRO_HASH(ip_hdr
->ip_src
.s_addr
, ip_hdr
->ip_dst
.s_addr
,
775 tcp_hdr
->th_sport
, tcp_hdr
->th_dport
,
776 (TCP_LRO_FLOW_MAP
- 1));
779 lck_mtx_lock_spin(&tcp_lro_lock
);
780 flow_id
= lro_flow_map
[hash
];
781 if (flow_id
!= TCP_LRO_FLOW_NOTFOUND
) {
782 lf
= &lro_flow_list
[flow_id
];
783 if ((lf
->lr_faddr
.s_addr
== ip_hdr
->ip_src
.s_addr
) &&
784 (lf
->lr_laddr
.s_addr
== ip_hdr
->ip_dst
.s_addr
) &&
785 (lf
->lr_fport
== tcp_hdr
->th_sport
) &&
786 (lf
->lr_lport
== tcp_hdr
->th_dport
)) {
787 if ((lf
->lr_tcphdr
== NULL
) &&
788 (lf
->lr_seq
!= (tcp_hdr
->th_seq
+ tlen
))) {
789 lf
->lr_seq
= tcp_hdr
->th_seq
+ tlen
;
791 lf
->lr_flags
&= ~LRO_EJECT_REQ
;
793 lck_mtx_unlock(&tcp_lro_lock
);
797 HTONL(tcp_hdr
->th_seq
);
798 HTONL(tcp_hdr
->th_ack
);
800 tcp_lro_insert_flow(NULL
, ip_hdr
, tcp_hdr
, tlen
, 0, hash
,
803 lck_mtx_unlock(&tcp_lro_lock
);
805 NTOHL(tcp_hdr
->th_seq
);
806 NTOHL(tcp_hdr
->th_ack
);
808 printf("%s: src = %x dst = %x sport = %d dport = %d seq %x \n",
809 __func__
, ip_hdr
->ip_src
.s_addr
, ip_hdr
->ip_dst
.s_addr
,
810 tcp_hdr
->th_sport
, tcp_hdr
->th_dport
, tcp_hdr
->th_seq
);
812 ASSERT(eject_mb
== NULL
);
817 * When TCP detects loss or idle condition, it stops offloading
821 tcp_lro_remove_state(struct in_addr saddr
, struct in_addr daddr
,
822 unsigned short sport
, unsigned short dport
)
827 hash
= LRO_HASH(daddr
.s_addr
, saddr
.s_addr
, dport
, sport
,
828 (TCP_LRO_FLOW_MAP
- 1));
829 lck_mtx_lock_spin(&tcp_lro_lock
);
830 flow_id
= lro_flow_map
[hash
];
831 if (flow_id
== TCP_LRO_FLOW_UNINIT
) {
832 lck_mtx_unlock(&tcp_lro_lock
);
835 lf
= &lro_flow_list
[flow_id
];
836 if ((lf
->lr_faddr
.s_addr
== daddr
.s_addr
) &&
837 (lf
->lr_laddr
.s_addr
== saddr
.s_addr
) &&
838 (lf
->lr_fport
== dport
) &&
839 (lf
->lr_lport
== sport
)) {
841 printf("%s: %x %x\n", __func__
,
842 lf
->lr_flags
, lf
->lr_seq
);
844 lf
->lr_flags
|= LRO_EJECT_REQ
;
846 lck_mtx_unlock(&tcp_lro_lock
);
851 tcp_update_lro_seq(__uint32_t rcv_nxt
, struct in_addr saddr
, struct in_addr daddr
,
852 unsigned short sport
, unsigned short dport
)
857 hash
= LRO_HASH(daddr
.s_addr
, saddr
.s_addr
, dport
, sport
,
858 (TCP_LRO_FLOW_MAP
- 1));
859 lck_mtx_lock_spin(&tcp_lro_lock
);
860 flow_id
= lro_flow_map
[hash
];
861 if (flow_id
== TCP_LRO_FLOW_UNINIT
) {
862 lck_mtx_unlock(&tcp_lro_lock
);
865 lf
= &lro_flow_list
[flow_id
];
866 if ((lf
->lr_faddr
.s_addr
== daddr
.s_addr
) &&
867 (lf
->lr_laddr
.s_addr
== saddr
.s_addr
) &&
868 (lf
->lr_fport
== dport
) &&
869 (lf
->lr_lport
== sport
) &&
870 (lf
->lr_tcphdr
== NULL
)) {
871 lf
->lr_seq
= (tcp_seq
)rcv_nxt
;
873 lck_mtx_unlock(&tcp_lro_lock
);
878 lro_update_stats(struct mbuf
*m
)
880 switch(m
->m_pkthdr
.lro_npkts
) {
881 case 0: /* fall through */
886 tcpstat
.tcps_lro_twopack
++;
889 case 3: /* fall through */
891 tcpstat
.tcps_lro_multpack
++;
895 tcpstat
.tcps_lro_largepack
++;
902 lro_update_flush_stats(struct mbuf
*m
)
905 switch(m
->m_pkthdr
.lro_npkts
) {
907 case 1: lro_single_flushes
++;
909 case 2: lro_double_flushes
++;
911 default: lro_good_flushes
++;