2 * Copyright (c) 2011-2013 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/sysctl.h>
33 #include <sys/mcache.h>
34 #include <sys/socket.h>
35 #include <sys/socketvar.h>
36 #include <net/if_types.h>
37 #include <net/route.h>
38 #include <netinet/in.h>
39 #include <netinet/in_systm.h>
42 #include <netinet/ip.h>
43 #include <netinet/ip_var.h>
44 #include <netinet/in_var.h>
45 #include <netinet/tcp.h>
46 #include <netinet/tcp_seq.h>
47 #include <netinet/tcpip.h>
48 #include <netinet/tcp_var.h>
49 #include <netinet/tcp_lro.h>
50 #include <netinet/lro_ext.h>
51 #include <kern/locks.h>
53 unsigned int lrocount
= 0; /* A counter used for debugging only */
54 unsigned int lro_seq_outoforder
= 0; /* Counter for debugging */
55 unsigned int lro_seq_mismatch
= 0; /* Counter for debugging */
56 unsigned int lro_flushes
= 0; /* Counter for tracking number of flushes */
57 unsigned int lro_single_flushes
= 0;
58 unsigned int lro_double_flushes
= 0;
59 unsigned int lro_good_flushes
= 0;
61 unsigned int coalesc_sz
= LRO_MX_COALESCE_PKTS
;
62 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, lro_sz
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
63 &coalesc_sz
, 0, "Max coalescing size");
65 unsigned int coalesc_time
= LRO_MX_TIME_TO_BUFFER
;
66 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, lro_time
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
67 &coalesc_time
, 0, "Max coalescing time");
69 struct lro_flow lro_flow_list
[TCP_LRO_NUM_FLOWS
];
71 char lro_flow_map
[TCP_LRO_FLOW_MAP
];
73 static lck_attr_t
*tcp_lro_mtx_attr
= NULL
; /* mutex attributes */
74 static lck_grp_t
*tcp_lro_mtx_grp
= NULL
; /* mutex group */
75 static lck_grp_attr_t
*tcp_lro_mtx_grp_attr
= NULL
; /* mutex group attrs */
76 decl_lck_mtx_data( ,tcp_lro_lock
); /* Used to synchronize updates */
78 unsigned int lro_byte_count
= 0;
80 uint64_t lro_deadline
= 0; /* LRO's sense of time - protected by tcp_lro_lock */
81 uint32_t lro_timer_set
= 0;
84 u_int32_t lro_pkt_count
= 0; /* Number of packets encountered in an LRO period */
85 thread_call_t tcp_lro_timer
;
87 extern u_int32_t kipf_count
;
89 static void tcp_lro_timer_proc(void*, void*);
90 static void lro_update_stats(struct mbuf
*);
91 static void lro_update_flush_stats(struct mbuf
*);
92 static void tcp_lro_flush_flows(void);
93 static void tcp_lro_sched_timer(uint64_t);
94 static void lro_proto_input(struct mbuf
*);
96 static struct mbuf
*lro_tcp_xsum_validate(struct mbuf
*, struct ip
*,
98 static struct mbuf
*tcp_lro_process_pkt(struct mbuf
*, int);
105 bzero(lro_flow_list
, sizeof (struct lro_flow
) * TCP_LRO_NUM_FLOWS
);
106 for (i
= 0; i
< TCP_LRO_FLOW_MAP
; i
++) {
107 lro_flow_map
[i
] = TCP_LRO_FLOW_UNINIT
;
111 * allocate lock group attribute, group and attribute for tcp_lro_lock
113 tcp_lro_mtx_grp_attr
= lck_grp_attr_alloc_init();
114 tcp_lro_mtx_grp
= lck_grp_alloc_init("tcplro", tcp_lro_mtx_grp_attr
);
115 tcp_lro_mtx_attr
= lck_attr_alloc_init();
116 lck_mtx_init(&tcp_lro_lock
, tcp_lro_mtx_grp
, tcp_lro_mtx_attr
);
118 tcp_lro_timer
= thread_call_allocate(tcp_lro_timer_proc
, NULL
);
119 if (tcp_lro_timer
== NULL
) {
120 panic_plain("%s: unable to allocate lro timer", __func__
);
127 tcp_lro_matching_tuple(struct ip
* ip_hdr
, struct tcphdr
*tcp_hdr
, int *hash
,
130 struct lro_flow
*flow
;
132 unsigned int off
= 0;
135 *hash
= LRO_HASH(ip_hdr
->ip_src
.s_addr
, ip_hdr
->ip_dst
.s_addr
,
136 tcp_hdr
->th_sport
, tcp_hdr
->th_dport
, (TCP_LRO_FLOW_MAP
- 1));
138 *flow_id
= lro_flow_map
[*hash
];
139 if (*flow_id
== TCP_LRO_FLOW_NOTFOUND
) {
143 seqnum
= tcp_hdr
->th_seq
;
144 off
= tcp_hdr
->th_off
<< 2;
145 payload_len
= ip_hdr
->ip_len
- off
;
147 flow
= &lro_flow_list
[*flow_id
];
149 if ((flow
->lr_faddr
.s_addr
== ip_hdr
->ip_src
.s_addr
) &&
150 (flow
->lr_laddr
.s_addr
== ip_hdr
->ip_dst
.s_addr
) &&
151 (flow
->lr_fport
== tcp_hdr
->th_sport
) &&
152 (flow
->lr_lport
== tcp_hdr
->th_dport
)) {
153 if (flow
->lr_tcphdr
== NULL
) {
154 if (ntohl(seqnum
) == flow
->lr_seq
) {
155 return TCP_LRO_COALESCE
;
158 printf("%s: seqnum = %x, lr_seq = %x\n",
159 __func__
, ntohl(seqnum
), flow
->lr_seq
);
162 if (SEQ_GT(ntohl(seqnum
), flow
->lr_seq
)) {
163 lro_seq_outoforder
++;
165 * Whenever we receive out of order packets it
166 * signals loss and recovery and LRO doesn't
167 * let flows recover quickly. So eject.
169 flow
->lr_flags
|= LRO_EJECT_REQ
;
175 if (flow
->lr_flags
& LRO_EJECT_REQ
) {
177 printf("%s: eject. \n", __func__
);
178 return TCP_LRO_EJECT_FLOW
;
180 if (SEQ_GT(tcp_hdr
->th_ack
, flow
->lr_tcphdr
->th_ack
)) {
182 printf("%s: th_ack = %x flow_ack = %x \n",
183 __func__
, tcp_hdr
->th_ack
,
184 flow
->lr_tcphdr
->th_ack
);
186 return TCP_LRO_EJECT_FLOW
;
189 if (ntohl(seqnum
) == (ntohl(lro_flow_list
[*flow_id
].lr_tcphdr
->th_seq
) + lro_flow_list
[*flow_id
].lr_len
)) {
190 return TCP_LRO_COALESCE
;
192 /* LRO does not handle loss recovery well, eject */
193 flow
->lr_flags
|= LRO_EJECT_REQ
;
194 return TCP_LRO_EJECT_FLOW
;
197 if (lrodebug
) printf("tcp_lro_matching_tuple: collision \n");
198 return TCP_LRO_COLLISION
;
202 tcp_lro_init_flow(int flow_id
, struct ip
* ip_hdr
, struct tcphdr
*tcp_hdr
,
203 int hash
, u_int32_t timestamp
, int payload_len
)
205 struct lro_flow
*flow
= NULL
;
207 flow
= &lro_flow_list
[flow_id
];
209 flow
->lr_hash_map
= hash
;
210 flow
->lr_faddr
.s_addr
= ip_hdr
->ip_src
.s_addr
;
211 flow
->lr_laddr
.s_addr
= ip_hdr
->ip_dst
.s_addr
;
212 flow
->lr_fport
= tcp_hdr
->th_sport
;
213 flow
->lr_lport
= tcp_hdr
->th_dport
;
214 lro_flow_map
[hash
] = flow_id
;
215 flow
->lr_timestamp
= timestamp
;
216 flow
->lr_seq
= ntohl(tcp_hdr
->th_seq
) + payload_len
;
222 tcp_lro_coalesce(int flow_id
, struct mbuf
*lro_mb
, struct tcphdr
*tcphdr
,
223 int payload_len
, int drop_hdrlen
, struct tcpopt
*topt
,
224 u_int32_t
* tsval
, u_int32_t
* tsecr
, int thflags
)
226 struct lro_flow
*flow
= NULL
;
228 struct ip
*ip
= NULL
;
230 flow
= &lro_flow_list
[flow_id
];
231 if (flow
->lr_mhead
) {
233 printf("%s: lr_mhead %x %d \n", __func__
, flow
->lr_seq
,
235 m_adj(lro_mb
, drop_hdrlen
);
237 last
= flow
->lr_mtail
;
238 while (last
->m_next
!= NULL
) {
241 last
->m_next
= lro_mb
;
243 flow
->lr_mtail
= lro_mb
;
245 ip
= mtod(flow
->lr_mhead
, struct ip
*);
246 ip
->ip_len
+= lro_mb
->m_pkthdr
.len
;
247 flow
->lr_mhead
->m_pkthdr
.len
+= lro_mb
->m_pkthdr
.len
;
249 if (flow
->lr_len
== 0) {
250 panic_plain("%s: Inconsistent LRO flow state", __func__
);
252 flow
->lr_len
+= payload_len
;
253 flow
->lr_seq
+= payload_len
;
255 * This bit is re-OR'd each time a packet is added to the
256 * large coalesced packet.
258 flow
->lr_mhead
->m_pkthdr
.pkt_flags
|= PKTF_SW_LRO_PKT
;
259 flow
->lr_mhead
->m_pkthdr
.lro_npkts
++; /* for tcpstat.tcps_rcvpack */
260 if (flow
->lr_mhead
->m_pkthdr
.lro_pktlen
<
261 lro_mb
->m_pkthdr
.lro_pktlen
) {
263 * For TCP Inter Arrival Jitter calculation, return max
264 * size encountered while coalescing a stream of pkts.
266 flow
->lr_mhead
->m_pkthdr
.lro_pktlen
=
267 lro_mb
->m_pkthdr
.lro_pktlen
;
269 /* Update the timestamp value */
270 if (topt
->to_flags
& TOF_TS
) {
271 if ((flow
->lr_tsval
) &&
272 (TSTMP_GT(topt
->to_tsval
, ntohl(*(flow
->lr_tsval
))))) {
273 *(flow
->lr_tsval
) = htonl(topt
->to_tsval
);
275 if ((flow
->lr_tsecr
) &&
276 (topt
->to_tsecr
!= 0) &&
277 (TSTMP_GT(topt
->to_tsecr
, ntohl(*(flow
->lr_tsecr
))))) {
279 printf("%s: instantaneous RTT = %d \n", __func__
,
280 topt
->to_tsecr
- ntohl(*(flow
->lr_tsecr
)));
282 *(flow
->lr_tsecr
) = htonl(topt
->to_tsecr
);
285 /* Coalesce the flags */
287 flow
->lr_tcphdr
->th_flags
|= thflags
;
289 /* Update receive window */
290 flow
->lr_tcphdr
->th_win
= tcphdr
->th_win
;
293 flow
->lr_mhead
= flow
->lr_mtail
= lro_mb
;
294 flow
->lr_mhead
->m_pkthdr
.pkt_flags
|= PKTF_SW_LRO_PKT
;
295 flow
->lr_tcphdr
= tcphdr
;
296 if ((topt
) && (topt
->to_flags
& TOF_TS
)) {
297 ASSERT(tsval
!= NULL
);
298 ASSERT(tsecr
!= NULL
);
299 flow
->lr_tsval
= tsval
;
300 flow
->lr_tsecr
= tsecr
;
302 flow
->lr_len
= payload_len
;
303 calculate_tcp_clock();
304 flow
->lr_timestamp
= tcp_now
;
305 tcp_lro_sched_timer(0);
307 flow
->lr_seq
= ntohl(tcphdr
->th_seq
) + payload_len
;
310 tcpstat
.tcps_coalesced_pack
++;
316 tcp_lro_eject_flow(int flow_id
)
318 struct mbuf
*mb
= NULL
;
320 mb
= lro_flow_list
[flow_id
].lr_mhead
;
321 ASSERT(lro_flow_map
[lro_flow_list
[flow_id
].lr_hash_map
] == flow_id
);
322 lro_flow_map
[lro_flow_list
[flow_id
].lr_hash_map
] = TCP_LRO_FLOW_UNINIT
;
323 bzero(&lro_flow_list
[flow_id
], sizeof(struct lro_flow
));
329 tcp_lro_eject_coalesced_pkt(int flow_id
)
331 struct mbuf
*mb
= NULL
;
332 mb
= lro_flow_list
[flow_id
].lr_mhead
;
333 lro_flow_list
[flow_id
].lr_mhead
=
334 lro_flow_list
[flow_id
].lr_mtail
= NULL
;
335 lro_flow_list
[flow_id
].lr_tcphdr
= NULL
;
340 tcp_lro_insert_flow(struct mbuf
*lro_mb
, struct ip
*ip_hdr
,
341 struct tcphdr
*tcp_hdr
, int payload_len
,
342 int drop_hdrlen
, int hash
, struct tcpopt
*topt
,
343 u_int32_t
*tsval
, u_int32_t
*tsecr
)
346 int slot_available
= 0;
347 int candidate_flow
= 0;
348 u_int32_t oldest_timestamp
;
349 struct mbuf
*mb
= NULL
;
352 oldest_timestamp
= tcp_now
;
354 /* handle collision */
355 if (lro_flow_map
[hash
] != TCP_LRO_FLOW_UNINIT
) {
359 candidate_flow
= lro_flow_map
[hash
];
360 tcpstat
.tcps_flowtbl_collision
++;
364 for (i
= 0; i
< TCP_LRO_NUM_FLOWS
; i
++) {
365 if (lro_flow_list
[i
].lr_mhead
== NULL
) {
370 if (oldest_timestamp
>= lro_flow_list
[i
].lr_timestamp
) {
372 oldest_timestamp
= lro_flow_list
[i
].lr_timestamp
;
376 if (!slot_available
) {
377 tcpstat
.tcps_flowtbl_full
++;
379 /* kick the oldest flow */
380 mb
= tcp_lro_eject_flow(candidate_flow
);
383 if (!slot_available
) {
384 printf("%s: slot unavailable.\n",__func__
);
387 printf("%s: collision.\n",__func__
);
391 candidate_flow
= i
; /* this is now the flow to be used */
395 tcp_lro_init_flow(candidate_flow
, ip_hdr
, tcp_hdr
, hash
,
396 tcp_now
, payload_len
);
397 tcp_lro_coalesce(candidate_flow
, lro_mb
, tcp_hdr
, payload_len
,
398 drop_hdrlen
, topt
, tsval
, tsecr
, 0);
403 tcp_lro_process_pkt(struct mbuf
*lro_mb
, int drop_hdrlen
)
405 int flow_id
= TCP_LRO_FLOW_UNINIT
;
407 unsigned int off
= 0;
411 struct mbuf
*mb
= NULL
;
416 int ret_response
= TCP_LRO_CONSUMED
;
417 int coalesced
= 0, tcpflags
= 0, unknown_tcpopts
= 0;
420 struct tcphdr
*tcp_hdr
;
422 if (lro_mb
->m_len
< drop_hdrlen
) {
423 if ((lro_mb
= m_pullup(lro_mb
, drop_hdrlen
)) == NULL
) {
424 tcpstat
.tcps_rcvshort
++;
427 printf("tcp_lro_process_pkt:mbuf too short.\n");
433 ip_hdr
= mtod(lro_mb
, struct ip
*);
434 tcp_hdr
= (struct tcphdr
*)((caddr_t
)ip_hdr
+ sizeof(struct ip
));
437 lro_mb
->m_pkthdr
.pkt_flags
&= ~PKTF_SW_LRO_DID_CSUM
;
439 if ((lro_mb
= lro_tcp_xsum_validate(lro_mb
, ip_hdr
, tcp_hdr
)) == NULL
) {
441 printf("tcp_lro_process_pkt: TCP xsum failed.\n");
449 /* Avoids checksumming in tcp_input */
450 lro_mb
->m_pkthdr
.pkt_flags
|= PKTF_SW_LRO_DID_CSUM
;
452 off
= tcp_hdr
->th_off
<< 2;
453 optlen
= off
- sizeof (struct tcphdr
);
454 payload_len
= ip_hdr
->ip_len
- off
;
455 optp
= (u_char
*)(tcp_hdr
+ 1);
457 * Do quick retrieval of timestamp options ("options
458 * prediction?"). If timestamp is the only option and it's
459 * formatted as recommended in RFC 1323 appendix A, we
460 * quickly get the values now and not bother calling
461 * tcp_dooptions(), etc.
463 bzero(&to
, sizeof(to
));
464 if ((optlen
== TCPOLEN_TSTAMP_APPA
||
465 (optlen
> TCPOLEN_TSTAMP_APPA
&&
466 optp
[TCPOLEN_TSTAMP_APPA
] == TCPOPT_EOL
)) &&
467 *(u_int32_t
*)optp
== htonl(TCPOPT_TSTAMP_HDR
) &&
468 (tcp_hdr
->th_flags
& TH_SYN
) == 0) {
469 to
.to_flags
|= TOF_TS
;
470 to
.to_tsval
= ntohl(*(u_int32_t
*)(void *)(optp
+ 4));
471 to
.to_tsecr
= ntohl(*(u_int32_t
*)(void *)(optp
+ 8));
474 * If TCP timestamps are not in use, or not the first option,
475 * skip LRO path since timestamps are used to avoid LRO
476 * from introducing additional latencies for retransmissions
477 * and other slow-paced transmissions.
479 to
.to_flags
= to
.to_tsecr
= 0;
483 /* list all the conditions that can trigger a flow ejection here */
485 thflags
= tcp_hdr
->th_flags
;
486 if (thflags
& (TH_SYN
| TH_URG
| TH_ECE
| TH_CWR
| TH_PUSH
| TH_RST
| TH_FIN
)) {
487 eject_flow
= tcpflags
= 1;
490 if (optlen
&& !((optlen
== TCPOLEN_TSTAMP_APPA
) &&
491 (to
.to_flags
& TOF_TS
))) {
492 eject_flow
= unknown_tcpopts
= 1;
495 if (payload_len
<= LRO_MIN_COALESC_SZ
) { /* zero payload ACK */
499 /* Can't coalesce ECN marked packets. */
500 ecn
= ip_hdr
->ip_tos
& IPTOS_ECN_MASK
;
501 if (ecn
== IPTOS_ECN_CE
) {
503 * ECN needs quick notification
506 printf("%s: ECE bits set.\n", __func__
);
511 lck_mtx_lock_spin(&tcp_lro_lock
);
513 retval
= tcp_lro_matching_tuple(ip_hdr
, tcp_hdr
, &hash
, &flow_id
);
517 lck_mtx_unlock(&tcp_lro_lock
);
518 ret_response
= TCP_LRO_FLOW_NOTFOUND
;
521 case TCP_LRO_COALESCE
:
522 if ((payload_len
!= 0) && (unknown_tcpopts
== 0) &&
523 (tcpflags
== 0) && (ecn
!= IPTOS_ECN_CE
) && (to
.to_flags
& TOF_TS
)) {
524 tcp_lro_coalesce(flow_id
, lro_mb
, tcp_hdr
, payload_len
,
526 (to
.to_flags
& TOF_TS
) ? (u_int32_t
*)(void *)(optp
+ 4) : NULL
,
527 (to
.to_flags
& TOF_TS
) ? (u_int32_t
*)(void *)(optp
+ 8) : NULL
,
530 printf("tcp_lro_process_pkt: coalesce len = %d. flow_id = %d payload_len = %d drop_hdrlen = %d optlen = %d lport = %d seqnum = %x.\n",
531 lro_flow_list
[flow_id
].lr_len
, flow_id
,
532 payload_len
, drop_hdrlen
, optlen
,
533 ntohs(lro_flow_list
[flow_id
].lr_lport
),
534 ntohl(tcp_hdr
->th_seq
));
536 if (lro_flow_list
[flow_id
].lr_mhead
->m_pkthdr
.lro_npkts
>= coalesc_sz
) {
542 mb
= tcp_lro_eject_coalesced_pkt(flow_id
);
543 lro_flow_list
[flow_id
].lr_seq
= ntohl(tcp_hdr
->th_seq
) +
545 calculate_tcp_clock();
546 u_int8_t timestamp
= tcp_now
- lro_flow_list
[flow_id
].lr_timestamp
;
547 lck_mtx_unlock(&tcp_lro_lock
);
549 mb
->m_pkthdr
.lro_elapsed
= timestamp
;
554 printf("%s: pkt payload_len = %d \n", __func__
, payload_len
);
556 lro_proto_input(lro_mb
);
559 lck_mtx_unlock(&tcp_lro_lock
);
563 case TCP_LRO_EJECT_FLOW
:
564 mb
= tcp_lro_eject_coalesced_pkt(flow_id
);
565 calculate_tcp_clock();
566 u_int8_t timestamp
= tcp_now
- lro_flow_list
[flow_id
].lr_timestamp
;
567 lck_mtx_unlock(&tcp_lro_lock
);
570 printf("tcp_lro_process_pkt eject_flow, len = %d\n", mb
->m_pkthdr
.len
);
571 mb
->m_pkthdr
.lro_elapsed
= timestamp
;
575 lro_proto_input(lro_mb
);
578 case TCP_LRO_COLLISION
:
579 lck_mtx_unlock(&tcp_lro_lock
);
580 ret_response
= TCP_LRO_FLOW_NOTFOUND
;
584 lck_mtx_unlock(&tcp_lro_lock
);
585 panic_plain("%s: unrecognized type %d", __func__
, retval
);
589 if (ret_response
== TCP_LRO_FLOW_NOTFOUND
) {
590 lro_proto_input(lro_mb
);
596 tcp_lro_timer_proc(void *arg1
, void *arg2
)
598 #pragma unused(arg1, arg2)
600 lck_mtx_lock_spin(&tcp_lro_lock
);
602 lck_mtx_unlock(&tcp_lro_lock
);
603 tcp_lro_flush_flows();
607 tcp_lro_flush_flows(void)
611 struct lro_flow
*flow
;
612 int tcpclock_updated
= 0;
614 lck_mtx_lock(&tcp_lro_lock
);
616 while (i
< TCP_LRO_NUM_FLOWS
) {
617 flow
= &lro_flow_list
[i
];
618 if (flow
->lr_mhead
!= NULL
) {
620 if (!tcpclock_updated
) {
621 calculate_tcp_clock();
622 tcpclock_updated
= 1;
626 printf("tcp_lro_flush_flows: len =%d n_pkts = %d %d %d \n",
628 flow
->lr_mhead
->m_pkthdr
.lro_npkts
,
629 flow
->lr_timestamp
, tcp_now
);
631 u_int8_t timestamp
= tcp_now
- flow
->lr_timestamp
;
633 mb
= tcp_lro_eject_flow(i
);
636 mb
->m_pkthdr
.lro_elapsed
= timestamp
;
637 lck_mtx_unlock(&tcp_lro_lock
);
638 lro_update_flush_stats(mb
);
640 lck_mtx_lock(&tcp_lro_lock
);
645 lck_mtx_unlock(&tcp_lro_lock
);
649 * Must be called with tcp_lro_lock held.
650 * The hint is non-zero for longer waits. The wait time dictated by coalesc_time
651 * takes precedence, so lro_timer_set is not set for the hint case
654 tcp_lro_sched_timer(uint64_t hint
)
662 /* the intent is to wake up every coalesc_time msecs */
663 clock_interval_to_deadline(coalesc_time
,
664 (NSEC_PER_SEC
/ TCP_RETRANSHZ
), &lro_deadline
);
666 clock_interval_to_deadline(hint
, NSEC_PER_SEC
/ TCP_RETRANSHZ
,
669 thread_call_enter_delayed(tcp_lro_timer
, lro_deadline
);
673 tcp_lro(struct mbuf
*m
, unsigned int hlen
)
677 struct tcphdr
* tcp_hdr
= NULL
;
678 unsigned int off
= 0;
684 * Experiments on cellular show that the RTT is much higher
685 * than the coalescing time of 5 msecs, causing lro to flush
686 * 80% of the time on a single packet. Increasing
687 * coalescing time for cellular does not show marked
688 * improvement to throughput either. Loopback perf is hurt
689 * by the 5 msec latency and it already sends large packets.
691 if (IFNET_IS_CELLULAR(m
->m_pkthdr
.rcvif
) ||
692 (m
->m_pkthdr
.rcvif
->if_type
== IFT_LOOP
)) {
696 ip_hdr
= mtod(m
, struct ip
*);
698 /* don't deal with IP options */
699 if (hlen
!= sizeof (struct ip
))
702 /* only TCP is coalesced */
703 if (ip_hdr
->ip_p
!= IPPROTO_TCP
) {
707 if (m
->m_len
< (int32_t) sizeof (struct tcpiphdr
)) {
708 if (lrodebug
) printf("tcp_lro m_pullup \n");
709 if ((m
= m_pullup(m
, sizeof (struct tcpiphdr
))) == NULL
) {
710 tcpstat
.tcps_rcvshort
++;
712 printf("ip_lro: rcvshort.\n");
716 ip_hdr
= mtod(m
, struct ip
*);
719 tcp_hdr
= (struct tcphdr
*)((caddr_t
)ip_hdr
+ hlen
);
720 tlen
= ip_hdr
->ip_len
; //ignore IP header bytes len
721 m
->m_pkthdr
.lro_pktlen
= tlen
; /* Used to return max pkt encountered to tcp */
722 m
->m_pkthdr
.lro_npkts
= 1; /* Initialize a counter to hold num pkts coalesced */
723 m
->m_pkthdr
.lro_elapsed
= 0; /* Initialize the field to carry elapsed time */
724 off
= tcp_hdr
->th_off
<< 2;
725 if (off
< sizeof (struct tcphdr
) || off
> tlen
) {
726 tcpstat
.tcps_rcvbadoff
++;
728 printf("ip_lro: TCP off greater than TCP header.\n");
733 return (tcp_lro_process_pkt(m
, hlen
+ off
));
737 lro_proto_input(struct mbuf
*m
)
739 struct ip
* ip_hdr
= mtod(m
, struct ip
*);
742 printf("lro_proto_input: ip_len = %d \n",
746 ip_proto_dispatch_in_wrapper(m
, ip_hdr
->ip_hl
<< 2, ip_hdr
->ip_p
);
750 lro_tcp_xsum_validate(struct mbuf
*m
, struct ip
*ip
, struct tcphdr
* th
)
752 /* Expect 32-bit aligned data pointer on strict-align platforms */
753 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m
);
755 /* we shouldn't get here for IP with options; hence sizeof (ip) */
756 if (tcp_input_checksum(AF_INET
, m
, th
, sizeof (*ip
), ip
->ip_len
)) {
758 printf("%s: bad xsum and drop m = 0x%llx.\n", __func__
,
759 (uint64_t)VM_KERNEL_ADDRPERM(m
));
768 * When TCP detects a stable, steady flow without out of ordering,
769 * with a sufficiently high cwnd, it invokes LRO.
772 tcp_start_coalescing(struct ip
*ip_hdr
, struct tcphdr
*tcp_hdr
, int tlen
)
776 struct mbuf
*eject_mb
;
779 hash
= LRO_HASH(ip_hdr
->ip_src
.s_addr
, ip_hdr
->ip_dst
.s_addr
,
780 tcp_hdr
->th_sport
, tcp_hdr
->th_dport
,
781 (TCP_LRO_FLOW_MAP
- 1));
784 lck_mtx_lock_spin(&tcp_lro_lock
);
785 flow_id
= lro_flow_map
[hash
];
786 if (flow_id
!= TCP_LRO_FLOW_NOTFOUND
) {
787 lf
= &lro_flow_list
[flow_id
];
788 if ((lf
->lr_faddr
.s_addr
== ip_hdr
->ip_src
.s_addr
) &&
789 (lf
->lr_laddr
.s_addr
== ip_hdr
->ip_dst
.s_addr
) &&
790 (lf
->lr_fport
== tcp_hdr
->th_sport
) &&
791 (lf
->lr_lport
== tcp_hdr
->th_dport
)) {
792 if ((lf
->lr_tcphdr
== NULL
) &&
793 (lf
->lr_seq
!= (tcp_hdr
->th_seq
+ tlen
))) {
794 lf
->lr_seq
= tcp_hdr
->th_seq
+ tlen
;
796 lf
->lr_flags
&= ~LRO_EJECT_REQ
;
798 lck_mtx_unlock(&tcp_lro_lock
);
802 HTONL(tcp_hdr
->th_seq
);
803 HTONL(tcp_hdr
->th_ack
);
805 tcp_lro_insert_flow(NULL
, ip_hdr
, tcp_hdr
, tlen
, 0, hash
,
808 lck_mtx_unlock(&tcp_lro_lock
);
810 NTOHL(tcp_hdr
->th_seq
);
811 NTOHL(tcp_hdr
->th_ack
);
813 printf("%s: src = %x dst = %x sport = %d dport = %d seq %x \n",
814 __func__
, ip_hdr
->ip_src
.s_addr
, ip_hdr
->ip_dst
.s_addr
,
815 tcp_hdr
->th_sport
, tcp_hdr
->th_dport
, tcp_hdr
->th_seq
);
817 ASSERT(eject_mb
== NULL
);
822 * When TCP detects loss or idle condition, it stops offloading
826 tcp_lro_remove_state(struct in_addr saddr
, struct in_addr daddr
,
827 unsigned short sport
, unsigned short dport
)
832 hash
= LRO_HASH(daddr
.s_addr
, saddr
.s_addr
, dport
, sport
,
833 (TCP_LRO_FLOW_MAP
- 1));
834 lck_mtx_lock_spin(&tcp_lro_lock
);
835 flow_id
= lro_flow_map
[hash
];
836 if (flow_id
== TCP_LRO_FLOW_UNINIT
) {
837 lck_mtx_unlock(&tcp_lro_lock
);
840 lf
= &lro_flow_list
[flow_id
];
841 if ((lf
->lr_faddr
.s_addr
== daddr
.s_addr
) &&
842 (lf
->lr_laddr
.s_addr
== saddr
.s_addr
) &&
843 (lf
->lr_fport
== dport
) &&
844 (lf
->lr_lport
== sport
)) {
846 printf("%s: %x %x\n", __func__
,
847 lf
->lr_flags
, lf
->lr_seq
);
849 lf
->lr_flags
|= LRO_EJECT_REQ
;
851 lck_mtx_unlock(&tcp_lro_lock
);
856 tcp_update_lro_seq(__uint32_t rcv_nxt
, struct in_addr saddr
, struct in_addr daddr
,
857 unsigned short sport
, unsigned short dport
)
862 hash
= LRO_HASH(daddr
.s_addr
, saddr
.s_addr
, dport
, sport
,
863 (TCP_LRO_FLOW_MAP
- 1));
864 lck_mtx_lock_spin(&tcp_lro_lock
);
865 flow_id
= lro_flow_map
[hash
];
866 if (flow_id
== TCP_LRO_FLOW_UNINIT
) {
867 lck_mtx_unlock(&tcp_lro_lock
);
870 lf
= &lro_flow_list
[flow_id
];
871 if ((lf
->lr_faddr
.s_addr
== daddr
.s_addr
) &&
872 (lf
->lr_laddr
.s_addr
== saddr
.s_addr
) &&
873 (lf
->lr_fport
== dport
) &&
874 (lf
->lr_lport
== sport
) &&
875 (lf
->lr_tcphdr
== NULL
)) {
876 lf
->lr_seq
= (tcp_seq
)rcv_nxt
;
878 lck_mtx_unlock(&tcp_lro_lock
);
883 lro_update_stats(struct mbuf
*m
)
885 switch(m
->m_pkthdr
.lro_npkts
) {
886 case 0: /* fall through */
891 tcpstat
.tcps_lro_twopack
++;
894 case 3: /* fall through */
896 tcpstat
.tcps_lro_multpack
++;
900 tcpstat
.tcps_lro_largepack
++;
907 lro_update_flush_stats(struct mbuf
*m
)
910 switch(m
->m_pkthdr
.lro_npkts
) {
912 case 1: lro_single_flushes
++;
914 case 2: lro_double_flushes
++;
916 default: lro_good_flushes
++;