]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/tcp_lro.c
xnu-4903.270.47.tar.gz
[apple/xnu.git] / bsd / netinet / tcp_lro.c
CommitLineData
316670eb 1/*
39236c6e 2 * Copyright (c) 2011-2013 Apple Inc. All rights reserved.
316670eb
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745 5 *
316670eb
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
0a7de745 14 *
316670eb
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745 17 *
316670eb
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
0a7de745 25 *
316670eb
A
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/sysctl.h>
32#include <sys/mbuf.h>
33#include <sys/mcache.h>
34#include <sys/socket.h>
35#include <sys/socketvar.h>
36#include <net/if_types.h>
37#include <net/route.h>
38#include <netinet/in.h>
39#include <netinet/in_systm.h>
40#include <net/if.h>
39236c6e 41#include <net/dlil.h>
316670eb
A
42#include <netinet/ip.h>
43#include <netinet/ip_var.h>
44#include <netinet/in_var.h>
45#include <netinet/tcp.h>
46#include <netinet/tcp_seq.h>
47#include <netinet/tcpip.h>
48#include <netinet/tcp_var.h>
49#include <netinet/tcp_lro.h>
50#include <netinet/lro_ext.h>
51#include <kern/locks.h>
52
53unsigned int lrocount = 0; /* A counter used for debugging only */
54unsigned int lro_seq_outoforder = 0; /* Counter for debugging */
55unsigned int lro_seq_mismatch = 0; /* Counter for debugging */
316670eb
A
56unsigned int lro_flushes = 0; /* Counter for tracking number of flushes */
57unsigned int lro_single_flushes = 0;
58unsigned int lro_double_flushes = 0;
59unsigned int lro_good_flushes = 0;
60
61unsigned int coalesc_sz = LRO_MX_COALESCE_PKTS;
62SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_sz, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 63 &coalesc_sz, 0, "Max coalescing size");
316670eb
A
64
65unsigned int coalesc_time = LRO_MX_TIME_TO_BUFFER;
66SYSCTL_INT(_net_inet_tcp, OID_AUTO, lro_time, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 67 &coalesc_time, 0, "Max coalescing time");
316670eb 68
0a7de745 69struct lro_flow lro_flow_list[TCP_LRO_NUM_FLOWS];
316670eb 70
0a7de745 71char lro_flow_map[TCP_LRO_FLOW_MAP];
316670eb 72
0a7de745
A
73static lck_attr_t *tcp_lro_mtx_attr = NULL; /* mutex attributes */
74static lck_grp_t *tcp_lro_mtx_grp = NULL; /* mutex group */
75static lck_grp_attr_t *tcp_lro_mtx_grp_attr = NULL; /* mutex group attrs */
76decl_lck_mtx_data(, tcp_lro_lock); /* Used to synchronize updates */
316670eb
A
77
78unsigned int lro_byte_count = 0;
79
80uint64_t lro_deadline = 0; /* LRO's sense of time - protected by tcp_lro_lock */
81uint32_t lro_timer_set = 0;
82
83/* Some LRO stats */
84u_int32_t lro_pkt_count = 0; /* Number of packets encountered in an LRO period */
85thread_call_t tcp_lro_timer;
86
87extern u_int32_t kipf_count;
88
0a7de745
A
89static void tcp_lro_timer_proc(void*, void*);
90static void lro_update_stats(struct mbuf*);
91static void lro_update_flush_stats(struct mbuf *);
92static void tcp_lro_flush_flows(void);
93static void tcp_lro_sched_timer(uint64_t);
94static void lro_proto_input(struct mbuf *);
316670eb 95
0a7de745
A
96static struct mbuf *lro_tcp_xsum_validate(struct mbuf*, struct ip *,
97 struct tcphdr*);
39037602 98static struct mbuf *tcp_lro_process_pkt(struct mbuf*, int);
316670eb
A
99
100void
101tcp_lro_init(void)
102{
103 int i;
104
0a7de745 105 bzero(lro_flow_list, sizeof(struct lro_flow) * TCP_LRO_NUM_FLOWS);
316670eb
A
106 for (i = 0; i < TCP_LRO_FLOW_MAP; i++) {
107 lro_flow_map[i] = TCP_LRO_FLOW_UNINIT;
108 }
109
110 /*
111 * allocate lock group attribute, group and attribute for tcp_lro_lock
112 */
113 tcp_lro_mtx_grp_attr = lck_grp_attr_alloc_init();
114 tcp_lro_mtx_grp = lck_grp_alloc_init("tcplro", tcp_lro_mtx_grp_attr);
115 tcp_lro_mtx_attr = lck_attr_alloc_init();
116 lck_mtx_init(&tcp_lro_lock, tcp_lro_mtx_grp, tcp_lro_mtx_attr);
117
118 tcp_lro_timer = thread_call_allocate(tcp_lro_timer_proc, NULL);
119 if (tcp_lro_timer == NULL) {
120 panic_plain("%s: unable to allocate lro timer", __func__);
121 }
122
123 return;
124}
125
126static int
0a7de745
A
127tcp_lro_matching_tuple(struct ip* ip_hdr, struct tcphdr *tcp_hdr, int *hash,
128 int *flow_id )
316670eb
A
129{
130 struct lro_flow *flow;
131 tcp_seq seqnum;
132 unsigned int off = 0;
133 int payload_len = 0;
134
0a7de745
A
135 *hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
136 tcp_hdr->th_sport, tcp_hdr->th_dport, (TCP_LRO_FLOW_MAP - 1));
316670eb
A
137
138 *flow_id = lro_flow_map[*hash];
139 if (*flow_id == TCP_LRO_FLOW_NOTFOUND) {
140 return TCP_LRO_NAN;
141 }
142
143 seqnum = tcp_hdr->th_seq;
144 off = tcp_hdr->th_off << 2;
145 payload_len = ip_hdr->ip_len - off;
146
147 flow = &lro_flow_list[*flow_id];
148
149 if ((flow->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) &&
0a7de745
A
150 (flow->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) &&
151 (flow->lr_fport == tcp_hdr->th_sport) &&
152 (flow->lr_lport == tcp_hdr->th_dport)) {
316670eb
A
153 if (flow->lr_tcphdr == NULL) {
154 if (ntohl(seqnum) == flow->lr_seq) {
155 return TCP_LRO_COALESCE;
156 }
157 if (lrodebug >= 4) {
158 printf("%s: seqnum = %x, lr_seq = %x\n",
0a7de745 159 __func__, ntohl(seqnum), flow->lr_seq);
316670eb
A
160 }
161 lro_seq_mismatch++;
162 if (SEQ_GT(ntohl(seqnum), flow->lr_seq)) {
163 lro_seq_outoforder++;
0a7de745 164 /*
316670eb 165 * Whenever we receive out of order packets it
0a7de745 166 * signals loss and recovery and LRO doesn't
316670eb
A
167 * let flows recover quickly. So eject.
168 */
0a7de745 169 flow->lr_flags |= LRO_EJECT_REQ;
316670eb
A
170 }
171 return TCP_LRO_NAN;
172 }
173
174 if (flow->lr_flags & LRO_EJECT_REQ) {
0a7de745 175 if (lrodebug) {
316670eb 176 printf("%s: eject. \n", __func__);
0a7de745 177 }
316670eb
A
178 return TCP_LRO_EJECT_FLOW;
179 }
0a7de745 180 if (SEQ_GT(tcp_hdr->th_ack, flow->lr_tcphdr->th_ack)) {
316670eb 181 if (lrodebug) {
0a7de745
A
182 printf("%s: th_ack = %x flow_ack = %x \n",
183 __func__, tcp_hdr->th_ack,
184 flow->lr_tcphdr->th_ack);
316670eb
A
185 }
186 return TCP_LRO_EJECT_FLOW;
187 }
188
0a7de745 189 if (ntohl(seqnum) == (ntohl(lro_flow_list[*flow_id].lr_tcphdr->th_seq) + lro_flow_list[*flow_id].lr_len)) {
316670eb
A
190 return TCP_LRO_COALESCE;
191 } else {
192 /* LRO does not handle loss recovery well, eject */
193 flow->lr_flags |= LRO_EJECT_REQ;
194 return TCP_LRO_EJECT_FLOW;
195 }
196 }
0a7de745
A
197 if (lrodebug) {
198 printf("tcp_lro_matching_tuple: collision \n");
199 }
316670eb
A
200 return TCP_LRO_COLLISION;
201}
202
203static void
0a7de745
A
204tcp_lro_init_flow(int flow_id, struct ip* ip_hdr, struct tcphdr *tcp_hdr,
205 int hash, u_int32_t timestamp, int payload_len)
316670eb
A
206{
207 struct lro_flow *flow = NULL;
208
209 flow = &lro_flow_list[flow_id];
210
211 flow->lr_hash_map = hash;
212 flow->lr_faddr.s_addr = ip_hdr->ip_src.s_addr;
213 flow->lr_laddr.s_addr = ip_hdr->ip_dst.s_addr;
214 flow->lr_fport = tcp_hdr->th_sport;
215 flow->lr_lport = tcp_hdr->th_dport;
216 lro_flow_map[hash] = flow_id;
217 flow->lr_timestamp = timestamp;
218 flow->lr_seq = ntohl(tcp_hdr->th_seq) + payload_len;
219 flow->lr_flags = 0;
220 return;
221}
222
223static void
0a7de745
A
224tcp_lro_coalesce(int flow_id, struct mbuf *lro_mb, struct tcphdr *tcphdr,
225 int payload_len, int drop_hdrlen, struct tcpopt *topt,
226 u_int32_t* tsval, u_int32_t* tsecr, int thflags)
316670eb
A
227{
228 struct lro_flow *flow = NULL;
229 struct mbuf *last;
230 struct ip *ip = NULL;
231
232 flow = &lro_flow_list[flow_id];
233 if (flow->lr_mhead) {
0a7de745 234 if (lrodebug) {
316670eb 235 printf("%s: lr_mhead %x %d \n", __func__, flow->lr_seq,
0a7de745
A
236 payload_len);
237 }
316670eb
A
238 m_adj(lro_mb, drop_hdrlen);
239
240 last = flow->lr_mtail;
241 while (last->m_next != NULL) {
242 last = last->m_next;
243 }
244 last->m_next = lro_mb;
245
246 flow->lr_mtail = lro_mb;
247
248 ip = mtod(flow->lr_mhead, struct ip *);
249 ip->ip_len += lro_mb->m_pkthdr.len;
250 flow->lr_mhead->m_pkthdr.len += lro_mb->m_pkthdr.len;
251
252 if (flow->lr_len == 0) {
253 panic_plain("%s: Inconsistent LRO flow state", __func__);
254 }
255 flow->lr_len += payload_len;
256 flow->lr_seq += payload_len;
0a7de745
A
257 /*
258 * This bit is re-OR'd each time a packet is added to the
316670eb
A
259 * large coalesced packet.
260 */
39236c6e 261 flow->lr_mhead->m_pkthdr.pkt_flags |= PKTF_SW_LRO_PKT;
316670eb 262 flow->lr_mhead->m_pkthdr.lro_npkts++; /* for tcpstat.tcps_rcvpack */
0a7de745
A
263 if (flow->lr_mhead->m_pkthdr.lro_pktlen <
264 lro_mb->m_pkthdr.lro_pktlen) {
265 /*
266 * For TCP Inter Arrival Jitter calculation, return max
316670eb
A
267 * size encountered while coalescing a stream of pkts.
268 */
0a7de745
A
269 flow->lr_mhead->m_pkthdr.lro_pktlen =
270 lro_mb->m_pkthdr.lro_pktlen;
316670eb 271 }
0a7de745 272 /* Update the timestamp value */
316670eb 273 if (topt->to_flags & TOF_TS) {
0a7de745
A
274 if ((flow->lr_tsval) &&
275 (TSTMP_GT(topt->to_tsval, ntohl(*(flow->lr_tsval))))) {
316670eb
A
276 *(flow->lr_tsval) = htonl(topt->to_tsval);
277 }
278 if ((flow->lr_tsecr) &&
0a7de745
A
279 (topt->to_tsecr != 0) &&
280 (TSTMP_GT(topt->to_tsecr, ntohl(*(flow->lr_tsecr))))) {
316670eb 281 if (lrodebug >= 2) {
0a7de745
A
282 printf("%s: instantaneous RTT = %d \n", __func__,
283 topt->to_tsecr - ntohl(*(flow->lr_tsecr)));
316670eb
A
284 }
285 *(flow->lr_tsecr) = htonl(topt->to_tsecr);
286 }
287 }
288 /* Coalesce the flags */
289 if (thflags) {
290 flow->lr_tcphdr->th_flags |= thflags;
291 }
292 /* Update receive window */
293 flow->lr_tcphdr->th_win = tcphdr->th_win;
294 } else {
295 if (lro_mb) {
296 flow->lr_mhead = flow->lr_mtail = lro_mb;
39236c6e 297 flow->lr_mhead->m_pkthdr.pkt_flags |= PKTF_SW_LRO_PKT;
316670eb
A
298 flow->lr_tcphdr = tcphdr;
299 if ((topt) && (topt->to_flags & TOF_TS)) {
300 ASSERT(tsval != NULL);
301 ASSERT(tsecr != NULL);
0a7de745 302 flow->lr_tsval = tsval;
316670eb 303 flow->lr_tsecr = tsecr;
0a7de745 304 }
316670eb 305 flow->lr_len = payload_len;
39236c6e 306 calculate_tcp_clock();
316670eb
A
307 flow->lr_timestamp = tcp_now;
308 tcp_lro_sched_timer(0);
0a7de745 309 }
316670eb
A
310 flow->lr_seq = ntohl(tcphdr->th_seq) + payload_len;
311 }
0a7de745 312 if (lro_mb) {
316670eb 313 tcpstat.tcps_coalesced_pack++;
0a7de745 314 }
316670eb
A
315 return;
316}
317
318static struct mbuf *
319tcp_lro_eject_flow(int flow_id)
320{
321 struct mbuf *mb = NULL;
322
323 mb = lro_flow_list[flow_id].lr_mhead;
324 ASSERT(lro_flow_map[lro_flow_list[flow_id].lr_hash_map] == flow_id);
325 lro_flow_map[lro_flow_list[flow_id].lr_hash_map] = TCP_LRO_FLOW_UNINIT;
326 bzero(&lro_flow_list[flow_id], sizeof(struct lro_flow));
0a7de745 327
316670eb
A
328 return mb;
329}
330
331static struct mbuf*
332tcp_lro_eject_coalesced_pkt(int flow_id)
333{
334 struct mbuf *mb = NULL;
335 mb = lro_flow_list[flow_id].lr_mhead;
0a7de745
A
336 lro_flow_list[flow_id].lr_mhead =
337 lro_flow_list[flow_id].lr_mtail = NULL;
316670eb
A
338 lro_flow_list[flow_id].lr_tcphdr = NULL;
339 return mb;
340}
341
342static struct mbuf*
0a7de745
A
343tcp_lro_insert_flow(struct mbuf *lro_mb, struct ip *ip_hdr,
344 struct tcphdr *tcp_hdr, int payload_len,
345 int drop_hdrlen, int hash, struct tcpopt *topt,
346 u_int32_t *tsval, u_int32_t *tsecr)
316670eb
A
347{
348 int i;
349 int slot_available = 0;
0a7de745 350 int candidate_flow = 0;
316670eb
A
351 u_int32_t oldest_timestamp;
352 struct mbuf *mb = NULL;
353 int collision = 0;
354
355 oldest_timestamp = tcp_now;
0a7de745 356
316670eb
A
357 /* handle collision */
358 if (lro_flow_map[hash] != TCP_LRO_FLOW_UNINIT) {
359 if (lrodebug) {
360 collision = 1;
361 }
362 candidate_flow = lro_flow_map[hash];
363 tcpstat.tcps_flowtbl_collision++;
364 goto kick_flow;
365 }
366
367 for (i = 0; i < TCP_LRO_NUM_FLOWS; i++) {
368 if (lro_flow_list[i].lr_mhead == NULL) {
369 candidate_flow = i;
370 slot_available = 1;
371 break;
372 }
373 if (oldest_timestamp >= lro_flow_list[i].lr_timestamp) {
374 candidate_flow = i;
375 oldest_timestamp = lro_flow_list[i].lr_timestamp;
376 }
377 }
378
379 if (!slot_available) {
380 tcpstat.tcps_flowtbl_full++;
381kick_flow:
382 /* kick the oldest flow */
383 mb = tcp_lro_eject_flow(candidate_flow);
384
385 if (lrodebug) {
386 if (!slot_available) {
0a7de745 387 printf("%s: slot unavailable.\n", __func__);
316670eb
A
388 }
389 if (collision) {
0a7de745 390 printf("%s: collision.\n", __func__);
316670eb
A
391 }
392 }
393 } else {
394 candidate_flow = i; /* this is now the flow to be used */
316670eb
A
395 }
396
0a7de745
A
397 tcp_lro_init_flow(candidate_flow, ip_hdr, tcp_hdr, hash,
398 tcp_now, payload_len);
399 tcp_lro_coalesce(candidate_flow, lro_mb, tcp_hdr, payload_len,
400 drop_hdrlen, topt, tsval, tsecr, 0);
316670eb
A
401 return mb;
402}
403
404struct mbuf*
39037602 405tcp_lro_process_pkt(struct mbuf *lro_mb, int drop_hdrlen)
316670eb
A
406{
407 int flow_id = TCP_LRO_FLOW_UNINIT;
408 int hash;
409 unsigned int off = 0;
410 int eject_flow = 0;
411 int optlen;
412 int retval = 0;
413 struct mbuf *mb = NULL;
414 int payload_len = 0;
415 u_char *optp = NULL;
416 int thflags = 0;
417 struct tcpopt to;
418 int ret_response = TCP_LRO_CONSUMED;
419 int coalesced = 0, tcpflags = 0, unknown_tcpopts = 0;
420 u_int8_t ecn;
39037602
A
421 struct ip *ip_hdr;
422 struct tcphdr *tcp_hdr;
0a7de745 423
39037602
A
424 if (lro_mb->m_len < drop_hdrlen) {
425 if ((lro_mb = m_pullup(lro_mb, drop_hdrlen)) == NULL) {
316670eb 426 tcpstat.tcps_rcvshort++;
0a7de745 427 m_freem(lro_mb);
316670eb
A
428 if (lrodebug) {
429 printf("tcp_lro_process_pkt:mbuf too short.\n");
430 }
0a7de745 431 return NULL;
316670eb
A
432 }
433 }
0a7de745 434
39037602
A
435 ip_hdr = mtod(lro_mb, struct ip*);
436 tcp_hdr = (struct tcphdr *)((caddr_t)ip_hdr + sizeof(struct ip));
0a7de745 437
39236c6e
A
438 /* Just in case */
439 lro_mb->m_pkthdr.pkt_flags &= ~PKTF_SW_LRO_DID_CSUM;
440
441 if ((lro_mb = lro_tcp_xsum_validate(lro_mb, ip_hdr, tcp_hdr)) == NULL) {
316670eb
A
442 if (lrodebug) {
443 printf("tcp_lro_process_pkt: TCP xsum failed.\n");
444 }
0a7de745 445 return NULL;
316670eb
A
446 }
447
448 /* Update stats */
449 lro_pkt_count++;
450
451 /* Avoids checksumming in tcp_input */
39236c6e
A
452 lro_mb->m_pkthdr.pkt_flags |= PKTF_SW_LRO_DID_CSUM;
453
316670eb 454 off = tcp_hdr->th_off << 2;
0a7de745 455 optlen = off - sizeof(struct tcphdr);
316670eb
A
456 payload_len = ip_hdr->ip_len - off;
457 optp = (u_char *)(tcp_hdr + 1);
458 /*
459 * Do quick retrieval of timestamp options ("options
460 * prediction?"). If timestamp is the only option and it's
461 * formatted as recommended in RFC 1323 appendix A, we
462 * quickly get the values now and not bother calling
463 * tcp_dooptions(), etc.
464 */
5ba3f43e 465 bzero(&to, sizeof(to));
316670eb 466 if ((optlen == TCPOLEN_TSTAMP_APPA ||
0a7de745
A
467 (optlen > TCPOLEN_TSTAMP_APPA &&
468 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
469 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
470 (tcp_hdr->th_flags & TH_SYN) == 0) {
471 to.to_flags |= TOF_TS;
472 to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4));
473 to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8));
316670eb
A
474 } else {
475 /*
0a7de745
A
476 * If TCP timestamps are not in use, or not the first option,
477 * skip LRO path since timestamps are used to avoid LRO
316670eb
A
478 * from introducing additional latencies for retransmissions
479 * and other slow-paced transmissions.
480 */
481 to.to_flags = to.to_tsecr = 0;
482 eject_flow = 1;
483 }
484
485 /* list all the conditions that can trigger a flow ejection here */
0a7de745 486
316670eb 487 thflags = tcp_hdr->th_flags;
0a7de745 488 if (thflags & (TH_SYN | TH_URG | TH_ECE | TH_CWR | TH_PUSH | TH_RST | TH_FIN)) {
316670eb 489 eject_flow = tcpflags = 1;
0a7de745
A
490 }
491
492 if (optlen && !((optlen == TCPOLEN_TSTAMP_APPA) &&
493 (to.to_flags & TOF_TS))) {
316670eb 494 eject_flow = unknown_tcpopts = 1;
0a7de745
A
495 }
496
316670eb
A
497 if (payload_len <= LRO_MIN_COALESC_SZ) { /* zero payload ACK */
498 eject_flow = 1;
499 }
500
501 /* Can't coalesce ECN marked packets. */
502 ecn = ip_hdr->ip_tos & IPTOS_ECN_MASK;
503 if (ecn == IPTOS_ECN_CE) {
504 /*
505 * ECN needs quick notification
506 */
507 if (lrodebug) {
508 printf("%s: ECE bits set.\n", __func__);
509 }
510 eject_flow = 1;
511 }
512
513 lck_mtx_lock_spin(&tcp_lro_lock);
514
515 retval = tcp_lro_matching_tuple(ip_hdr, tcp_hdr, &hash, &flow_id);
516
517 switch (retval) {
518 case TCP_LRO_NAN:
519 lck_mtx_unlock(&tcp_lro_lock);
520 ret_response = TCP_LRO_FLOW_NOTFOUND;
521 break;
522
523 case TCP_LRO_COALESCE:
0a7de745
A
524 if ((payload_len != 0) && (unknown_tcpopts == 0) &&
525 (tcpflags == 0) && (ecn != IPTOS_ECN_CE) && (to.to_flags & TOF_TS)) {
316670eb 526 tcp_lro_coalesce(flow_id, lro_mb, tcp_hdr, payload_len,
0a7de745
A
527 drop_hdrlen, &to,
528 (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 4) : NULL,
529 (to.to_flags & TOF_TS) ? (u_int32_t *)(void *)(optp + 8) : NULL,
530 thflags);
531 if (lrodebug >= 2) {
316670eb 532 printf("tcp_lro_process_pkt: coalesce len = %d. flow_id = %d payload_len = %d drop_hdrlen = %d optlen = %d lport = %d seqnum = %x.\n",
0a7de745
A
533 lro_flow_list[flow_id].lr_len, flow_id,
534 payload_len, drop_hdrlen, optlen,
535 ntohs(lro_flow_list[flow_id].lr_lport),
536 ntohl(tcp_hdr->th_seq));
316670eb
A
537 }
538 if (lro_flow_list[flow_id].lr_mhead->m_pkthdr.lro_npkts >= coalesc_sz) {
539 eject_flow = 1;
540 }
541 coalesced = 1;
542 }
543 if (eject_flow) {
544 mb = tcp_lro_eject_coalesced_pkt(flow_id);
545 lro_flow_list[flow_id].lr_seq = ntohl(tcp_hdr->th_seq) +
0a7de745
A
546 payload_len;
547 calculate_tcp_clock();
548 u_int8_t timestamp = tcp_now - lro_flow_list[flow_id].lr_timestamp;
316670eb
A
549 lck_mtx_unlock(&tcp_lro_lock);
550 if (mb) {
39236c6e 551 mb->m_pkthdr.lro_elapsed = timestamp;
316670eb
A
552 lro_proto_input(mb);
553 }
554 if (!coalesced) {
555 if (lrodebug >= 2) {
556 printf("%s: pkt payload_len = %d \n", __func__, payload_len);
557 }
558 lro_proto_input(lro_mb);
559 }
560 } else {
561 lck_mtx_unlock(&tcp_lro_lock);
562 }
563 break;
564
565 case TCP_LRO_EJECT_FLOW:
566 mb = tcp_lro_eject_coalesced_pkt(flow_id);
39236c6e
A
567 calculate_tcp_clock();
568 u_int8_t timestamp = tcp_now - lro_flow_list[flow_id].lr_timestamp;
316670eb
A
569 lck_mtx_unlock(&tcp_lro_lock);
570 if (mb) {
0a7de745 571 if (lrodebug) {
316670eb 572 printf("tcp_lro_process_pkt eject_flow, len = %d\n", mb->m_pkthdr.len);
0a7de745 573 }
39236c6e 574 mb->m_pkthdr.lro_elapsed = timestamp;
316670eb
A
575 lro_proto_input(mb);
576 }
577
578 lro_proto_input(lro_mb);
579 break;
580
581 case TCP_LRO_COLLISION:
582 lck_mtx_unlock(&tcp_lro_lock);
583 ret_response = TCP_LRO_FLOW_NOTFOUND;
584 break;
585
586 default:
587 lck_mtx_unlock(&tcp_lro_lock);
588 panic_plain("%s: unrecognized type %d", __func__, retval);
0a7de745 589 break;
316670eb
A
590 }
591
592 if (ret_response == TCP_LRO_FLOW_NOTFOUND) {
593 lro_proto_input(lro_mb);
594 }
0a7de745 595 return NULL;
316670eb
A
596}
597
598static void
599tcp_lro_timer_proc(void *arg1, void *arg2)
600{
601#pragma unused(arg1, arg2)
602
603 lck_mtx_lock_spin(&tcp_lro_lock);
604 lro_timer_set = 0;
605 lck_mtx_unlock(&tcp_lro_lock);
606 tcp_lro_flush_flows();
607}
608
609static void
610tcp_lro_flush_flows(void)
611{
612 int i = 0;
613 struct mbuf *mb;
614 struct lro_flow *flow;
316670eb
A
615 int tcpclock_updated = 0;
616
617 lck_mtx_lock(&tcp_lro_lock);
618
619 while (i < TCP_LRO_NUM_FLOWS) {
620 flow = &lro_flow_list[i];
621 if (flow->lr_mhead != NULL) {
316670eb
A
622 if (!tcpclock_updated) {
623 calculate_tcp_clock();
624 tcpclock_updated = 1;
625 }
316670eb 626
0a7de745 627 if (lrodebug >= 2) {
39236c6e 628 printf("tcp_lro_flush_flows: len =%d n_pkts = %d %d %d \n",
0a7de745
A
629 flow->lr_len,
630 flow->lr_mhead->m_pkthdr.lro_npkts,
631 flow->lr_timestamp, tcp_now);
632 }
316670eb 633
39236c6e 634 u_int8_t timestamp = tcp_now - flow->lr_timestamp;
316670eb 635
316670eb 636 mb = tcp_lro_eject_flow(i);
39236c6e 637
316670eb 638 if (mb) {
39236c6e 639 mb->m_pkthdr.lro_elapsed = timestamp;
316670eb 640 lck_mtx_unlock(&tcp_lro_lock);
39236c6e 641 lro_update_flush_stats(mb);
316670eb 642 lro_proto_input(mb);
316670eb
A
643 lck_mtx_lock(&tcp_lro_lock);
644 }
645 }
646 i++;
647 }
648 lck_mtx_unlock(&tcp_lro_lock);
316670eb
A
649}
650
651/*
652 * Must be called with tcp_lro_lock held.
653 * The hint is non-zero for longer waits. The wait time dictated by coalesc_time
654 * takes precedence, so lro_timer_set is not set for the hint case
655 */
656static void
657tcp_lro_sched_timer(uint64_t hint)
658{
659 if (lro_timer_set) {
660 return;
661 }
662
663 lro_timer_set = 1;
664 if (!hint) {
665 /* the intent is to wake up every coalesc_time msecs */
0a7de745
A
666 clock_interval_to_deadline(coalesc_time,
667 (NSEC_PER_SEC / TCP_RETRANSHZ), &lro_deadline);
316670eb
A
668 } else {
669 clock_interval_to_deadline(hint, NSEC_PER_SEC / TCP_RETRANSHZ,
0a7de745 670 &lro_deadline);
316670eb
A
671 }
672 thread_call_enter_delayed(tcp_lro_timer, lro_deadline);
673}
674
675struct mbuf*
676tcp_lro(struct mbuf *m, unsigned int hlen)
677{
678 struct ip *ip_hdr;
679 unsigned int tlen;
680 struct tcphdr * tcp_hdr = NULL;
681 unsigned int off = 0;
682
0a7de745
A
683 if (kipf_count != 0) {
684 return m;
685 }
316670eb 686
0a7de745
A
687 /*
688 * Experiments on cellular show that the RTT is much higher
316670eb 689 * than the coalescing time of 5 msecs, causing lro to flush
0a7de745
A
690 * 80% of the time on a single packet. Increasing
691 * coalescing time for cellular does not show marked
316670eb
A
692 * improvement to throughput either. Loopback perf is hurt
693 * by the 5 msec latency and it already sends large packets.
694 */
39236c6e 695 if (IFNET_IS_CELLULAR(m->m_pkthdr.rcvif) ||
0a7de745
A
696 (m->m_pkthdr.rcvif->if_type == IFT_LOOP)) {
697 return m;
316670eb
A
698 }
699
700 ip_hdr = mtod(m, struct ip*);
701
39236c6e 702 /* don't deal with IP options */
0a7de745
A
703 if (hlen != sizeof(struct ip)) {
704 return m;
705 }
39236c6e 706
316670eb
A
707 /* only TCP is coalesced */
708 if (ip_hdr->ip_p != IPPROTO_TCP) {
0a7de745 709 return m;
316670eb
A
710 }
711
0a7de745
A
712 if (m->m_len < (int32_t) sizeof(struct tcpiphdr)) {
713 if (lrodebug) {
714 printf("tcp_lro m_pullup \n");
715 }
716 if ((m = m_pullup(m, sizeof(struct tcpiphdr))) == NULL) {
717 tcpstat.tcps_rcvshort++;
316670eb
A
718 if (lrodebug) {
719 printf("ip_lro: rcvshort.\n");
720 }
0a7de745 721 return NULL;
316670eb 722 }
39037602 723 ip_hdr = mtod(m, struct ip*);
316670eb
A
724 }
725
726 tcp_hdr = (struct tcphdr *)((caddr_t)ip_hdr + hlen);
0a7de745 727 tlen = ip_hdr->ip_len; //ignore IP header bytes len
316670eb
A
728 m->m_pkthdr.lro_pktlen = tlen; /* Used to return max pkt encountered to tcp */
729 m->m_pkthdr.lro_npkts = 1; /* Initialize a counter to hold num pkts coalesced */
39236c6e 730 m->m_pkthdr.lro_elapsed = 0; /* Initialize the field to carry elapsed time */
316670eb 731 off = tcp_hdr->th_off << 2;
0a7de745
A
732 if (off < sizeof(struct tcphdr) || off > tlen) {
733 tcpstat.tcps_rcvbadoff++;
316670eb
A
734 if (lrodebug) {
735 printf("ip_lro: TCP off greater than TCP header.\n");
736 }
0a7de745 737 return m;
316670eb
A
738 }
739
0a7de745 740 return tcp_lro_process_pkt(m, hlen + off);
316670eb
A
741}
742
743static void
744lro_proto_input(struct mbuf *m)
745{
746 struct ip* ip_hdr = mtod(m, struct ip*);
747
748 if (lrodebug >= 3) {
0a7de745
A
749 printf("lro_proto_input: ip_len = %d \n",
750 ip_hdr->ip_len);
316670eb
A
751 }
752 lro_update_stats(m);
753 ip_proto_dispatch_in_wrapper(m, ip_hdr->ip_hl << 2, ip_hdr->ip_p);
754}
755
756static struct mbuf *
39236c6e 757lro_tcp_xsum_validate(struct mbuf *m, struct ip *ip, struct tcphdr * th)
316670eb 758{
316670eb
A
759 /* Expect 32-bit aligned data pointer on strict-align platforms */
760 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
761
39236c6e 762 /* we shouldn't get here for IP with options; hence sizeof (ip) */
0a7de745
A
763 if (tcp_input_checksum(AF_INET, m, th, sizeof(*ip), ip->ip_len)) {
764 if (lrodebug) {
39236c6e 765 printf("%s: bad xsum and drop m = 0x%llx.\n", __func__,
0a7de745
A
766 (uint64_t)VM_KERNEL_ADDRPERM(m));
767 }
316670eb 768 m_freem(m);
0a7de745 769 return NULL;
316670eb 770 }
39236c6e 771
0a7de745 772 return m;
316670eb
A
773}
774
775/*
0a7de745 776 * When TCP detects a stable, steady flow without out of ordering,
316670eb
A
777 * with a sufficiently high cwnd, it invokes LRO.
778 */
779int
0a7de745 780tcp_start_coalescing(struct ip *ip_hdr, struct tcphdr *tcp_hdr, int tlen)
316670eb
A
781{
782 int hash;
783 int flow_id;
784 struct mbuf *eject_mb;
785 struct lro_flow *lf;
786
0a7de745
A
787 hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
788 tcp_hdr->th_sport, tcp_hdr->th_dport,
789 (TCP_LRO_FLOW_MAP - 1));
790
316670eb 791
316670eb
A
792 lck_mtx_lock_spin(&tcp_lro_lock);
793 flow_id = lro_flow_map[hash];
794 if (flow_id != TCP_LRO_FLOW_NOTFOUND) {
795 lf = &lro_flow_list[flow_id];
796 if ((lf->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) &&
797 (lf->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) &&
798 (lf->lr_fport == tcp_hdr->th_sport) &&
799 (lf->lr_lport == tcp_hdr->th_dport)) {
0a7de745
A
800 if ((lf->lr_tcphdr == NULL) &&
801 (lf->lr_seq != (tcp_hdr->th_seq + tlen))) {
316670eb 802 lf->lr_seq = tcp_hdr->th_seq + tlen;
0a7de745 803 }
316670eb
A
804 lf->lr_flags &= ~LRO_EJECT_REQ;
805 }
0a7de745 806 lck_mtx_unlock(&tcp_lro_lock);
316670eb
A
807 return 0;
808 }
809
810 HTONL(tcp_hdr->th_seq);
811 HTONL(tcp_hdr->th_ack);
0a7de745
A
812 eject_mb =
813 tcp_lro_insert_flow(NULL, ip_hdr, tcp_hdr, tlen, 0, hash,
814 NULL, NULL, NULL);
316670eb
A
815
816 lck_mtx_unlock(&tcp_lro_lock);
817
818 NTOHL(tcp_hdr->th_seq);
819 NTOHL(tcp_hdr->th_ack);
820 if (lrodebug >= 3) {
821 printf("%s: src = %x dst = %x sport = %d dport = %d seq %x \n",
0a7de745
A
822 __func__, ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr,
823 tcp_hdr->th_sport, tcp_hdr->th_dport, tcp_hdr->th_seq);
316670eb
A
824 }
825 ASSERT(eject_mb == NULL);
826 return 0;
827}
828
829/*
830 * When TCP detects loss or idle condition, it stops offloading
0a7de745 831 * to LRO.
316670eb
A
832 */
833int
0a7de745
A
834tcp_lro_remove_state(struct in_addr saddr, struct in_addr daddr,
835 unsigned short sport, unsigned short dport)
316670eb
A
836{
837 int hash, flow_id;
838 struct lro_flow *lf;
839
840 hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport,
0a7de745 841 (TCP_LRO_FLOW_MAP - 1));
316670eb
A
842 lck_mtx_lock_spin(&tcp_lro_lock);
843 flow_id = lro_flow_map[hash];
844 if (flow_id == TCP_LRO_FLOW_UNINIT) {
845 lck_mtx_unlock(&tcp_lro_lock);
846 return 0;
847 }
848 lf = &lro_flow_list[flow_id];
0a7de745 849 if ((lf->lr_faddr.s_addr == daddr.s_addr) &&
316670eb
A
850 (lf->lr_laddr.s_addr == saddr.s_addr) &&
851 (lf->lr_fport == dport) &&
852 (lf->lr_lport == sport)) {
853 if (lrodebug) {
0a7de745
A
854 printf("%s: %x %x\n", __func__,
855 lf->lr_flags, lf->lr_seq);
316670eb
A
856 }
857 lf->lr_flags |= LRO_EJECT_REQ;
858 }
859 lck_mtx_unlock(&tcp_lro_lock);
860 return 0;
861}
862
863void
864tcp_update_lro_seq(__uint32_t rcv_nxt, struct in_addr saddr, struct in_addr daddr,
0a7de745 865 unsigned short sport, unsigned short dport)
316670eb
A
866{
867 int hash, flow_id;
868 struct lro_flow *lf;
869
0a7de745
A
870 hash = LRO_HASH(daddr.s_addr, saddr.s_addr, dport, sport,
871 (TCP_LRO_FLOW_MAP - 1));
316670eb
A
872 lck_mtx_lock_spin(&tcp_lro_lock);
873 flow_id = lro_flow_map[hash];
874 if (flow_id == TCP_LRO_FLOW_UNINIT) {
875 lck_mtx_unlock(&tcp_lro_lock);
876 return;
877 }
878 lf = &lro_flow_list[flow_id];
879 if ((lf->lr_faddr.s_addr == daddr.s_addr) &&
880 (lf->lr_laddr.s_addr == saddr.s_addr) &&
881 (lf->lr_fport == dport) &&
882 (lf->lr_lport == sport) &&
883 (lf->lr_tcphdr == NULL)) {
884 lf->lr_seq = (tcp_seq)rcv_nxt;
885 }
886 lck_mtx_unlock(&tcp_lro_lock);
887 return;
888}
889
890static void
891lro_update_stats(struct mbuf *m)
892{
0a7de745 893 switch (m->m_pkthdr.lro_npkts) {
316670eb 894 case 0: /* fall through */
0a7de745 895 case 1:
316670eb 896 break;
0a7de745
A
897
898 case 2:
316670eb
A
899 tcpstat.tcps_lro_twopack++;
900 break;
0a7de745 901
316670eb
A
902 case 3: /* fall through */
903 case 4:
904 tcpstat.tcps_lro_multpack++;
905 break;
0a7de745
A
906
907 default:
316670eb
A
908 tcpstat.tcps_lro_largepack++;
909 break;
910 }
911 return;
912}
913
914static void
915lro_update_flush_stats(struct mbuf *m)
916{
917 lro_flushes++;
0a7de745 918 switch (m->m_pkthdr.lro_npkts) {
316670eb
A
919 case 0: ASSERT(0);
920 case 1: lro_single_flushes++;
921 break;
922 case 2: lro_double_flushes++;
923 break;
924 default: lro_good_flushes++;
925 break;
926 }
927 return;
928}