]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/mptcp.c
xnu-4570.41.2.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp.c
CommitLineData
39236c6e 1/*
5ba3f43e 2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
39236c6e
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
5ba3f43e
A
29/*
30 * A note on the MPTCP/NECP-interactions:
31 *
32 * MPTCP uses NECP-callbacks to get notified of interface/policy events.
33 * MPTCP registers to these events at the MPTCP-layer for interface-events
34 * through a call to necp_client_register_multipath_cb.
35 * To get per-flow events (aka per TCP-subflow), we register to it with
36 * necp_client_register_socket_flow. Both registrations happen by using the
37 * necp-client-uuid that comes from the app.
38 *
39 * The locking is rather tricky. In general, we expect the lock-ordering to
40 * happen from necp-fd -> necp->client -> mpp_lock.
41 *
42 * There are however some subtleties.
43 *
44 * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
45 * safe, because it is the very first time this MPTCP-connection goes into NECP.
46 * As we go into NECP we take the NECP-locks and thus are guaranteed that no
47 * NECP-locks will deadlock us. Because these NECP-events will also first take
48 * the NECP-locks. Either they win the race and thus won't find our
49 * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
50 * the callbacks while holding the NECP lock.
51 *
52 * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
53 * because we have already registered callbacks and we might race against an
54 * NECP-event that will match on our socket. So, we have to unlock to be safe.
55 *
56 * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
57 * so_usecount has reached 0. We must be careful to not remove the mpp_socket
58 * pointers before we unregistered the callback. Because, again we might be
59 * racing against an NECP-event. Unregistering must happen with an unlocked
60 * mpp_lock, because of the lock-ordering constraint. It could be that
61 * before we had a chance to unregister an NECP-event triggers. That's why
62 * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
63 * there while the socket is being garbage-collected, the use-count will go
64 * down to 0 and we exit. Removal of the multipath_cb again happens by taking
65 * the NECP-locks so any running NECP-events will finish first and exit cleanly.
66 *
67 * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
68 * the socket-lock must be unlocked for lock-ordering constraints. This gets a
69 * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
70 * So, we drop the mp_so-lock as soon as the subflow is unlinked with
71 * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
72 * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
73 * gets it, it will realize that the subflow became non-MPTCP and retry (see
74 * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
75 * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
76 * for the NECP-lock (held by the other thread that is taking care of the NECP-
77 * event). So, the event now finally gets the subflow-lock and then hits an
78 * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
79 * the NECP callback.
80 */
81
39236c6e
A
82#include <sys/param.h>
83#include <sys/systm.h>
84#include <sys/kernel.h>
85#include <sys/mbuf.h>
86#include <sys/mcache.h>
87#include <sys/socket.h>
88#include <sys/socketvar.h>
89#include <sys/syslog.h>
90#include <sys/protosw.h>
91
92#include <kern/zalloc.h>
93#include <kern/locks.h>
94
39236c6e
A
95#include <mach/sdt.h>
96
39236c6e
A
97#include <net/if.h>
98#include <netinet/in.h>
99#include <netinet/in_var.h>
100#include <netinet/tcp.h>
101#include <netinet/tcp_fsm.h>
102#include <netinet/tcp_seq.h>
103#include <netinet/tcp_var.h>
104#include <netinet/mptcp_var.h>
105#include <netinet/mptcp.h>
106#include <netinet/mptcp_seq.h>
107#include <netinet/mptcp_opt.h>
108#include <netinet/mptcp_timer.h>
109
110int mptcp_enable = 1;
111SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
112 &mptcp_enable, 0, "Enable Multipath TCP Support");
113
39236c6e
A
114/* Number of times to try negotiating MPTCP on SYN retransmissions */
115int mptcp_mpcap_retries = MPTCP_CAPABLE_RETRIES;
116SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
117 CTLFLAG_RW | CTLFLAG_LOCKED,
118 &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
119
120/*
121 * By default, DSS checksum is turned off, revisit if we ever do
122 * MPTCP for non SSL Traffic.
123 */
124int mptcp_dss_csum = 0;
125SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
126 &mptcp_dss_csum, 0, "Enable DSS checksum");
127
128/*
129 * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
130 * is attempted on a different path.
131 */
132int mptcp_fail_thresh = 1;
133SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
134 &mptcp_fail_thresh, 0, "Failover threshold");
135
136
137/*
fe8ab488
A
138 * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
139 * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
140 * Some carrier networks have a timeout of 10 or 15 minutes.
39236c6e 141 */
fe8ab488 142int mptcp_subflow_keeptime = 60*14;
39236c6e
A
143SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
144 &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
145
3e170ce0
A
146int mptcp_rtthist_rtthresh = 600;
147SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
148 &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
149
150/*
151 * Use RTO history for sending new data
152 */
153int mptcp_use_rto = 1;
154SYSCTL_INT(_net_inet_mptcp, OID_AUTO, userto, CTLFLAG_RW | CTLFLAG_LOCKED,
155 &mptcp_use_rto, 0, "Disable RTO for subflow selection");
156
3e170ce0
A
157int mptcp_rtothresh = 1500;
158SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
159 &mptcp_rtothresh, 0, "RTO threshold");
160
3e170ce0
A
161/*
162 * Probe the preferred path, when it is not in use
163 */
3e170ce0
A
164uint32_t mptcp_probeto = 1000;
165SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
166 &mptcp_probeto, 0, "Disable probing by setting to 0");
167
3e170ce0
A
168uint32_t mptcp_probecnt = 5;
169SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
170 &mptcp_probecnt, 0, "Number of probe writes");
171
172/*
173 * Static declarations
174 */
5ba3f43e 175static uint16_t mptcp_input_csum(struct tcpcb *, struct mbuf *, uint64_t,
5c9f4661 176 uint32_t, uint16_t, uint16_t, uint16_t);
5ba3f43e
A
177
178static int
179mptcp_reass_present(struct socket *mp_so)
180{
181 struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
182 struct tseg_qent *q;
183 int dowakeup = 0;
5c9f4661 184 int flags = 0;
5ba3f43e
A
185
186 /*
187 * Present data to user, advancing rcv_nxt through
188 * completed sequence space.
189 */
190 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED)
5c9f4661 191 return (flags);
5ba3f43e
A
192 q = LIST_FIRST(&mp_tp->mpt_segq);
193 if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt)
5c9f4661 194 return (flags);
5ba3f43e
A
195
196 /*
197 * If there is already another thread doing reassembly for this
198 * connection, it is better to let it finish the job --
199 * (radar 16316196)
200 */
201 if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG)
5c9f4661 202 return (flags);
5ba3f43e
A
203
204 mp_tp->mpt_flags |= MPTCPF_REASS_INPROG;
205
206 do {
207 mp_tp->mpt_rcvnxt += q->tqe_len;
208 LIST_REMOVE(q, tqe_q);
209 if (mp_so->so_state & SS_CANTRCVMORE) {
210 m_freem(q->tqe_m);
211 } else {
5c9f4661
A
212 flags = !!(q->tqe_m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
213 if (sbappendstream_rcvdemux(mp_so, q->tqe_m, 0, 0))
5ba3f43e
A
214 dowakeup = 1;
215 }
216 zfree(tcp_reass_zone, q);
217 mp_tp->mpt_reassqlen--;
218 q = LIST_FIRST(&mp_tp->mpt_segq);
219 } while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt);
220 mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG;
221
222 if (dowakeup)
223 sorwakeup(mp_so); /* done with socket lock held */
5c9f4661 224 return (flags);
5ba3f43e
A
225
226}
227
228static int
229mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf *m)
230{
231 struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
232 u_int64_t mb_dsn = phdr->mp_dsn;
233 struct tseg_qent *q;
234 struct tseg_qent *p = NULL;
235 struct tseg_qent *nq;
236 struct tseg_qent *te = NULL;
237 u_int16_t qlimit;
238
239 /*
240 * Limit the number of segments in the reassembly queue to prevent
241 * holding on to too many segments (and thus running out of mbufs).
242 * Make sure to let the missing segment through which caused this
243 * queue. Always keep one global queue entry spare to be able to
244 * process the missing segment.
245 */
246 qlimit = min(max(100, mp_so->so_rcv.sb_hiwat >> 10),
247 (tcp_autorcvbuf_max >> 10));
248 if (mb_dsn != mp_tp->mpt_rcvnxt &&
249 (mp_tp->mpt_reassqlen + 1) >= qlimit) {
250 tcpstat.tcps_mptcp_rcvmemdrop++;
251 m_freem(m);
252 *tlenp = 0;
253 return (0);
254 }
255
256 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
257 te = (struct tseg_qent *) zalloc(tcp_reass_zone);
258 if (te == NULL) {
259 tcpstat.tcps_mptcp_rcvmemdrop++;
260 m_freem(m);
261 return (0);
262 }
263
264 mp_tp->mpt_reassqlen++;
265
266 /*
267 * Find a segment which begins after this one does.
268 */
269 LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) {
270 if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn))
271 break;
272 p = q;
273 }
274
275 /*
276 * If there is a preceding segment, it may provide some of
277 * our data already. If so, drop the data from the incoming
278 * segment. If it provides all of our data, drop us.
279 */
280 if (p != NULL) {
281 int64_t i;
282 /* conversion to int (in i) handles seq wraparound */
283 i = p->tqe_m->m_pkthdr.mp_dsn + p->tqe_len - mb_dsn;
284 if (i > 0) {
285 if (i >= *tlenp) {
286 tcpstat.tcps_mptcp_rcvduppack++;
287 m_freem(m);
288 zfree(tcp_reass_zone, te);
289 te = NULL;
290 mp_tp->mpt_reassqlen--;
291 /*
292 * Try to present any queued data
293 * at the left window edge to the user.
294 * This is needed after the 3-WHS
295 * completes.
296 */
297 goto out;
298 }
299 m_adj(m, i);
300 *tlenp -= i;
301 phdr->mp_dsn += i;
302 }
303 }
304
305 tcpstat.tcps_mp_oodata++;
306
307 /*
308 * While we overlap succeeding segments trim them or,
309 * if they are completely covered, dequeue them.
310 */
311 while (q) {
312 int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn;
313 if (i <= 0)
314 break;
315
316 if (i < q->tqe_len) {
317 q->tqe_m->m_pkthdr.mp_dsn += i;
318 q->tqe_len -= i;
319 m_adj(q->tqe_m, i);
320 break;
321 }
322
323 nq = LIST_NEXT(q, tqe_q);
324 LIST_REMOVE(q, tqe_q);
325 m_freem(q->tqe_m);
326 zfree(tcp_reass_zone, q);
327 mp_tp->mpt_reassqlen--;
328 q = nq;
329 }
330
331 /* Insert the new segment queue entry into place. */
332 te->tqe_m = m;
333 te->tqe_th = NULL;
334 te->tqe_len = *tlenp;
335
336 if (p == NULL) {
337 LIST_INSERT_HEAD(&mp_tp->mpt_segq, te, tqe_q);
338 } else {
339 LIST_INSERT_AFTER(p, te, tqe_q);
340 }
341
342out:
343 return (mptcp_reass_present(mp_so));
344}
3e170ce0 345
39236c6e
A
346/*
347 * MPTCP input, called when data has been read from a subflow socket.
348 */
349void
350mptcp_input(struct mptses *mpte, struct mbuf *m)
351{
352 struct socket *mp_so;
353 struct mptcb *mp_tp = NULL;
5ba3f43e 354 int count = 0, wakeup = 0;
fe8ab488 355 struct mbuf *save = NULL, *prev = NULL;
39236c6e
A
356 struct mbuf *freelist = NULL, *tail = NULL;
357
358 VERIFY(m->m_flags & M_PKTHDR);
359
5ba3f43e
A
360 mpte_lock_assert_held(mpte); /* same as MP socket lock */
361
362 mp_so = mptetoso(mpte);
363 mp_tp = mpte->mpte_mptcb;
39236c6e
A
364
365 DTRACE_MPTCP(input);
366
5ba3f43e
A
367 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
368
39236c6e
A
369 /*
370 * Each mbuf contains MPTCP Data Sequence Map
371 * Process the data for reassembly, delivery to MPTCP socket
372 * client, etc.
373 *
374 */
375 count = mp_so->so_rcv.sb_cc;
376
39236c6e
A
377 /*
378 * In the degraded fallback case, data is accepted without DSS map
379 */
5ba3f43e 380 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
5c9f4661
A
381 struct mbuf *iter;
382 int mb_dfin = 0;
39037602 383fallback:
5ba3f43e
A
384 mptcp_sbrcv_grow(mp_tp);
385
5c9f4661
A
386 for (iter = m; iter; iter = iter->m_next) {
387 if ((iter->m_flags & M_PKTHDR) &&
388 (iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
389 mb_dfin = 1;
390 break;
391 }
392 }
393
39037602
A
394 /*
395 * assume degraded flow as this may be the first packet
396 * without DSS, and the subflow state is not updated yet.
fe8ab488 397 */
5c9f4661 398 if (sbappendstream_rcvdemux(mp_so, m, 0, 0))
39236c6e 399 sorwakeup(mp_so);
5c9f4661 400
39236c6e
A
401 DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
402 struct socket *, mp_so,
403 struct sockbuf *, &mp_so->so_rcv,
404 struct sockbuf *, &mp_so->so_snd,
405 struct mptses *, mpte);
406 count = mp_so->so_rcv.sb_cc - count;
5c9f4661
A
407
408 mp_tp->mpt_rcvnxt += count;
409
410 if (mb_dfin) {
411 mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
412 socantrcvmore(mp_so);
413 }
414
5ba3f43e 415 mptcplog((LOG_DEBUG, "%s: Fallback read %d bytes\n", __func__,
3e170ce0 416 count), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
417 return;
418 }
419
39236c6e 420 do {
5ba3f43e
A
421 u_int64_t mb_dsn;
422 int32_t mb_datalen;
423 int64_t todrop;
5c9f4661 424 int mb_dfin = 0;
5ba3f43e 425
fe8ab488 426 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
5ba3f43e 427 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP))
fe8ab488 428 goto fallback;
fe8ab488 429
39236c6e 430 save = m->m_next;
fe8ab488
A
431 /*
432 * A single TCP packet formed of multiple mbufs
433 * holds DSS mapping in the first mbuf of the chain.
434 * Other mbufs in the chain may have M_PKTHDR set
435 * even though they belong to the same TCP packet
436 * and therefore use the DSS mapping stored in the
437 * first mbuf of the mbuf chain. mptcp_input() can
438 * get an mbuf chain with multiple TCP packets.
439 */
440 while (save && (!(save->m_flags & M_PKTHDR) ||
441 !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
442 prev = save;
443 save = save->m_next;
444 }
445 if (prev)
446 prev->m_next = NULL;
447 else
448 m->m_next = NULL;
39236c6e
A
449
450 mb_dsn = m->m_pkthdr.mp_dsn;
451 mb_datalen = m->m_pkthdr.mp_rlen;
452
5ba3f43e
A
453 todrop = (mb_dsn + mb_datalen) - (mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd);
454 if (todrop > 0) {
455 tcpstat.tcps_mptcp_rcvpackafterwin++;
456
457 if (todrop >= mb_datalen) {
458 if (freelist == NULL)
459 freelist = m;
460 else
461 tail->m_next = m;
462
463 if (prev != NULL)
464 tail = prev;
465 else
466 tail = m;
467
468 m = save;
469 prev = save = NULL;
470 continue;
471 } else {
472 m_adj(m, -todrop);
473 mb_datalen -= todrop;
474 }
5c9f4661
A
475
476 /*
477 * We drop from the right edge of the mbuf, thus the
478 * DATA_FIN is dropped as well
479 */
480 m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
39236c6e
A
481 }
482
5ba3f43e
A
483 if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
484 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5c9f4661 485 mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
5ba3f43e
A
486
487 goto next;
488 }
5c9f4661 489 mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
5ba3f43e
A
490
491 if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
39236c6e 492 if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
5ba3f43e 493 mp_tp->mpt_rcvnxt)) {
39236c6e 494 if (freelist == NULL)
fe8ab488
A
495 freelist = m;
496 else
39236c6e 497 tail->m_next = m;
fe8ab488
A
498
499 if (prev != NULL)
500 tail = prev;
501 else
39236c6e 502 tail = m;
fe8ab488 503
39236c6e 504 m = save;
fe8ab488 505 prev = save = NULL;
39236c6e
A
506 continue;
507 } else {
5ba3f43e 508 m_adj(m, (mp_tp->mpt_rcvnxt - mb_dsn));
39236c6e 509 }
5ba3f43e
A
510 mptcplog((LOG_INFO, "%s: Left Edge %llu\n", __func__,
511 mp_tp->mpt_rcvnxt),
3e170ce0 512 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
513 }
514
5ba3f43e
A
515 mptcp_sbrcv_grow(mp_tp);
516
5c9f4661 517 if (sbappendstream_rcvdemux(mp_so, m, 0, 0))
5ba3f43e
A
518 wakeup = 1;
519
39236c6e
A
520 DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
521 struct sockbuf *, &mp_so->so_rcv,
522 struct sockbuf *, &mp_so->so_snd,
523 struct mptses *, mpte,
524 struct mptcb *, mp_tp);
39236c6e
A
525 count = mp_so->so_rcv.sb_cc - count;
526 tcpstat.tcps_mp_rcvtotal++;
527 tcpstat.tcps_mp_rcvbytes += count;
5ba3f43e 528 mptcplog((LOG_DEBUG, "%s: Read %d bytes\n", __func__, count),
3e170ce0
A
529 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
530
5ba3f43e
A
531 mp_tp->mpt_rcvnxt += count;
532
533next:
5c9f4661
A
534 if (mb_dfin) {
535 mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
536 socantrcvmore(mp_so);
537 }
39236c6e 538 m = save;
fe8ab488 539 prev = save = NULL;
39236c6e
A
540 count = mp_so->so_rcv.sb_cc;
541 } while (m);
39236c6e
A
542
543 if (freelist)
544 m_freem(freelist);
5ba3f43e
A
545
546 if (wakeup)
547 sorwakeup(mp_so);
548}
549
550static boolean_t
551mptcp_can_send_more(struct mptcb *mp_tp)
552{
553 struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
554
555 /*
556 * Always send if there is data in the reinject-queue.
557 */
558 if (mp_tp->mpt_mpte->mpte_reinjectq)
559 return (TRUE);
560
561 /*
562 * Don't send, if:
563 *
564 * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
565 * Except when using TFO, we might be doing a 0-byte write.
566 * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
567 * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
568 */
569
570 if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax))
571 return (FALSE);
572
573 if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt))
574 return (FALSE);
575
576 if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
577 return (FALSE);
578
579 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2)
580 return (FALSE);
581
582 return (TRUE);
39236c6e
A
583}
584
585/*
586 * MPTCP output.
587 */
588int
589mptcp_output(struct mptses *mpte)
590{
5ba3f43e 591 struct mptcb *mp_tp;
39236c6e
A
592 struct mptsub *mpts;
593 struct mptsub *mpts_tried = NULL;
594 struct socket *mp_so;
3e170ce0 595 struct mptsub *preferred_mpts = NULL;
5ba3f43e 596 uint64_t old_snd_nxt;
39236c6e
A
597 int error = 0;
598
5ba3f43e
A
599 mpte_lock_assert_held(mpte);
600 mp_so = mptetoso(mpte);
601 mp_tp = mpte->mpte_mptcb;
39236c6e 602
5ba3f43e
A
603 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
604 mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
605
606 mptcplog((LOG_DEBUG, "%s: snxt %u sndmax %u suna %u swnd %u reinjectq %u state %u\n",
607 __func__, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
608 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_sndwnd,
609 mpte->mpte_reinjectq ? 1 : 0,
610 mp_tp->mpt_state),
611 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
612
613 old_snd_nxt = mp_tp->mpt_sndnxt;
614 while (mptcp_can_send_more(mp_tp)) {
615 /* get the "best" subflow to be used for transmission */
616 mpts = mptcp_get_subflow(mpte, NULL, &preferred_mpts);
617 if (mpts == NULL) {
618 mptcplog((LOG_INFO, "%s: no subflow\n", __func__),
619 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
620 break;
621 }
39236c6e 622
5ba3f43e 623 mptcplog((LOG_DEBUG, "%s: using id %u\n", __func__, mpts->mpts_connid),
3e170ce0 624 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 625
5ba3f43e
A
626 /* In case there's just one flow, we reattempt later */
627 if (mpts_tried != NULL &&
628 (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
629 mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
630 mpts_tried->mpts_flags |= MPTSF_ACTIVE;
631 mptcp_start_timer(mpte, MPTT_REXMT);
632 mptcplog((LOG_DEBUG, "%s: retry later\n", __func__),
633 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
634 break;
635 }
636
637 /*
638 * Automatic sizing of send socket buffer. Increase the send
639 * socket buffer size if all of the following criteria are met
640 * 1. the receiver has enough buffer space for this data
641 * 2. send buffer is filled to 7/8th with data (so we actually
642 * have data to make use of it);
643 */
644 if (tcp_do_autosendbuf == 1 &&
645 (mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE &&
646 tcp_cansbgrow(&mp_so->so_snd)) {
647 if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat &&
648 mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) {
649 if (sbreserve(&mp_so->so_snd,
650 min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
651 tcp_autosndbuf_max)) == 1) {
652 mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
653
654 mptcplog((LOG_DEBUG, "%s: increased snd hiwat to %u lowat %u\n",
655 __func__, mp_so->so_snd.sb_hiwat,
656 mp_so->so_snd.sb_lowat),
657 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3e170ce0
A
658 }
659 }
3e170ce0 660 }
3e170ce0 661
5ba3f43e
A
662 DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
663 struct socket *, mp_so);
664 error = mptcp_subflow_output(mpte, mpts, 0);
665 if (error) {
666 /* can be a temporary loss of source address or other error */
667 mpts->mpts_flags |= MPTSF_FAILINGOVER;
668 mpts->mpts_flags &= ~MPTSF_ACTIVE;
669 mpts_tried = mpts;
670 mptcplog((LOG_ERR, "%s: Error = %d mpts_flags %#x\n", __func__,
671 error, mpts->mpts_flags),
672 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
673 break;
674 }
675 /* The model is to have only one active flow at a time */
676 mpts->mpts_flags |= MPTSF_ACTIVE;
677 mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
678
679 /* Allows us to update the smoothed rtt */
680 if (mptcp_probeto && mpts != preferred_mpts && preferred_mpts != NULL) {
681 if (preferred_mpts->mpts_probesoon) {
682 if ((tcp_now - preferred_mpts->mpts_probesoon) > mptcp_probeto) {
683 mptcp_subflow_output(mpte, preferred_mpts, MPTCP_SUBOUT_PROBING);
684 if (preferred_mpts->mpts_probecnt >= mptcp_probecnt) {
685 preferred_mpts->mpts_probesoon = 0;
686 preferred_mpts->mpts_probecnt = 0;
687 }
688 }
689 } else {
690 preferred_mpts->mpts_probesoon = tcp_now;
691 preferred_mpts->mpts_probecnt = 0;
692 }
693 }
694
695 if (mpte->mpte_active_sub == NULL) {
696 mpte->mpte_active_sub = mpts;
697 } else if (mpte->mpte_active_sub != mpts) {
698 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
699 struct tcpcb *acttp = sototcpcb(mpte->mpte_active_sub->mpts_socket);
700
701 mptcplog((LOG_DEBUG, "%s: switch [%u, srtt %d] to [%u, srtt %d]\n", __func__,
702 mpte->mpte_active_sub->mpts_connid, acttp->t_srtt >> TCP_RTT_SHIFT,
703 mpts->mpts_connid, tp->t_srtt >> TCP_RTT_SHIFT),
704 (MPTCP_SENDER_DBG | MPTCP_SOCKET_DBG), MPTCP_LOGLVL_LOG);
705
706 mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
707 mpte->mpte_active_sub = mpts;
708
709 mptcpstats_inc_switch(mpte, mpts);
710 }
39236c6e 711 }
5ba3f43e
A
712
713 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_WUPCALL);
714
39236c6e
A
715 /* subflow errors should not be percolated back up */
716 return (0);
717}
718
5ba3f43e
A
719
720static struct mptsub *
721mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt)
722{
723 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
724
725 /*
726 * Lower RTT? Take it, if it's our first one, or
727 * it doesn't has any loss, or the current one has
728 * loss as well.
729 */
730 if (tp->t_srtt && *currtt > tp->t_srtt &&
731 (curbest == NULL || tp->t_rxtshift == 0 ||
732 sototcpcb(curbest->mpts_socket)->t_rxtshift)) {
733 *currtt = tp->t_srtt;
734 return (mpts);
735 }
736
737 /*
738 * If we find a subflow without loss, take it always!
739 */
740 if (curbest &&
741 sototcpcb(curbest->mpts_socket)->t_rxtshift &&
742 tp->t_rxtshift == 0) {
743 *currtt = tp->t_srtt;
744 return (mpts);
745 }
746
747 return (curbest != NULL ? curbest : mpts);
748}
749
750static struct mptsub *
751mptcp_return_subflow(struct mptsub *mpts)
752{
753 if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0)
754 return (NULL);
755
756 return (mpts);
757}
758
39236c6e
A
759/*
760 * Return the most eligible subflow to be used for sending data.
39236c6e
A
761 */
762struct mptsub *
3e170ce0 763mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **preferred)
39236c6e 764{
5ba3f43e
A
765 struct tcpcb *besttp, *secondtp;
766 struct inpcb *bestinp, *secondinp;
39236c6e 767 struct mptsub *mpts;
3e170ce0
A
768 struct mptsub *best = NULL;
769 struct mptsub *second_best = NULL;
5ba3f43e 770 int exp_rtt = INT_MAX, cheap_rtt = INT_MAX;
39236c6e 771
5ba3f43e
A
772 /*
773 * First Step:
774 * Choose the best subflow for cellular and non-cellular interfaces.
775 */
39236c6e
A
776
777 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5ba3f43e
A
778 struct socket *so = mpts->mpts_socket;
779 struct tcpcb *tp = sototcpcb(so);
780 struct inpcb *inp = sotoinpcb(so);
781
782 mptcplog((LOG_DEBUG, "%s mpts %u ignore %d, mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n",
783 __func__, mpts->mpts_connid, ignore ? ignore->mpts_connid : -1, mpts->mpts_flags,
784 INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state,
785 inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1,
786 tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt,
787 mptcp_subflow_cwnd_space(so)),
788 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 789
5ba3f43e
A
790 /*
791 * First, the hard conditions to reject subflows
792 * (e.g., not connected,...)
793 */
794 if (mpts == ignore || inp->inp_last_outifp == NULL)
795 continue;
796
797 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
39236c6e 798 continue;
39236c6e
A
799
800 /* There can only be one subflow in degraded state */
801 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3e170ce0 802 best = mpts;
39236c6e
A
803 break;
804 }
805
fe8ab488 806 /*
5ba3f43e 807 * If this subflow is waiting to finally send, do it!
fe8ab488 808 */
5ba3f43e
A
809 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
810 return (mptcp_return_subflow(mpts));
39236c6e 811
5ba3f43e
A
812 /*
813 * Only send if the subflow is MP_CAPABLE. The exceptions to
814 * this rule (degraded or TFO) have been taken care of above.
815 */
816 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE))
39236c6e 817 continue;
39236c6e 818
5ba3f43e
A
819 if ((so->so_state & SS_ISDISCONNECTED) ||
820 !(so->so_state & SS_ISCONNECTED) ||
821 !TCPS_HAVEESTABLISHED(tp->t_state) ||
822 tp->t_state > TCPS_CLOSE_WAIT)
fe8ab488 823 continue;
39236c6e 824
5ba3f43e
A
825 /*
826 * Second, the soft conditions to find the subflow with best
827 * conditions for each set (aka cellular vs non-cellular)
828 */
829 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
830 second_best = mptcp_choose_subflow(mpts, second_best,
831 &exp_rtt);
832 else
833 best = mptcp_choose_subflow(mpts, best, &cheap_rtt);
39236c6e 834 }
3e170ce0 835
39236c6e
A
836 /*
837 * If there is no preferred or backup subflow, and there is no active
838 * subflow use the last usable subflow.
839 */
5ba3f43e
A
840 if (best == NULL)
841 return (mptcp_return_subflow(second_best));
39236c6e 842
5ba3f43e
A
843 if (second_best == NULL)
844 return (mptcp_return_subflow(best));
845
846 besttp = sototcpcb(best->mpts_socket);
847 bestinp = sotoinpcb(best->mpts_socket);
848 secondtp = sototcpcb(second_best->mpts_socket);
849 secondinp = sotoinpcb(second_best->mpts_socket);
3e170ce0
A
850
851 if (preferred != NULL)
5ba3f43e 852 *preferred = mptcp_return_subflow(best);
3e170ce0 853
5ba3f43e
A
854 /*
855 * Second Step: Among best and second_best. Choose the one that is
856 * most appropriate for this particular service-type.
857 */
858 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
859 /*
860 * Only handover if Symptoms tells us to do so.
861 */
862 if (IFNET_IS_WIFI(bestinp->inp_last_outifp) &&
863 mptcp_is_wifi_unusable() &&
864 besttp->t_rxtshift >= mptcp_fail_thresh)
865 return (mptcp_return_subflow(second_best));
866
867 return (mptcp_return_subflow(best));
868 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) {
869 int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT;
870 int rto_thresh = mptcp_rtothresh;
871
872 /* Adjust with symptoms information */
873 if (IFNET_IS_WIFI(bestinp->inp_last_outifp) &&
874 mptcp_is_wifi_unusable()) {
875 rtt_thresh /= 2;
876 rto_thresh /= 2;
877 }
3e170ce0 878
5ba3f43e
A
879 if (besttp->t_srtt && secondtp->t_srtt &&
880 besttp->t_srtt >= rtt_thresh &&
881 secondtp->t_srtt < rtt_thresh) {
882 tcpstat.tcps_mp_sel_rtt++;
883 mptcplog((LOG_DEBUG, "%s: best cid %d at rtt %d, second cid %d at rtt %d\n", __func__,
884 best->mpts_connid, besttp->t_srtt >> TCP_RTT_SHIFT,
885 second_best->mpts_connid,
886 secondtp->t_srtt >> TCP_RTT_SHIFT),
887 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
888 return (mptcp_return_subflow(second_best));
889 }
39236c6e 890
5ba3f43e
A
891 if (besttp->t_rxtshift >= mptcp_fail_thresh &&
892 secondtp->t_rxtshift == 0) {
893 return (mptcp_return_subflow(second_best));
894 }
39037602 895
5ba3f43e
A
896 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
897 if (besttp->t_rxtcur && secondtp->t_rxtcur &&
898 besttp->t_rxtcur >= rto_thresh &&
899 secondtp->t_rxtcur < rto_thresh) {
900 tcpstat.tcps_mp_sel_rto++;
901 mptcplog((LOG_DEBUG, "%s: best cid %d at rto %d, second cid %d at rto %d\n", __func__,
902 best->mpts_connid, besttp->t_rxtcur,
903 second_best->mpts_connid, secondtp->t_rxtcur),
904 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
905
906 return (mptcp_return_subflow(second_best));
907 }
fe8ab488 908
5ba3f43e
A
909 /*
910 * None of the above conditions for sending on the secondary
911 * were true. So, let's schedule on the best one, if he still
912 * has some space in the congestion-window.
913 */
914 return (mptcp_return_subflow(best));
915 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_AGGREGATE) {
916 struct mptsub *tmp;
fe8ab488 917
5ba3f43e
A
918 /*
919 * We only care about RTT when aggregating
920 */
921 if (besttp->t_srtt > secondtp->t_srtt) {
922 tmp = best;
923 best = second_best;
924 besttp = secondtp;
925 bestinp = secondinp;
926
927 second_best = tmp;
928 secondtp = sototcpcb(second_best->mpts_socket);
929 secondinp = sotoinpcb(second_best->mpts_socket);
fe8ab488
A
930 }
931
5ba3f43e
A
932 /* Is there still space in the congestion window? */
933 if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0)
934 return (mptcp_return_subflow(second_best));
fe8ab488 935
5ba3f43e
A
936 return (mptcp_return_subflow(best));
937 } else {
938 panic("Unknown service-type configured for MPTCP");
fe8ab488 939 }
5ba3f43e
A
940
941 return (NULL);
fe8ab488
A
942}
943
3e170ce0
A
944static const char *
945mptcp_event_to_str(uint32_t event)
946{
947 const char *c = "UNDEFINED";
948 switch (event) {
949 case MPCE_CLOSE:
950 c = "MPCE_CLOSE";
951 break;
952 case MPCE_RECV_DATA_ACK:
953 c = "MPCE_RECV_DATA_ACK";
954 break;
955 case MPCE_RECV_DATA_FIN:
956 c = "MPCE_RECV_DATA_FIN";
957 break;
958 }
959 return (c);
960}
961
962static const char *
963mptcp_state_to_str(mptcp_state_t state)
964{
965 const char *c = "UNDEFINED";
966 switch (state) {
967 case MPTCPS_CLOSED:
968 c = "MPTCPS_CLOSED";
969 break;
970 case MPTCPS_LISTEN:
971 c = "MPTCPS_LISTEN";
972 break;
973 case MPTCPS_ESTABLISHED:
974 c = "MPTCPS_ESTABLISHED";
975 break;
976 case MPTCPS_CLOSE_WAIT:
977 c = "MPTCPS_CLOSE_WAIT";
978 break;
979 case MPTCPS_FIN_WAIT_1:
980 c = "MPTCPS_FIN_WAIT_1";
981 break;
982 case MPTCPS_CLOSING:
983 c = "MPTCPS_CLOSING";
984 break;
985 case MPTCPS_LAST_ACK:
986 c = "MPTCPS_LAST_ACK";
987 break;
988 case MPTCPS_FIN_WAIT_2:
989 c = "MPTCPS_FIN_WAIT_2";
990 break;
991 case MPTCPS_TIME_WAIT:
992 c = "MPTCPS_TIME_WAIT";
993 break;
3e170ce0
A
994 case MPTCPS_TERMINATE:
995 c = "MPTCPS_TERMINATE";
996 break;
997 }
998 return (c);
999}
1000
39236c6e
A
1001void
1002mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
1003{
5ba3f43e 1004 mpte_lock_assert_held(mp_tp->mpt_mpte);
3e170ce0 1005 mptcp_state_t old_state = mp_tp->mpt_state;
39236c6e 1006
39037602 1007 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
39236c6e
A
1008 uint32_t, event);
1009
1010 switch (mp_tp->mpt_state) {
1011 case MPTCPS_CLOSED:
1012 case MPTCPS_LISTEN:
1013 mp_tp->mpt_state = MPTCPS_CLOSED;
1014 break;
1015
1016 case MPTCPS_ESTABLISHED:
fe8ab488 1017 if (event == MPCE_CLOSE) {
39236c6e 1018 mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
fe8ab488 1019 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
39037602 1020 } else if (event == MPCE_RECV_DATA_FIN) {
fe8ab488 1021 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
39236c6e 1022 mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
39037602 1023 }
39236c6e
A
1024 break;
1025
1026 case MPTCPS_CLOSE_WAIT:
fe8ab488 1027 if (event == MPCE_CLOSE) {
39236c6e 1028 mp_tp->mpt_state = MPTCPS_LAST_ACK;
fe8ab488 1029 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
39037602 1030 }
39236c6e
A
1031 break;
1032
1033 case MPTCPS_FIN_WAIT_1:
39037602 1034 if (event == MPCE_RECV_DATA_ACK) {
39236c6e 1035 mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
39037602 1036 } else if (event == MPCE_RECV_DATA_FIN) {
fe8ab488 1037 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
39236c6e 1038 mp_tp->mpt_state = MPTCPS_CLOSING;
39037602 1039 }
39236c6e
A
1040 break;
1041
1042 case MPTCPS_CLOSING:
1043 if (event == MPCE_RECV_DATA_ACK)
1044 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1045 break;
1046
1047 case MPTCPS_LAST_ACK:
1048 if (event == MPCE_RECV_DATA_ACK)
5ba3f43e 1049 mptcp_close(mp_tp->mpt_mpte, mp_tp);
39236c6e
A
1050 break;
1051
1052 case MPTCPS_FIN_WAIT_2:
fe8ab488
A
1053 if (event == MPCE_RECV_DATA_FIN) {
1054 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
39236c6e 1055 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
39037602 1056 }
39236c6e
A
1057 break;
1058
1059 case MPTCPS_TIME_WAIT:
fe8ab488 1060 case MPTCPS_TERMINATE:
39236c6e 1061 break;
5ba3f43e 1062
39236c6e
A
1063 default:
1064 VERIFY(0);
1065 /* NOTREACHED */
1066 }
39037602 1067 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
39236c6e 1068 uint32_t, event);
5ba3f43e 1069 mptcplog((LOG_INFO, "%s: %s to %s on event %s\n", __func__,
3e170ce0
A
1070 mptcp_state_to_str(old_state),
1071 mptcp_state_to_str(mp_tp->mpt_state),
1072 mptcp_event_to_str(event)),
1073 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
39236c6e
A
1074}
1075
39236c6e
A
1076/* If you change this function, match up mptcp_update_rcv_state_f */
1077void
1078mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
1079 uint16_t csum)
1080{
1081 struct mptcb *mp_tp = tptomptp(tp);
1082 u_int64_t full_dsn = 0;
1083
1084 NTOHL(dss_info->mdss_dsn);
1085 NTOHL(dss_info->mdss_subflow_seqn);
1086 NTOHS(dss_info->mdss_data_len);
1087
1088 /* XXX for autosndbuf grow sb here */
39236c6e 1089 MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
39236c6e
A
1090 mptcp_update_rcv_state_meat(mp_tp, tp,
1091 full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
1092 csum);
1093
1094}
1095
1096void
1097mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
1098 u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
1099 uint16_t csum)
1100{
1101 if (mdss_data_len == 0) {
5ba3f43e 1102 mptcplog((LOG_INFO, "%s: Infinite Mapping.\n", __func__),
3e170ce0
A
1103 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1104
39236c6e 1105 if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
5ba3f43e 1106 mptcplog((LOG_ERR, "%s: Bad checksum %x \n", __func__,
3e170ce0 1107 csum), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
39236c6e
A
1108 }
1109 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1110 return;
1111 }
5c9f4661
A
1112 mptcplog((LOG_DEBUG,
1113 "%s: seqn = %x len = %x full = %llx rcvnxt = %llu \n", __func__,
1114 seqn, mdss_data_len, full_dsn, mp_tp->mpt_rcvnxt),
1115 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e 1116
39236c6e 1117 mptcp_notify_mpready(tp->t_inpcb->inp_socket);
5c9f4661 1118
39236c6e
A
1119 tp->t_rcv_map.mpt_dsn = full_dsn;
1120 tp->t_rcv_map.mpt_sseq = seqn;
1121 tp->t_rcv_map.mpt_len = mdss_data_len;
1122 tp->t_rcv_map.mpt_csum = csum;
1123 tp->t_mpflags |= TMPF_EMBED_DSN;
1124}
1125
1126
3e170ce0
A
1127static int
1128mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m,
1129 int hdrlen)
1130{
5ba3f43e 1131 u_int32_t datalen;
3e170ce0
A
1132
1133 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP))
1134 return 0;
1135
3e170ce0
A
1136 datalen = m->m_pkthdr.mp_rlen;
1137
3e170ce0
A
1138 /* unacceptable DSS option, fallback to TCP */
1139 if (m->m_pkthdr.len > ((int) datalen + hdrlen)) {
5ba3f43e 1140 mptcplog((LOG_ERR, "%s: mbuf len %d, MPTCP expected %d",
3e170ce0
A
1141 __func__, m->m_pkthdr.len, datalen),
1142 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1143 } else {
1144 return 0;
1145 }
1146 tp->t_mpflags |= TMPF_SND_MPFAIL;
1147 mptcp_notify_mpfail(so);
1148 m_freem(m);
1149 return -1;
1150}
1151
1152int
5c9f4661
A
1153mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
1154 int drop_hdrlen)
3e170ce0 1155{
5c9f4661 1156 mptcp_insert_rmap(tp, m, th);
3e170ce0
A
1157 if (mptcp_validate_dss_map(tp->t_inpcb->inp_socket, tp, m,
1158 drop_hdrlen) != 0)
1159 return -1;
1160 return 0;
1161}
1162
39236c6e
A
1163/*
1164 * MPTCP Checksum support
1165 * The checksum is calculated whenever the MPTCP DSS option is included
1166 * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
1167 * header and the actual data indicated by the length specified in the
1168 * DSS option.
1169 */
1170
5ba3f43e
A
1171int
1172mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
5c9f4661 1173 uint32_t sseq, uint16_t dlen, uint16_t csum, uint16_t dfin)
3e170ce0 1174{
5ba3f43e
A
1175 uint16_t mptcp_csum;
1176
5c9f4661 1177 mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum, dfin);
3e170ce0
A
1178 if (mptcp_csum) {
1179 tp->t_mpflags |= TMPF_SND_MPFAIL;
3e170ce0
A
1180 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1181 m_freem(m);
1182 tcpstat.tcps_mp_badcsum++;
5ba3f43e 1183 return (-1);
3e170ce0 1184 }
5ba3f43e 1185 return (0);
3e170ce0
A
1186}
1187
1188static uint16_t
5ba3f43e 1189mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq,
5c9f4661 1190 uint16_t dlen, uint16_t csum, uint16_t dfin)
39236c6e
A
1191{
1192 struct mptcb *mp_tp = tptomptp(tp);
5c9f4661 1193 uint16_t real_len = dlen - dfin;
39236c6e 1194 uint32_t sum = 0;
39236c6e
A
1195
1196 if (mp_tp == NULL)
1197 return (0);
1198
1199 if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM))
1200 return (0);
1201
39236c6e
A
1202 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
1203 return (0);
1204
39037602 1205 /*
39236c6e
A
1206 * The remote side may send a packet with fewer bytes than the
1207 * claimed DSS checksum length.
1208 */
5c9f4661 1209 if ((int)m_length2(m, NULL) < real_len) {
39236c6e 1210 return (0xffff);
5c9f4661 1211 }
39236c6e 1212
5c9f4661
A
1213 if (real_len != 0)
1214 sum = m_sum16(m, 0, real_len);
39236c6e 1215
5ba3f43e 1216 sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum);
39236c6e
A
1217 ADDCARRY(sum);
1218 DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1219 uint32_t, sum);
5ba3f43e
A
1220
1221 mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
3e170ce0 1222 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
39236c6e
A
1223 return (~sum & 0xffff);
1224}
1225
5ba3f43e
A
1226uint32_t
1227mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen)
39236c6e 1228{
39236c6e 1229 u_int32_t sum = 0;
39236c6e 1230
5ba3f43e
A
1231 if (dlen)
1232 sum = m_sum16(m, 0, dlen);
39236c6e
A
1233
1234 dss_val = mptcp_hton64(dss_val);
5ba3f43e
A
1235 sseq = htonl(sseq);
1236 dlen = htons(dlen);
1237 sum += in_pseudo64(dss_val, sseq, dlen);
39236c6e
A
1238
1239 ADDCARRY(sum);
1240 sum = ~sum & 0xffff;
5ba3f43e
A
1241 DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum);
1242 mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1243 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
1244
1245 return sum;
3e170ce0
A
1246}
1247
1248/*
1249 * When WiFi signal starts fading, there's more loss and RTT spikes.
1250 * Check if there has been a large spike by comparing against
1251 * a tolerable RTT spike threshold.
1252 */
1253boolean_t
1254mptcp_no_rto_spike(struct socket *so)
1255{
1256 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1257 int32_t spike = 0;
1258
5ba3f43e 1259 if (tp->t_rxtcur > mptcp_rtothresh) {
3e170ce0
A
1260 spike = tp->t_rxtcur - mptcp_rtothresh;
1261
5ba3f43e
A
1262 mptcplog((LOG_DEBUG, "%s: spike = %d rto = %d best = %d cur = %d\n",
1263 __func__, spike,
3e170ce0
A
1264 tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
1265 tp->t_rttcur),
1266 (MPTCP_SOCKET_DBG|MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG);
1267
1268 }
1269
1270 if (spike > 0 ) {
1271 return (FALSE);
1272 } else {
1273 return (TRUE);
1274 }
39236c6e 1275}
5ba3f43e
A
1276
1277void
1278mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
1279{
1280 VERIFY(mpp->mpp_flags & flag);
1281 mpp->mpp_flags &= ~flag;
1282
1283 if (mptcp_should_defer_upcall(mpp))
1284 return;
1285
1286 if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) {
1287 mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP;
1288
1289 mptcp_subflow_workloop(mpp->mpp_pcbe);
1290 }
1291
1292 if (mpp->mpp_flags & MPP_SHOULD_RWAKEUP) {
1293 mpp->mpp_flags &= ~MPP_SHOULD_RWAKEUP;
1294
1295 sorwakeup(mpp->mpp_socket);
1296 }
1297
1298 if (mpp->mpp_flags & MPP_SHOULD_WWAKEUP) {
1299 mpp->mpp_flags &= ~MPP_SHOULD_WWAKEUP;
1300
1301 sowwakeup(mpp->mpp_socket);
1302 }
1303
1304 if (mpp->mpp_flags & MPP_SET_CELLICON) {
1305 mpp->mpp_flags &= ~MPP_SET_CELLICON;
1306
1307 mptcp_set_cellicon(mpp->mpp_pcbe);
1308 }
1309
1310 if (mpp->mpp_flags & MPP_UNSET_CELLICON) {
1311 mpp->mpp_flags &= ~MPP_UNSET_CELLICON;
1312
1313 mptcp_unset_cellicon();
1314 }
1315}
1316
1317static void
1318mptcp_ask_for_nat64(struct ifnet *ifp)
1319{
1320 in6_post_msg(ifp, KEV_INET6_REQUEST_NAT64_PREFIX, NULL, NULL);
1321
1322 mptcplog((LOG_DEBUG, "%s: asked for NAT64-prefix on %s\n",
1323 __func__, ifp->if_name), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1324}
1325
1326static void
1327mptcp_reset_itfinfo(struct mpt_itf_info *info)
1328{
1329 info->ifindex = 0;
1330 info->has_v4_conn = 0;
1331 info->has_v6_conn = 0;
1332}
1333
1334void
1335mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow)
1336{
1337 struct mppcb *mp = (struct mppcb *)handle;
1338 struct mptses *mpte = mptompte(mp);
1339 struct socket *mp_so;
1340 struct mptcb *mp_tp;
1341 int locked = 0;
1342 uint32_t i, ifindex;
1343
1344 ifindex = flow->interface_index;
1345 VERIFY(ifindex != IFSCOPE_NONE);
1346
1347 /* ToDo - remove after rdar://problem/32007628 */
1348 if (!IF_INDEX_IN_RANGE(ifindex))
1349 printf("%s 1 ifindex %u not in range of flow %p action %d\n",
1350 __func__, ifindex, flow, action);
1351
1352 /* About to be garbage-collected (see note about MPTCP/NECP interactions) */
1353 if (mp->mpp_socket->so_usecount == 0)
1354 return;
1355
1356 if (action != NECP_CLIENT_CBACTION_INITIAL) {
1357 mpte_lock(mpte);
1358 locked = 1;
1359
1360 /* Check again, because it might have changed while waiting */
1361 if (mp->mpp_socket->so_usecount == 0)
1362 goto out;
1363 }
1364
1365 mp_tp = mpte->mpte_mptcb;
1366 mp_so = mptetoso(mpte);
1367
1368 mptcplog((LOG_DEBUG, "%s, action: %u ifindex %u usecount %u mpt_flags %#x state %u\n",
1369 __func__, action, ifindex, mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state),
1370 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1371
1372 /* No need on fallen back sockets */
1373 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)
1374 goto out;
1375
1376 if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
1377 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1378 if (mpte->mpte_itfinfo[i].ifindex == ifindex)
1379 mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]);
1380 }
1381
1382 mptcp_sched_create_subflows(mpte);
1383 } else if (action == NECP_CLIENT_CBACTION_VIABLE ||
1384 action == NECP_CLIENT_CBACTION_INITIAL) {
1385 int found_empty = 0, empty_index = -1;
1386 struct ifnet *ifp;
1387
1388 /* ToDo - remove after rdar://problem/32007628 */
1389 if (!IF_INDEX_IN_RANGE(ifindex))
1390 printf("%s 2 ifindex %u not in range of flow %p action %d\n",
1391 __func__, ifindex, flow, action);
1392
1393 ifnet_head_lock_shared();
1394 ifp = ifindex2ifnet[ifindex];
1395 ifnet_head_done();
1396
1397 /* ToDo - remove after rdar://problem/32007628 */
1398 if (!IF_INDEX_IN_RANGE(ifindex))
1399 printf("%s 3 ifindex %u not in range of flow %p action %d\n",
1400 __func__, ifindex, flow, action);
1401
1402 if (ifp == NULL)
1403 goto out;
1404
1405 if (IFNET_IS_EXPENSIVE(ifp) &&
1406 (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE))
1407 goto out;
1408
1409 if (IFNET_IS_CELLULAR(ifp) &&
1410 (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR))
1411 goto out;
1412
1413 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1414 if (mpte->mpte_itfinfo[i].ifindex == 0) {
1415 found_empty = 1;
1416 empty_index = i;
1417 }
1418
1419 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1420 /* Ok, it's already there */
1421 goto out;
1422 }
1423 }
1424
1425 if ((mpte->mpte_dst.sa_family == AF_INET || mpte->mpte_dst.sa_family == 0) &&
1426 !(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4) &&
1427 ifnet_get_nat64prefix(ifp, NULL) == ENOENT) {
1428 mptcp_ask_for_nat64(ifp);
1429 goto out;
1430 }
1431
1432 if (found_empty == 0) {
1433 int new_size = mpte->mpte_itfinfo_size * 2;
1434 struct mpt_itf_info *info = _MALLOC(sizeof(*info) * new_size, M_TEMP, M_ZERO);
1435
1436 if (info == NULL) {
1437 mptcplog((LOG_ERR, "%s malloc failed for %u\n", __func__, new_size),
1438 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1439 goto out;
1440 }
1441
1442 memcpy(info, mpte->mpte_itfinfo, mpte->mpte_itfinfo_size * sizeof(*info));
1443
1444 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE)
1445 _FREE(mpte->mpte_itfinfo, M_TEMP);
1446
1447 /* We allocated a new one, thus the first must be empty */
1448 empty_index = mpte->mpte_itfinfo_size;
1449
1450 mpte->mpte_itfinfo = info;
1451 mpte->mpte_itfinfo_size = new_size;
1452
1453 mptcplog((LOG_DEBUG, "%s Needed to realloc to %u\n", __func__, new_size),
1454 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1455 }
1456
1457 VERIFY(empty_index >= 0 && empty_index < (int)mpte->mpte_itfinfo_size);
1458 mpte->mpte_itfinfo[empty_index].ifindex = ifindex;
1459 mpte->mpte_itfinfo[empty_index].has_v4_conn = !!(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
1460 mpte->mpte_itfinfo[empty_index].has_v6_conn = !!(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
1461
1462 mptcp_sched_create_subflows(mpte);
1463 }
1464
1465out:
1466 if (locked)
1467 mpte_unlock(mpte);
1468}
1469
1470void
1471mptcp_set_restrictions(struct socket *mp_so)
1472{
1473 struct mptses *mpte = mpsotompte(mp_so);
1474 uint32_t i;
1475
1476 mpte_lock_assert_held(mpte);
1477
1478 ifnet_head_lock_shared();
1479
1480 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1481 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
1482 uint32_t ifindex = info->ifindex;
1483 struct ifnet *ifp;
1484
1485 if (ifindex == IFSCOPE_NONE)
1486 continue;
1487
1488 ifp = ifindex2ifnet[ifindex];
1489
1490 if (IFNET_IS_EXPENSIVE(ifp) &&
1491 (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE))
1492 info->ifindex = IFSCOPE_NONE;
1493
1494 if (IFNET_IS_CELLULAR(ifp) &&
1495 (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR))
1496 info->ifindex = IFSCOPE_NONE;
1497 }
1498
1499 ifnet_head_done();
1500}
1501