]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/mptcp.c
c40a144a57602b7b94dcffed5b148df5fe1753c5
[apple/xnu.git] / bsd / netinet / mptcp.c
1 /*
2 * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * A note on the MPTCP/NECP-interactions:
31 *
32 * MPTCP uses NECP-callbacks to get notified of interface/policy events.
33 * MPTCP registers to these events at the MPTCP-layer for interface-events
34 * through a call to necp_client_register_multipath_cb.
35 * To get per-flow events (aka per TCP-subflow), we register to it with
36 * necp_client_register_socket_flow. Both registrations happen by using the
37 * necp-client-uuid that comes from the app.
38 *
39 * The locking is rather tricky. In general, we expect the lock-ordering to
40 * happen from necp-fd -> necp->client -> mpp_lock.
41 *
42 * There are however some subtleties.
43 *
44 * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
45 * safe, because it is the very first time this MPTCP-connection goes into NECP.
46 * As we go into NECP we take the NECP-locks and thus are guaranteed that no
47 * NECP-locks will deadlock us. Because these NECP-events will also first take
48 * the NECP-locks. Either they win the race and thus won't find our
49 * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
50 * the callbacks while holding the NECP lock.
51 *
52 * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
53 * because we have already registered callbacks and we might race against an
54 * NECP-event that will match on our socket. So, we have to unlock to be safe.
55 *
56 * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
57 * so_usecount has reached 0. We must be careful to not remove the mpp_socket
58 * pointers before we unregistered the callback. Because, again we might be
59 * racing against an NECP-event. Unregistering must happen with an unlocked
60 * mpp_lock, because of the lock-ordering constraint. It could be that
61 * before we had a chance to unregister an NECP-event triggers. That's why
62 * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
63 * there while the socket is being garbage-collected, the use-count will go
64 * down to 0 and we exit. Removal of the multipath_cb again happens by taking
65 * the NECP-locks so any running NECP-events will finish first and exit cleanly.
66 *
67 * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
68 * the socket-lock must be unlocked for lock-ordering constraints. This gets a
69 * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
70 * So, we drop the mp_so-lock as soon as the subflow is unlinked with
71 * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
72 * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
73 * gets it, it will realize that the subflow became non-MPTCP and retry (see
74 * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
75 * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
76 * for the NECP-lock (held by the other thread that is taking care of the NECP-
77 * event). So, the event now finally gets the subflow-lock and then hits an
78 * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
79 * the NECP callback.
80 */
81
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/kernel.h>
85 #include <sys/mbuf.h>
86 #include <sys/mcache.h>
87 #include <sys/socket.h>
88 #include <sys/socketvar.h>
89 #include <sys/syslog.h>
90 #include <sys/protosw.h>
91
92 #include <kern/zalloc.h>
93 #include <kern/locks.h>
94
95 #include <mach/sdt.h>
96
97 #include <net/if.h>
98 #include <netinet/in.h>
99 #include <netinet/in_var.h>
100 #include <netinet/tcp.h>
101 #include <netinet/tcp_fsm.h>
102 #include <netinet/tcp_seq.h>
103 #include <netinet/tcp_var.h>
104 #include <netinet/mptcp_var.h>
105 #include <netinet/mptcp.h>
106 #include <netinet/mptcp_seq.h>
107 #include <netinet/mptcp_opt.h>
108 #include <netinet/mptcp_timer.h>
109
110 int mptcp_enable = 1;
111 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
112 &mptcp_enable, 0, "Enable Multipath TCP Support");
113
114 /* Number of times to try negotiating MPTCP on SYN retransmissions */
115 int mptcp_mpcap_retries = MPTCP_CAPABLE_RETRIES;
116 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
117 CTLFLAG_RW | CTLFLAG_LOCKED,
118 &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
119
120 /*
121 * By default, DSS checksum is turned off, revisit if we ever do
122 * MPTCP for non SSL Traffic.
123 */
124 int mptcp_dss_csum = 0;
125 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
126 &mptcp_dss_csum, 0, "Enable DSS checksum");
127
128 /*
129 * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
130 * is attempted on a different path.
131 */
132 int mptcp_fail_thresh = 1;
133 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
134 &mptcp_fail_thresh, 0, "Failover threshold");
135
136
137 /*
138 * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
139 * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
140 * Some carrier networks have a timeout of 10 or 15 minutes.
141 */
142 int mptcp_subflow_keeptime = 60 * 14;
143 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
144 &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
145
146 int mptcp_rtthist_rtthresh = 600;
147 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
148 &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
149
150 /*
151 * Use RTO history for sending new data
152 */
153 int mptcp_use_rto = 1;
154 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, userto, CTLFLAG_RW | CTLFLAG_LOCKED,
155 &mptcp_use_rto, 0, "Disable RTO for subflow selection");
156
157 int mptcp_rtothresh = 1500;
158 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
159 &mptcp_rtothresh, 0, "RTO threshold");
160
161 /*
162 * Probe the preferred path, when it is not in use
163 */
164 uint32_t mptcp_probeto = 1000;
165 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
166 &mptcp_probeto, 0, "Disable probing by setting to 0");
167
168 uint32_t mptcp_probecnt = 5;
169 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
170 &mptcp_probecnt, 0, "Number of probe writes");
171
172 /*
173 * Static declarations
174 */
175 static uint16_t mptcp_input_csum(struct tcpcb *, struct mbuf *, uint64_t,
176 uint32_t, uint16_t, uint16_t, uint16_t);
177
178 static int
179 mptcp_reass_present(struct socket *mp_so)
180 {
181 struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
182 struct tseg_qent *q;
183 int dowakeup = 0;
184 int flags = 0;
185
186 /*
187 * Present data to user, advancing rcv_nxt through
188 * completed sequence space.
189 */
190 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
191 return flags;
192 }
193 q = LIST_FIRST(&mp_tp->mpt_segq);
194 if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt) {
195 return flags;
196 }
197
198 /*
199 * If there is already another thread doing reassembly for this
200 * connection, it is better to let it finish the job --
201 * (radar 16316196)
202 */
203 if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG) {
204 return flags;
205 }
206
207 mp_tp->mpt_flags |= MPTCPF_REASS_INPROG;
208
209 do {
210 mp_tp->mpt_rcvnxt += q->tqe_len;
211 LIST_REMOVE(q, tqe_q);
212 if (mp_so->so_state & SS_CANTRCVMORE) {
213 m_freem(q->tqe_m);
214 } else {
215 flags = !!(q->tqe_m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
216 if (sbappendstream_rcvdemux(mp_so, q->tqe_m, 0, 0)) {
217 dowakeup = 1;
218 }
219 }
220 zfree(tcp_reass_zone, q);
221 mp_tp->mpt_reassqlen--;
222 q = LIST_FIRST(&mp_tp->mpt_segq);
223 } while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt);
224 mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG;
225
226 if (dowakeup) {
227 sorwakeup(mp_so); /* done with socket lock held */
228 }
229 return flags;
230 }
231
232 static int
233 mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf *m)
234 {
235 struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
236 u_int64_t mb_dsn = phdr->mp_dsn;
237 struct tseg_qent *q;
238 struct tseg_qent *p = NULL;
239 struct tseg_qent *nq;
240 struct tseg_qent *te = NULL;
241 u_int16_t qlimit;
242
243 /*
244 * Limit the number of segments in the reassembly queue to prevent
245 * holding on to too many segments (and thus running out of mbufs).
246 * Make sure to let the missing segment through which caused this
247 * queue. Always keep one global queue entry spare to be able to
248 * process the missing segment.
249 */
250 qlimit = min(max(100, mp_so->so_rcv.sb_hiwat >> 10),
251 (tcp_autorcvbuf_max >> 10));
252 if (mb_dsn != mp_tp->mpt_rcvnxt &&
253 (mp_tp->mpt_reassqlen + 1) >= qlimit) {
254 tcpstat.tcps_mptcp_rcvmemdrop++;
255 m_freem(m);
256 *tlenp = 0;
257 return 0;
258 }
259
260 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
261 te = (struct tseg_qent *) zalloc(tcp_reass_zone);
262 if (te == NULL) {
263 tcpstat.tcps_mptcp_rcvmemdrop++;
264 m_freem(m);
265 return 0;
266 }
267
268 mp_tp->mpt_reassqlen++;
269
270 /*
271 * Find a segment which begins after this one does.
272 */
273 LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) {
274 if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn)) {
275 break;
276 }
277 p = q;
278 }
279
280 /*
281 * If there is a preceding segment, it may provide some of
282 * our data already. If so, drop the data from the incoming
283 * segment. If it provides all of our data, drop us.
284 */
285 if (p != NULL) {
286 int64_t i;
287 /* conversion to int (in i) handles seq wraparound */
288 i = p->tqe_m->m_pkthdr.mp_dsn + p->tqe_len - mb_dsn;
289 if (i > 0) {
290 if (i >= *tlenp) {
291 tcpstat.tcps_mptcp_rcvduppack++;
292 m_freem(m);
293 zfree(tcp_reass_zone, te);
294 te = NULL;
295 mp_tp->mpt_reassqlen--;
296 /*
297 * Try to present any queued data
298 * at the left window edge to the user.
299 * This is needed after the 3-WHS
300 * completes.
301 */
302 goto out;
303 }
304 m_adj(m, i);
305 *tlenp -= i;
306 phdr->mp_dsn += i;
307 }
308 }
309
310 tcpstat.tcps_mp_oodata++;
311
312 /*
313 * While we overlap succeeding segments trim them or,
314 * if they are completely covered, dequeue them.
315 */
316 while (q) {
317 int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn;
318 if (i <= 0) {
319 break;
320 }
321
322 if (i < q->tqe_len) {
323 q->tqe_m->m_pkthdr.mp_dsn += i;
324 q->tqe_len -= i;
325 m_adj(q->tqe_m, i);
326 break;
327 }
328
329 nq = LIST_NEXT(q, tqe_q);
330 LIST_REMOVE(q, tqe_q);
331 m_freem(q->tqe_m);
332 zfree(tcp_reass_zone, q);
333 mp_tp->mpt_reassqlen--;
334 q = nq;
335 }
336
337 /* Insert the new segment queue entry into place. */
338 te->tqe_m = m;
339 te->tqe_th = NULL;
340 te->tqe_len = *tlenp;
341
342 if (p == NULL) {
343 LIST_INSERT_HEAD(&mp_tp->mpt_segq, te, tqe_q);
344 } else {
345 LIST_INSERT_AFTER(p, te, tqe_q);
346 }
347
348 out:
349 return mptcp_reass_present(mp_so);
350 }
351
352 /*
353 * MPTCP input, called when data has been read from a subflow socket.
354 */
355 void
356 mptcp_input(struct mptses *mpte, struct mbuf *m)
357 {
358 struct socket *mp_so;
359 struct mptcb *mp_tp = NULL;
360 int count = 0, wakeup = 0;
361 struct mbuf *save = NULL, *prev = NULL;
362 struct mbuf *freelist = NULL, *tail = NULL;
363
364 VERIFY(m->m_flags & M_PKTHDR);
365
366 mpte_lock_assert_held(mpte); /* same as MP socket lock */
367
368 mp_so = mptetoso(mpte);
369 mp_tp = mpte->mpte_mptcb;
370
371 DTRACE_MPTCP(input);
372
373 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
374
375 /*
376 * Each mbuf contains MPTCP Data Sequence Map
377 * Process the data for reassembly, delivery to MPTCP socket
378 * client, etc.
379 *
380 */
381 count = mp_so->so_rcv.sb_cc;
382
383 /*
384 * In the degraded fallback case, data is accepted without DSS map
385 */
386 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
387 struct mbuf *iter;
388 int mb_dfin = 0;
389 fallback:
390 mptcp_sbrcv_grow(mp_tp);
391
392 iter = m;
393 while (iter) {
394 if ((iter->m_flags & M_PKTHDR) &&
395 (iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
396 mb_dfin = 1;
397 }
398
399 if ((iter->m_flags & M_PKTHDR) && m_pktlen(iter) == 0) {
400 /* Don't add zero-length packets, so jump it! */
401 if (prev == NULL) {
402 m = iter->m_next;
403 m_free(iter);
404 iter = m;
405 } else {
406 prev->m_next = iter->m_next;
407 m_free(iter);
408 iter = prev->m_next;
409 }
410
411 /* It was a zero-length packet so next one must be a pkthdr */
412 VERIFY(iter == NULL || iter->m_flags & M_PKTHDR);
413 } else {
414 prev = iter;
415 iter = iter->m_next;
416 }
417 }
418
419 /*
420 * assume degraded flow as this may be the first packet
421 * without DSS, and the subflow state is not updated yet.
422 */
423 if (sbappendstream_rcvdemux(mp_so, m, 0, 0)) {
424 sorwakeup(mp_so);
425 }
426
427 DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
428 struct socket *, mp_so,
429 struct sockbuf *, &mp_so->so_rcv,
430 struct sockbuf *, &mp_so->so_snd,
431 struct mptses *, mpte);
432 count = mp_so->so_rcv.sb_cc - count;
433
434 mp_tp->mpt_rcvnxt += count;
435
436 if (mb_dfin) {
437 mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
438 socantrcvmore(mp_so);
439 }
440
441 mptcplog((LOG_DEBUG, "%s: Fallback read %d bytes\n", __func__,
442 count), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
443 return;
444 }
445
446 do {
447 u_int64_t mb_dsn;
448 int32_t mb_datalen;
449 int64_t todrop;
450 int mb_dfin = 0;
451
452 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
453 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
454 goto fallback;
455 }
456
457 save = m->m_next;
458 /*
459 * A single TCP packet formed of multiple mbufs
460 * holds DSS mapping in the first mbuf of the chain.
461 * Other mbufs in the chain may have M_PKTHDR set
462 * even though they belong to the same TCP packet
463 * and therefore use the DSS mapping stored in the
464 * first mbuf of the mbuf chain. mptcp_input() can
465 * get an mbuf chain with multiple TCP packets.
466 */
467 while (save && (!(save->m_flags & M_PKTHDR) ||
468 !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
469 prev = save;
470 save = save->m_next;
471 }
472 if (prev) {
473 prev->m_next = NULL;
474 } else {
475 m->m_next = NULL;
476 }
477
478 mb_dsn = m->m_pkthdr.mp_dsn;
479 mb_datalen = m->m_pkthdr.mp_rlen;
480
481 todrop = (mb_dsn + mb_datalen) - (mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd);
482 if (todrop > 0) {
483 tcpstat.tcps_mptcp_rcvpackafterwin++;
484
485 if (todrop >= mb_datalen) {
486 if (freelist == NULL) {
487 freelist = m;
488 } else {
489 tail->m_next = m;
490 }
491
492 if (prev != NULL) {
493 tail = prev;
494 } else {
495 tail = m;
496 }
497
498 m = save;
499 prev = save = NULL;
500 continue;
501 } else {
502 m_adj(m, -todrop);
503 mb_datalen -= todrop;
504 }
505
506 /*
507 * We drop from the right edge of the mbuf, thus the
508 * DATA_FIN is dropped as well
509 */
510 m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
511 }
512
513
514 if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
515 if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
516 mp_tp->mpt_rcvnxt)) {
517 if (freelist == NULL) {
518 freelist = m;
519 } else {
520 tail->m_next = m;
521 }
522
523 if (prev != NULL) {
524 tail = prev;
525 } else {
526 tail = m;
527 }
528
529 m = save;
530 prev = save = NULL;
531 continue;
532 } else {
533 m_adj(m, (mp_tp->mpt_rcvnxt - mb_dsn));
534 }
535 mptcplog((LOG_INFO, "%s: Left Edge %llu\n", __func__,
536 mp_tp->mpt_rcvnxt),
537 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
538 }
539
540 if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
541 !LIST_EMPTY(&mp_tp->mpt_segq)) {
542 mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
543
544 goto next;
545 }
546 mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
547
548 mptcp_sbrcv_grow(mp_tp);
549
550 if (sbappendstream_rcvdemux(mp_so, m, 0, 0)) {
551 wakeup = 1;
552 }
553
554 DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
555 struct sockbuf *, &mp_so->so_rcv,
556 struct sockbuf *, &mp_so->so_snd,
557 struct mptses *, mpte,
558 struct mptcb *, mp_tp);
559 count = mp_so->so_rcv.sb_cc - count;
560 tcpstat.tcps_mp_rcvtotal++;
561 tcpstat.tcps_mp_rcvbytes += count;
562 mptcplog((LOG_DEBUG, "%s: Read %d bytes\n", __func__, count),
563 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
564
565 mp_tp->mpt_rcvnxt += count;
566
567 next:
568 if (mb_dfin) {
569 mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
570 socantrcvmore(mp_so);
571 }
572 m = save;
573 prev = save = NULL;
574 count = mp_so->so_rcv.sb_cc;
575 } while (m);
576
577 if (freelist) {
578 m_freem(freelist);
579 }
580
581 if (wakeup) {
582 sorwakeup(mp_so);
583 }
584 }
585
586 boolean_t
587 mptcp_can_send_more(struct mptcb *mp_tp, boolean_t ignore_reinject)
588 {
589 struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
590
591 /*
592 * Always send if there is data in the reinject-queue.
593 */
594 if (!ignore_reinject && mp_tp->mpt_mpte->mpte_reinjectq) {
595 return TRUE;
596 }
597
598 /*
599 * Don't send, if:
600 *
601 * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
602 * Except when using TFO, we might be doing a 0-byte write.
603 * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
604 * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
605 */
606
607 if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
608 return FALSE;
609 }
610
611 if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt)) {
612 return FALSE;
613 }
614
615 if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
616 return FALSE;
617 }
618
619 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
620 return FALSE;
621 }
622
623 return TRUE;
624 }
625
626 /*
627 * MPTCP output.
628 */
629 int
630 mptcp_output(struct mptses *mpte)
631 {
632 struct mptcb *mp_tp;
633 struct mptsub *mpts;
634 struct mptsub *mpts_tried = NULL;
635 struct socket *mp_so;
636 struct mptsub *preferred_mpts = NULL;
637 uint64_t old_snd_nxt;
638 int error = 0;
639
640 mpte_lock_assert_held(mpte);
641 mp_so = mptetoso(mpte);
642 mp_tp = mpte->mpte_mptcb;
643
644 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
645 mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
646
647 mptcplog((LOG_DEBUG, "%s: snxt %u sndmax %u suna %u swnd %u reinjectq %u state %u\n",
648 __func__, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
649 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_sndwnd,
650 mpte->mpte_reinjectq ? 1 : 0,
651 mp_tp->mpt_state),
652 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
653
654 old_snd_nxt = mp_tp->mpt_sndnxt;
655 while (mptcp_can_send_more(mp_tp, FALSE)) {
656 /* get the "best" subflow to be used for transmission */
657 mpts = mptcp_get_subflow(mpte, NULL, &preferred_mpts);
658 if (mpts == NULL) {
659 mptcplog((LOG_INFO, "%s: no subflow\n", __func__),
660 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
661 break;
662 }
663
664 mptcplog((LOG_DEBUG, "%s: using id %u\n", __func__, mpts->mpts_connid),
665 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
666
667 /* In case there's just one flow, we reattempt later */
668 if (mpts_tried != NULL &&
669 (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
670 mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
671 mpts_tried->mpts_flags |= MPTSF_ACTIVE;
672 mptcp_start_timer(mpte, MPTT_REXMT);
673 mptcplog((LOG_DEBUG, "%s: retry later\n", __func__),
674 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
675 break;
676 }
677
678 /*
679 * Automatic sizing of send socket buffer. Increase the send
680 * socket buffer size if all of the following criteria are met
681 * 1. the receiver has enough buffer space for this data
682 * 2. send buffer is filled to 7/8th with data (so we actually
683 * have data to make use of it);
684 */
685 if (tcp_do_autosendbuf == 1 &&
686 (mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE &&
687 tcp_cansbgrow(&mp_so->so_snd)) {
688 if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat &&
689 mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) {
690 if (sbreserve(&mp_so->so_snd,
691 min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
692 tcp_autosndbuf_max)) == 1) {
693 mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
694
695 mptcplog((LOG_DEBUG, "%s: increased snd hiwat to %u lowat %u\n",
696 __func__, mp_so->so_snd.sb_hiwat,
697 mp_so->so_snd.sb_lowat),
698 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
699 }
700 }
701 }
702
703 DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
704 struct socket *, mp_so);
705 error = mptcp_subflow_output(mpte, mpts, 0);
706 if (error) {
707 /* can be a temporary loss of source address or other error */
708 mpts->mpts_flags |= MPTSF_FAILINGOVER;
709 mpts->mpts_flags &= ~MPTSF_ACTIVE;
710 mpts_tried = mpts;
711 if (error != ECANCELED) {
712 mptcplog((LOG_ERR, "%s: Error = %d mpts_flags %#x\n", __func__,
713 error, mpts->mpts_flags),
714 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
715 }
716 break;
717 }
718 /* The model is to have only one active flow at a time */
719 mpts->mpts_flags |= MPTSF_ACTIVE;
720 mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
721
722 /* Allows us to update the smoothed rtt */
723 if (mptcp_probeto && mpts != preferred_mpts && preferred_mpts != NULL) {
724 if (preferred_mpts->mpts_probesoon) {
725 if ((tcp_now - preferred_mpts->mpts_probesoon) > mptcp_probeto) {
726 mptcp_subflow_output(mpte, preferred_mpts, MPTCP_SUBOUT_PROBING);
727 if (preferred_mpts->mpts_probecnt >= mptcp_probecnt) {
728 preferred_mpts->mpts_probesoon = 0;
729 preferred_mpts->mpts_probecnt = 0;
730 }
731 }
732 } else {
733 preferred_mpts->mpts_probesoon = tcp_now;
734 preferred_mpts->mpts_probecnt = 0;
735 }
736 }
737
738 if (mpte->mpte_active_sub == NULL) {
739 mpte->mpte_active_sub = mpts;
740 } else if (mpte->mpte_active_sub != mpts) {
741 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
742 struct tcpcb *acttp = sototcpcb(mpte->mpte_active_sub->mpts_socket);
743
744 mptcplog((LOG_DEBUG, "%s: switch [%u, srtt %d] to [%u, srtt %d]\n", __func__,
745 mpte->mpte_active_sub->mpts_connid, acttp->t_srtt >> TCP_RTT_SHIFT,
746 mpts->mpts_connid, tp->t_srtt >> TCP_RTT_SHIFT),
747 (MPTCP_SENDER_DBG | MPTCP_SOCKET_DBG), MPTCP_LOGLVL_LOG);
748
749 mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
750 mpte->mpte_active_sub = mpts;
751
752 mptcpstats_inc_switch(mpte, mpts);
753 }
754 }
755
756 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
757 if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax &&
758 mp_tp->mpt_snduna == mp_tp->mpt_sndnxt) {
759 mptcp_finish_usrclosed(mpte);
760 }
761 }
762
763 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_WUPCALL);
764
765 /* subflow errors should not be percolated back up */
766 return 0;
767 }
768
769
770 static struct mptsub *
771 mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt)
772 {
773 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
774
775 /*
776 * Lower RTT? Take it, if it's our first one, or
777 * it doesn't has any loss, or the current one has
778 * loss as well.
779 */
780 if (tp->t_srtt && *currtt > tp->t_srtt &&
781 (curbest == NULL || tp->t_rxtshift == 0 ||
782 sototcpcb(curbest->mpts_socket)->t_rxtshift)) {
783 *currtt = tp->t_srtt;
784 return mpts;
785 }
786
787 /*
788 * If we find a subflow without loss, take it always!
789 */
790 if (curbest &&
791 sototcpcb(curbest->mpts_socket)->t_rxtshift &&
792 tp->t_rxtshift == 0) {
793 *currtt = tp->t_srtt;
794 return mpts;
795 }
796
797 return curbest != NULL ? curbest : mpts;
798 }
799
800 static struct mptsub *
801 mptcp_return_subflow(struct mptsub *mpts)
802 {
803 if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0) {
804 return NULL;
805 }
806
807 return mpts;
808 }
809
810 /*
811 * Return the most eligible subflow to be used for sending data.
812 */
813 struct mptsub *
814 mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **preferred)
815 {
816 struct tcpcb *besttp, *secondtp;
817 struct inpcb *bestinp, *secondinp;
818 struct mptsub *mpts;
819 struct mptsub *best = NULL;
820 struct mptsub *second_best = NULL;
821 int exp_rtt = INT_MAX, cheap_rtt = INT_MAX;
822
823 /*
824 * First Step:
825 * Choose the best subflow for cellular and non-cellular interfaces.
826 */
827
828 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
829 struct socket *so = mpts->mpts_socket;
830 struct tcpcb *tp = sototcpcb(so);
831 struct inpcb *inp = sotoinpcb(so);
832
833 mptcplog((LOG_DEBUG, "%s mpts %u ignore %d, mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n",
834 __func__, mpts->mpts_connid, ignore ? ignore->mpts_connid : -1, mpts->mpts_flags,
835 INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state,
836 inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1,
837 tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt,
838 mptcp_subflow_cwnd_space(so)),
839 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
840
841 /*
842 * First, the hard conditions to reject subflows
843 * (e.g., not connected,...)
844 */
845 if (mpts == ignore || inp->inp_last_outifp == NULL) {
846 continue;
847 }
848
849 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
850 continue;
851 }
852
853 /* There can only be one subflow in degraded state */
854 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
855 best = mpts;
856 break;
857 }
858
859 /*
860 * If this subflow is waiting to finally send, do it!
861 */
862 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
863 return mptcp_return_subflow(mpts);
864 }
865
866 /*
867 * Only send if the subflow is MP_CAPABLE. The exceptions to
868 * this rule (degraded or TFO) have been taken care of above.
869 */
870 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE)) {
871 continue;
872 }
873
874 if ((so->so_state & SS_ISDISCONNECTED) ||
875 !(so->so_state & SS_ISCONNECTED) ||
876 !TCPS_HAVEESTABLISHED(tp->t_state) ||
877 tp->t_state > TCPS_CLOSE_WAIT) {
878 continue;
879 }
880
881 /*
882 * Second, the soft conditions to find the subflow with best
883 * conditions for each set (aka cellular vs non-cellular)
884 */
885 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
886 second_best = mptcp_choose_subflow(mpts, second_best,
887 &exp_rtt);
888 } else {
889 best = mptcp_choose_subflow(mpts, best, &cheap_rtt);
890 }
891 }
892
893 /*
894 * If there is no preferred or backup subflow, and there is no active
895 * subflow use the last usable subflow.
896 */
897 if (best == NULL) {
898 return mptcp_return_subflow(second_best);
899 }
900
901 if (second_best == NULL) {
902 return mptcp_return_subflow(best);
903 }
904
905 besttp = sototcpcb(best->mpts_socket);
906 bestinp = sotoinpcb(best->mpts_socket);
907 secondtp = sototcpcb(second_best->mpts_socket);
908 secondinp = sotoinpcb(second_best->mpts_socket);
909
910 if (preferred != NULL) {
911 *preferred = mptcp_return_subflow(best);
912 }
913
914 /*
915 * Second Step: Among best and second_best. Choose the one that is
916 * most appropriate for this particular service-type.
917 */
918 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
919 /*
920 * Only handover if Symptoms tells us to do so.
921 */
922 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
923 mptcp_is_wifi_unusable(mpte) != 0 && mptcp_subflow_is_bad(mpte, best)) {
924 return mptcp_return_subflow(second_best);
925 }
926
927 return mptcp_return_subflow(best);
928 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) {
929 int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT;
930 int rto_thresh = mptcp_rtothresh;
931
932 /* Adjust with symptoms information */
933 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
934 mptcp_is_wifi_unusable(mpte) != 0) {
935 rtt_thresh /= 2;
936 rto_thresh /= 2;
937 }
938
939 if (besttp->t_srtt && secondtp->t_srtt &&
940 besttp->t_srtt >= rtt_thresh &&
941 secondtp->t_srtt < rtt_thresh) {
942 tcpstat.tcps_mp_sel_rtt++;
943 mptcplog((LOG_DEBUG, "%s: best cid %d at rtt %d, second cid %d at rtt %d\n", __func__,
944 best->mpts_connid, besttp->t_srtt >> TCP_RTT_SHIFT,
945 second_best->mpts_connid,
946 secondtp->t_srtt >> TCP_RTT_SHIFT),
947 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
948 return mptcp_return_subflow(second_best);
949 }
950
951 if (mptcp_subflow_is_bad(mpte, best) &&
952 secondtp->t_rxtshift == 0) {
953 return mptcp_return_subflow(second_best);
954 }
955
956 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
957 if (besttp->t_rxtcur && secondtp->t_rxtcur &&
958 besttp->t_rxtcur >= rto_thresh &&
959 secondtp->t_rxtcur < rto_thresh) {
960 tcpstat.tcps_mp_sel_rto++;
961 mptcplog((LOG_DEBUG, "%s: best cid %d at rto %d, second cid %d at rto %d\n", __func__,
962 best->mpts_connid, besttp->t_rxtcur,
963 second_best->mpts_connid, secondtp->t_rxtcur),
964 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
965
966 return mptcp_return_subflow(second_best);
967 }
968
969 /*
970 * None of the above conditions for sending on the secondary
971 * were true. So, let's schedule on the best one, if he still
972 * has some space in the congestion-window.
973 */
974 return mptcp_return_subflow(best);
975 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_AGGREGATE) {
976 struct mptsub *tmp;
977
978 /*
979 * We only care about RTT when aggregating
980 */
981 if (besttp->t_srtt > secondtp->t_srtt) {
982 tmp = best;
983 best = second_best;
984 besttp = secondtp;
985 bestinp = secondinp;
986
987 second_best = tmp;
988 secondtp = sototcpcb(second_best->mpts_socket);
989 secondinp = sotoinpcb(second_best->mpts_socket);
990 }
991
992 /* Is there still space in the congestion window? */
993 if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0) {
994 return mptcp_return_subflow(second_best);
995 }
996
997 return mptcp_return_subflow(best);
998 } else {
999 panic("Unknown service-type configured for MPTCP");
1000 }
1001
1002 return NULL;
1003 }
1004
1005 static const char *
1006 mptcp_event_to_str(uint32_t event)
1007 {
1008 const char *c = "UNDEFINED";
1009 switch (event) {
1010 case MPCE_CLOSE:
1011 c = "MPCE_CLOSE";
1012 break;
1013 case MPCE_RECV_DATA_ACK:
1014 c = "MPCE_RECV_DATA_ACK";
1015 break;
1016 case MPCE_RECV_DATA_FIN:
1017 c = "MPCE_RECV_DATA_FIN";
1018 break;
1019 }
1020 return c;
1021 }
1022
1023 static const char *
1024 mptcp_state_to_str(mptcp_state_t state)
1025 {
1026 const char *c = "UNDEFINED";
1027 switch (state) {
1028 case MPTCPS_CLOSED:
1029 c = "MPTCPS_CLOSED";
1030 break;
1031 case MPTCPS_LISTEN:
1032 c = "MPTCPS_LISTEN";
1033 break;
1034 case MPTCPS_ESTABLISHED:
1035 c = "MPTCPS_ESTABLISHED";
1036 break;
1037 case MPTCPS_CLOSE_WAIT:
1038 c = "MPTCPS_CLOSE_WAIT";
1039 break;
1040 case MPTCPS_FIN_WAIT_1:
1041 c = "MPTCPS_FIN_WAIT_1";
1042 break;
1043 case MPTCPS_CLOSING:
1044 c = "MPTCPS_CLOSING";
1045 break;
1046 case MPTCPS_LAST_ACK:
1047 c = "MPTCPS_LAST_ACK";
1048 break;
1049 case MPTCPS_FIN_WAIT_2:
1050 c = "MPTCPS_FIN_WAIT_2";
1051 break;
1052 case MPTCPS_TIME_WAIT:
1053 c = "MPTCPS_TIME_WAIT";
1054 break;
1055 case MPTCPS_TERMINATE:
1056 c = "MPTCPS_TERMINATE";
1057 break;
1058 }
1059 return c;
1060 }
1061
1062 void
1063 mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
1064 {
1065 mpte_lock_assert_held(mp_tp->mpt_mpte);
1066 mptcp_state_t old_state = mp_tp->mpt_state;
1067
1068 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1069 uint32_t, event);
1070
1071 switch (mp_tp->mpt_state) {
1072 case MPTCPS_CLOSED:
1073 case MPTCPS_LISTEN:
1074 mp_tp->mpt_state = MPTCPS_TERMINATE;
1075 break;
1076
1077 case MPTCPS_ESTABLISHED:
1078 if (event == MPCE_CLOSE) {
1079 mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
1080 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1081 } else if (event == MPCE_RECV_DATA_FIN) {
1082 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1083 mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
1084 }
1085 break;
1086
1087 case MPTCPS_CLOSE_WAIT:
1088 if (event == MPCE_CLOSE) {
1089 mp_tp->mpt_state = MPTCPS_LAST_ACK;
1090 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1091 }
1092 break;
1093
1094 case MPTCPS_FIN_WAIT_1:
1095 if (event == MPCE_RECV_DATA_ACK) {
1096 mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
1097 } else if (event == MPCE_RECV_DATA_FIN) {
1098 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1099 mp_tp->mpt_state = MPTCPS_CLOSING;
1100 }
1101 break;
1102
1103 case MPTCPS_CLOSING:
1104 if (event == MPCE_RECV_DATA_ACK) {
1105 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1106 }
1107 break;
1108
1109 case MPTCPS_LAST_ACK:
1110 if (event == MPCE_RECV_DATA_ACK) {
1111 mptcp_close(mp_tp->mpt_mpte, mp_tp);
1112 }
1113 break;
1114
1115 case MPTCPS_FIN_WAIT_2:
1116 if (event == MPCE_RECV_DATA_FIN) {
1117 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1118 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1119 }
1120 break;
1121
1122 case MPTCPS_TIME_WAIT:
1123 case MPTCPS_TERMINATE:
1124 break;
1125
1126 default:
1127 VERIFY(0);
1128 /* NOTREACHED */
1129 }
1130 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1131 uint32_t, event);
1132 mptcplog((LOG_INFO, "%s: %s to %s on event %s\n", __func__,
1133 mptcp_state_to_str(old_state),
1134 mptcp_state_to_str(mp_tp->mpt_state),
1135 mptcp_event_to_str(event)),
1136 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
1137 }
1138
1139 /* If you change this function, match up mptcp_update_rcv_state_f */
1140 void
1141 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
1142 uint16_t csum)
1143 {
1144 struct mptcb *mp_tp = tptomptp(tp);
1145 u_int64_t full_dsn = 0;
1146
1147 NTOHL(dss_info->mdss_dsn);
1148 NTOHL(dss_info->mdss_subflow_seqn);
1149 NTOHS(dss_info->mdss_data_len);
1150
1151 /* XXX for autosndbuf grow sb here */
1152 MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
1153 mptcp_update_rcv_state_meat(mp_tp, tp,
1154 full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
1155 csum);
1156 }
1157
1158 void
1159 mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
1160 u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
1161 uint16_t csum)
1162 {
1163 if (mdss_data_len == 0) {
1164 mptcplog((LOG_INFO, "%s: Infinite Mapping.\n", __func__),
1165 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1166
1167 if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
1168 mptcplog((LOG_ERR, "%s: Bad checksum %x \n", __func__,
1169 csum), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
1170 }
1171 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1172 return;
1173 }
1174 mptcplog((LOG_DEBUG,
1175 "%s: seqn = %u len = %u full = %u rcvnxt = %u \n", __func__,
1176 seqn, mdss_data_len, (uint32_t)full_dsn, (uint32_t)mp_tp->mpt_rcvnxt),
1177 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
1178
1179 mptcp_notify_mpready(tp->t_inpcb->inp_socket);
1180
1181 tp->t_rcv_map.mpt_dsn = full_dsn;
1182 tp->t_rcv_map.mpt_sseq = seqn;
1183 tp->t_rcv_map.mpt_len = mdss_data_len;
1184 tp->t_rcv_map.mpt_csum = csum;
1185 tp->t_mpflags |= TMPF_EMBED_DSN;
1186 }
1187
1188
1189 static int
1190 mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m,
1191 int hdrlen)
1192 {
1193 u_int32_t datalen;
1194
1195 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1196 return 0;
1197 }
1198
1199 datalen = m->m_pkthdr.mp_rlen;
1200
1201 /* unacceptable DSS option, fallback to TCP */
1202 if (m->m_pkthdr.len > ((int) datalen + hdrlen)) {
1203 mptcplog((LOG_ERR, "%s: mbuf len %d, MPTCP expected %d",
1204 __func__, m->m_pkthdr.len, datalen),
1205 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1206 } else {
1207 return 0;
1208 }
1209 tp->t_mpflags |= TMPF_SND_MPFAIL;
1210 mptcp_notify_mpfail(so);
1211 m_freem(m);
1212 return -1;
1213 }
1214
1215 int
1216 mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
1217 int drop_hdrlen)
1218 {
1219 mptcp_insert_rmap(tp, m, th);
1220 if (mptcp_validate_dss_map(tp->t_inpcb->inp_socket, tp, m,
1221 drop_hdrlen) != 0) {
1222 return -1;
1223 }
1224 return 0;
1225 }
1226
1227 /*
1228 * MPTCP Checksum support
1229 * The checksum is calculated whenever the MPTCP DSS option is included
1230 * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
1231 * header and the actual data indicated by the length specified in the
1232 * DSS option.
1233 */
1234
1235 int
1236 mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
1237 uint32_t sseq, uint16_t dlen, uint16_t csum, uint16_t dfin)
1238 {
1239 uint16_t mptcp_csum;
1240
1241 mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum, dfin);
1242 if (mptcp_csum) {
1243 tp->t_mpflags |= TMPF_SND_MPFAIL;
1244 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1245 m_freem(m);
1246 tcpstat.tcps_mp_badcsum++;
1247 return -1;
1248 }
1249 return 0;
1250 }
1251
1252 static uint16_t
1253 mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq,
1254 uint16_t dlen, uint16_t csum, uint16_t dfin)
1255 {
1256 struct mptcb *mp_tp = tptomptp(tp);
1257 uint16_t real_len = dlen - dfin;
1258 uint32_t sum = 0;
1259
1260 if (mp_tp == NULL) {
1261 return 0;
1262 }
1263
1264 if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
1265 return 0;
1266 }
1267
1268 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
1269 return 0;
1270 }
1271
1272 /*
1273 * The remote side may send a packet with fewer bytes than the
1274 * claimed DSS checksum length.
1275 */
1276 if ((int)m_length2(m, NULL) < real_len) {
1277 return 0xffff;
1278 }
1279
1280 if (real_len != 0) {
1281 sum = m_sum16(m, 0, real_len);
1282 }
1283
1284 sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum);
1285 ADDCARRY(sum);
1286 DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1287 uint32_t, sum);
1288
1289 mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1290 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
1291 return ~sum & 0xffff;
1292 }
1293
1294 uint32_t
1295 mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen)
1296 {
1297 uint32_t sum = 0;
1298
1299 if (dlen) {
1300 sum = m_sum16(m, 0, dlen);
1301 }
1302
1303 dss_val = mptcp_hton64(dss_val);
1304 sseq = htonl(sseq);
1305 dlen = htons(dlen);
1306 sum += in_pseudo64(dss_val, sseq, dlen);
1307
1308 ADDCARRY(sum);
1309 sum = ~sum & 0xffff;
1310 DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum);
1311 mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1312 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
1313
1314 return sum;
1315 }
1316
1317 /*
1318 * When WiFi signal starts fading, there's more loss and RTT spikes.
1319 * Check if there has been a large spike by comparing against
1320 * a tolerable RTT spike threshold.
1321 */
1322 boolean_t
1323 mptcp_no_rto_spike(struct socket *so)
1324 {
1325 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1326 int32_t spike = 0;
1327
1328 if (tp->t_rxtcur > mptcp_rtothresh) {
1329 spike = tp->t_rxtcur - mptcp_rtothresh;
1330
1331 mptcplog((LOG_DEBUG, "%s: spike = %d rto = %d best = %d cur = %d\n",
1332 __func__, spike,
1333 tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
1334 tp->t_rttcur),
1335 (MPTCP_SOCKET_DBG | MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG);
1336 }
1337
1338 if (spike > 0) {
1339 return FALSE;
1340 } else {
1341 return TRUE;
1342 }
1343 }
1344
1345 void
1346 mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
1347 {
1348 VERIFY(mpp->mpp_flags & flag);
1349 mpp->mpp_flags &= ~flag;
1350
1351 if (mptcp_should_defer_upcall(mpp)) {
1352 return;
1353 }
1354
1355 if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) {
1356 mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP;
1357
1358 mptcp_subflow_workloop(mpp->mpp_pcbe);
1359 }
1360
1361 if (mpp->mpp_flags & MPP_SHOULD_RWAKEUP) {
1362 mpp->mpp_flags &= ~MPP_SHOULD_RWAKEUP;
1363
1364 sorwakeup(mpp->mpp_socket);
1365 }
1366
1367 if (mpp->mpp_flags & MPP_SHOULD_WWAKEUP) {
1368 mpp->mpp_flags &= ~MPP_SHOULD_WWAKEUP;
1369
1370 sowwakeup(mpp->mpp_socket);
1371 }
1372
1373 if (mpp->mpp_flags & MPP_SET_CELLICON) {
1374 mpp->mpp_flags &= ~MPP_SET_CELLICON;
1375
1376 mptcp_set_cellicon(mpp->mpp_pcbe);
1377 }
1378
1379 if (mpp->mpp_flags & MPP_UNSET_CELLICON) {
1380 mpp->mpp_flags &= ~MPP_UNSET_CELLICON;
1381
1382 mptcp_unset_cellicon();
1383 }
1384 }
1385
1386 void
1387 mptcp_ask_for_nat64(struct ifnet *ifp)
1388 {
1389 in6_post_msg(ifp, KEV_INET6_REQUEST_NAT64_PREFIX, NULL, NULL);
1390
1391 os_log_info(mptcp_log_handle,
1392 "%s: asked for NAT64-prefix on %s\n", __func__,
1393 ifp->if_name);
1394 }
1395
1396 static void
1397 mptcp_reset_itfinfo(struct mpt_itf_info *info)
1398 {
1399 info->ifindex = 0;
1400 info->has_v4_conn = 0;
1401 info->has_v6_conn = 0;
1402 info->has_nat64_conn = 0;
1403 }
1404
1405 void
1406 mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
1407 uint32_t necp_flags, __unused bool *viable)
1408 {
1409 boolean_t has_v4 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
1410 boolean_t has_v6 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
1411 boolean_t has_nat64 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_NAT64);
1412 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1413 struct mppcb *mp = (struct mppcb *)handle;
1414 struct mptses *mpte = mptompte(mp);
1415 struct socket *mp_so;
1416 struct mptcb *mp_tp;
1417 int locked = 0;
1418 uint32_t i, ifindex;
1419
1420 ifindex = interface_index;
1421 VERIFY(ifindex != IFSCOPE_NONE);
1422
1423 /* About to be garbage-collected (see note about MPTCP/NECP interactions) */
1424 if (mp->mpp_socket->so_usecount == 0) {
1425 return;
1426 }
1427
1428 if (action != NECP_CLIENT_CBACTION_INITIAL) {
1429 mpte_lock(mpte);
1430 locked = 1;
1431
1432 /* Check again, because it might have changed while waiting */
1433 if (mp->mpp_socket->so_usecount == 0) {
1434 goto out;
1435 }
1436 }
1437
1438 mpte_lock_assert_held(mpte);
1439
1440 mp_tp = mpte->mpte_mptcb;
1441 mp_so = mptetoso(mpte);
1442
1443 os_log_info(mptcp_log_handle, "%s, action: %u ifindex %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
1444 __func__, action, ifindex, mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state,
1445 has_v4, has_v6, has_nat64, low_power);
1446
1447 /* No need on fallen back sockets */
1448 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
1449 goto out;
1450 }
1451
1452 /*
1453 * When the interface goes in low-power mode we don't want to establish
1454 * new subflows on it. Thus, mark it internally as non-viable.
1455 */
1456 if (low_power) {
1457 action = NECP_CLIENT_CBACTION_NONVIABLE;
1458 }
1459
1460 if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
1461 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1462 if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1463 continue;
1464 }
1465
1466 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1467 mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]);
1468 }
1469 }
1470
1471 mptcp_sched_create_subflows(mpte);
1472 } else if (action == NECP_CLIENT_CBACTION_VIABLE ||
1473 action == NECP_CLIENT_CBACTION_INITIAL) {
1474 int found_slot = 0, slot_index = -1;
1475 struct ifnet *ifp;
1476
1477 ifnet_head_lock_shared();
1478 ifp = ifindex2ifnet[ifindex];
1479 ifnet_head_done();
1480
1481 if (ifp == NULL) {
1482 goto out;
1483 }
1484
1485 if (IFNET_IS_EXPENSIVE(ifp) &&
1486 (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1487 goto out;
1488 }
1489
1490 if (IFNET_IS_CELLULAR(ifp) &&
1491 (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1492 goto out;
1493 }
1494
1495 if (IS_INTF_CLAT46(ifp)) {
1496 has_v4 = FALSE;
1497 }
1498
1499 /* Look for the slot on where to store/update the interface-info. */
1500 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1501 /* Found a potential empty slot where we can put it */
1502 if (mpte->mpte_itfinfo[i].ifindex == 0) {
1503 found_slot = 1;
1504 slot_index = i;
1505 }
1506
1507 /*
1508 * The interface is already in our array. Check if we
1509 * need to update it.
1510 */
1511 if (mpte->mpte_itfinfo[i].ifindex == ifindex &&
1512 (mpte->mpte_itfinfo[i].has_v4_conn != has_v4 ||
1513 mpte->mpte_itfinfo[i].has_v6_conn != has_v6 ||
1514 mpte->mpte_itfinfo[i].has_nat64_conn != has_nat64)) {
1515 found_slot = 1;
1516 slot_index = i;
1517 break;
1518 }
1519
1520 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1521 /*
1522 * Ok, it's already there and we don't need
1523 * to update it
1524 */
1525 goto out;
1526 }
1527 }
1528
1529 if ((mpte->mpte_dst.sa_family == AF_INET || mpte->mpte_dst.sa_family == 0) &&
1530 !has_nat64 && !has_v4) {
1531 if (found_slot) {
1532 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1533 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1534 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1535 }
1536 mptcp_ask_for_nat64(ifp);
1537 goto out;
1538 }
1539
1540 if (found_slot == 0) {
1541 int new_size = mpte->mpte_itfinfo_size * 2;
1542 struct mpt_itf_info *info = _MALLOC(sizeof(*info) * new_size, M_TEMP, M_ZERO);
1543
1544 if (info == NULL) {
1545 os_log_error(mptcp_log_handle, "%s malloc failed for %u\n",
1546 __func__, new_size);
1547 goto out;
1548 }
1549
1550 memcpy(info, mpte->mpte_itfinfo, mpte->mpte_itfinfo_size * sizeof(*info));
1551
1552 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
1553 _FREE(mpte->mpte_itfinfo, M_TEMP);
1554 }
1555
1556 /* We allocated a new one, thus the first must be empty */
1557 slot_index = mpte->mpte_itfinfo_size;
1558
1559 mpte->mpte_itfinfo = info;
1560 mpte->mpte_itfinfo_size = new_size;
1561 }
1562
1563 VERIFY(slot_index >= 0 && slot_index < (int)mpte->mpte_itfinfo_size);
1564 mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1565 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1566 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1567 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1568
1569 mptcp_sched_create_subflows(mpte);
1570 }
1571
1572 out:
1573 if (locked) {
1574 mpte_unlock(mpte);
1575 }
1576 }
1577
1578 void
1579 mptcp_set_restrictions(struct socket *mp_so)
1580 {
1581 struct mptses *mpte = mpsotompte(mp_so);
1582 uint32_t i;
1583
1584 mpte_lock_assert_held(mpte);
1585
1586 ifnet_head_lock_shared();
1587
1588 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1589 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
1590 uint32_t ifindex = info->ifindex;
1591 struct ifnet *ifp;
1592
1593 if (ifindex == IFSCOPE_NONE) {
1594 continue;
1595 }
1596
1597 ifp = ifindex2ifnet[ifindex];
1598 if (ifp == NULL) {
1599 continue;
1600 }
1601
1602 if (IFNET_IS_EXPENSIVE(ifp) &&
1603 (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1604 info->ifindex = IFSCOPE_NONE;
1605 }
1606
1607 if (IFNET_IS_CELLULAR(ifp) &&
1608 (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1609 info->ifindex = IFSCOPE_NONE;
1610 }
1611 }
1612
1613 ifnet_head_done();
1614 }