]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/mptcp.c
xnu-4570.20.62.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp.c
1 /*
2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * A note on the MPTCP/NECP-interactions:
31 *
32 * MPTCP uses NECP-callbacks to get notified of interface/policy events.
33 * MPTCP registers to these events at the MPTCP-layer for interface-events
34 * through a call to necp_client_register_multipath_cb.
35 * To get per-flow events (aka per TCP-subflow), we register to it with
36 * necp_client_register_socket_flow. Both registrations happen by using the
37 * necp-client-uuid that comes from the app.
38 *
39 * The locking is rather tricky. In general, we expect the lock-ordering to
40 * happen from necp-fd -> necp->client -> mpp_lock.
41 *
42 * There are however some subtleties.
43 *
44 * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
45 * safe, because it is the very first time this MPTCP-connection goes into NECP.
46 * As we go into NECP we take the NECP-locks and thus are guaranteed that no
47 * NECP-locks will deadlock us. Because these NECP-events will also first take
48 * the NECP-locks. Either they win the race and thus won't find our
49 * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
50 * the callbacks while holding the NECP lock.
51 *
52 * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
53 * because we have already registered callbacks and we might race against an
54 * NECP-event that will match on our socket. So, we have to unlock to be safe.
55 *
56 * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
57 * so_usecount has reached 0. We must be careful to not remove the mpp_socket
58 * pointers before we unregistered the callback. Because, again we might be
59 * racing against an NECP-event. Unregistering must happen with an unlocked
60 * mpp_lock, because of the lock-ordering constraint. It could be that
61 * before we had a chance to unregister an NECP-event triggers. That's why
62 * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
63 * there while the socket is being garbage-collected, the use-count will go
64 * down to 0 and we exit. Removal of the multipath_cb again happens by taking
65 * the NECP-locks so any running NECP-events will finish first and exit cleanly.
66 *
67 * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
68 * the socket-lock must be unlocked for lock-ordering constraints. This gets a
69 * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
70 * So, we drop the mp_so-lock as soon as the subflow is unlinked with
71 * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
72 * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
73 * gets it, it will realize that the subflow became non-MPTCP and retry (see
74 * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
75 * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
76 * for the NECP-lock (held by the other thread that is taking care of the NECP-
77 * event). So, the event now finally gets the subflow-lock and then hits an
78 * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
79 * the NECP callback.
80 */
81
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/kernel.h>
85 #include <sys/mbuf.h>
86 #include <sys/mcache.h>
87 #include <sys/socket.h>
88 #include <sys/socketvar.h>
89 #include <sys/syslog.h>
90 #include <sys/protosw.h>
91
92 #include <kern/zalloc.h>
93 #include <kern/locks.h>
94
95 #include <mach/sdt.h>
96
97 #include <net/if.h>
98 #include <netinet/in.h>
99 #include <netinet/in_var.h>
100 #include <netinet/tcp.h>
101 #include <netinet/tcp_fsm.h>
102 #include <netinet/tcp_seq.h>
103 #include <netinet/tcp_var.h>
104 #include <netinet/mptcp_var.h>
105 #include <netinet/mptcp.h>
106 #include <netinet/mptcp_seq.h>
107 #include <netinet/mptcp_opt.h>
108 #include <netinet/mptcp_timer.h>
109
110 int mptcp_enable = 1;
111 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
112 &mptcp_enable, 0, "Enable Multipath TCP Support");
113
114 /* Number of times to try negotiating MPTCP on SYN retransmissions */
115 int mptcp_mpcap_retries = MPTCP_CAPABLE_RETRIES;
116 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
117 CTLFLAG_RW | CTLFLAG_LOCKED,
118 &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
119
120 /*
121 * By default, DSS checksum is turned off, revisit if we ever do
122 * MPTCP for non SSL Traffic.
123 */
124 int mptcp_dss_csum = 0;
125 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
126 &mptcp_dss_csum, 0, "Enable DSS checksum");
127
128 /*
129 * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
130 * is attempted on a different path.
131 */
132 int mptcp_fail_thresh = 1;
133 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
134 &mptcp_fail_thresh, 0, "Failover threshold");
135
136
137 /*
138 * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
139 * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
140 * Some carrier networks have a timeout of 10 or 15 minutes.
141 */
142 int mptcp_subflow_keeptime = 60*14;
143 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
144 &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
145
146 int mptcp_rtthist_rtthresh = 600;
147 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
148 &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
149
150 /*
151 * Use RTO history for sending new data
152 */
153 int mptcp_use_rto = 1;
154 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, userto, CTLFLAG_RW | CTLFLAG_LOCKED,
155 &mptcp_use_rto, 0, "Disable RTO for subflow selection");
156
157 int mptcp_rtothresh = 1500;
158 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
159 &mptcp_rtothresh, 0, "RTO threshold");
160
161 /*
162 * Probe the preferred path, when it is not in use
163 */
164 uint32_t mptcp_probeto = 1000;
165 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
166 &mptcp_probeto, 0, "Disable probing by setting to 0");
167
168 uint32_t mptcp_probecnt = 5;
169 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
170 &mptcp_probecnt, 0, "Number of probe writes");
171
172 /*
173 * Static declarations
174 */
175 static uint16_t mptcp_input_csum(struct tcpcb *, struct mbuf *, uint64_t,
176 uint32_t, uint16_t, uint16_t);
177
178 static int
179 mptcp_reass_present(struct socket *mp_so)
180 {
181 struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
182 struct tseg_qent *q;
183 int dowakeup = 0;
184
185 /*
186 * Present data to user, advancing rcv_nxt through
187 * completed sequence space.
188 */
189 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED)
190 return (0);
191 q = LIST_FIRST(&mp_tp->mpt_segq);
192 if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt)
193 return (0);
194
195 /*
196 * If there is already another thread doing reassembly for this
197 * connection, it is better to let it finish the job --
198 * (radar 16316196)
199 */
200 if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG)
201 return (0);
202
203 mp_tp->mpt_flags |= MPTCPF_REASS_INPROG;
204
205 do {
206 mp_tp->mpt_rcvnxt += q->tqe_len;
207 LIST_REMOVE(q, tqe_q);
208 if (mp_so->so_state & SS_CANTRCVMORE) {
209 m_freem(q->tqe_m);
210 } else {
211 if (sbappendstream(&mp_so->so_rcv, q->tqe_m))
212 dowakeup = 1;
213 }
214 zfree(tcp_reass_zone, q);
215 mp_tp->mpt_reassqlen--;
216 q = LIST_FIRST(&mp_tp->mpt_segq);
217 } while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt);
218 mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG;
219
220 if (dowakeup)
221 sorwakeup(mp_so); /* done with socket lock held */
222 return (0);
223
224 }
225
226 static int
227 mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf *m)
228 {
229 struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
230 u_int64_t mb_dsn = phdr->mp_dsn;
231 struct tseg_qent *q;
232 struct tseg_qent *p = NULL;
233 struct tseg_qent *nq;
234 struct tseg_qent *te = NULL;
235 u_int16_t qlimit;
236
237 /*
238 * Limit the number of segments in the reassembly queue to prevent
239 * holding on to too many segments (and thus running out of mbufs).
240 * Make sure to let the missing segment through which caused this
241 * queue. Always keep one global queue entry spare to be able to
242 * process the missing segment.
243 */
244 qlimit = min(max(100, mp_so->so_rcv.sb_hiwat >> 10),
245 (tcp_autorcvbuf_max >> 10));
246 if (mb_dsn != mp_tp->mpt_rcvnxt &&
247 (mp_tp->mpt_reassqlen + 1) >= qlimit) {
248 tcpstat.tcps_mptcp_rcvmemdrop++;
249 m_freem(m);
250 *tlenp = 0;
251 return (0);
252 }
253
254 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
255 te = (struct tseg_qent *) zalloc(tcp_reass_zone);
256 if (te == NULL) {
257 tcpstat.tcps_mptcp_rcvmemdrop++;
258 m_freem(m);
259 return (0);
260 }
261
262 mp_tp->mpt_reassqlen++;
263
264 /*
265 * Find a segment which begins after this one does.
266 */
267 LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) {
268 if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn))
269 break;
270 p = q;
271 }
272
273 /*
274 * If there is a preceding segment, it may provide some of
275 * our data already. If so, drop the data from the incoming
276 * segment. If it provides all of our data, drop us.
277 */
278 if (p != NULL) {
279 int64_t i;
280 /* conversion to int (in i) handles seq wraparound */
281 i = p->tqe_m->m_pkthdr.mp_dsn + p->tqe_len - mb_dsn;
282 if (i > 0) {
283 if (i >= *tlenp) {
284 tcpstat.tcps_mptcp_rcvduppack++;
285 m_freem(m);
286 zfree(tcp_reass_zone, te);
287 te = NULL;
288 mp_tp->mpt_reassqlen--;
289 /*
290 * Try to present any queued data
291 * at the left window edge to the user.
292 * This is needed after the 3-WHS
293 * completes.
294 */
295 goto out;
296 }
297 m_adj(m, i);
298 *tlenp -= i;
299 phdr->mp_dsn += i;
300 }
301 }
302
303 tcpstat.tcps_mp_oodata++;
304
305 /*
306 * While we overlap succeeding segments trim them or,
307 * if they are completely covered, dequeue them.
308 */
309 while (q) {
310 int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn;
311 if (i <= 0)
312 break;
313
314 if (i < q->tqe_len) {
315 q->tqe_m->m_pkthdr.mp_dsn += i;
316 q->tqe_len -= i;
317 m_adj(q->tqe_m, i);
318 break;
319 }
320
321 nq = LIST_NEXT(q, tqe_q);
322 LIST_REMOVE(q, tqe_q);
323 m_freem(q->tqe_m);
324 zfree(tcp_reass_zone, q);
325 mp_tp->mpt_reassqlen--;
326 q = nq;
327 }
328
329 /* Insert the new segment queue entry into place. */
330 te->tqe_m = m;
331 te->tqe_th = NULL;
332 te->tqe_len = *tlenp;
333
334 if (p == NULL) {
335 LIST_INSERT_HEAD(&mp_tp->mpt_segq, te, tqe_q);
336 } else {
337 LIST_INSERT_AFTER(p, te, tqe_q);
338 }
339
340 out:
341 return (mptcp_reass_present(mp_so));
342 }
343
344 /*
345 * MPTCP input, called when data has been read from a subflow socket.
346 */
347 void
348 mptcp_input(struct mptses *mpte, struct mbuf *m)
349 {
350 struct socket *mp_so;
351 struct mptcb *mp_tp = NULL;
352 int count = 0, wakeup = 0;
353 struct mbuf *save = NULL, *prev = NULL;
354 struct mbuf *freelist = NULL, *tail = NULL;
355
356 VERIFY(m->m_flags & M_PKTHDR);
357
358 mpte_lock_assert_held(mpte); /* same as MP socket lock */
359
360 mp_so = mptetoso(mpte);
361 mp_tp = mpte->mpte_mptcb;
362
363 DTRACE_MPTCP(input);
364
365 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
366
367 /*
368 * Each mbuf contains MPTCP Data Sequence Map
369 * Process the data for reassembly, delivery to MPTCP socket
370 * client, etc.
371 *
372 */
373 count = mp_so->so_rcv.sb_cc;
374
375 /*
376 * In the degraded fallback case, data is accepted without DSS map
377 */
378 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
379 fallback:
380 mptcp_sbrcv_grow(mp_tp);
381
382 /*
383 * assume degraded flow as this may be the first packet
384 * without DSS, and the subflow state is not updated yet.
385 */
386 if (sbappendstream(&mp_so->so_rcv, m))
387 sorwakeup(mp_so);
388 DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
389 struct socket *, mp_so,
390 struct sockbuf *, &mp_so->so_rcv,
391 struct sockbuf *, &mp_so->so_snd,
392 struct mptses *, mpte);
393 count = mp_so->so_rcv.sb_cc - count;
394 mptcplog((LOG_DEBUG, "%s: Fallback read %d bytes\n", __func__,
395 count), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
396 return;
397 }
398
399 do {
400 u_int64_t mb_dsn;
401 int32_t mb_datalen;
402 int64_t todrop;
403
404 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
405 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP))
406 goto fallback;
407
408 save = m->m_next;
409 /*
410 * A single TCP packet formed of multiple mbufs
411 * holds DSS mapping in the first mbuf of the chain.
412 * Other mbufs in the chain may have M_PKTHDR set
413 * even though they belong to the same TCP packet
414 * and therefore use the DSS mapping stored in the
415 * first mbuf of the mbuf chain. mptcp_input() can
416 * get an mbuf chain with multiple TCP packets.
417 */
418 while (save && (!(save->m_flags & M_PKTHDR) ||
419 !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
420 prev = save;
421 save = save->m_next;
422 }
423 if (prev)
424 prev->m_next = NULL;
425 else
426 m->m_next = NULL;
427
428 mb_dsn = m->m_pkthdr.mp_dsn;
429 mb_datalen = m->m_pkthdr.mp_rlen;
430
431 todrop = (mb_dsn + mb_datalen) - (mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd);
432 if (todrop > 0) {
433 tcpstat.tcps_mptcp_rcvpackafterwin++;
434
435 if (todrop >= mb_datalen) {
436 if (freelist == NULL)
437 freelist = m;
438 else
439 tail->m_next = m;
440
441 if (prev != NULL)
442 tail = prev;
443 else
444 tail = m;
445
446 m = save;
447 prev = save = NULL;
448 continue;
449 } else {
450 m_adj(m, -todrop);
451 mb_datalen -= todrop;
452 }
453 }
454
455 if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
456 !LIST_EMPTY(&mp_tp->mpt_segq)) {
457 mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
458
459 goto next;
460 }
461
462 if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
463 if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
464 mp_tp->mpt_rcvnxt)) {
465 if (freelist == NULL)
466 freelist = m;
467 else
468 tail->m_next = m;
469
470 if (prev != NULL)
471 tail = prev;
472 else
473 tail = m;
474
475 m = save;
476 prev = save = NULL;
477 continue;
478 } else {
479 m_adj(m, (mp_tp->mpt_rcvnxt - mb_dsn));
480 }
481 mptcplog((LOG_INFO, "%s: Left Edge %llu\n", __func__,
482 mp_tp->mpt_rcvnxt),
483 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
484 }
485
486 mptcp_sbrcv_grow(mp_tp);
487
488 if (sbappendstream(&mp_so->so_rcv, m))
489 wakeup = 1;
490
491 DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
492 struct sockbuf *, &mp_so->so_rcv,
493 struct sockbuf *, &mp_so->so_snd,
494 struct mptses *, mpte,
495 struct mptcb *, mp_tp);
496 count = mp_so->so_rcv.sb_cc - count;
497 tcpstat.tcps_mp_rcvtotal++;
498 tcpstat.tcps_mp_rcvbytes += count;
499 mptcplog((LOG_DEBUG, "%s: Read %d bytes\n", __func__, count),
500 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
501
502 mp_tp->mpt_rcvnxt += count;
503
504 next:
505 m = save;
506 prev = save = NULL;
507 count = mp_so->so_rcv.sb_cc;
508 } while (m);
509
510 if (freelist)
511 m_freem(freelist);
512
513 if (wakeup)
514 sorwakeup(mp_so);
515 }
516
517 static boolean_t
518 mptcp_can_send_more(struct mptcb *mp_tp)
519 {
520 struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
521
522 /*
523 * Always send if there is data in the reinject-queue.
524 */
525 if (mp_tp->mpt_mpte->mpte_reinjectq)
526 return (TRUE);
527
528 /*
529 * Don't send, if:
530 *
531 * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
532 * Except when using TFO, we might be doing a 0-byte write.
533 * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
534 * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
535 */
536
537 if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax))
538 return (FALSE);
539
540 if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt))
541 return (FALSE);
542
543 if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
544 return (FALSE);
545
546 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2)
547 return (FALSE);
548
549 return (TRUE);
550 }
551
552 /*
553 * MPTCP output.
554 */
555 int
556 mptcp_output(struct mptses *mpte)
557 {
558 struct mptcb *mp_tp;
559 struct mptsub *mpts;
560 struct mptsub *mpts_tried = NULL;
561 struct socket *mp_so;
562 struct mptsub *preferred_mpts = NULL;
563 uint64_t old_snd_nxt;
564 int error = 0;
565
566 mpte_lock_assert_held(mpte);
567 mp_so = mptetoso(mpte);
568 mp_tp = mpte->mpte_mptcb;
569
570 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
571 mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
572
573 mptcplog((LOG_DEBUG, "%s: snxt %u sndmax %u suna %u swnd %u reinjectq %u state %u\n",
574 __func__, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
575 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_sndwnd,
576 mpte->mpte_reinjectq ? 1 : 0,
577 mp_tp->mpt_state),
578 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
579
580 old_snd_nxt = mp_tp->mpt_sndnxt;
581 while (mptcp_can_send_more(mp_tp)) {
582 /* get the "best" subflow to be used for transmission */
583 mpts = mptcp_get_subflow(mpte, NULL, &preferred_mpts);
584 if (mpts == NULL) {
585 mptcplog((LOG_INFO, "%s: no subflow\n", __func__),
586 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
587 break;
588 }
589
590 mptcplog((LOG_DEBUG, "%s: using id %u\n", __func__, mpts->mpts_connid),
591 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
592
593 /* In case there's just one flow, we reattempt later */
594 if (mpts_tried != NULL &&
595 (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
596 mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
597 mpts_tried->mpts_flags |= MPTSF_ACTIVE;
598 mptcp_start_timer(mpte, MPTT_REXMT);
599 mptcplog((LOG_DEBUG, "%s: retry later\n", __func__),
600 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
601 break;
602 }
603
604 /*
605 * Automatic sizing of send socket buffer. Increase the send
606 * socket buffer size if all of the following criteria are met
607 * 1. the receiver has enough buffer space for this data
608 * 2. send buffer is filled to 7/8th with data (so we actually
609 * have data to make use of it);
610 */
611 if (tcp_do_autosendbuf == 1 &&
612 (mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE &&
613 tcp_cansbgrow(&mp_so->so_snd)) {
614 if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat &&
615 mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) {
616 if (sbreserve(&mp_so->so_snd,
617 min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
618 tcp_autosndbuf_max)) == 1) {
619 mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
620
621 mptcplog((LOG_DEBUG, "%s: increased snd hiwat to %u lowat %u\n",
622 __func__, mp_so->so_snd.sb_hiwat,
623 mp_so->so_snd.sb_lowat),
624 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
625 }
626 }
627 }
628
629 DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
630 struct socket *, mp_so);
631 error = mptcp_subflow_output(mpte, mpts, 0);
632 if (error) {
633 /* can be a temporary loss of source address or other error */
634 mpts->mpts_flags |= MPTSF_FAILINGOVER;
635 mpts->mpts_flags &= ~MPTSF_ACTIVE;
636 mpts_tried = mpts;
637 mptcplog((LOG_ERR, "%s: Error = %d mpts_flags %#x\n", __func__,
638 error, mpts->mpts_flags),
639 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
640 break;
641 }
642 /* The model is to have only one active flow at a time */
643 mpts->mpts_flags |= MPTSF_ACTIVE;
644 mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
645
646 /* Allows us to update the smoothed rtt */
647 if (mptcp_probeto && mpts != preferred_mpts && preferred_mpts != NULL) {
648 if (preferred_mpts->mpts_probesoon) {
649 if ((tcp_now - preferred_mpts->mpts_probesoon) > mptcp_probeto) {
650 mptcp_subflow_output(mpte, preferred_mpts, MPTCP_SUBOUT_PROBING);
651 if (preferred_mpts->mpts_probecnt >= mptcp_probecnt) {
652 preferred_mpts->mpts_probesoon = 0;
653 preferred_mpts->mpts_probecnt = 0;
654 }
655 }
656 } else {
657 preferred_mpts->mpts_probesoon = tcp_now;
658 preferred_mpts->mpts_probecnt = 0;
659 }
660 }
661
662 if (mpte->mpte_active_sub == NULL) {
663 mpte->mpte_active_sub = mpts;
664 } else if (mpte->mpte_active_sub != mpts) {
665 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
666 struct tcpcb *acttp = sototcpcb(mpte->mpte_active_sub->mpts_socket);
667
668 mptcplog((LOG_DEBUG, "%s: switch [%u, srtt %d] to [%u, srtt %d]\n", __func__,
669 mpte->mpte_active_sub->mpts_connid, acttp->t_srtt >> TCP_RTT_SHIFT,
670 mpts->mpts_connid, tp->t_srtt >> TCP_RTT_SHIFT),
671 (MPTCP_SENDER_DBG | MPTCP_SOCKET_DBG), MPTCP_LOGLVL_LOG);
672
673 mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
674 mpte->mpte_active_sub = mpts;
675
676 mptcpstats_inc_switch(mpte, mpts);
677 }
678 }
679
680 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_WUPCALL);
681
682 /* subflow errors should not be percolated back up */
683 return (0);
684 }
685
686
687 static struct mptsub *
688 mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt)
689 {
690 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
691
692 /*
693 * Lower RTT? Take it, if it's our first one, or
694 * it doesn't has any loss, or the current one has
695 * loss as well.
696 */
697 if (tp->t_srtt && *currtt > tp->t_srtt &&
698 (curbest == NULL || tp->t_rxtshift == 0 ||
699 sototcpcb(curbest->mpts_socket)->t_rxtshift)) {
700 *currtt = tp->t_srtt;
701 return (mpts);
702 }
703
704 /*
705 * If we find a subflow without loss, take it always!
706 */
707 if (curbest &&
708 sototcpcb(curbest->mpts_socket)->t_rxtshift &&
709 tp->t_rxtshift == 0) {
710 *currtt = tp->t_srtt;
711 return (mpts);
712 }
713
714 return (curbest != NULL ? curbest : mpts);
715 }
716
717 static struct mptsub *
718 mptcp_return_subflow(struct mptsub *mpts)
719 {
720 if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0)
721 return (NULL);
722
723 return (mpts);
724 }
725
726 /*
727 * Return the most eligible subflow to be used for sending data.
728 */
729 struct mptsub *
730 mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **preferred)
731 {
732 struct tcpcb *besttp, *secondtp;
733 struct inpcb *bestinp, *secondinp;
734 struct mptsub *mpts;
735 struct mptsub *best = NULL;
736 struct mptsub *second_best = NULL;
737 int exp_rtt = INT_MAX, cheap_rtt = INT_MAX;
738
739 /*
740 * First Step:
741 * Choose the best subflow for cellular and non-cellular interfaces.
742 */
743
744 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
745 struct socket *so = mpts->mpts_socket;
746 struct tcpcb *tp = sototcpcb(so);
747 struct inpcb *inp = sotoinpcb(so);
748
749 mptcplog((LOG_DEBUG, "%s mpts %u ignore %d, mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n",
750 __func__, mpts->mpts_connid, ignore ? ignore->mpts_connid : -1, mpts->mpts_flags,
751 INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state,
752 inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1,
753 tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt,
754 mptcp_subflow_cwnd_space(so)),
755 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
756
757 /*
758 * First, the hard conditions to reject subflows
759 * (e.g., not connected,...)
760 */
761 if (mpts == ignore || inp->inp_last_outifp == NULL)
762 continue;
763
764 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
765 continue;
766
767 /* There can only be one subflow in degraded state */
768 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
769 best = mpts;
770 break;
771 }
772
773 /*
774 * If this subflow is waiting to finally send, do it!
775 */
776 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
777 return (mptcp_return_subflow(mpts));
778
779 /*
780 * Only send if the subflow is MP_CAPABLE. The exceptions to
781 * this rule (degraded or TFO) have been taken care of above.
782 */
783 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE))
784 continue;
785
786 if ((so->so_state & SS_ISDISCONNECTED) ||
787 !(so->so_state & SS_ISCONNECTED) ||
788 !TCPS_HAVEESTABLISHED(tp->t_state) ||
789 tp->t_state > TCPS_CLOSE_WAIT)
790 continue;
791
792 /*
793 * Second, the soft conditions to find the subflow with best
794 * conditions for each set (aka cellular vs non-cellular)
795 */
796 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
797 second_best = mptcp_choose_subflow(mpts, second_best,
798 &exp_rtt);
799 else
800 best = mptcp_choose_subflow(mpts, best, &cheap_rtt);
801 }
802
803 /*
804 * If there is no preferred or backup subflow, and there is no active
805 * subflow use the last usable subflow.
806 */
807 if (best == NULL)
808 return (mptcp_return_subflow(second_best));
809
810 if (second_best == NULL)
811 return (mptcp_return_subflow(best));
812
813 besttp = sototcpcb(best->mpts_socket);
814 bestinp = sotoinpcb(best->mpts_socket);
815 secondtp = sototcpcb(second_best->mpts_socket);
816 secondinp = sotoinpcb(second_best->mpts_socket);
817
818 if (preferred != NULL)
819 *preferred = mptcp_return_subflow(best);
820
821 /*
822 * Second Step: Among best and second_best. Choose the one that is
823 * most appropriate for this particular service-type.
824 */
825 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
826 /*
827 * Only handover if Symptoms tells us to do so.
828 */
829 if (IFNET_IS_WIFI(bestinp->inp_last_outifp) &&
830 mptcp_is_wifi_unusable() &&
831 besttp->t_rxtshift >= mptcp_fail_thresh)
832 return (mptcp_return_subflow(second_best));
833
834 return (mptcp_return_subflow(best));
835 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) {
836 int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT;
837 int rto_thresh = mptcp_rtothresh;
838
839 /* Adjust with symptoms information */
840 if (IFNET_IS_WIFI(bestinp->inp_last_outifp) &&
841 mptcp_is_wifi_unusable()) {
842 rtt_thresh /= 2;
843 rto_thresh /= 2;
844 }
845
846 if (besttp->t_srtt && secondtp->t_srtt &&
847 besttp->t_srtt >= rtt_thresh &&
848 secondtp->t_srtt < rtt_thresh) {
849 tcpstat.tcps_mp_sel_rtt++;
850 mptcplog((LOG_DEBUG, "%s: best cid %d at rtt %d, second cid %d at rtt %d\n", __func__,
851 best->mpts_connid, besttp->t_srtt >> TCP_RTT_SHIFT,
852 second_best->mpts_connid,
853 secondtp->t_srtt >> TCP_RTT_SHIFT),
854 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
855 return (mptcp_return_subflow(second_best));
856 }
857
858 if (besttp->t_rxtshift >= mptcp_fail_thresh &&
859 secondtp->t_rxtshift == 0) {
860 return (mptcp_return_subflow(second_best));
861 }
862
863 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
864 if (besttp->t_rxtcur && secondtp->t_rxtcur &&
865 besttp->t_rxtcur >= rto_thresh &&
866 secondtp->t_rxtcur < rto_thresh) {
867 tcpstat.tcps_mp_sel_rto++;
868 mptcplog((LOG_DEBUG, "%s: best cid %d at rto %d, second cid %d at rto %d\n", __func__,
869 best->mpts_connid, besttp->t_rxtcur,
870 second_best->mpts_connid, secondtp->t_rxtcur),
871 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
872
873 return (mptcp_return_subflow(second_best));
874 }
875
876 /*
877 * None of the above conditions for sending on the secondary
878 * were true. So, let's schedule on the best one, if he still
879 * has some space in the congestion-window.
880 */
881 return (mptcp_return_subflow(best));
882 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_AGGREGATE) {
883 struct mptsub *tmp;
884
885 /*
886 * We only care about RTT when aggregating
887 */
888 if (besttp->t_srtt > secondtp->t_srtt) {
889 tmp = best;
890 best = second_best;
891 besttp = secondtp;
892 bestinp = secondinp;
893
894 second_best = tmp;
895 secondtp = sototcpcb(second_best->mpts_socket);
896 secondinp = sotoinpcb(second_best->mpts_socket);
897 }
898
899 /* Is there still space in the congestion window? */
900 if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0)
901 return (mptcp_return_subflow(second_best));
902
903 return (mptcp_return_subflow(best));
904 } else {
905 panic("Unknown service-type configured for MPTCP");
906 }
907
908 return (NULL);
909 }
910
911 static const char *
912 mptcp_event_to_str(uint32_t event)
913 {
914 const char *c = "UNDEFINED";
915 switch (event) {
916 case MPCE_CLOSE:
917 c = "MPCE_CLOSE";
918 break;
919 case MPCE_RECV_DATA_ACK:
920 c = "MPCE_RECV_DATA_ACK";
921 break;
922 case MPCE_RECV_DATA_FIN:
923 c = "MPCE_RECV_DATA_FIN";
924 break;
925 }
926 return (c);
927 }
928
929 static const char *
930 mptcp_state_to_str(mptcp_state_t state)
931 {
932 const char *c = "UNDEFINED";
933 switch (state) {
934 case MPTCPS_CLOSED:
935 c = "MPTCPS_CLOSED";
936 break;
937 case MPTCPS_LISTEN:
938 c = "MPTCPS_LISTEN";
939 break;
940 case MPTCPS_ESTABLISHED:
941 c = "MPTCPS_ESTABLISHED";
942 break;
943 case MPTCPS_CLOSE_WAIT:
944 c = "MPTCPS_CLOSE_WAIT";
945 break;
946 case MPTCPS_FIN_WAIT_1:
947 c = "MPTCPS_FIN_WAIT_1";
948 break;
949 case MPTCPS_CLOSING:
950 c = "MPTCPS_CLOSING";
951 break;
952 case MPTCPS_LAST_ACK:
953 c = "MPTCPS_LAST_ACK";
954 break;
955 case MPTCPS_FIN_WAIT_2:
956 c = "MPTCPS_FIN_WAIT_2";
957 break;
958 case MPTCPS_TIME_WAIT:
959 c = "MPTCPS_TIME_WAIT";
960 break;
961 case MPTCPS_TERMINATE:
962 c = "MPTCPS_TERMINATE";
963 break;
964 }
965 return (c);
966 }
967
968 void
969 mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
970 {
971 mpte_lock_assert_held(mp_tp->mpt_mpte);
972 mptcp_state_t old_state = mp_tp->mpt_state;
973
974 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
975 uint32_t, event);
976
977 switch (mp_tp->mpt_state) {
978 case MPTCPS_CLOSED:
979 case MPTCPS_LISTEN:
980 mp_tp->mpt_state = MPTCPS_CLOSED;
981 break;
982
983 case MPTCPS_ESTABLISHED:
984 if (event == MPCE_CLOSE) {
985 mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
986 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
987 } else if (event == MPCE_RECV_DATA_FIN) {
988 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
989 mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
990 }
991 break;
992
993 case MPTCPS_CLOSE_WAIT:
994 if (event == MPCE_CLOSE) {
995 mp_tp->mpt_state = MPTCPS_LAST_ACK;
996 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
997 }
998 break;
999
1000 case MPTCPS_FIN_WAIT_1:
1001 if (event == MPCE_RECV_DATA_ACK) {
1002 mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
1003 } else if (event == MPCE_RECV_DATA_FIN) {
1004 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1005 mp_tp->mpt_state = MPTCPS_CLOSING;
1006 }
1007 break;
1008
1009 case MPTCPS_CLOSING:
1010 if (event == MPCE_RECV_DATA_ACK)
1011 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1012 break;
1013
1014 case MPTCPS_LAST_ACK:
1015 if (event == MPCE_RECV_DATA_ACK)
1016 mptcp_close(mp_tp->mpt_mpte, mp_tp);
1017 break;
1018
1019 case MPTCPS_FIN_WAIT_2:
1020 if (event == MPCE_RECV_DATA_FIN) {
1021 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1022 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1023 }
1024 break;
1025
1026 case MPTCPS_TIME_WAIT:
1027 case MPTCPS_TERMINATE:
1028 break;
1029
1030 default:
1031 VERIFY(0);
1032 /* NOTREACHED */
1033 }
1034 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1035 uint32_t, event);
1036 mptcplog((LOG_INFO, "%s: %s to %s on event %s\n", __func__,
1037 mptcp_state_to_str(old_state),
1038 mptcp_state_to_str(mp_tp->mpt_state),
1039 mptcp_event_to_str(event)),
1040 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
1041 }
1042
1043 /* If you change this function, match up mptcp_update_rcv_state_f */
1044 void
1045 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
1046 uint16_t csum)
1047 {
1048 struct mptcb *mp_tp = tptomptp(tp);
1049 u_int64_t full_dsn = 0;
1050
1051 NTOHL(dss_info->mdss_dsn);
1052 NTOHL(dss_info->mdss_subflow_seqn);
1053 NTOHS(dss_info->mdss_data_len);
1054
1055 /* XXX for autosndbuf grow sb here */
1056 MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
1057 mptcp_update_rcv_state_meat(mp_tp, tp,
1058 full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
1059 csum);
1060
1061 }
1062
1063 void
1064 mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
1065 u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
1066 uint16_t csum)
1067 {
1068 if (mdss_data_len == 0) {
1069 mptcplog((LOG_INFO, "%s: Infinite Mapping.\n", __func__),
1070 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1071
1072 if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
1073 mptcplog((LOG_ERR, "%s: Bad checksum %x \n", __func__,
1074 csum), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
1075 }
1076 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1077 return;
1078 }
1079 mptcplog((LOG_DEBUG,
1080 "%s: seqn = %x len = %x full = %llx rcvnxt = %llu \n", __func__,
1081 seqn, mdss_data_len, full_dsn, mp_tp->mpt_rcvnxt),
1082 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
1083
1084 /* Process a Data FIN packet , handled in mptcp_do_fin_opt */
1085 if ((seqn == 0) && (mdss_data_len == 1)) {
1086 mptcplog((LOG_INFO, "%s: Data FIN in %s state \n", __func__,
1087 mptcp_state_to_str(mp_tp->mpt_state)),
1088 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1089 return;
1090 }
1091 mptcp_notify_mpready(tp->t_inpcb->inp_socket);
1092 tp->t_rcv_map.mpt_dsn = full_dsn;
1093 tp->t_rcv_map.mpt_sseq = seqn;
1094 tp->t_rcv_map.mpt_len = mdss_data_len;
1095 tp->t_rcv_map.mpt_csum = csum;
1096 tp->t_mpflags |= TMPF_EMBED_DSN;
1097 }
1098
1099
1100 static int
1101 mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m,
1102 int hdrlen)
1103 {
1104 u_int32_t datalen;
1105
1106 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP))
1107 return 0;
1108
1109 datalen = m->m_pkthdr.mp_rlen;
1110
1111 /* unacceptable DSS option, fallback to TCP */
1112 if (m->m_pkthdr.len > ((int) datalen + hdrlen)) {
1113 mptcplog((LOG_ERR, "%s: mbuf len %d, MPTCP expected %d",
1114 __func__, m->m_pkthdr.len, datalen),
1115 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1116 } else {
1117 return 0;
1118 }
1119 tp->t_mpflags |= TMPF_SND_MPFAIL;
1120 mptcp_notify_mpfail(so);
1121 m_freem(m);
1122 return -1;
1123 }
1124
1125 int
1126 mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, int drop_hdrlen)
1127 {
1128 mptcp_insert_rmap(tp, m);
1129 if (mptcp_validate_dss_map(tp->t_inpcb->inp_socket, tp, m,
1130 drop_hdrlen) != 0)
1131 return -1;
1132 return 0;
1133 }
1134
1135 /*
1136 * MPTCP Checksum support
1137 * The checksum is calculated whenever the MPTCP DSS option is included
1138 * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
1139 * header and the actual data indicated by the length specified in the
1140 * DSS option.
1141 */
1142
1143 int
1144 mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
1145 uint32_t sseq, uint16_t dlen, uint16_t csum)
1146 {
1147 uint16_t mptcp_csum;
1148
1149 mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum);
1150 if (mptcp_csum) {
1151 tp->t_mpflags |= TMPF_SND_MPFAIL;
1152 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1153 m_freem(m);
1154 tcpstat.tcps_mp_badcsum++;
1155 return (-1);
1156 }
1157 return (0);
1158 }
1159
1160 static uint16_t
1161 mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq,
1162 uint16_t dlen, uint16_t csum)
1163 {
1164 struct mptcb *mp_tp = tptomptp(tp);
1165 uint32_t sum = 0;
1166
1167 if (mp_tp == NULL)
1168 return (0);
1169
1170 if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM))
1171 return (0);
1172
1173 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
1174 return (0);
1175
1176 /*
1177 * The remote side may send a packet with fewer bytes than the
1178 * claimed DSS checksum length.
1179 */
1180 if ((int)m_length2(m, NULL) < dlen)
1181 return (0xffff);
1182
1183 if (dlen != 0)
1184 sum = m_sum16(m, 0, dlen);
1185
1186 sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum);
1187 ADDCARRY(sum);
1188 DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1189 uint32_t, sum);
1190
1191 mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1192 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
1193 return (~sum & 0xffff);
1194 }
1195
1196 uint32_t
1197 mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen)
1198 {
1199 u_int32_t sum = 0;
1200
1201 if (dlen)
1202 sum = m_sum16(m, 0, dlen);
1203
1204 dss_val = mptcp_hton64(dss_val);
1205 sseq = htonl(sseq);
1206 dlen = htons(dlen);
1207 sum += in_pseudo64(dss_val, sseq, dlen);
1208
1209 ADDCARRY(sum);
1210 sum = ~sum & 0xffff;
1211 DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum);
1212 mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1213 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
1214
1215 return sum;
1216 }
1217
1218 /*
1219 * When WiFi signal starts fading, there's more loss and RTT spikes.
1220 * Check if there has been a large spike by comparing against
1221 * a tolerable RTT spike threshold.
1222 */
1223 boolean_t
1224 mptcp_no_rto_spike(struct socket *so)
1225 {
1226 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1227 int32_t spike = 0;
1228
1229 if (tp->t_rxtcur > mptcp_rtothresh) {
1230 spike = tp->t_rxtcur - mptcp_rtothresh;
1231
1232 mptcplog((LOG_DEBUG, "%s: spike = %d rto = %d best = %d cur = %d\n",
1233 __func__, spike,
1234 tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
1235 tp->t_rttcur),
1236 (MPTCP_SOCKET_DBG|MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG);
1237
1238 }
1239
1240 if (spike > 0 ) {
1241 return (FALSE);
1242 } else {
1243 return (TRUE);
1244 }
1245 }
1246
1247 void
1248 mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
1249 {
1250 VERIFY(mpp->mpp_flags & flag);
1251 mpp->mpp_flags &= ~flag;
1252
1253 if (mptcp_should_defer_upcall(mpp))
1254 return;
1255
1256 if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) {
1257 mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP;
1258
1259 mptcp_subflow_workloop(mpp->mpp_pcbe);
1260 }
1261
1262 if (mpp->mpp_flags & MPP_SHOULD_RWAKEUP) {
1263 mpp->mpp_flags &= ~MPP_SHOULD_RWAKEUP;
1264
1265 sorwakeup(mpp->mpp_socket);
1266 }
1267
1268 if (mpp->mpp_flags & MPP_SHOULD_WWAKEUP) {
1269 mpp->mpp_flags &= ~MPP_SHOULD_WWAKEUP;
1270
1271 sowwakeup(mpp->mpp_socket);
1272 }
1273
1274 if (mpp->mpp_flags & MPP_SET_CELLICON) {
1275 mpp->mpp_flags &= ~MPP_SET_CELLICON;
1276
1277 mptcp_set_cellicon(mpp->mpp_pcbe);
1278 }
1279
1280 if (mpp->mpp_flags & MPP_UNSET_CELLICON) {
1281 mpp->mpp_flags &= ~MPP_UNSET_CELLICON;
1282
1283 mptcp_unset_cellicon();
1284 }
1285 }
1286
1287 static void
1288 mptcp_ask_for_nat64(struct ifnet *ifp)
1289 {
1290 in6_post_msg(ifp, KEV_INET6_REQUEST_NAT64_PREFIX, NULL, NULL);
1291
1292 mptcplog((LOG_DEBUG, "%s: asked for NAT64-prefix on %s\n",
1293 __func__, ifp->if_name), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1294 }
1295
1296 static void
1297 mptcp_reset_itfinfo(struct mpt_itf_info *info)
1298 {
1299 info->ifindex = 0;
1300 info->has_v4_conn = 0;
1301 info->has_v6_conn = 0;
1302 }
1303
1304 void
1305 mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow)
1306 {
1307 struct mppcb *mp = (struct mppcb *)handle;
1308 struct mptses *mpte = mptompte(mp);
1309 struct socket *mp_so;
1310 struct mptcb *mp_tp;
1311 int locked = 0;
1312 uint32_t i, ifindex;
1313
1314 ifindex = flow->interface_index;
1315 VERIFY(ifindex != IFSCOPE_NONE);
1316
1317 /* ToDo - remove after rdar://problem/32007628 */
1318 if (!IF_INDEX_IN_RANGE(ifindex))
1319 printf("%s 1 ifindex %u not in range of flow %p action %d\n",
1320 __func__, ifindex, flow, action);
1321
1322 /* About to be garbage-collected (see note about MPTCP/NECP interactions) */
1323 if (mp->mpp_socket->so_usecount == 0)
1324 return;
1325
1326 if (action != NECP_CLIENT_CBACTION_INITIAL) {
1327 mpte_lock(mpte);
1328 locked = 1;
1329
1330 /* Check again, because it might have changed while waiting */
1331 if (mp->mpp_socket->so_usecount == 0)
1332 goto out;
1333 }
1334
1335 mp_tp = mpte->mpte_mptcb;
1336 mp_so = mptetoso(mpte);
1337
1338 mptcplog((LOG_DEBUG, "%s, action: %u ifindex %u usecount %u mpt_flags %#x state %u\n",
1339 __func__, action, ifindex, mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state),
1340 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1341
1342 /* No need on fallen back sockets */
1343 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)
1344 goto out;
1345
1346 if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
1347 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1348 if (mpte->mpte_itfinfo[i].ifindex == ifindex)
1349 mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]);
1350 }
1351
1352 mptcp_sched_create_subflows(mpte);
1353 } else if (action == NECP_CLIENT_CBACTION_VIABLE ||
1354 action == NECP_CLIENT_CBACTION_INITIAL) {
1355 int found_empty = 0, empty_index = -1;
1356 struct ifnet *ifp;
1357
1358 /* ToDo - remove after rdar://problem/32007628 */
1359 if (!IF_INDEX_IN_RANGE(ifindex))
1360 printf("%s 2 ifindex %u not in range of flow %p action %d\n",
1361 __func__, ifindex, flow, action);
1362
1363 ifnet_head_lock_shared();
1364 ifp = ifindex2ifnet[ifindex];
1365 ifnet_head_done();
1366
1367 /* ToDo - remove after rdar://problem/32007628 */
1368 if (!IF_INDEX_IN_RANGE(ifindex))
1369 printf("%s 3 ifindex %u not in range of flow %p action %d\n",
1370 __func__, ifindex, flow, action);
1371
1372 if (ifp == NULL)
1373 goto out;
1374
1375 if (IFNET_IS_EXPENSIVE(ifp) &&
1376 (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE))
1377 goto out;
1378
1379 if (IFNET_IS_CELLULAR(ifp) &&
1380 (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR))
1381 goto out;
1382
1383 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1384 if (mpte->mpte_itfinfo[i].ifindex == 0) {
1385 found_empty = 1;
1386 empty_index = i;
1387 }
1388
1389 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1390 /* Ok, it's already there */
1391 goto out;
1392 }
1393 }
1394
1395 if ((mpte->mpte_dst.sa_family == AF_INET || mpte->mpte_dst.sa_family == 0) &&
1396 !(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4) &&
1397 ifnet_get_nat64prefix(ifp, NULL) == ENOENT) {
1398 mptcp_ask_for_nat64(ifp);
1399 goto out;
1400 }
1401
1402 if (found_empty == 0) {
1403 int new_size = mpte->mpte_itfinfo_size * 2;
1404 struct mpt_itf_info *info = _MALLOC(sizeof(*info) * new_size, M_TEMP, M_ZERO);
1405
1406 if (info == NULL) {
1407 mptcplog((LOG_ERR, "%s malloc failed for %u\n", __func__, new_size),
1408 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1409 goto out;
1410 }
1411
1412 memcpy(info, mpte->mpte_itfinfo, mpte->mpte_itfinfo_size * sizeof(*info));
1413
1414 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE)
1415 _FREE(mpte->mpte_itfinfo, M_TEMP);
1416
1417 /* We allocated a new one, thus the first must be empty */
1418 empty_index = mpte->mpte_itfinfo_size;
1419
1420 mpte->mpte_itfinfo = info;
1421 mpte->mpte_itfinfo_size = new_size;
1422
1423 mptcplog((LOG_DEBUG, "%s Needed to realloc to %u\n", __func__, new_size),
1424 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1425 }
1426
1427 VERIFY(empty_index >= 0 && empty_index < (int)mpte->mpte_itfinfo_size);
1428 mpte->mpte_itfinfo[empty_index].ifindex = ifindex;
1429 mpte->mpte_itfinfo[empty_index].has_v4_conn = !!(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
1430 mpte->mpte_itfinfo[empty_index].has_v6_conn = !!(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
1431
1432 mptcp_sched_create_subflows(mpte);
1433 }
1434
1435 out:
1436 if (locked)
1437 mpte_unlock(mpte);
1438 }
1439
1440 void
1441 mptcp_set_restrictions(struct socket *mp_so)
1442 {
1443 struct mptses *mpte = mpsotompte(mp_so);
1444 uint32_t i;
1445
1446 mpte_lock_assert_held(mpte);
1447
1448 ifnet_head_lock_shared();
1449
1450 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1451 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
1452 uint32_t ifindex = info->ifindex;
1453 struct ifnet *ifp;
1454
1455 if (ifindex == IFSCOPE_NONE)
1456 continue;
1457
1458 ifp = ifindex2ifnet[ifindex];
1459
1460 if (IFNET_IS_EXPENSIVE(ifp) &&
1461 (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE))
1462 info->ifindex = IFSCOPE_NONE;
1463
1464 if (IFNET_IS_CELLULAR(ifp) &&
1465 (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR))
1466 info->ifindex = IFSCOPE_NONE;
1467 }
1468
1469 ifnet_head_done();
1470 }
1471