]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/mptcp.c
xnu-3248.50.21.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp.c
1 /*
2 * Copyright (c) 2012-2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
32 #include <sys/mbuf.h>
33 #include <sys/mcache.h>
34 #include <sys/socket.h>
35 #include <sys/socketvar.h>
36 #include <sys/syslog.h>
37 #include <sys/protosw.h>
38
39 #include <kern/zalloc.h>
40 #include <kern/locks.h>
41
42 #include <mach/thread_act.h>
43 #include <mach/sdt.h>
44
45 #include <dev/random/randomdev.h>
46
47 #include <net/if.h>
48 #include <netinet/in.h>
49 #include <netinet/in_var.h>
50 #include <netinet/tcp.h>
51 #include <netinet/tcp_fsm.h>
52 #include <netinet/tcp_seq.h>
53 #include <netinet/tcp_var.h>
54 #include <netinet/mptcp_var.h>
55 #include <netinet/mptcp.h>
56 #include <netinet/mptcp_seq.h>
57 #include <netinet/mptcp_opt.h>
58 #include <netinet/mptcp_timer.h>
59
60 int mptcp_enable = 1;
61 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
62 &mptcp_enable, 0, "Enable Multipath TCP Support");
63
64 /* Number of times to try negotiating MPTCP on SYN retransmissions */
65 int mptcp_mpcap_retries = MPTCP_CAPABLE_RETRIES;
66 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
67 CTLFLAG_RW | CTLFLAG_LOCKED,
68 &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
69
70 /*
71 * By default, DSS checksum is turned off, revisit if we ever do
72 * MPTCP for non SSL Traffic.
73 */
74 int mptcp_dss_csum = 0;
75 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
76 &mptcp_dss_csum, 0, "Enable DSS checksum");
77
78 /*
79 * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
80 * is attempted on a different path.
81 */
82 int mptcp_fail_thresh = 1;
83 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
84 &mptcp_fail_thresh, 0, "Failover threshold");
85
86
87 /*
88 * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
89 * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
90 * Some carrier networks have a timeout of 10 or 15 minutes.
91 */
92 int mptcp_subflow_keeptime = 60*14;
93 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
94 &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
95
96 /*
97 * MP_PRIO option.
98 */
99 int mptcp_mpprio_enable = 1;
100 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mpprio, CTLFLAG_RW | CTLFLAG_LOCKED,
101 &mptcp_mpprio_enable, 0, "Enable MP_PRIO option");
102
103 /*
104 * REMOVE_ADDR option.
105 */
106 int mptcp_remaddr_enable = 1;
107 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, remaddr, CTLFLAG_RW | CTLFLAG_LOCKED,
108 &mptcp_remaddr_enable, 0, "Enable REMOVE_ADDR option");
109
110 /*
111 * FastJoin Option
112 */
113 int mptcp_fastjoin = 1;
114 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fastjoin, CTLFLAG_RW | CTLFLAG_LOCKED,
115 &mptcp_fastjoin, 0, "Enable FastJoin Option");
116
117 int mptcp_zerortt_fastjoin = 0;
118 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, zerortt_fastjoin, CTLFLAG_RW |
119 CTLFLAG_LOCKED, &mptcp_zerortt_fastjoin, 0,
120 "Enable Zero RTT Fast Join");
121
122 /*
123 * R/W Notification on resume
124 */
125 int mptcp_rwnotify = 0;
126 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rwnotify, CTLFLAG_RW | CTLFLAG_LOCKED,
127 &mptcp_rwnotify, 0, "Enable RW notify on resume");
128
129 /*
130 * Using RTT history for sending new data
131 */
132 int mptcp_use_rtthist = 1;
133 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist, CTLFLAG_RW | CTLFLAG_LOCKED,
134 &mptcp_use_rtthist, 0, "Disable RTT History");
135
136 #define MPTCP_RTTHIST_MINTHRESH 500
137 int mptcp_rtthist_rtthresh = 600;
138 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
139 &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
140
141 /*
142 * Use RTO history for sending new data
143 */
144 int mptcp_use_rto = 1;
145 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, userto, CTLFLAG_RW | CTLFLAG_LOCKED,
146 &mptcp_use_rto, 0, "Disable RTO for subflow selection");
147
148 #define MPTCP_RTO_MINTHRESH 1000
149 int mptcp_rtothresh = 1500;
150 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
151 &mptcp_rtothresh, 0, "RTO threshold");
152
153 /*
154 * Use server's chosen path for sending new data
155 */
156 int mptcp_peerswitch = 1;
157 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, use_peer, CTLFLAG_RW | CTLFLAG_LOCKED,
158 &mptcp_peerswitch, 0, "Use peer");
159
160 #define MPTCP_PEERSWITCH_CNTMIN 3
161 uint32_t mptcp_peerswitch_cnt = 3;
162 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, peerswitchno, CTLFLAG_RW | CTLFLAG_LOCKED,
163 &mptcp_peerswitch_cnt, 0, "Set threshold based on peer's data arrival");
164
165 /*
166 * Probe the preferred path, when it is not in use
167 */
168 #define MPTCP_PROBETO_MIN 500
169 uint32_t mptcp_probeto = 1000;
170 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
171 &mptcp_probeto, 0, "Disable probing by setting to 0");
172
173 #define MPTCP_PROBE_MX 15
174 uint32_t mptcp_probecnt = 5;
175 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
176 &mptcp_probecnt, 0, "Number of probe writes");
177
178 /*
179 * Static declarations
180 */
181 static int mptcp_validate_csum(struct tcpcb *, struct mbuf *, int);
182 static uint16_t mptcp_input_csum(struct tcpcb *, struct mbuf *, int);
183
184 /*
185 * MPTCP input, called when data has been read from a subflow socket.
186 */
187 void
188 mptcp_input(struct mptses *mpte, struct mbuf *m)
189 {
190 struct socket *mp_so;
191 struct mptcb *mp_tp = NULL;
192 u_int64_t mb_dsn;
193 u_int32_t mb_datalen;
194 int count = 0;
195 struct mbuf *save = NULL, *prev = NULL;
196 struct mbuf *freelist = NULL, *tail = NULL;
197 boolean_t in_fallback = FALSE;
198
199 VERIFY(m->m_flags & M_PKTHDR);
200
201 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
202 mp_so = mpte->mpte_mppcb->mpp_socket;
203
204 DTRACE_MPTCP(input);
205
206 /*
207 * Each mbuf contains MPTCP Data Sequence Map
208 * Process the data for reassembly, delivery to MPTCP socket
209 * client, etc.
210 *
211 */
212 count = mp_so->so_rcv.sb_cc;
213
214 VERIFY(m != NULL);
215 mp_tp = mpte->mpte_mptcb;
216 VERIFY(mp_tp != NULL);
217
218 /* Ok to check for this flag without lock as its set in this thread */
219 in_fallback = (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
220
221 /*
222 * In the degraded fallback case, data is accepted without DSS map
223 */
224 if (in_fallback) {
225 fallback:
226 /*
227 * assume degraded flow as this may be the first packet
228 * without DSS, and the subflow state is not updated yet.
229 */
230 if (sbappendstream(&mp_so->so_rcv, m))
231 sorwakeup(mp_so);
232 DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
233 struct socket *, mp_so,
234 struct sockbuf *, &mp_so->so_rcv,
235 struct sockbuf *, &mp_so->so_snd,
236 struct mptses *, mpte);
237 count = mp_so->so_rcv.sb_cc - count;
238 mptcplog((LOG_DEBUG, "MPTCP Receiver: Fallback read %d bytes\n",
239 count), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
240 return;
241 }
242
243 MPT_LOCK(mp_tp);
244 do {
245 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
246 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
247 MPT_UNLOCK(mp_tp);
248 goto fallback;
249 }
250
251 save = m->m_next;
252 /*
253 * A single TCP packet formed of multiple mbufs
254 * holds DSS mapping in the first mbuf of the chain.
255 * Other mbufs in the chain may have M_PKTHDR set
256 * even though they belong to the same TCP packet
257 * and therefore use the DSS mapping stored in the
258 * first mbuf of the mbuf chain. mptcp_input() can
259 * get an mbuf chain with multiple TCP packets.
260 */
261 while (save && (!(save->m_flags & M_PKTHDR) ||
262 !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
263 prev = save;
264 save = save->m_next;
265 }
266 if (prev)
267 prev->m_next = NULL;
268 else
269 m->m_next = NULL;
270
271 mb_dsn = m->m_pkthdr.mp_dsn;
272 mb_datalen = m->m_pkthdr.mp_rlen;
273
274 if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvatmark)) {
275 tcpstat.tcps_mp_oodata++;
276 MPT_UNLOCK(mp_tp);
277 m_freem(m);
278 return;
279 /*
280 * Reassembly queue support here in future. Per spec,
281 * senders must implement retransmission timer to
282 * retransmit unacked data. Dropping out of order
283 * gives a slight hit on performance but allows us to
284 * deploy MPTCP and protects us against in-window DoS
285 * attacks that attempt to use up memory by sending
286 * out of order data. When doing load sharing across
287 * subflows, out of order support is a must.
288 */
289 }
290
291 if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvatmark)) {
292 if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
293 mp_tp->mpt_rcvatmark)) {
294 if (freelist == NULL)
295 freelist = m;
296 else
297 tail->m_next = m;
298
299 if (prev != NULL)
300 tail = prev;
301 else
302 tail = m;
303
304 m = save;
305 prev = save = NULL;
306 continue;
307 } else {
308 m_adj(m, (mp_tp->mpt_rcvatmark - mb_dsn));
309 }
310 mptcplog((LOG_INFO, "MPTCP Receiver: Left Edge %llu\n",
311 mp_tp->mpt_rcvatmark),
312 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
313 }
314
315 MPT_UNLOCK(mp_tp);
316 if (sbappendstream(&mp_so->so_rcv, m)) {
317 sorwakeup(mp_so);
318 }
319 DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
320 struct sockbuf *, &mp_so->so_rcv,
321 struct sockbuf *, &mp_so->so_snd,
322 struct mptses *, mpte,
323 struct mptcb *, mp_tp);
324 MPT_LOCK(mp_tp);
325 count = mp_so->so_rcv.sb_cc - count;
326 tcpstat.tcps_mp_rcvtotal++;
327 tcpstat.tcps_mp_rcvbytes += count;
328 mptcplog((LOG_DEBUG, "MPTCP Receiver: Read %d bytes\n", count),
329 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
330
331 /*
332 * The data received at the MPTCP layer will never exceed the
333 * receive window because anything to the right of the
334 * receive window will be trimmed at the subflow level.
335 */
336 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
337 mp_tp->mpt_rcvatmark += count;
338 m = save;
339 prev = save = NULL;
340 count = mp_so->so_rcv.sb_cc;
341 } while (m);
342 MPT_UNLOCK(mp_tp);
343
344 if (freelist)
345 m_freem(freelist);
346 }
347
348 /*
349 * MPTCP output.
350 */
351 int
352 mptcp_output(struct mptses *mpte)
353 {
354 struct mptsub *mpts;
355 struct mptsub *mpts_tried = NULL;
356 struct socket *mp_so;
357 struct mptsub *preferred_mpts = NULL;
358 int error = 0;
359
360 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
361 mp_so = mpte->mpte_mppcb->mpp_socket;
362 if (mp_so->so_state & SS_CANTSENDMORE) {
363 mptcplog((LOG_DEBUG, "MPTCP Sender: cantsendmore\n"),
364 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
365 return (EPIPE);
366 }
367
368 try_again:
369 /* get the "best" subflow to be used for transmission */
370 mpts = mptcp_get_subflow(mpte, NULL, &preferred_mpts);
371 if (mpts == NULL) {
372 mptcplog((LOG_ERR, "MPTCP Sender: mp_so 0x%llx no subflow\n",
373 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
374 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
375 goto out;
376 }
377
378 mptcplog((LOG_DEBUG, "MPTCP Sender: mp_so 0x%llx using cid %d \n",
379 (uint64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid),
380 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
381
382 /* In case there's just one flow, we reattempt later */
383 MPTS_LOCK(mpts);
384 if ((mpts_tried != NULL) && ((mpts == mpts_tried) ||
385 (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
386 MPTS_UNLOCK(mpts);
387 MPTS_LOCK(mpts_tried);
388 mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
389 mpts_tried->mpts_flags |= MPTSF_ACTIVE;
390 MPTS_UNLOCK(mpts_tried);
391 mptcp_start_timer(mpte, MPTT_REXMT);
392 mptcplog((LOG_DEBUG, "MPTCP Sender: mp_so 0x%llx retry later\n",
393 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
394 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
395 goto out;
396 }
397
398 DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
399 struct socket *, mp_so);
400 error = mptcp_subflow_output(mpte, mpts);
401 if (error && error != EWOULDBLOCK) {
402 /* can be a temporary loss of source address or other error */
403 mpts->mpts_flags |= MPTSF_FAILINGOVER;
404 mpts->mpts_flags &= ~MPTSF_ACTIVE;
405 mpts_tried = mpts;
406 MPTS_UNLOCK(mpts);
407 mptcplog((LOG_INFO, "MPTCP Sender: %s Error = %d \n",
408 __func__, error),
409 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
410 goto try_again;
411 }
412 /* The model is to have only one active flow at a time */
413 mpts->mpts_flags |= MPTSF_ACTIVE;
414 mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
415 MPTS_UNLOCK(mpts);
416
417 /* Allows us to update the smoothed rtt */
418 if ((mptcp_probeto) && (mptcp_probeto >= MPTCP_PROBETO_MIN) &&
419 (mpts != preferred_mpts) && (preferred_mpts != NULL)) {
420 MPTS_LOCK(preferred_mpts);
421 if (preferred_mpts->mpts_probesoon) {
422 if ((tcp_now - preferred_mpts->mpts_probesoon) >
423 mptcp_probeto) {
424 (void) mptcp_subflow_output(mpte, preferred_mpts);
425 if (preferred_mpts->mpts_probecnt >=
426 MIN(mptcp_probecnt, MPTCP_PROBE_MX)) {
427 preferred_mpts->mpts_probesoon = 0;
428 preferred_mpts->mpts_probecnt = 0;
429 }
430 }
431 } else {
432 preferred_mpts->mpts_probesoon = tcp_now;
433 preferred_mpts->mpts_probecnt = 0;
434 }
435 MPTS_UNLOCK(preferred_mpts);
436 }
437
438 if (mpte->mpte_active_sub == NULL) {
439 mpte->mpte_active_sub = mpts;
440 } else if (mpte->mpte_active_sub != mpts) {
441 mptcplog((LOG_DEBUG, "MPTCP Sender: switch [cid %d, srtt %d]"
442 "to [cid %d, srtt %d]\n",
443 mpte->mpte_active_sub->mpts_connid,
444 mpte->mpte_active_sub->mpts_srtt >> 5,
445 mpts->mpts_connid,
446 mpts->mpts_srtt >> 5),
447 MPTCP_SENDER_DBG | MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
448
449 MPTS_LOCK(mpte->mpte_active_sub);
450 mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
451 mpts->mpts_peerswitch = 0;
452 MPTS_UNLOCK(mpte->mpte_active_sub);
453 mpte->mpte_active_sub = mpts;
454 tcpstat.tcps_mp_switches++;
455 }
456 out:
457 /* subflow errors should not be percolated back up */
458 return (0);
459 }
460
461 /*
462 * Return the most eligible subflow to be used for sending data.
463 * This function also serves to check if any alternate subflow is available
464 * or not. best and second_best flows are chosen by their priority. third_best
465 * could be best or second_best but is under loss at the time of evaluation.
466 */
467 struct mptsub *
468 mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **preferred)
469 {
470 struct mptsub *mpts;
471 struct mptsub *best = NULL;
472 struct mptsub *second_best = NULL;
473 struct mptsub *third_best = NULL;
474 struct mptsub *symptoms_best = NULL;
475 struct socket *so = NULL;
476
477 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
478
479 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
480 MPTS_LOCK(mpts);
481
482 if ((ignore) && (mpts == ignore)) {
483 MPTS_UNLOCK(mpts);
484 continue;
485 }
486
487 /* There can only be one subflow in degraded state */
488 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
489 MPTS_UNLOCK(mpts);
490 best = mpts;
491 break;
492 }
493
494 /*
495 * Subflows with TFO or Fastjoin allow data to be written before
496 * the subflow is mp capable.
497 */
498 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) &&
499 !(mpts->mpts_flags & MPTSF_FASTJ_REQD) &&
500 !(mpts->mpts_flags & MPTSF_TFO_REQD)) {
501 MPTS_UNLOCK(mpts);
502 continue;
503 }
504
505 if (mpts->mpts_flags & MPTSF_SUSPENDED) {
506 MPTS_UNLOCK(mpts);
507 continue;
508 }
509
510 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
511 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
512 MPTS_UNLOCK(mpts);
513 continue;
514 }
515
516 if (mpts->mpts_flags & MPTSF_FAILINGOVER) {
517 so = mpts->mpts_socket;
518 if ((so) && (!(so->so_flags & SOF_PCBCLEARING))) {
519 socket_lock(so, 1);
520 if ((so->so_snd.sb_cc == 0) &&
521 (mptcp_no_rto_spike(so))) {
522 mpts->mpts_flags &= ~MPTSF_FAILINGOVER;
523 so->so_flags &= ~SOF_MP_TRYFAILOVER;
524 socket_unlock(so, 1);
525 } else {
526 third_best = mpts;
527 mptcplog((LOG_DEBUG, "MPTCP Sender: "
528 "%s cid %d in failover\n",
529 __func__, third_best->mpts_connid),
530 MPTCP_SENDER_DBG,
531 MPTCP_LOGLVL_VERBOSE);
532 socket_unlock(so, 1);
533 MPTS_UNLOCK(mpts);
534 continue;
535 }
536 } else {
537 MPTS_UNLOCK(mpts);
538 continue;
539 }
540 }
541
542 /* When there are no preferred flows, use first one in list */
543 if ((!second_best) && !(mpts->mpts_flags & MPTSF_PREFERRED))
544 second_best = mpts;
545
546 if (mpts->mpts_flags & MPTSF_PREFERRED) {
547 best = mpts;
548 }
549
550 MPTS_UNLOCK(mpts);
551 }
552
553 /*
554 * If there is no preferred or backup subflow, and there is no active
555 * subflow use the last usable subflow.
556 */
557 if (best == NULL) {
558 return (second_best ? second_best : third_best);
559 }
560
561 if (second_best == NULL) {
562 return (best ? best : third_best);
563 }
564
565 if (preferred != NULL)
566 *preferred = best;
567
568 /* Use a hint from symptomsd if it exists */
569 symptoms_best = mptcp_use_symptoms_hints(best, second_best);
570 if (symptoms_best != NULL)
571 return (symptoms_best);
572
573 /* Compare RTTs, select second_best if best's rtt exceeds rttthresh */
574 if ((mptcp_use_rtthist) &&
575 (best->mpts_srtt) && (second_best->mpts_srtt) &&
576 (best->mpts_srtt > second_best->mpts_srtt) &&
577 (best->mpts_srtt >= MAX((MPTCP_RTTHIST_MINTHRESH << 5),
578 (mptcp_rtthist_rtthresh << 5)))) {
579 tcpstat.tcps_mp_sel_rtt++;
580 mptcplog((LOG_DEBUG, "MPTCP Sender: %s best cid %d"
581 " at rtt %d, second cid %d at rtt %d\n", __func__,
582 best->mpts_connid, best->mpts_srtt >> 5,
583 second_best->mpts_connid,
584 second_best->mpts_srtt >> 5),
585 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
586 return (second_best);
587 }
588
589 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
590 if ((mptcp_use_rto) &&
591 (best->mpts_rxtcur) && (second_best->mpts_rxtcur) &&
592 (best->mpts_rxtcur > second_best->mpts_rxtcur) &&
593 (best->mpts_rxtcur >=
594 MAX(MPTCP_RTO_MINTHRESH, mptcp_rtothresh))) {
595 tcpstat.tcps_mp_sel_rto++;
596 mptcplog((LOG_DEBUG, "MPTCP Sender: %s best cid %d"
597 " at rto %d, second cid %d at rto %d\n", __func__,
598 best->mpts_connid, best->mpts_rxtcur,
599 second_best->mpts_connid, second_best->mpts_rxtcur),
600 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
601
602 return (second_best);
603 }
604
605 /* If second_best received data, use second_best */
606 if (mptcp_peerswitch &&
607 (second_best->mpts_peerswitch >
608 MAX(MPTCP_PEERSWITCH_CNTMIN, mptcp_peerswitch_cnt))) {
609 tcpstat.tcps_mp_sel_peer++;
610 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: best cid %d"
611 " but using cid %d after receiving %d segments\n",
612 __func__, best->mpts_connid, second_best->mpts_connid,
613 second_best->mpts_peerswitch), MPTCP_SENDER_DBG,
614 MPTCP_LOGLVL_LOG);
615 return (second_best);
616 }
617 return (best);
618 }
619
620 struct mptsub *
621 mptcp_get_pending_subflow(struct mptses *mpte, struct mptsub *ignore)
622 {
623 struct mptsub *mpts = NULL;
624
625 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
626
627 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
628 MPTS_LOCK(mpts);
629
630 if ((ignore) && (mpts == ignore)) {
631 MPTS_UNLOCK(mpts);
632 continue;
633 }
634
635 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
636 MPTS_UNLOCK(mpts);
637 break;
638 }
639
640 MPTS_UNLOCK(mpts);
641 }
642 return (mpts);
643 }
644
645 static const char *
646 mptcp_event_to_str(uint32_t event)
647 {
648 const char *c = "UNDEFINED";
649 switch (event) {
650 case MPCE_CLOSE:
651 c = "MPCE_CLOSE";
652 break;
653 case MPCE_RECV_DATA_ACK:
654 c = "MPCE_RECV_DATA_ACK";
655 break;
656 case MPCE_RECV_DATA_FIN:
657 c = "MPCE_RECV_DATA_FIN";
658 break;
659 }
660 return (c);
661 }
662
663 static const char *
664 mptcp_state_to_str(mptcp_state_t state)
665 {
666 const char *c = "UNDEFINED";
667 switch (state) {
668 case MPTCPS_CLOSED:
669 c = "MPTCPS_CLOSED";
670 break;
671 case MPTCPS_LISTEN:
672 c = "MPTCPS_LISTEN";
673 break;
674 case MPTCPS_ESTABLISHED:
675 c = "MPTCPS_ESTABLISHED";
676 break;
677 case MPTCPS_CLOSE_WAIT:
678 c = "MPTCPS_CLOSE_WAIT";
679 break;
680 case MPTCPS_FIN_WAIT_1:
681 c = "MPTCPS_FIN_WAIT_1";
682 break;
683 case MPTCPS_CLOSING:
684 c = "MPTCPS_CLOSING";
685 break;
686 case MPTCPS_LAST_ACK:
687 c = "MPTCPS_LAST_ACK";
688 break;
689 case MPTCPS_FIN_WAIT_2:
690 c = "MPTCPS_FIN_WAIT_2";
691 break;
692 case MPTCPS_TIME_WAIT:
693 c = "MPTCPS_TIME_WAIT";
694 break;
695 case MPTCPS_FASTCLOSE_WAIT:
696 c = "MPTCPS_FASTCLOSE_WAIT";
697 break;
698 case MPTCPS_TERMINATE:
699 c = "MPTCPS_TERMINATE";
700 break;
701 }
702 return (c);
703 }
704
705 void
706 mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
707 {
708 MPT_LOCK_ASSERT_HELD(mp_tp);
709 mptcp_state_t old_state = mp_tp->mpt_state;
710
711 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
712 uint32_t, event);
713
714 switch (mp_tp->mpt_state) {
715 case MPTCPS_CLOSED:
716 case MPTCPS_LISTEN:
717 mp_tp->mpt_state = MPTCPS_CLOSED;
718 break;
719
720 case MPTCPS_ESTABLISHED:
721 if (event == MPCE_CLOSE) {
722 mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
723 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
724 }
725 else if (event == MPCE_RECV_DATA_FIN) {
726 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
727 mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
728 }
729 break;
730
731 case MPTCPS_CLOSE_WAIT:
732 if (event == MPCE_CLOSE) {
733 mp_tp->mpt_state = MPTCPS_LAST_ACK;
734 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
735 }
736 break;
737
738 case MPTCPS_FIN_WAIT_1:
739 if (event == MPCE_RECV_DATA_ACK)
740 mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
741 else if (event == MPCE_RECV_DATA_FIN) {
742 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
743 mp_tp->mpt_state = MPTCPS_CLOSING;
744 }
745 break;
746
747 case MPTCPS_CLOSING:
748 if (event == MPCE_RECV_DATA_ACK)
749 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
750 break;
751
752 case MPTCPS_LAST_ACK:
753 if (event == MPCE_RECV_DATA_ACK)
754 mp_tp->mpt_state = MPTCPS_TERMINATE;
755 break;
756
757 case MPTCPS_FIN_WAIT_2:
758 if (event == MPCE_RECV_DATA_FIN) {
759 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
760 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
761 }
762 break;
763
764 case MPTCPS_TIME_WAIT:
765 break;
766
767 case MPTCPS_FASTCLOSE_WAIT:
768 if (event == MPCE_CLOSE) {
769 /* no need to adjust for data FIN */
770 mp_tp->mpt_state = MPTCPS_TERMINATE;
771 }
772 break;
773 case MPTCPS_TERMINATE:
774 break;
775 default:
776 VERIFY(0);
777 /* NOTREACHED */
778 }
779 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
780 uint32_t, event);
781 mptcplog((LOG_INFO, "MPTCP State: %s to %s on event %s\n",
782 mptcp_state_to_str(old_state),
783 mptcp_state_to_str(mp_tp->mpt_state),
784 mptcp_event_to_str(event)),
785 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
786 }
787
788 /*
789 * Update the mptcb send state variables, but the actual sbdrop occurs
790 * in MPTCP layer
791 */
792 void
793 mptcp_data_ack_rcvd(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dack)
794 {
795 u_int64_t acked = 0;
796
797 acked = full_dack - mp_tp->mpt_snduna;
798
799 if (acked) {
800 mp_tp->mpt_snduna += acked;
801 /* In degraded mode, we may get some Data ACKs */
802 if ((tp->t_mpflags & TMPF_TCP_FALLBACK) &&
803 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
804 MPTCP_SEQ_GT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
805 /* bring back sndnxt to retransmit MPTCP data */
806 mp_tp->mpt_sndnxt = mp_tp->mpt_dsn_at_csum_fail;
807 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
808 tp->t_inpcb->inp_socket->so_flags1 |=
809 SOF1_POST_FALLBACK_SYNC;
810 }
811 }
812 if ((full_dack == mp_tp->mpt_sndmax) &&
813 (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1)) {
814 mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_ACK);
815 tp->t_mpflags &= ~TMPF_SEND_DFIN;
816 }
817 }
818
819 /* If you change this function, match up mptcp_update_rcv_state_f */
820 void
821 mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
822 uint16_t csum)
823 {
824 struct mptcb *mp_tp = tptomptp(tp);
825 u_int64_t full_dsn = 0;
826
827 NTOHL(dss_info->mdss_dsn);
828 NTOHL(dss_info->mdss_subflow_seqn);
829 NTOHS(dss_info->mdss_data_len);
830
831 /* XXX for autosndbuf grow sb here */
832 MPT_LOCK(mp_tp);
833 MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
834 MPT_UNLOCK(mp_tp);
835 mptcp_update_rcv_state_meat(mp_tp, tp,
836 full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
837 csum);
838
839 }
840
841 void
842 mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
843 u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
844 uint16_t csum)
845 {
846 if (mdss_data_len == 0) {
847 mptcplog((LOG_INFO, "MPTCP Receiver: Infinite Mapping.\n"),
848 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
849
850 if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
851 mptcplog((LOG_ERR, "MPTCP Receiver: Bad checksum %x \n",
852 csum), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
853 }
854 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
855 return;
856 }
857 MPT_LOCK(mp_tp);
858 mptcplog((LOG_DEBUG,
859 "MPTCP Receiver: seqn = %x len = %x full = %llx "
860 "rcvnxt = %llu \n",
861 seqn, mdss_data_len, full_dsn, mp_tp->mpt_rcvnxt),
862 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
863
864 /* Process a Data FIN packet , handled in mptcp_do_fin_opt */
865 if ((seqn == 0) && (mdss_data_len == 1)) {
866 mptcplog((LOG_INFO, "MPTCP Receiver: Data FIN in %s state \n",
867 mptcp_state_to_str(mp_tp->mpt_state)),
868 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
869 MPT_UNLOCK(mp_tp);
870 return;
871 }
872 MPT_UNLOCK(mp_tp);
873 mptcp_notify_mpready(tp->t_inpcb->inp_socket);
874 tp->t_rcv_map.mpt_dsn = full_dsn;
875 tp->t_rcv_map.mpt_sseq = seqn;
876 tp->t_rcv_map.mpt_len = mdss_data_len;
877 tp->t_rcv_map.mpt_csum = csum;
878 tp->t_mpflags |= TMPF_EMBED_DSN;
879 }
880
881
882 void
883 mptcp_update_rcv_state_f(struct mptcp_dss_ack_opt *dss_info, struct tcpcb *tp,
884 uint16_t csum)
885 {
886 u_int64_t full_dsn = 0;
887 struct mptcb *mp_tp = tptomptp(tp);
888
889 /*
890 * May happen, because the caller of this function does an soevent.
891 * Review after rdar://problem/24083886
892 */
893 if (!mp_tp)
894 return;
895
896 NTOHL(dss_info->mdss_dsn);
897 NTOHL(dss_info->mdss_subflow_seqn);
898 NTOHS(dss_info->mdss_data_len);
899 MPT_LOCK(mp_tp);
900 MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
901 MPT_UNLOCK(mp_tp);
902 mptcp_update_rcv_state_meat(mp_tp, tp,
903 full_dsn,
904 dss_info->mdss_subflow_seqn,
905 dss_info->mdss_data_len,
906 csum);
907 }
908
909 void
910 mptcp_update_rcv_state_g(struct mptcp_dss64_ack32_opt *dss_info,
911 struct tcpcb *tp, uint16_t csum)
912 {
913 u_int64_t dsn = mptcp_ntoh64(dss_info->mdss_dsn);
914 struct mptcb *mp_tp = tptomptp(tp);
915
916 /*
917 * May happen, because the caller of this function does an soevent.
918 * Review after rdar://problem/24083886
919 */
920 if (!mp_tp)
921 return;
922
923 NTOHL(dss_info->mdss_subflow_seqn);
924 NTOHS(dss_info->mdss_data_len);
925 mptcp_update_rcv_state_meat(mp_tp, tp,
926 dsn,
927 dss_info->mdss_subflow_seqn,
928 dss_info->mdss_data_len,
929 csum);
930 }
931
932 static int
933 mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m,
934 int hdrlen)
935 {
936 u_int32_t sseq, datalen;
937
938 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP))
939 return 0;
940
941 sseq = m->m_pkthdr.mp_rseq + tp->irs;
942 datalen = m->m_pkthdr.mp_rlen;
943
944 #if 0
945 /* enable this to test TCP fallback post connection establishment */
946 if (SEQ_GT(sseq, (tp->irs+1)))
947 datalen = m->m_pkthdr.len - hdrlen - 1;
948 #endif
949
950 /* unacceptable DSS option, fallback to TCP */
951 if (m->m_pkthdr.len > ((int) datalen + hdrlen)) {
952 mptcplog((LOG_ERR, "MPTCP Receiver: "
953 "%s: mbuf len %d, MPTCP expected %d",
954 __func__, m->m_pkthdr.len, datalen),
955 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
956 } else {
957 return 0;
958 }
959 tp->t_mpflags |= TMPF_SND_MPFAIL;
960 mptcp_notify_mpfail(so);
961 m_freem(m);
962 return -1;
963 }
964
965 int
966 mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, int drop_hdrlen)
967 {
968 if (mptcp_validate_csum(tp, m, drop_hdrlen) != 0)
969 return -1;
970
971 mptcp_insert_rmap(tp, m);
972 if (mptcp_validate_dss_map(tp->t_inpcb->inp_socket, tp, m,
973 drop_hdrlen) != 0)
974 return -1;
975 return 0;
976 }
977
978 /*
979 * MPTCP Checksum support
980 * The checksum is calculated whenever the MPTCP DSS option is included
981 * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
982 * header and the actual data indicated by the length specified in the
983 * DSS option.
984 */
985
986 static int
987 mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, int drop_hdrlen)
988 {
989 uint16_t mptcp_csum = 0;
990 mptcp_csum = mptcp_input_csum(tp, m, drop_hdrlen);
991 if (mptcp_csum) {
992 tp->t_mpflags |= TMPF_SND_MPFAIL;
993 tp->t_mpflags &= ~TMPF_EMBED_DSN;
994 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
995 m_freem(m);
996 tcpstat.tcps_mp_badcsum++;
997 return -1;
998 }
999 return 0;
1000 }
1001
1002 static uint16_t
1003 mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, int off)
1004 {
1005 struct mptcb *mp_tp = tptomptp(tp);
1006 uint32_t sum = 0;
1007 uint64_t dsn;
1008 uint32_t sseq;
1009 uint16_t len;
1010 uint16_t csum;
1011
1012 if (mp_tp == NULL)
1013 return (0);
1014
1015 if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM))
1016 return (0);
1017
1018 if (!(tp->t_mpflags & TMPF_EMBED_DSN))
1019 return (0);
1020
1021 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
1022 return (0);
1023
1024 /*
1025 * The remote side may send a packet with fewer bytes than the
1026 * claimed DSS checksum length.
1027 */
1028 if ((int)m_length2(m, NULL) < (off + tp->t_rcv_map.mpt_len))
1029 return (0xffff);
1030
1031 if (tp->t_rcv_map.mpt_len != 0)
1032 sum = m_sum16(m, off, tp->t_rcv_map.mpt_len);
1033
1034 dsn = mptcp_hton64(tp->t_rcv_map.mpt_dsn);
1035 sseq = htonl(tp->t_rcv_map.mpt_sseq);
1036 len = htons(tp->t_rcv_map.mpt_len);
1037 csum = tp->t_rcv_map.mpt_csum;
1038 sum += in_pseudo64(dsn, sseq, (len + csum));
1039 ADDCARRY(sum);
1040 DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1041 uint32_t, sum);
1042 mptcplog((LOG_DEBUG, "MPTCP Receiver: sum = %x \n", sum),
1043 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
1044 return (~sum & 0xffff);
1045 }
1046
1047 void
1048 mptcp_output_csum(struct tcpcb *tp, struct mbuf *m, int32_t len,
1049 unsigned hdrlen, u_int64_t dss_val, u_int32_t *sseqp)
1050 {
1051 struct mptcb *mp_tp = tptomptp(tp);
1052 u_int32_t sum = 0;
1053 uint32_t sseq;
1054 uint16_t dss_len;
1055 uint16_t csum = 0;
1056 uint16_t *csump = NULL;
1057
1058 if (mp_tp == NULL)
1059 return;
1060
1061 if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM))
1062 return;
1063
1064 if (sseqp == NULL)
1065 return;
1066
1067 if (len)
1068 sum = m_sum16(m, hdrlen, len);
1069
1070 dss_val = mptcp_hton64(dss_val);
1071 sseq = *sseqp;
1072 dss_len = *(uint16_t *)(void *)((u_char*)sseqp + sizeof (u_int32_t));
1073 sum += in_pseudo64(dss_val, sseq, (dss_len + csum));
1074
1075 ADDCARRY(sum);
1076 sum = ~sum & 0xffff;
1077 csump = (uint16_t *)(void *)((u_char*)sseqp + sizeof (u_int32_t) +
1078 sizeof (uint16_t));
1079 DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1080 uint32_t, sum);
1081 *csump = sum;
1082 mptcplog((LOG_DEBUG, "MPTCP Sender: sum = %x \n", sum),
1083 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
1084 }
1085
1086 /*
1087 * When WiFi signal starts fading, there's more loss and RTT spikes.
1088 * Check if there has been a large spike by comparing against
1089 * a tolerable RTT spike threshold.
1090 */
1091 boolean_t
1092 mptcp_no_rto_spike(struct socket *so)
1093 {
1094 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1095 int32_t spike = 0;
1096
1097 if (tp->t_rxtcur > MAX(mptcp_rtothresh, MPTCP_RTO_MINTHRESH)) {
1098 spike = tp->t_rxtcur - mptcp_rtothresh;
1099
1100 mptcplog((LOG_DEBUG, "MPTCP Socket: %s: spike = %d rto = %d"
1101 "best = %d cur = %d\n", __func__, spike,
1102 tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
1103 tp->t_rttcur),
1104 (MPTCP_SOCKET_DBG|MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG);
1105
1106 }
1107
1108 if (spike > 0 ) {
1109 return (FALSE);
1110 } else {
1111 return (TRUE);
1112 }
1113 }