]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/vsock_domain.c
xnu-7195.81.3.tar.gz
[apple/xnu.git] / bsd / kern / vsock_domain.c
1 /*
2 * Copyright (c) 2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/domain.h>
30 #include <sys/socket.h>
31 #include <sys/protosw.h>
32 #include <sys/mcache.h>
33 #include <sys/systm.h>
34 #include <sys/sysctl.h>
35 #include <sys/random.h>
36 #include <sys/mbuf.h>
37 #include <sys/vsock_domain.h>
38 #include <sys/vsock_transport.h>
39 #include <kern/task.h>
40 #include <kern/zalloc.h>
41 #include <kern/locks.h>
42 #include <machine/atomic.h>
43
44 #define sotovsockpcb(so) ((struct vsockpcb *)(so)->so_pcb)
45
46 #define VSOCK_PORT_RESERVED 1024
47
48 /* VSock Protocol Globals */
49
50 static struct vsock_transport * _Atomic the_vsock_transport = NULL;
51 static ZONE_DECLARE(vsockpcb_zone, "vsockpcbzone",
52 sizeof(struct vsockpcb), ZC_NONE);
53 static struct vsockpcbinfo vsockinfo;
54
55 static uint32_t vsock_sendspace = VSOCK_MAX_PACKET_SIZE * 8;
56 static uint32_t vsock_recvspace = VSOCK_MAX_PACKET_SIZE * 8;
57
58 /* VSock PCB Helpers */
59
60 static uint32_t
61 vsock_get_peer_space(struct vsockpcb *pcb)
62 {
63 return pcb->peer_buf_alloc - (pcb->tx_cnt - pcb->peer_fwd_cnt);
64 }
65
66 static struct vsockpcb *
67 vsock_get_matching_pcb(struct vsock_address src, struct vsock_address dst)
68 {
69 struct vsockpcb *preferred = NULL;
70 struct vsockpcb *match = NULL;
71 struct vsockpcb *pcb = NULL;
72
73 lck_rw_lock_shared(vsockinfo.bound_lock);
74 LIST_FOREACH(pcb, &vsockinfo.bound, bound) {
75 // Source cid and port must match. Only destination port must match. (Allows for a changing CID during migration)
76 socket_lock(pcb->so, 1);
77 if ((pcb->so->so_state & SS_ISCONNECTED || pcb->so->so_state & SS_ISCONNECTING) &&
78 pcb->local_address.cid == src.cid && pcb->local_address.port == src.port &&
79 pcb->remote_address.port == dst.port) {
80 preferred = pcb;
81 break;
82 } else if ((pcb->local_address.cid == src.cid || pcb->local_address.cid == VMADDR_CID_ANY) &&
83 pcb->local_address.port == src.port) {
84 match = pcb;
85 }
86 socket_unlock(pcb->so, 1);
87 }
88 if (!preferred && match) {
89 socket_lock(match->so, 1);
90 preferred = match;
91 }
92 lck_rw_done(vsockinfo.bound_lock);
93
94 return preferred;
95 }
96
97 static errno_t
98 vsock_bind_address_if_free(struct vsockpcb *pcb, uint32_t local_cid, uint32_t local_port, uint32_t remote_cid, uint32_t remote_port)
99 {
100 socket_lock_assert_owned(pcb->so);
101
102 // Privileged ports.
103 if (local_port != VMADDR_PORT_ANY && local_port < VSOCK_PORT_RESERVED &&
104 current_task() != kernel_task && proc_suser(current_proc()) != 0) {
105 return EACCES;
106 }
107
108 bool taken = false;
109 const bool check_remote = (remote_cid != VMADDR_CID_ANY && remote_port != VMADDR_PORT_ANY);
110
111 struct vsockpcb *pcb_match = NULL;
112
113 socket_unlock(pcb->so, 0);
114 lck_rw_lock_exclusive(vsockinfo.bound_lock);
115 LIST_FOREACH(pcb_match, &vsockinfo.bound, bound) {
116 socket_lock(pcb_match->so, 1);
117 if (pcb == pcb_match ||
118 (!check_remote && pcb_match->local_address.port == local_port) ||
119 (check_remote && pcb_match->local_address.port == local_port &&
120 pcb_match->remote_address.cid == remote_cid && pcb_match->remote_address.port == remote_port)) {
121 socket_unlock(pcb_match->so, 1);
122 taken = true;
123 break;
124 }
125 socket_unlock(pcb_match->so, 1);
126 }
127 socket_lock(pcb->so, 0);
128 if (!taken) {
129 pcb->local_address = (struct vsock_address) { .cid = local_cid, .port = local_port };
130 pcb->remote_address = (struct vsock_address) { .cid = remote_cid, .port = remote_port };
131 LIST_INSERT_HEAD(&vsockinfo.bound, pcb, bound);
132 }
133 lck_rw_done(vsockinfo.bound_lock);
134
135 return taken ? EADDRINUSE : 0;
136 }
137
138 static errno_t
139 vsock_bind_address(struct vsockpcb *pcb, struct vsock_address laddr, struct vsock_address raddr)
140 {
141 if (!pcb) {
142 return EINVAL;
143 }
144
145 socket_lock_assert_owned(pcb->so);
146
147 // Certain CIDs are reserved.
148 if (laddr.cid == VMADDR_CID_HYPERVISOR || laddr.cid == VMADDR_CID_RESERVED || laddr.cid == VMADDR_CID_HOST) {
149 return EADDRNOTAVAIL;
150 }
151
152 // Remote address must be fully specified or not specified at all.
153 if ((raddr.cid == VMADDR_CID_ANY) ^ (raddr.port == VMADDR_PORT_ANY)) {
154 return EINVAL;
155 }
156
157 // Cannot bind if already bound.
158 if (pcb->local_address.port != VMADDR_PORT_ANY) {
159 return EINVAL;
160 }
161
162 uint32_t transport_cid;
163 struct vsock_transport *transport = pcb->transport;
164 errno_t error = transport->get_cid(transport->provider, &transport_cid);
165 if (error) {
166 return error;
167 }
168
169 // Local CID must be this transport's CID or any.
170 if (laddr.cid != transport_cid && laddr.cid != VMADDR_CID_ANY) {
171 return EINVAL;
172 }
173
174 if (laddr.port != VMADDR_PORT_ANY) {
175 error = vsock_bind_address_if_free(pcb, laddr.cid, laddr.port, raddr.cid, raddr.port);
176 } else {
177 lck_mtx_lock(&vsockinfo.port_lock);
178
179 const uint32_t first = VSOCK_PORT_RESERVED;
180 const uint32_t last = VMADDR_PORT_ANY - 1;
181 uint32_t count = last - first + 1;
182 uint32_t *last_port = &vsockinfo.last_port;
183
184 if (pcb->so->so_flags & SOF_BINDRANDOMPORT) {
185 uint32_t random = 0;
186 read_frandom(&random, sizeof(random));
187 *last_port = first + (random % count);
188 }
189
190 do {
191 if (count == 0) {
192 lck_mtx_unlock(&vsockinfo.port_lock);
193 return EADDRNOTAVAIL;
194 }
195 count--;
196
197 ++*last_port;
198 if (*last_port < first || *last_port > last) {
199 *last_port = first;
200 }
201
202 error = vsock_bind_address_if_free(pcb, laddr.cid, *last_port, raddr.cid, raddr.port);
203 } while (error);
204
205 lck_mtx_unlock(&vsockinfo.port_lock);
206 }
207
208 return error;
209 }
210
211 static void
212 vsock_unbind_pcb(struct vsockpcb *pcb, bool is_locked)
213 {
214 if (!pcb) {
215 return;
216 }
217
218 socket_lock_assert_owned(pcb->so);
219
220 soisdisconnected(pcb->so);
221
222 if (!pcb->bound.le_prev) {
223 return;
224 }
225
226 if (!is_locked) {
227 socket_unlock(pcb->so, 0);
228 lck_rw_lock_exclusive(vsockinfo.bound_lock);
229 socket_lock(pcb->so, 0);
230 if (!pcb->bound.le_prev) {
231 lck_rw_done(vsockinfo.bound_lock);
232 return;
233 }
234 }
235
236 LIST_REMOVE(pcb, bound);
237 pcb->bound.le_next = NULL;
238 pcb->bound.le_prev = NULL;
239
240 if (!is_locked) {
241 lck_rw_done(vsockinfo.bound_lock);
242 }
243 }
244
245 static struct sockaddr *
246 vsock_new_sockaddr(struct vsock_address *address)
247 {
248 if (!address) {
249 return NULL;
250 }
251
252 struct sockaddr_vm *addr;
253 MALLOC(addr, struct sockaddr_vm *, sizeof(*addr), M_SONAME, M_WAITOK);
254 if (!addr) {
255 return NULL;
256 }
257
258 bzero(addr, sizeof(*addr));
259 addr->svm_len = sizeof(*addr);
260 addr->svm_family = AF_VSOCK;
261 addr->svm_port = address->port;
262 addr->svm_cid = address->cid;
263
264 return (struct sockaddr *)addr;
265 }
266
267 static errno_t
268 vsock_pcb_send_message(struct vsockpcb *pcb, enum vsock_operation operation, mbuf_t m)
269 {
270 if (!pcb) {
271 if (m != NULL) {
272 mbuf_freem_list(m);
273 }
274 return EINVAL;
275 }
276
277 socket_lock_assert_owned(pcb->so);
278
279 errno_t error;
280
281 struct vsock_address dst = pcb->remote_address;
282 if (dst.cid == VMADDR_CID_ANY || dst.port == VMADDR_PORT_ANY) {
283 if (m != NULL) {
284 mbuf_freem_list(m);
285 }
286 return EINVAL;
287 }
288
289 struct vsock_address src = pcb->local_address;
290 if (src.cid == VMADDR_CID_ANY) {
291 uint32_t transport_cid;
292 struct vsock_transport *transport = pcb->transport;
293 error = transport->get_cid(transport->provider, &transport_cid);
294 if (error) {
295 if (m != NULL) {
296 mbuf_freem_list(m);
297 }
298 return error;
299 }
300 src.cid = transport_cid;
301 }
302
303 uint32_t buf_alloc = pcb->so->so_rcv.sb_hiwat;
304 uint32_t fwd_cnt = pcb->fwd_cnt;
305
306 if (src.cid == dst.cid) {
307 pcb->last_buf_alloc = buf_alloc;
308 pcb->last_fwd_cnt = fwd_cnt;
309
310 socket_unlock(pcb->so, 0);
311 error = vsock_put_message(src, dst, operation, buf_alloc, fwd_cnt, m);
312 socket_lock(pcb->so, 0);
313 } else {
314 struct vsock_transport *transport = pcb->transport;
315 error = transport->put_message(transport->provider, src, dst, operation, buf_alloc, fwd_cnt, m);
316
317 if (!error) {
318 pcb->last_buf_alloc = buf_alloc;
319 pcb->last_fwd_cnt = fwd_cnt;
320 }
321 }
322
323 return error;
324 }
325
326 static errno_t
327 vsock_pcb_reset_address(struct vsock_address src, struct vsock_address dst)
328 {
329 if (dst.cid == VMADDR_CID_ANY || dst.port == VMADDR_PORT_ANY) {
330 return EINVAL;
331 }
332
333 errno_t error;
334 struct vsock_transport *transport = NULL;
335
336 if (src.cid == VMADDR_CID_ANY) {
337 transport = os_atomic_load(&the_vsock_transport, relaxed);
338 if (transport == NULL) {
339 return ENODEV;
340 }
341
342 uint32_t transport_cid;
343 error = transport->get_cid(transport->provider, &transport_cid);
344 if (error) {
345 return error;
346 }
347 src.cid = transport_cid;
348 }
349
350 if (src.cid == dst.cid) {
351 error = vsock_put_message(src, dst, VSOCK_RESET, 0, 0, NULL);
352 } else {
353 if (!transport) {
354 transport = os_atomic_load(&the_vsock_transport, relaxed);
355 if (transport == NULL) {
356 return ENODEV;
357 }
358 }
359 error = transport->put_message(transport->provider, src, dst, VSOCK_RESET, 0, 0, NULL);
360 }
361
362 return error;
363 }
364
365 static errno_t
366 vsock_pcb_safe_reset_address(struct vsockpcb *pcb, struct vsock_address src, struct vsock_address dst)
367 {
368 if (pcb) {
369 socket_lock_assert_owned(pcb->so);
370 socket_unlock(pcb->so, 0);
371 }
372 errno_t error = vsock_pcb_reset_address(src, dst);
373 if (pcb) {
374 socket_lock(pcb->so, 0);
375 }
376 return error;
377 }
378
379 static errno_t
380 vsock_pcb_connect(struct vsockpcb *pcb)
381 {
382 return vsock_pcb_send_message(pcb, VSOCK_REQUEST, NULL);
383 }
384
385 static errno_t
386 vsock_pcb_respond(struct vsockpcb *pcb)
387 {
388 return vsock_pcb_send_message(pcb, VSOCK_RESPONSE, NULL);
389 }
390
391 static errno_t
392 vsock_pcb_send(struct vsockpcb *pcb, mbuf_t m)
393 {
394 return vsock_pcb_send_message(pcb, VSOCK_PAYLOAD, m);
395 }
396
397 static errno_t
398 vsock_pcb_shutdown_send(struct vsockpcb *pcb)
399 {
400 return vsock_pcb_send_message(pcb, VSOCK_SHUTDOWN_SEND, NULL);
401 }
402
403 static errno_t
404 vsock_pcb_reset(struct vsockpcb *pcb)
405 {
406 return vsock_pcb_send_message(pcb, VSOCK_RESET, NULL);
407 }
408
409 static errno_t
410 vsock_pcb_credit_update(struct vsockpcb *pcb)
411 {
412 return vsock_pcb_send_message(pcb, VSOCK_CREDIT_UPDATE, NULL);
413 }
414
415 static errno_t
416 vsock_pcb_credit_request(struct vsockpcb *pcb)
417 {
418 return vsock_pcb_send_message(pcb, VSOCK_CREDIT_REQUEST, NULL);
419 }
420
421 static errno_t
422 vsock_disconnect_pcb_common(struct vsockpcb *pcb, bool is_locked)
423 {
424 socket_lock_assert_owned(pcb->so);
425 vsock_unbind_pcb(pcb, is_locked);
426 return vsock_pcb_reset(pcb);
427 }
428
429 static errno_t
430 vsock_disconnect_pcb_locked(struct vsockpcb *pcb)
431 {
432 return vsock_disconnect_pcb_common(pcb, true);
433 }
434
435 static errno_t
436 vsock_disconnect_pcb(struct vsockpcb *pcb)
437 {
438 return vsock_disconnect_pcb_common(pcb, false);
439 }
440
441 static errno_t
442 vsock_sockaddr_vm_validate(struct vsockpcb *pcb, struct sockaddr_vm *addr)
443 {
444 if (!pcb || !pcb->so || !addr) {
445 return EINVAL;
446 }
447
448 // Validate address length.
449 if (addr->svm_len < sizeof(struct sockaddr_vm)) {
450 return EINVAL;
451 }
452
453 // Validate address family.
454 if (addr->svm_family != AF_UNSPEC && addr->svm_family != AF_VSOCK) {
455 return EAFNOSUPPORT;
456 }
457
458 // Only stream is supported currently.
459 if (pcb->so->so_type != SOCK_STREAM) {
460 return EAFNOSUPPORT;
461 }
462
463 return 0;
464 }
465 /* VSock Receive Handlers */
466
467 static errno_t
468 vsock_put_message_connected(struct vsockpcb *pcb, enum vsock_operation op, mbuf_t m)
469 {
470 socket_lock_assert_owned(pcb->so);
471
472 errno_t error = 0;
473
474 switch (op) {
475 case VSOCK_SHUTDOWN:
476 error = vsock_disconnect_pcb(pcb);
477 break;
478 case VSOCK_SHUTDOWN_RECEIVE:
479 socantsendmore(pcb->so);
480 break;
481 case VSOCK_SHUTDOWN_SEND:
482 socantrcvmore(pcb->so);
483 break;
484 case VSOCK_PAYLOAD:
485 // Add data to the receive queue then wakeup any reading threads.
486 error = !sbappendstream(&pcb->so->so_rcv, m);
487 if (!error) {
488 sorwakeup(pcb->so);
489 }
490 break;
491 case VSOCK_RESET:
492 vsock_unbind_pcb(pcb, false);
493 break;
494 default:
495 error = ENOTSUP;
496 break;
497 }
498
499 return error;
500 }
501
502 static errno_t
503 vsock_put_message_connecting(struct vsockpcb *pcb, enum vsock_operation op)
504 {
505 socket_lock_assert_owned(pcb->so);
506
507 errno_t error = 0;
508
509 switch (op) {
510 case VSOCK_RESPONSE:
511 soisconnected(pcb->so);
512 break;
513 case VSOCK_RESET:
514 pcb->so->so_error = EAGAIN;
515 error = vsock_disconnect_pcb(pcb);
516 break;
517 default:
518 vsock_disconnect_pcb(pcb);
519 error = ENOTSUP;
520 break;
521 }
522
523 return error;
524 }
525
526 static errno_t
527 vsock_put_message_listening(struct vsockpcb *pcb, enum vsock_operation op, struct vsock_address src, struct vsock_address dst)
528 {
529 socket_lock_assert_owned(pcb->so);
530
531 struct sockaddr_vm addr;
532 struct socket *so2 = NULL;
533 struct vsockpcb *pcb2 = NULL;
534
535 errno_t error = 0;
536
537 switch (op) {
538 case VSOCK_REQUEST:
539 addr = (struct sockaddr_vm) {
540 .svm_len = sizeof(addr),
541 .svm_family = AF_VSOCK,
542 .svm_reserved1 = 0,
543 .svm_port = pcb->local_address.port,
544 .svm_cid = pcb->local_address.cid
545 };
546 so2 = sonewconn(pcb->so, 0, (struct sockaddr *)&addr);
547 if (!so2) {
548 // It is likely that the backlog is full. Deny this request.
549 vsock_pcb_safe_reset_address(pcb, dst, src);
550 error = ECONNREFUSED;
551 break;
552 }
553
554 pcb2 = sotovsockpcb(so2);
555 if (!pcb2) {
556 error = EINVAL;
557 goto done;
558 }
559
560 error = vsock_bind_address(pcb2, dst, src);
561 if (error) {
562 goto done;
563 }
564
565 error = vsock_pcb_respond(pcb2);
566 if (error) {
567 goto done;
568 }
569
570 soisconnected(so2);
571
572 done:
573 if (error) {
574 soisdisconnected(so2);
575 if (pcb2) {
576 vsock_unbind_pcb(pcb2, false);
577 }
578 socket_unlock(so2, 1);
579 vsock_pcb_reset_address(dst, src);
580 } else {
581 socket_unlock(so2, 0);
582 }
583 socket_lock(pcb->so, 0);
584
585 break;
586 case VSOCK_RESET:
587 error = vsock_pcb_safe_reset_address(pcb, dst, src);
588 break;
589 default:
590 vsock_pcb_safe_reset_address(pcb, dst, src);
591 error = ENOTSUP;
592 break;
593 }
594
595 return error;
596 }
597
598 /* VSock Transport */
599
600 errno_t
601 vsock_add_transport(struct vsock_transport *transport)
602 {
603 if (transport == NULL || transport->provider == NULL) {
604 return EINVAL;
605 }
606 if (!os_atomic_cmpxchg((void * volatile *)&the_vsock_transport, NULL, transport, acq_rel)) {
607 return EEXIST;
608 }
609 return 0;
610 }
611
612 errno_t
613 vsock_remove_transport(struct vsock_transport *transport)
614 {
615 if (!os_atomic_cmpxchg((void * volatile *)&the_vsock_transport, transport, NULL, acq_rel)) {
616 return ENODEV;
617 }
618 return 0;
619 }
620
621 errno_t
622 vsock_reset_transport(struct vsock_transport *transport)
623 {
624 if (transport == NULL) {
625 return EINVAL;
626 }
627
628 errno_t error = 0;
629 struct vsockpcb *pcb = NULL;
630 struct vsockpcb *tmp_pcb = NULL;
631
632 lck_rw_lock_exclusive(vsockinfo.bound_lock);
633 LIST_FOREACH_SAFE(pcb, &vsockinfo.bound, bound, tmp_pcb) {
634 // Disconnect this transport's sockets. Listen and bind sockets must stay alive.
635 socket_lock(pcb->so, 1);
636 if (pcb->transport == transport && pcb->so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) {
637 errno_t dc_error = vsock_disconnect_pcb_locked(pcb);
638 if (dc_error && !error) {
639 error = dc_error;
640 }
641 }
642 socket_unlock(pcb->so, 1);
643 }
644 lck_rw_done(vsockinfo.bound_lock);
645
646 return error;
647 }
648
649 errno_t
650 vsock_put_message(struct vsock_address src, struct vsock_address dst, enum vsock_operation op, uint32_t buf_alloc, uint32_t fwd_cnt, mbuf_t m)
651 {
652 struct vsockpcb *pcb = vsock_get_matching_pcb(dst, src);
653 if (!pcb) {
654 if (op != VSOCK_RESET) {
655 vsock_pcb_reset_address(dst, src);
656 }
657 if (m != NULL) {
658 mbuf_freem_list(m);
659 }
660 return EINVAL;
661 }
662
663 socket_lock_assert_owned(pcb->so);
664
665 struct socket *so = pcb->so;
666 errno_t error = 0;
667
668 // Check if the peer's buffer has changed. Update our view of the peer's forwarded bytes.
669 int buffers_changed = (pcb->peer_buf_alloc != buf_alloc) || (pcb->peer_fwd_cnt) != fwd_cnt;
670 pcb->peer_buf_alloc = buf_alloc;
671 pcb->peer_fwd_cnt = fwd_cnt;
672
673 // Peer's buffer has enough space for the next packet. Notify any threads waiting for space.
674 if (buffers_changed && vsock_get_peer_space(pcb) >= pcb->waiting_send_size) {
675 sowwakeup(so);
676 }
677
678 switch (op) {
679 case VSOCK_CREDIT_REQUEST:
680 error = vsock_pcb_credit_update(pcb);
681 break;
682 case VSOCK_CREDIT_UPDATE:
683 break;
684 default:
685 if (so->so_state & SS_ISCONNECTED) {
686 error = vsock_put_message_connected(pcb, op, m);
687 m = NULL;
688 } else if (so->so_state & SS_ISCONNECTING) {
689 error = vsock_put_message_connecting(pcb, op);
690 } else if (so->so_options & SO_ACCEPTCONN) {
691 error = vsock_put_message_listening(pcb, op, src, dst);
692 } else {
693 // Reset the connection for other states such as 'disconnecting'.
694 error = vsock_disconnect_pcb(pcb);
695 if (!error) {
696 error = ENODEV;
697 }
698 }
699 break;
700 }
701 socket_unlock(so, 1);
702
703 if (m != NULL) {
704 mbuf_freem_list(m);
705 }
706
707 return error;
708 }
709
710 /* VSock Sysctl */
711
712 static int
713 vsock_pcblist SYSCTL_HANDLER_ARGS
714 {
715 #pragma unused(oidp,arg2)
716
717 int error;
718
719 // Only stream is supported.
720 if ((intptr_t)arg1 != SOCK_STREAM) {
721 return EINVAL;
722 }
723
724 // Get the generation count and the count of all vsock sockets.
725 lck_rw_lock_shared(vsockinfo.all_lock);
726 uint64_t n = vsockinfo.all_pcb_count;
727 vsock_gen_t gen_count = vsockinfo.vsock_gencnt;
728 lck_rw_done(vsockinfo.all_lock);
729
730 const size_t xpcb_len = sizeof(struct xvsockpcb);
731 struct xvsockpgen xvg;
732
733 /*
734 * The process of preparing the PCB list is too time-consuming and
735 * resource-intensive to repeat twice on every request.
736 */
737 if (req->oldptr == USER_ADDR_NULL) {
738 req->oldidx = (size_t)(2 * sizeof(xvg) + (n + n / 8) * xpcb_len);
739 return 0;
740 }
741
742 if (req->newptr != USER_ADDR_NULL) {
743 return EPERM;
744 }
745
746 bzero(&xvg, sizeof(xvg));
747 xvg.xvg_len = sizeof(xvg);
748 xvg.xvg_count = n;
749 xvg.xvg_gen = gen_count;
750 xvg.xvg_sogen = so_gencnt;
751 error = SYSCTL_OUT(req, &xvg, sizeof(xvg));
752 if (error) {
753 return error;
754 }
755
756 // Return if no sockets exist.
757 if (n == 0) {
758 return 0;
759 }
760
761 lck_rw_lock_shared(vsockinfo.all_lock);
762
763 n = 0;
764 struct vsockpcb *pcb = NULL;
765 TAILQ_FOREACH(pcb, &vsockinfo.all, all) {
766 // Bail if there is not enough user buffer for this next socket.
767 if (req->oldlen - req->oldidx - sizeof(xvg) < xpcb_len) {
768 break;
769 }
770
771 // Populate the socket structure.
772 socket_lock(pcb->so, 1);
773 if (pcb->vsock_gencnt <= gen_count) {
774 struct xvsockpcb xpcb;
775 bzero(&xpcb, xpcb_len);
776 xpcb.xv_len = xpcb_len;
777 xpcb.xv_vsockpp = (uint64_t)VM_KERNEL_ADDRHASH(pcb);
778 xpcb.xvp_local_cid = pcb->local_address.cid;
779 xpcb.xvp_local_port = pcb->local_address.port;
780 xpcb.xvp_remote_cid = pcb->remote_address.cid;
781 xpcb.xvp_remote_port = pcb->remote_address.port;
782 xpcb.xvp_rxcnt = pcb->fwd_cnt;
783 xpcb.xvp_txcnt = pcb->tx_cnt;
784 xpcb.xvp_peer_rxhiwat = pcb->peer_buf_alloc;
785 xpcb.xvp_peer_rxcnt = pcb->peer_fwd_cnt;
786 xpcb.xvp_last_pid = pcb->so->last_pid;
787 xpcb.xvp_gencnt = pcb->vsock_gencnt;
788 if (pcb->so) {
789 sotoxsocket(pcb->so, &xpcb.xv_socket);
790 }
791 socket_unlock(pcb->so, 1);
792
793 error = SYSCTL_OUT(req, &xpcb, xpcb_len);
794 if (error != 0) {
795 break;
796 }
797 n++;
798 } else {
799 socket_unlock(pcb->so, 1);
800 }
801 }
802
803 // Update the generation count to match the sockets being returned.
804 gen_count = vsockinfo.vsock_gencnt;
805
806 lck_rw_done(vsockinfo.all_lock);
807
808 if (!error) {
809 /*
810 * Give the user an updated idea of our state.
811 * If the generation differs from what we told
812 * her before, she knows that something happened
813 * while we were processing this request, and it
814 * might be necessary to retry.
815 */
816 bzero(&xvg, sizeof(xvg));
817 xvg.xvg_len = sizeof(xvg);
818 xvg.xvg_count = n;
819 xvg.xvg_gen = gen_count;
820 xvg.xvg_sogen = so_gencnt;
821 error = SYSCTL_OUT(req, &xvg, sizeof(xvg));
822 }
823
824 return error;
825 }
826
827 #ifdef SYSCTL_DECL
828 SYSCTL_NODE(_net, OID_AUTO, vsock, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "vsock");
829 SYSCTL_UINT(_net_vsock, OID_AUTO, sendspace, CTLFLAG_RW | CTLFLAG_LOCKED,
830 &vsock_sendspace, 0, "Maximum outgoing vsock datagram size");
831 SYSCTL_UINT(_net_vsock, OID_AUTO, recvspace, CTLFLAG_RW | CTLFLAG_LOCKED,
832 &vsock_recvspace, 0, "Maximum incoming vsock datagram size");
833 SYSCTL_PROC(_net_vsock, OID_AUTO, pcblist,
834 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
835 (caddr_t)(long)SOCK_STREAM, 0, vsock_pcblist, "S,xvsockpcb",
836 "List of active vsock sockets");
837 #endif
838
839 /* VSock Protocol */
840
841 static int
842 vsock_attach(struct socket *so, int proto, struct proc *p)
843 {
844 #pragma unused(proto, p)
845
846 // Attach should only be run once per socket.
847 struct vsockpcb *pcb = sotovsockpcb(so);
848 if (pcb) {
849 return EINVAL;
850 }
851
852 // Get the transport for this socket.
853 struct vsock_transport *transport = os_atomic_load(&the_vsock_transport, relaxed);
854 if (transport == NULL) {
855 return ENODEV;
856 }
857
858 // Reserve send and receive buffers.
859 errno_t error = soreserve(so, vsock_sendspace, vsock_recvspace);
860 if (error) {
861 return error;
862 }
863
864 // Initialize the vsock protocol control block.
865 pcb = zalloc(vsockpcb_zone);
866 if (pcb == NULL) {
867 return ENOBUFS;
868 }
869 bzero(pcb, sizeof(*pcb));
870 pcb->so = so;
871 pcb->transport = transport;
872 pcb->local_address = (struct vsock_address) {
873 .cid = VMADDR_CID_ANY,
874 .port = VMADDR_PORT_ANY
875 };
876 pcb->remote_address = (struct vsock_address) {
877 .cid = VMADDR_CID_ANY,
878 .port = VMADDR_PORT_ANY
879 };
880 so->so_pcb = pcb;
881
882 // Tell the transport that this socket has attached.
883 error = transport->attach_socket(transport->provider);
884 if (error) {
885 return error;
886 }
887
888 // Add to the list of all vsock sockets.
889 lck_rw_lock_exclusive(vsockinfo.all_lock);
890 TAILQ_INSERT_TAIL(&vsockinfo.all, pcb, all);
891 vsockinfo.all_pcb_count++;
892 pcb->vsock_gencnt = ++vsockinfo.vsock_gencnt;
893 lck_rw_done(vsockinfo.all_lock);
894
895 return 0;
896 }
897
898 static int
899 vsock_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct proc *p)
900 {
901 #pragma unused(ifp)
902
903 VERIFY(so != NULL || p == kernproc);
904
905 if (cmd != IOCTL_VM_SOCKETS_GET_LOCAL_CID) {
906 return EINVAL;
907 }
908
909 struct vsock_transport *transport;
910 if (so) {
911 struct vsockpcb *pcb = sotovsockpcb(so);
912 if (pcb == NULL) {
913 return EINVAL;
914 }
915 transport = pcb->transport;
916 } else {
917 transport = os_atomic_load(&the_vsock_transport, relaxed);
918 }
919
920 if (transport == NULL) {
921 return ENODEV;
922 }
923
924 uint32_t transport_cid;
925 errno_t error = transport->get_cid(transport->provider, &transport_cid);
926 if (error) {
927 return error;
928 }
929
930 memcpy(data, &transport_cid, sizeof(transport_cid));
931
932 return 0;
933 }
934
935 static int
936 vsock_detach(struct socket *so)
937 {
938 struct vsockpcb *pcb = sotovsockpcb(so);
939 if (pcb == NULL) {
940 return EINVAL;
941 }
942
943 vsock_unbind_pcb(pcb, false);
944
945 // Tell the transport that this socket has detached.
946 struct vsock_transport *transport = pcb->transport;
947 errno_t error = transport->detach_socket(transport->provider);
948 if (error) {
949 return error;
950 }
951
952 // Remove from the list of all vsock sockets.
953 lck_rw_lock_exclusive(vsockinfo.all_lock);
954 TAILQ_REMOVE(&vsockinfo.all, pcb, all);
955 pcb->all.tqe_next = NULL;
956 pcb->all.tqe_prev = NULL;
957 vsockinfo.all_pcb_count--;
958 vsockinfo.vsock_gencnt++;
959 lck_rw_done(vsockinfo.all_lock);
960
961 // Deallocate any resources.
962 zfree(vsockpcb_zone, pcb);
963 so->so_pcb = 0;
964 so->so_flags |= SOF_PCBCLEARING;
965 sofree(so);
966
967 return 0;
968 }
969
970 static int
971 vsock_abort(struct socket *so)
972 {
973 soisdisconnected(so);
974 return vsock_detach(so);
975 }
976
977 static int
978 vsock_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
979 {
980 #pragma unused(p)
981
982 struct vsockpcb *pcb = sotovsockpcb(so);
983 if (pcb == NULL) {
984 return EINVAL;
985 }
986
987 struct sockaddr_vm *addr = (struct sockaddr_vm *)nam;
988
989 errno_t error = vsock_sockaddr_vm_validate(pcb, addr);
990 if (error) {
991 return error;
992 }
993
994 struct vsock_address laddr = (struct vsock_address) {
995 .cid = addr->svm_cid,
996 .port = addr->svm_port,
997 };
998
999 struct vsock_address raddr = (struct vsock_address) {
1000 .cid = VMADDR_CID_ANY,
1001 .port = VMADDR_PORT_ANY,
1002 };
1003
1004 error = vsock_bind_address(pcb, laddr, raddr);
1005 if (error) {
1006 return error;
1007 }
1008
1009 return 0;
1010 }
1011
1012 static int
1013 vsock_listen(struct socket *so, struct proc *p)
1014 {
1015 #pragma unused(p)
1016
1017 struct vsockpcb *pcb = sotovsockpcb(so);
1018 if (pcb == NULL) {
1019 return EINVAL;
1020 }
1021
1022 // Only stream is supported currently.
1023 if (so->so_type != SOCK_STREAM) {
1024 return EAFNOSUPPORT;
1025 }
1026
1027 struct vsock_address *addr = &pcb->local_address;
1028
1029 if (addr->port == VMADDR_CID_ANY) {
1030 return EFAULT;
1031 }
1032
1033 struct vsock_transport *transport = pcb->transport;
1034 uint32_t transport_cid;
1035 errno_t error = transport->get_cid(transport->provider, &transport_cid);
1036 if (error) {
1037 return error;
1038 }
1039
1040 // Can listen on the transport's cid or any.
1041 if (addr->cid != transport_cid && addr->cid != VMADDR_CID_ANY) {
1042 return EFAULT;
1043 }
1044
1045 return 0;
1046 }
1047
1048 static int
1049 vsock_accept(struct socket *so, struct sockaddr **nam)
1050 {
1051 struct vsockpcb *pcb = sotovsockpcb(so);
1052 if (pcb == NULL) {
1053 return EINVAL;
1054 }
1055
1056 // Do not accept disconnected sockets.
1057 if (so->so_state & SS_ISDISCONNECTED) {
1058 return ECONNABORTED;
1059 }
1060
1061 *nam = vsock_new_sockaddr(&pcb->remote_address);
1062
1063 return 0;
1064 }
1065
1066 static int
1067 vsock_connect(struct socket *so, struct sockaddr *nam, struct proc *p)
1068 {
1069 #pragma unused(p)
1070
1071 struct vsockpcb *pcb = sotovsockpcb(so);
1072 if (pcb == NULL) {
1073 return EINVAL;
1074 }
1075
1076 struct sockaddr_vm *addr = (struct sockaddr_vm *)nam;
1077
1078 errno_t error = vsock_sockaddr_vm_validate(pcb, addr);
1079 if (error) {
1080 return error;
1081 }
1082
1083 uint32_t transport_cid;
1084 struct vsock_transport *transport = pcb->transport;
1085 error = transport->get_cid(transport->provider, &transport_cid);
1086 if (error) {
1087 return error;
1088 }
1089
1090 // Only supporting connections to the host, hypervisor, or self for now.
1091 if (addr->svm_cid != VMADDR_CID_HOST &&
1092 addr->svm_cid != VMADDR_CID_HYPERVISOR &&
1093 addr->svm_cid != transport_cid) {
1094 return EFAULT;
1095 }
1096
1097 soisconnecting(so);
1098
1099 // Set the remote and local address.
1100 struct vsock_address remote_addr = (struct vsock_address) {
1101 .cid = addr->svm_cid,
1102 .port = addr->svm_port,
1103 };
1104
1105 struct vsock_address local_addr = (struct vsock_address) {
1106 .cid = transport_cid,
1107 .port = VMADDR_PORT_ANY,
1108 };
1109
1110 // Bind to the address.
1111 error = vsock_bind_address(pcb, local_addr, remote_addr);
1112 if (error) {
1113 goto cleanup;
1114 }
1115
1116 // Attempt a connection using the socket's transport.
1117 error = vsock_pcb_connect(pcb);
1118 if (error) {
1119 goto cleanup;
1120 }
1121
1122 if ((so->so_state & SS_ISCONNECTED) == 0) {
1123 // Don't wait for peer's response if non-blocking.
1124 if (so->so_state & SS_NBIO) {
1125 error = EINPROGRESS;
1126 goto done;
1127 }
1128
1129 struct timespec ts = (struct timespec) {
1130 .tv_sec = so->so_snd.sb_timeo.tv_sec,
1131 .tv_nsec = so->so_snd.sb_timeo.tv_usec * 1000,
1132 };
1133
1134 lck_mtx_t *mutex_held;
1135 if (so->so_proto->pr_getlock != NULL) {
1136 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1137 } else {
1138 mutex_held = so->so_proto->pr_domain->dom_mtx;
1139 }
1140
1141 // Wait until we receive a response to the connect request.
1142 error = msleep((caddr_t)&so->so_timeo, mutex_held, PSOCK | PCATCH, "vsock_connect", &ts);
1143 if (error) {
1144 if (error == EAGAIN) {
1145 error = ETIMEDOUT;
1146 }
1147 goto cleanup;
1148 }
1149 }
1150
1151 cleanup:
1152 if (so->so_error && !error) {
1153 error = so->so_error;
1154 so->so_error = 0;
1155 }
1156 if (!error) {
1157 error = !(so->so_state & SS_ISCONNECTED);
1158 }
1159 if (error) {
1160 vsock_unbind_pcb(pcb, false);
1161 }
1162
1163 done:
1164 return error;
1165 }
1166
1167 static int
1168 vsock_disconnect(struct socket *so)
1169 {
1170 struct vsockpcb *pcb = sotovsockpcb(so);
1171 if (pcb == NULL) {
1172 return EINVAL;
1173 }
1174
1175 return vsock_disconnect_pcb(pcb);
1176 }
1177
1178 static int
1179 vsock_sockaddr(struct socket *so, struct sockaddr **nam)
1180 {
1181 struct vsockpcb *pcb = sotovsockpcb(so);
1182 if (pcb == NULL) {
1183 return EINVAL;
1184 }
1185
1186 *nam = vsock_new_sockaddr(&pcb->local_address);
1187
1188 return 0;
1189 }
1190
1191 static int
1192 vsock_peeraddr(struct socket *so, struct sockaddr **nam)
1193 {
1194 struct vsockpcb *pcb = sotovsockpcb(so);
1195 if (pcb == NULL) {
1196 return EINVAL;
1197 }
1198
1199 *nam = vsock_new_sockaddr(&pcb->remote_address);
1200
1201 return 0;
1202 }
1203
1204 static int
1205 vsock_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, proc_t p)
1206 {
1207 #pragma unused(flags, nam, p)
1208
1209 struct vsockpcb *pcb = sotovsockpcb(so);
1210 if (pcb == NULL || m == NULL) {
1211 return EINVAL;
1212 }
1213
1214 if (control != NULL) {
1215 m_freem(control);
1216 return EOPNOTSUPP;
1217 }
1218
1219 // Ensure this socket is connected.
1220 if ((so->so_state & SS_ISCONNECTED) == 0) {
1221 if (m != NULL) {
1222 mbuf_freem_list(m);
1223 }
1224 return EPERM;
1225 }
1226
1227 errno_t error;
1228
1229 const size_t len = mbuf_pkthdr_len(m);
1230 uint32_t free_space = vsock_get_peer_space(pcb);
1231
1232 // Ensure the peer has enough space in their receive buffer.
1233 while (len > free_space) {
1234 // Record the number of free peer bytes necessary before we can send.
1235 if (len > pcb->waiting_send_size) {
1236 pcb->waiting_send_size = len;
1237 }
1238
1239 // Send a credit request.
1240 error = vsock_pcb_credit_request(pcb);
1241 if (error) {
1242 if (m != NULL) {
1243 mbuf_freem_list(m);
1244 }
1245 return error;
1246 }
1247
1248 // Check again in case free space was automatically updated in loopback case.
1249 free_space = vsock_get_peer_space(pcb);
1250 if (len <= free_space) {
1251 pcb->waiting_send_size = 0;
1252 break;
1253 }
1254
1255 // Bail if this is a non-blocking socket.
1256 if (so->so_state & SS_NBIO) {
1257 if (m != NULL) {
1258 mbuf_freem_list(m);
1259 }
1260 return EWOULDBLOCK;
1261 }
1262
1263 // Wait until our peer has enough free space in their receive buffer.
1264 error = sbwait(&so->so_snd);
1265 pcb->waiting_send_size = 0;
1266 if (error) {
1267 if (m != NULL) {
1268 mbuf_freem_list(m);
1269 }
1270 return error;
1271 }
1272
1273 // Bail if an error occured or we can't send more.
1274 if (so->so_state & SS_CANTSENDMORE) {
1275 if (m != NULL) {
1276 mbuf_freem_list(m);
1277 }
1278 return EPIPE;
1279 } else if (so->so_error) {
1280 error = so->so_error;
1281 so->so_error = 0;
1282 if (m != NULL) {
1283 mbuf_freem_list(m);
1284 }
1285 return error;
1286 }
1287
1288 free_space = vsock_get_peer_space(pcb);
1289 }
1290
1291 // Send a payload over the transport.
1292 error = vsock_pcb_send(pcb, m);
1293 if (error) {
1294 return error;
1295 }
1296
1297 pcb->tx_cnt += len;
1298
1299 return 0;
1300 }
1301
1302 static int
1303 vsock_shutdown(struct socket *so)
1304 {
1305 struct vsockpcb *pcb = sotovsockpcb(so);
1306 if (pcb == NULL) {
1307 return EINVAL;
1308 }
1309
1310 socantsendmore(so);
1311
1312 // Tell peer we will no longer send.
1313 errno_t error = vsock_pcb_shutdown_send(pcb);
1314 if (error) {
1315 return error;
1316 }
1317
1318 return 0;
1319 }
1320
1321 static int
1322 vsock_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
1323 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1324 {
1325 struct vsockpcb *pcb = sotovsockpcb(so);
1326 if (pcb == NULL) {
1327 return EINVAL;
1328 }
1329
1330 user_ssize_t length = uio_resid(uio);
1331 int result = soreceive(so, psa, uio, mp0, controlp, flagsp);
1332 length -= uio_resid(uio);
1333
1334 socket_lock(so, 1);
1335
1336 pcb->fwd_cnt += length;
1337
1338 const uint32_t threshold = VSOCK_MAX_PACKET_SIZE;
1339
1340 // Send a credit update if is possible that the peer will no longer send.
1341 if ((pcb->fwd_cnt - pcb->last_fwd_cnt + threshold) >= pcb->last_buf_alloc) {
1342 errno_t error = vsock_pcb_credit_update(pcb);
1343 if (!result && error) {
1344 result = error;
1345 }
1346 }
1347
1348 socket_unlock(so, 1);
1349
1350 return result;
1351 }
1352
1353 static struct pr_usrreqs vsock_usrreqs = {
1354 .pru_abort = vsock_abort,
1355 .pru_attach = vsock_attach,
1356 .pru_control = vsock_control,
1357 .pru_detach = vsock_detach,
1358 .pru_bind = vsock_bind,
1359 .pru_listen = vsock_listen,
1360 .pru_accept = vsock_accept,
1361 .pru_connect = vsock_connect,
1362 .pru_disconnect = vsock_disconnect,
1363 .pru_send = vsock_send,
1364 .pru_shutdown = vsock_shutdown,
1365 .pru_sockaddr = vsock_sockaddr,
1366 .pru_peeraddr = vsock_peeraddr,
1367 .pru_sosend = sosend,
1368 .pru_soreceive = vsock_soreceive,
1369 };
1370
1371 static void
1372 vsock_init(struct protosw *pp, struct domain *dp)
1373 {
1374 #pragma unused(dp)
1375
1376 static int vsock_initialized = 0;
1377 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
1378 if (!os_atomic_cmpxchg((volatile int *)&vsock_initialized, 0, 1, acq_rel)) {
1379 return;
1380 }
1381
1382 // Setup VSock protocol info struct.
1383 vsockinfo.vsock_lock_grp_attr = lck_grp_attr_alloc_init();
1384 vsockinfo.vsock_lock_grp = lck_grp_alloc_init("vsock", vsockinfo.vsock_lock_grp_attr);
1385 vsockinfo.vsock_lock_attr = lck_attr_alloc_init();
1386 if ((vsockinfo.all_lock = lck_rw_alloc_init(vsockinfo.vsock_lock_grp, vsockinfo.vsock_lock_attr)) == NULL ||
1387 (vsockinfo.bound_lock = lck_rw_alloc_init(vsockinfo.vsock_lock_grp, vsockinfo.vsock_lock_attr)) == NULL) {
1388 panic("%s: unable to allocate PCB lock\n", __func__);
1389 /* NOTREACHED */
1390 }
1391 lck_mtx_init(&vsockinfo.port_lock, vsockinfo.vsock_lock_grp, vsockinfo.vsock_lock_attr);
1392 TAILQ_INIT(&vsockinfo.all);
1393 LIST_INIT(&vsockinfo.bound);
1394 vsockinfo.last_port = VMADDR_PORT_ANY;
1395 }
1396
1397 static struct protosw vsocksw[] = {
1398 {
1399 .pr_type = SOCK_STREAM,
1400 .pr_protocol = 0,
1401 .pr_flags = PR_CONNREQUIRED | PR_WANTRCVD,
1402 .pr_init = vsock_init,
1403 .pr_usrreqs = &vsock_usrreqs,
1404 }
1405 };
1406
1407 static const int vsock_proto_count = (sizeof(vsocksw) / sizeof(struct protosw));
1408
1409 /* VSock Domain */
1410
1411 static struct domain *vsock_domain = NULL;
1412
1413 static void
1414 vsock_dinit(struct domain *dp)
1415 {
1416 // The VSock domain is initialized with a singleton pattern.
1417 VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
1418 VERIFY(vsock_domain == NULL);
1419 vsock_domain = dp;
1420
1421 // Add protocols and initialize.
1422 for (int i = 0; i < vsock_proto_count; i++) {
1423 net_add_proto((struct protosw *)&vsocksw[i], dp, 1);
1424 }
1425 }
1426
1427 struct domain vsockdomain_s = {
1428 .dom_family = PF_VSOCK,
1429 .dom_name = "vsock",
1430 .dom_init = vsock_dinit,
1431 .dom_maxrtkey = sizeof(struct sockaddr_vm),
1432 .dom_protohdrlen = sizeof(struct sockaddr_vm),
1433 };