bsd/kern/uipc_syscalls.c

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All Rights Reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 /*
  31  * Copyright (c) 1982, 1986, 1989, 1990, 1993
  32  *      The Regents of the University of California.  All rights reserved.
  33  *
  34  * sendfile(2) and related extensions:
  35  * Copyright (c) 1998, David Greenman. All rights reserved.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  * 1. Redistributions of source code must retain the above copyright
  41  *    notice, this list of conditions and the following disclaimer.
  42  * 2. Redistributions in binary form must reproduce the above copyright
  43  *    notice, this list of conditions and the following disclaimer in the
  44  *    documentation and/or other materials provided with the distribution.
  45  * 3. All advertising materials mentioning features or use of this software
  46  *    must display the following acknowledgement:
  47  *      This product includes software developed by the University of
  48  *      California, Berkeley and its contributors.
  49  * 4. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)uipc_syscalls.c     8.4 (Berkeley) 2/21/94
  66  */
  67
  68
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/filedesc.h>
  73 #include <sys/proc_internal.h>
  74 #include <sys/file_internal.h>
  75 #include <sys/malloc.h>
  76 #include <sys/mbuf.h>
  77 #include <kern/lock.h>
  78 #include <sys/domain.h>
  79 #include <sys/protosw.h>
  80 #include <sys/signalvar.h>
  81 #include <sys/socket.h>
  82 #include <sys/socketvar.h>
  83 #if KTRACE
  84 #include <sys/ktrace.h>
  85 #endif
  86 #include <sys/kernel.h>
  87 #include <sys/uio_internal.h>
  88
  89 #include <bsm/audit_kernel.h>
  90
  91 #include <sys/kdebug.h>
  92 #include <sys/sysproto.h>
  93
  94 #define f_flag f_fglob->fg_flag
  95 #define f_type f_fglob->fg_type
  96 #define f_msgcount f_fglob->fg_msgcount
  97 #define f_cred f_fglob->fg_cred
  98 #define f_ops f_fglob->fg_ops
  99 #define f_offset f_fglob->fg_offset
 100 #define f_data f_fglob->fg_data
 101 #if KDEBUG
 102
 103 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 104 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 105 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 106 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 107 #define DBG_FNC_SENDMSG         NETDBG_CODE(DBG_NETSOCK, (1 << 8) | 1)
 108 #define DBG_FNC_SENDTO          NETDBG_CODE(DBG_NETSOCK, (2 << 8) | 1)
 109 #define DBG_FNC_SENDIT          NETDBG_CODE(DBG_NETSOCK, (3 << 8) | 1)
 110 #define DBG_FNC_RECVFROM        NETDBG_CODE(DBG_NETSOCK, (5 << 8))
 111 #define DBG_FNC_RECVMSG         NETDBG_CODE(DBG_NETSOCK, (6 << 8))
 112 #define DBG_FNC_RECVIT          NETDBG_CODE(DBG_NETSOCK, (7 << 8))
 113
 114 #endif
 115
 116
 117 #define HACK_FOR_4056224 1
 118 #if HACK_FOR_4056224
 119 static pid_t last_pid_4056224 = 0;
 120 #endif /* HACK_FOR_4056224 */
 121
 122
 123 #if SENDFILE
 124 static void sf_buf_init(void *arg);
 125 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
 126 static struct sf_buf *sf_buf_alloc(void);
 127 static void sf_buf_ref(caddr_t addr, u_int size);
 128 static void sf_buf_free(caddr_t addr, u_int size);
 129
 130 static SLIST_HEAD(, sf_buf) sf_freelist;
 131 static vm_offset_t sf_base;
 132 static struct sf_buf *sf_bufs;
 133 static int sf_buf_alloc_want;
 134 #endif
 135
 136 static int sendit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop,
 137                                         int flags, register_t *retval);
 138 static int recvit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop,
 139                                         user_addr_t namelenp, register_t *retval);
 140
 141 static int accept1(struct proc *p, struct accept_args *uap, register_t *retval, int compat);
 142 static int getsockname1(struct proc *p, struct getsockname_args *uap,
 143                              register_t *retval, int compat);
 144 static int getpeername1(struct proc *p, struct getpeername_args *uap,
 145                              register_t *retval, int compat);
 146
 147
 148 #if COMPAT_43_SOCKET
 149 struct orecvmsg_args  {
 150         int     s;
 151         struct  omsghdr *msg;
 152         int     flags;
 153 };
 154 struct osendmsg_args {
 155         int s;
 156         caddr_t msg;
 157         int flags;
 158 };
 159 struct osend_args {
 160         int s;
 161         caddr_t buf;
 162         int len;
 163         int flags;
 164 };
 165 struct  orecv_args {
 166         int     s;
 167         caddr_t buf;
 168         int     len;
 169         int     flags;
 170 };
 171
 172 int oaccept(struct proc *p, struct accept_args *uap, register_t *retval);
 173 int ogetpeername(struct proc *p, struct getpeername_args *uap, register_t *retval);
 174 int ogetsockname(struct proc *p, struct getsockname_args *uap, register_t *retval);
 175 int orecv(struct proc *p, struct orecv_args     *uap, register_t *retval);
 176 int orecvfrom(struct proc *p, struct recvfrom_args *uap, register_t *retval);
 177 int orecvmsg(struct proc *p, struct orecvmsg_args *uap, register_t *retval);
 178 int     osend(struct proc *p, struct osend_args *uap, register_t *retval);
 179 int osendmsg(struct proc *p, struct osendmsg_args *uap, register_t *retval);
 180 #endif // COMPAT_43_SOCKET
 181
 182 /*
 183  * System call interface to the socket abstraction.
 184  */
 185
 186 extern  struct fileops socketops;
 187
 188 int
 189 socket(p, uap, retval)
 190         struct proc *p;
 191         register struct socket_args *uap;
 192         register_t *retval;
 193 {
 194         struct socket *so;
 195         struct fileproc *fp;
 196         int fd, error;
 197
 198         AUDIT_ARG(socket, uap->domain, uap->type, uap->protocol);
 199
 200         error = falloc(p, &fp, &fd);
 201         if (error) {
 202                 return (error);
 203         }
 204         fp->f_flag = FREAD|FWRITE;
 205         fp->f_type = DTYPE_SOCKET;
 206         fp->f_ops = &socketops;
 207
 208         error = socreate(uap->domain, &so, uap->type, uap->protocol);
 209         if (error) {
 210                 fp_free(p, fd, fp);
 211         } else {
 212                 fp->f_data = (caddr_t)so;
 213
 214                 proc_fdlock(p);
 215                 *fdflags(p, fd) &= ~UF_RESERVED;
 216
 217                 fp_drop(p, fd, fp, 1);
 218                 proc_fdunlock(p);
 219
 220                 *retval = fd;
 221         }
 222         return (error);
 223 }
 224
 225 /* ARGSUSED */
 226 int
 227 bind(struct proc *p, struct bind_args *uap, __unused register_t *retval)
 228 {
 229         struct sockaddr *sa;
 230         struct socket *so;
 231         int error;
 232
 233         AUDIT_ARG(fd, uap->s);
 234         error = file_socket(uap->s, &so);
 235         if (error)
 236                 return (error);
 237         error = getsockaddr(&sa, uap->name, uap->namelen);
 238         if (error)
 239                 goto out;
 240         AUDIT_ARG(sockaddr, p, sa);
 241         if (so != NULL)
 242                 error = sobind(so, sa);
 243         else
 244                 error = EBADF;
 245         FREE(sa, M_SONAME);
 246 out:
 247         file_drop(uap->s);
 248         return (error);
 249 }
 250
 251
 252 int
 253 listen(__unused struct proc *p, register struct listen_args *uap,
 254                 __unused register_t *retval)
 255 {
 256         int error;
 257         struct socket * so;
 258
 259         AUDIT_ARG(fd, uap->s);
 260         error = file_socket(uap->s, &so);
 261         if (error)
 262                 return (error);
 263         if (so != NULL)
 264                 error =  solisten(so, uap->backlog);
 265         else
 266                 error = EBADF;
 267         file_drop(uap->s);
 268         return (error);
 269 }
 270
 271 #if !COMPAT_43_SOCKET
 272 #define accept1 accept
 273 #endif
 274
 275
 276
 277 int
 278 accept1(struct proc *p, struct accept_args *uap, register_t *retval, int compat)
 279 {
 280         struct fileproc *fp;
 281         struct sockaddr *sa;
 282         socklen_t namelen;
 283         int error;
 284         struct socket *head, *so = NULL;
 285         lck_mtx_t *mutex_held;
 286         int fd = uap->s;
 287         int newfd;;
 288         short fflag;            /* type must match fp->f_flag */
 289         int dosocklock = 0;
 290
 291         AUDIT_ARG(fd, uap->s);
 292         if (uap->name) {
 293                 error = copyin(uap->anamelen, (caddr_t)&namelen,
 294                         sizeof(socklen_t));
 295                 if(error)
 296                         return (error);
 297         }
 298         error = fp_getfsock(p, fd, &fp, &head);
 299         if (error) {
 300                 if (error == EOPNOTSUPP)
 301                         error = ENOTSOCK;
 302                 return (error);
 303         }
 304         if (head == NULL) {
 305                 error = EBADF;
 306                 goto out;
 307         }
 308
 309         socket_lock(head, 1);
 310
 311         if (head->so_proto->pr_getlock != NULL)  {
 312                 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
 313                 dosocklock = 1;
 314         }
 315         else {
 316                 mutex_held = head->so_proto->pr_domain->dom_mtx;
 317                 dosocklock = 0;
 318         }
 319
 320
 321         if ((head->so_options & SO_ACCEPTCONN) == 0) {
 322                 socket_unlock(head, 1);
 323                 error = EINVAL;
 324                 goto out;
 325         }
 326         if ((head->so_state & SS_NBIO) && head->so_comp.tqh_first == NULL) {
 327                 socket_unlock(head, 1);
 328                 error = EWOULDBLOCK;
 329                 goto out;
 330         }
 331         while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
 332                 if (head->so_state & SS_CANTRCVMORE) {
 333                         head->so_error = ECONNABORTED;
 334                         break;
 335                 }
 336                 if (head->so_usecount < 1)
 337                         panic("accept1: head=%x refcount=%d\n", head, head->so_usecount);
 338                 error = msleep((caddr_t)&head->so_timeo, mutex_held, PSOCK | PCATCH,
 339                     "accept", 0);
 340                 if (head->so_usecount < 1)
 341                         panic("accept1: 2 head=%x refcount=%d\n", head, head->so_usecount);
 342                 if ((head->so_state & SS_DRAINING)) {
 343                         error = ECONNABORTED;
 344                 }
 345                 if (error) {
 346                         socket_unlock(head, 1);
 347                         goto out;
 348                 }
 349         }
 350         if (head->so_error) {
 351                 error = head->so_error;
 352                 head->so_error = 0;
 353                 socket_unlock(head, 1);
 354                 goto out;
 355         }
 356
 357
 358         /*
 359          * At this point we know that there is at least one connection
 360          * ready to be accepted. Remove it from the queue prior to
 361          * allocating the file descriptor for it since falloc() may
 362          * block allowing another process to accept the connection
 363          * instead.
 364          */
 365         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 366         so = TAILQ_FIRST(&head->so_comp);
 367         TAILQ_REMOVE(&head->so_comp, so, so_list);
 368         head->so_qlen--;
 369         socket_unlock(head, 0); /* unlock head to avoid deadlock with select, keep a ref on head */
 370         fflag = fp->f_flag;
 371         proc_fdlock(p);
 372         error = falloc_locked(p, &fp, &newfd, 1);
 373         if (error) {
 374                 /*
 375                  * Probably ran out of file descriptors. Put the
 376                  * unaccepted connection back onto the queue and
 377                  * do another wakeup so some other process might
 378                  * have a chance at it.
 379                  */
 380                 proc_fdunlock(p);
 381                 socket_lock(head, 0);
 382                 TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
 383                 head->so_qlen++;
 384                 wakeup_one((caddr_t)&head->so_timeo);
 385                 socket_unlock(head, 1);
 386                 goto out;
 387         }
 388         *fdflags(p, newfd) &= ~UF_RESERVED;
 389         *retval = newfd;
 390         fp->f_type = DTYPE_SOCKET;
 391         fp->f_flag = fflag;
 392         fp->f_ops = &socketops;
 393         fp->f_data = (caddr_t)so;
 394         fp_drop(p, newfd, fp, 1);
 395         proc_fdunlock(p);
 396         socket_lock(head, 0);
 397         if (dosocklock)
 398                 socket_lock(so, 1);
 399         so->so_state &= ~SS_COMP;
 400         so->so_head = NULL;
 401         sa = 0;
 402         (void) soacceptlock(so, &sa, 0);
 403         socket_unlock(head, 1);
 404         if (sa == 0) {
 405                 namelen = 0;
 406                 if (uap->name)
 407                         goto gotnoname;
 408                 if (dosocklock)
 409                         socket_unlock(so, 1);
 410                 error = 0;
 411                 goto out;
 412         }
 413         AUDIT_ARG(sockaddr, p, sa);
 414         if (uap->name) {
 415                 /* check sa_len before it is destroyed */
 416                 if (namelen > sa->sa_len)
 417                         namelen = sa->sa_len;
 418 #if COMPAT_43_SOCKET
 419                 if (compat)
 420                         ((struct osockaddr *)sa)->sa_family =
 421                             sa->sa_family;
 422 #endif
 423                 error = copyout(sa, uap->name, namelen);
 424                 if (!error)
 425 gotnoname:
 426                         error = copyout((caddr_t)&namelen, uap->anamelen,
 427                                                 sizeof(socklen_t));
 428         }
 429         FREE(sa, M_SONAME);
 430         if (dosocklock)
 431                 socket_unlock(so, 1);
 432 out:
 433         file_drop(fd);
 434         return (error);
 435 }
 436
 437 int
 438 accept(struct proc *p, struct accept_args *uap, register_t *retval)
 439 {
 440
 441         return (accept1(p, uap, retval, 0));
 442 }
 443
 444 #if COMPAT_43_SOCKET
 445 int
 446 oaccept(struct proc *p, struct accept_args *uap, register_t *retval)
 447 {
 448
 449         return (accept1(p, uap, retval, 1));
 450 }
 451 #endif /* COMPAT_43_SOCKET */
 452
 453 /* ARGSUSED */
 454 int
 455 connect(struct proc *p, struct connect_args *uap, __unused register_t *retval)
 456 {
 457         struct socket *so;
 458         struct sockaddr *sa;
 459         lck_mtx_t *mutex_held;
 460         int error;
 461         int fd = uap->s;
 462
 463         AUDIT_ARG(fd, uap->s);
 464         error = file_socket( fd, &so);
 465         if (error)
 466                 return (error);
 467         if (so == NULL) {
 468                 error = EBADF;
 469                 goto out;
 470         }
 471
 472         socket_lock(so, 1);
 473
 474         if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
 475                 socket_unlock(so, 1);
 476                 error = EALREADY;
 477                 goto out;
 478         }
 479         error = getsockaddr(&sa, uap->name, uap->namelen);
 480         if (error)  {
 481                 socket_unlock(so, 1);
 482                 goto out;
 483         }
 484         AUDIT_ARG(sockaddr, p, sa);
 485         error = soconnectlock(so, sa, 0);
 486         if (error)
 487                 goto bad;
 488         if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
 489                 FREE(sa, M_SONAME);
 490                 socket_unlock(so, 1);
 491                 error = EINPROGRESS;
 492                 goto out;
 493         }
 494         while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
 495                 if (so->so_proto->pr_getlock != NULL)
 496                         mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 497                 else
 498                         mutex_held = so->so_proto->pr_domain->dom_mtx;
 499                 error = msleep((caddr_t)&so->so_timeo, mutex_held, PSOCK | PCATCH,
 500                     "connec", 0);
 501                 if ((so->so_state & SS_DRAINING)) {
 502                         error = ECONNABORTED;
 503                 }
 504                 if (error)
 505                         break;
 506         }
 507         if (error == 0) {
 508                 error = so->so_error;
 509                 so->so_error = 0;
 510         }
 511 bad:
 512         so->so_state &= ~SS_ISCONNECTING;
 513         socket_unlock(so, 1);
 514         FREE(sa, M_SONAME);
 515         if (error == ERESTART)
 516                 error = EINTR;
 517 out:
 518         file_drop(fd);
 519         return (error);
 520 }
 521
 522 int
 523 socketpair(struct proc *p, struct socketpair_args *uap, __unused register_t *retval)
 524 {
 525         struct fileproc *fp1, *fp2;
 526         struct socket *so1, *so2;
 527         int fd, error, sv[2];
 528
 529         AUDIT_ARG(socket, uap->domain, uap->type, uap->protocol);
 530         error = socreate(uap->domain, &so1, uap->type, uap->protocol);
 531         if (error)
 532                 return (error);
 533         error = socreate(uap->domain, &so2, uap->type, uap->protocol);
 534         if (error)
 535                 goto free1;
 536
 537         error = falloc(p, &fp1, &fd);
 538         if (error) {
 539                 goto free2;
 540         }
 541         fp1->f_flag = FREAD|FWRITE;
 542         fp1->f_type = DTYPE_SOCKET;
 543         fp1->f_ops = &socketops;
 544         fp1->f_data = (caddr_t)so1;
 545         sv[0] = fd;
 546
 547         error = falloc(p, &fp2, &fd);
 548         if (error) {
 549                 goto free3;
 550         }
 551         fp2->f_flag = FREAD|FWRITE;
 552         fp2->f_type = DTYPE_SOCKET;
 553         fp2->f_ops = &socketops;
 554         fp2->f_data = (caddr_t)so2;
 555         sv[1] = fd;
 556
 557         error = soconnect2(so1, so2);
 558         if (error) {
 559                 goto free4;
 560         }
 561         if (uap->type == SOCK_DGRAM) {
 562                 /*
 563                  * Datagram socket connection is asymmetric.
 564                  */
 565                  error = soconnect2(so2, so1);
 566                  if (error) {
 567                          goto free4;
 568                  }
 569         }
 570
 571         proc_fdlock(p);
 572         *fdflags(p, sv[0]) &= ~UF_RESERVED;
 573         *fdflags(p, sv[1]) &= ~UF_RESERVED;
 574         fp_drop(p, sv[0], fp1, 1);
 575         fp_drop(p, sv[1], fp2, 1);
 576         proc_fdunlock(p);
 577
 578         error = copyout((caddr_t)sv, uap->rsv, 2 * sizeof(int));
 579 #if 0   /* old pipe(2) syscall compatability, unused these days */
 580         retval[0] = sv[0];              /* XXX ??? */
 581         retval[1] = sv[1];              /* XXX ??? */
 582 #endif /* 0 */
 583         return (error);
 584 free4:
 585         fp_free(p, sv[1], fp2);
 586 free3:
 587         fp_free(p, sv[0], fp1);
 588 free2:
 589         (void)soclose(so2);
 590 free1:
 591         (void)soclose(so1);
 592         return (error);
 593 }
 594
 595 static int
 596 sendit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop,
 597                 int flags, register_t *retval)
 598 {
 599         struct mbuf *control;
 600         struct sockaddr *to;
 601         int error;
 602         struct socket *so;
 603         user_ssize_t len;
 604 #if KTRACE
 605         uio_t ktruio = NULL;
 606 #endif
 607
 608         KERNEL_DEBUG(DBG_FNC_SENDIT | DBG_FUNC_START, 0,0,0,0,0);
 609
 610         error = file_socket(s, &so);
 611         if (error )
 612         {
 613             KERNEL_DEBUG(DBG_FNC_SENDIT | DBG_FUNC_END, error,0,0,0,0);
 614             return (error);
 615         }
 616
 617         if (mp->msg_name) {
 618                 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
 619                 if (error) {
 620                     KERNEL_DEBUG(DBG_FNC_SENDIT | DBG_FUNC_END, error,0,0,0,0);
 621                         goto out;
 622                 }
 623                 AUDIT_ARG(sockaddr, p, to);
 624         } else {
 625                 to = 0;
 626         }
 627         if (mp->msg_control) {
 628                 if (mp->msg_controllen < ((socklen_t)sizeof(struct cmsghdr))
 629 #if COMPAT_43_SOCKET
 630                     && !(mp->msg_flags & MSG_COMPAT)
 631 #endif
 632                 ) {
 633                         error = EINVAL;
 634                         goto bad;
 635                 }
 636                 error = sockargs(&control, mp->msg_control,
 637                     mp->msg_controllen, MT_CONTROL);
 638                 if (error)
 639                         goto bad;
 640 #if COMPAT_43_SOCKET
 641                 if (mp->msg_flags & MSG_COMPAT) {
 642                         register struct cmsghdr *cm;
 643
 644                         M_PREPEND(control, sizeof(*cm), M_WAIT);
 645                         if (control == 0) {
 646                                 error = ENOBUFS;
 647                                 goto bad;
 648                         } else {
 649                                 cm = mtod(control, struct cmsghdr *);
 650                                 cm->cmsg_len = control->m_len;
 651                                 cm->cmsg_level = SOL_SOCKET;
 652                                 cm->cmsg_type = SCM_RIGHTS;
 653                         }
 654                 }
 655 #endif
 656         } else {
 657                 control = 0;
 658         }
 659
 660 #if KTRACE
 661         if (KTRPOINT(p, KTR_GENIO)) {
 662                 ktruio = uio_duplicate(uiop);
 663         }
 664 #endif
 665
 666         len = uio_resid(uiop);
 667         if (so == NULL)
 668                 error = EBADF;
 669         else
 670                 error = so->so_proto->pr_usrreqs->pru_sosend(so, to, uiop, 0, control,
 671                                                              flags);
 672         if (error) {
 673                 if (uio_resid(uiop) != len && (error == ERESTART ||
 674                     error == EINTR || error == EWOULDBLOCK))
 675                         error = 0;
 676                 /* Generation of SIGPIPE can be controlled per socket */
 677                 if (error == EPIPE && !(so->so_flags & SOF_NOSIGPIPE))
 678                         psignal(p, SIGPIPE);
 679         }
 680         if (error == 0)
 681                 *retval = (int)(len - uio_resid(uiop));
 682 bad:
 683 #if KTRACE
 684         if (ktruio != NULL) {
 685                 if (error == 0) {
 686                         uio_setresid(ktruio, retval[0]);
 687                         ktrgenio(p->p_tracep, s, UIO_WRITE, ktruio, error);
 688                 }
 689                 uio_free(ktruio);
 690         }
 691 #endif
 692         if (to)
 693                 FREE(to, M_SONAME);
 694         KERNEL_DEBUG(DBG_FNC_SENDIT | DBG_FUNC_END, error,0,0,0,0);
 695 out:
 696         file_drop(s);
 697         return (error);
 698 }
 699
 700
 701 int
 702 sendto(struct proc *p, struct sendto_args *uap, register_t *retval)
 703 {
 704         struct user_msghdr msg;
 705         int error;
 706         uio_t auio = NULL;
 707
 708         KERNEL_DEBUG(DBG_FNC_SENDTO | DBG_FUNC_START, 0,0,0,0,0);
 709         AUDIT_ARG(fd, uap->s);
 710
 711         auio = uio_create(1, 0,
 712                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
 713                                   UIO_WRITE);
 714         if (auio == NULL) {
 715                 return (ENOMEM);
 716         }
 717         uio_addiov(auio, uap->buf, uap->len);
 718
 719         msg.msg_name = uap->to;
 720         msg.msg_namelen = uap->tolen;
 721         /* no need to set up msg_iov.  sendit uses uio_t we send it */
 722         msg.msg_iov = 0;
 723         msg.msg_iovlen = 0;
 724         msg.msg_control = 0;
 725         msg.msg_flags = 0;
 726
 727         error = sendit(p, uap->s, &msg, auio, uap->flags, retval);
 728
 729         if (auio != NULL) {
 730                 uio_free(auio);
 731         }
 732
 733 #if HACK_FOR_4056224
 734         /*
 735          * Radar 4056224
 736          * Temporary workaround to let send() and recv() work over a pipe for binary compatibility
 737          * This will be removed in the release following Tiger
 738          */
 739         if (error == ENOTSOCK) {
 740                 struct fileproc *fp;
 741
 742         if (fp_lookup(p, uap->s, &fp, 0) == 0) {
 743                         (void) fp_drop(p, uap->s, fp,0);
 744
 745                         if (fp->f_type == DTYPE_PIPE) {
 746                                 struct write_args write_uap;
 747                                 user_ssize_t write_retval;
 748
 749                                 if (p->p_pid > last_pid_4056224) {
 750                                         last_pid_4056224 = p->p_pid;
 751
 752                                         printf("%s[%d] uses send/recv on a pipe\n",
 753                                                 p->p_comm, p->p_pid);
 754                                 }
 755
 756                                 bzero(&write_uap, sizeof(struct write_args));
 757                                 write_uap.fd = uap->s;
 758                                 write_uap.cbuf = uap->buf;
 759                                 write_uap.nbyte = uap->len;
 760
 761                                 error = write(p, &write_uap, &write_retval);
 762                                 *retval = (int)write_retval;
 763                         }
 764                 }
 765         }
 766 #endif /* HACK_FOR_4056224 */
 767
 768         KERNEL_DEBUG(DBG_FNC_SENDTO | DBG_FUNC_END, error, *retval,0,0,0);
 769
 770         return(error);
 771 }
 772
 773 #if COMPAT_43_SOCKET
 774 int
 775 osend(__unused struct proc *p,
 776           __unused struct osend_args *uap,
 777           __unused register_t *retval)
 778 {
 779         /* these are no longer supported and in fact
 780          * there is no way to call it directly.
 781          * LP64todo - remove this once we're sure there are no clients
 782          */
 783         return (ENOTSUP);
 784 }
 785
 786 int
 787 osendmsg(__unused struct proc *p,
 788                  __unused struct osendmsg_args *uap,
 789                  __unused register_t *retval)
 790 {
 791         /* these are no longer supported and in fact
 792          * there is no way to call it directly.
 793          * LP64todo - remove this once we're sure there are no clients
 794          */
 795         return (ENOTSUP);
 796 }
 797 #endif
 798
 799
 800 int
 801 sendmsg(struct proc *p, register struct sendmsg_args *uap, register_t *retval)
 802 {
 803         struct msghdr msg;
 804         struct user_msghdr user_msg;
 805         caddr_t msghdrp;
 806         int     size_of_msghdr;
 807         int error;
 808         int size_of_iovec;
 809         uio_t auio = NULL;
 810         struct user_iovec *iovp;
 811
 812         KERNEL_DEBUG(DBG_FNC_SENDMSG | DBG_FUNC_START, 0,0,0,0,0);
 813         AUDIT_ARG(fd, uap->s);
 814         if (IS_64BIT_PROCESS(p)) {
 815                 msghdrp = (caddr_t) &user_msg;
 816                 size_of_msghdr = sizeof(user_msg);
 817                 size_of_iovec = sizeof(struct user_iovec);
 818         }
 819         else {
 820                 msghdrp = (caddr_t) &msg;
 821                 size_of_msghdr = sizeof(msg);
 822                 size_of_iovec = sizeof(struct iovec);
 823         }
 824         error = copyin(uap->msg, msghdrp, size_of_msghdr);
 825         if (error)
 826         {
 827             KERNEL_DEBUG(DBG_FNC_SENDMSG | DBG_FUNC_END, error,0,0,0,0);
 828             return (error);
 829         }
 830
 831         /* only need to copy if user process is not 64-bit */
 832         if (!IS_64BIT_PROCESS(p)) {
 833                 user_msg.msg_flags = msg.msg_flags;
 834                 user_msg.msg_controllen = msg.msg_controllen;
 835                 user_msg.msg_control = CAST_USER_ADDR_T(msg.msg_control);
 836                 user_msg.msg_iovlen = msg.msg_iovlen;
 837                 user_msg.msg_iov = CAST_USER_ADDR_T(msg.msg_iov);
 838                 user_msg.msg_namelen = msg.msg_namelen;
 839                 user_msg.msg_name = CAST_USER_ADDR_T(msg.msg_name);
 840         }
 841
 842         if (user_msg.msg_iovlen <= 0 || user_msg.msg_iovlen > UIO_MAXIOV) {
 843                 KERNEL_DEBUG(DBG_FNC_SENDMSG | DBG_FUNC_END, EMSGSIZE,0,0,0,0);
 844                 return (EMSGSIZE);
 845         }
 846
 847         /* allocate a uio large enough to hold the number of iovecs passed */
 848         auio = uio_create(user_msg.msg_iovlen, 0,
 849                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
 850                                   UIO_WRITE);
 851         if (auio == NULL) {
 852                 error = ENOBUFS;
 853                 goto done;
 854         }
 855
 856         if (user_msg.msg_iovlen) {
 857                 /* get location of iovecs within the uio.  then copyin the iovecs from
 858                  * user space.
 859                  */
 860                 iovp = uio_iovsaddr(auio);
 861                 if (iovp == NULL) {
 862                         error = ENOBUFS;
 863                         goto done;
 864                 }
 865                 error = copyin(user_msg.msg_iov, (caddr_t)iovp, (user_msg.msg_iovlen * size_of_iovec));
 866                 if (error)
 867                         goto done;
 868                 user_msg.msg_iov = CAST_USER_ADDR_T(iovp);
 869
 870                 /* finish setup of uio_t */
 871                 uio_calculateresid(auio);
 872         }
 873         else {
 874                 user_msg.msg_iov = 0;
 875         }
 876
 877 #if COMPAT_43_SOCKET
 878         user_msg.msg_flags = 0;
 879 #endif
 880         error = sendit(p, uap->s, &user_msg, auio, uap->flags, retval);
 881 done:
 882         if (auio != NULL) {
 883                 uio_free(auio);
 884         }
 885         KERNEL_DEBUG(DBG_FNC_SENDMSG | DBG_FUNC_END, error,0,0,0,0);
 886
 887         return (error);
 888 }
 889
 890 static int
 891 recvit(p, s, mp, uiop, namelenp, retval)
 892         register struct proc *p;
 893         int s;
 894         register struct user_msghdr *mp;
 895         uio_t uiop;
 896         user_addr_t namelenp;
 897         register_t *retval;
 898 {
 899         int len, error;
 900         struct mbuf *m, *control = 0;
 901         user_addr_t ctlbuf;
 902         struct socket *so;
 903         struct sockaddr *fromsa = 0;
 904         struct fileproc *fp;
 905 #if KTRACE
 906         uio_t ktruio = NULL;
 907 #endif
 908
 909         KERNEL_DEBUG(DBG_FNC_RECVIT | DBG_FUNC_START, 0,0,0,0,0);
 910         proc_fdlock(p);
 911         if ( (error = fp_lookup(p, s, &fp, 1)) ) {
 912             KERNEL_DEBUG(DBG_FNC_RECVIT | DBG_FUNC_END, error,0,0,0,0);
 913                 proc_fdunlock(p);
 914             return (error);
 915         }
 916         if (fp->f_type != DTYPE_SOCKET) {
 917                 fp_drop(p, s, fp,1);
 918                 proc_fdunlock(p);
 919                 return(ENOTSOCK);
 920         }
 921
 922         so = (struct socket *)fp->f_data;
 923
 924         proc_fdunlock(p);
 925         if (uio_resid(uiop) < 0) {
 926                 KERNEL_DEBUG(DBG_FNC_RECVIT | DBG_FUNC_END, EINVAL,0,0,0,0);
 927                 error = EINVAL;
 928                 goto out1;
 929         }
 930 #if KTRACE
 931         if (KTRPOINT(p, KTR_GENIO)) {
 932                 ktruio = uio_duplicate(uiop);
 933         }
 934 #endif
 935
 936         len = uio_resid(uiop);
 937         if (so == NULL)
 938                 error = EBADF;
 939         else {
 940                 error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, uiop,
 941                         (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
 942                         &mp->msg_flags);
 943         }
 944         AUDIT_ARG(sockaddr, p, fromsa);
 945         if (error) {
 946                 if (uio_resid(uiop) != len && (error == ERESTART ||
 947                     error == EINTR || error == EWOULDBLOCK))
 948                         error = 0;
 949         }
 950 #if KTRACE
 951         if (ktruio != NULL) {
 952                 if (error == 0) {
 953                         uio_setresid(ktruio, len - uio_resid(uiop));
 954                         ktrgenio(p->p_tracep, s, UIO_WRITE, ktruio, error);
 955                 }
 956                 uio_free(ktruio);
 957         }
 958 #endif
 959         if (error)
 960                 goto out;
 961         *retval = len - uio_resid(uiop);
 962         if (mp->msg_name) {
 963                 len = mp->msg_namelen;
 964                 if (len <= 0 || fromsa == 0)
 965                         len = 0;
 966                 else {
 967 #ifndef MIN
 968 #define MIN(a,b) ((a)>(b)?(b):(a))
 969 #endif
 970                         /* save sa_len before it is destroyed by MSG_COMPAT */
 971                         len = MIN(len, fromsa->sa_len);
 972 #if COMPAT_43_SOCKET
 973                         if (mp->msg_flags & MSG_COMPAT)
 974                                 ((struct osockaddr *)fromsa)->sa_family =
 975                                     fromsa->sa_family;
 976 #endif
 977                         error = copyout(fromsa, mp->msg_name, (unsigned)len);
 978                         if (error)
 979                                 goto out;
 980                 }
 981                 mp->msg_namelen = len;
 982                 if (namelenp &&
 983                     (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
 984 #if COMPAT_43_SOCKET
 985                         if (mp->msg_flags & MSG_COMPAT)
 986                                 error = 0;      /* old recvfrom didn't check */
 987                         else
 988 #endif
 989                         goto out;
 990                 }
 991         }
 992         if (mp->msg_control) {
 993 #if COMPAT_43_SOCKET
 994                 /*
 995                  * We assume that old recvmsg calls won't receive access
 996                  * rights and other control info, esp. as control info
 997                  * is always optional and those options didn't exist in 4.3.
 998                  * If we receive rights, trim the cmsghdr; anything else
 999                  * is tossed.
1000                  */
1001                 if (control && mp->msg_flags & MSG_COMPAT) {
1002                         if (mtod(control, struct cmsghdr *)->cmsg_level !=
1003                             SOL_SOCKET ||
1004                             mtod(control, struct cmsghdr *)->cmsg_type !=
1005                             SCM_RIGHTS) {
1006                                 mp->msg_controllen = 0;
1007                                 goto out;
1008                         }
1009                         control->m_len -= sizeof (struct cmsghdr);
1010                         control->m_data += sizeof (struct cmsghdr);
1011                 }
1012 #endif
1013                 len = mp->msg_controllen;
1014                 m = control;
1015                 mp->msg_controllen = 0;
1016                 ctlbuf = mp->msg_control;
1017
1018                 while (m && len > 0) {
1019                         unsigned int tocopy;
1020
1021                         if (len >= m->m_len)
1022                                 tocopy = m->m_len;
1023                         else {
1024                                 mp->msg_flags |= MSG_CTRUNC;
1025                                 tocopy = len;
1026                         }
1027
1028                         error = copyout((caddr_t)mtod(m, caddr_t), ctlbuf, tocopy);
1029                         if (error)
1030                                 goto out;
1031
1032                         ctlbuf += tocopy;
1033                         len -= tocopy;
1034                         m = m->m_next;
1035                 }
1036                 mp->msg_controllen = ctlbuf - mp->msg_control;
1037         }
1038 out:
1039         if (fromsa)
1040                 FREE(fromsa, M_SONAME);
1041         if (control)
1042                 m_freem(control);
1043         KERNEL_DEBUG(DBG_FNC_RECVIT | DBG_FUNC_END, error,0,0,0,0);
1044 out1:
1045         fp_drop(p, s, fp, 0);
1046         return (error);
1047 }
1048
1049
1050 int
1051 recvfrom(p, uap, retval)
1052         struct proc *p;
1053         register struct recvfrom_args /* {
1054                 int     s;
1055                 caddr_t buf;
1056                 size_t  len;
1057                 int     flags;
1058                 caddr_t from;
1059                 int     *fromlenaddr;
1060         } */ *uap;
1061         register_t *retval;
1062 {
1063         struct user_msghdr msg;
1064         int error;
1065         uio_t auio = NULL;
1066
1067         KERNEL_DEBUG(DBG_FNC_RECVFROM | DBG_FUNC_START, 0,0,0,0,0);
1068         AUDIT_ARG(fd, uap->s);
1069
1070         if (uap->fromlenaddr) {
1071                 error = copyin(uap->fromlenaddr,
1072                     (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
1073                 if (error)
1074                         return (error);
1075         } else
1076                 msg.msg_namelen = 0;
1077         msg.msg_name = uap->from;
1078         auio = uio_create(1, 0,
1079                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
1080                                   UIO_READ);
1081         if (auio == NULL) {
1082                 return (ENOMEM);
1083         }
1084
1085         uio_addiov(auio, uap->buf, uap->len);
1086         /* no need to set up msg_iov.  recvit uses uio_t we send it */
1087         msg.msg_iov = 0;
1088         msg.msg_iovlen = 0;
1089         msg.msg_control = 0;
1090         msg.msg_controllen = 0;
1091         msg.msg_flags = uap->flags;
1092         error = recvit(p, uap->s, &msg, auio, uap->fromlenaddr, retval);
1093         if (auio != NULL) {
1094                 uio_free(auio);
1095         }
1096
1097 #if HACK_FOR_4056224
1098         /*
1099          * Radar 4056224
1100          * Temporary workaround to let send() and recv() work over a pipe for binary compatibility
1101          * This will be removed in the release following Tiger
1102          */
1103         if (error == ENOTSOCK && proc_is64bit(p) == 0) {
1104                 struct fileproc *fp;
1105
1106         if (fp_lookup(p, uap->s, &fp, 0) == 0) {
1107                         (void) fp_drop(p, uap->s, fp,0);
1108
1109                         if (fp->f_type == DTYPE_PIPE) {
1110                                 struct read_args read_uap;
1111                                 user_ssize_t read_retval;
1112
1113                                 if (p->p_pid > last_pid_4056224) {
1114                                         last_pid_4056224 = p->p_pid;
1115
1116                                         printf("%s[%d] uses send/recv on a pipe\n",
1117                                                 p->p_comm, p->p_pid);
1118                                 }
1119
1120                                 bzero(&read_uap, sizeof(struct read_args));
1121                                 read_uap.fd = uap->s;
1122                                 read_uap.cbuf = uap->buf;
1123                                 read_uap.nbyte = uap->len;
1124
1125                                 error = read(p, &read_uap, &read_retval);
1126                                 *retval = (int)read_retval;
1127                         }
1128                 }
1129         }
1130 #endif /* HACK_FOR_4056224 */
1131
1132         KERNEL_DEBUG(DBG_FNC_RECVFROM | DBG_FUNC_END, error,0,0,0,0);
1133
1134         return (error);
1135 }
1136
1137 #if COMPAT_43_SOCKET
1138 int
1139 orecvfrom(struct proc *p, struct recvfrom_args *uap, register_t *retval)
1140 {
1141
1142         uap->flags |= MSG_COMPAT;
1143         return (recvfrom(p, uap, retval));
1144 }
1145 #endif
1146
1147
1148 #if COMPAT_43_SOCKET
1149 int
1150 orecv(__unused struct proc *p, __unused struct orecv_args       *uap,
1151                 __unused register_t *retval)
1152 {
1153         /* these are no longer supported and in fact
1154          * there is no way to call it directly.
1155          * LP64todo - remove this once we're sure there are no clients
1156          */
1157
1158         return (ENOTSUP);
1159 }
1160
1161 /*
1162  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1163  * overlays the new one, missing only the flags, and with the (old) access
1164  * rights where the control fields are now.
1165  */
1166 int
1167 orecvmsg(__unused struct proc *p, __unused struct orecvmsg_args *uap,
1168                 __unused register_t *retval)
1169 {
1170         /* these are no longer supported and in fact
1171          * there is no way to call it directly.
1172          * LP64todo - remove this once we're sure there are no clients
1173          */
1174
1175         return (ENOTSUP);
1176
1177 }
1178 #endif
1179
1180 int
1181 recvmsg(p, uap, retval)
1182         struct proc *p;
1183         struct recvmsg_args *uap;
1184         register_t *retval;
1185 {
1186         struct msghdr msg;
1187         struct user_msghdr user_msg;
1188         caddr_t msghdrp;
1189         int     size_of_msghdr;
1190         user_addr_t uiov;
1191         register int error;
1192         int size_of_iovec;
1193         uio_t auio = NULL;
1194         struct user_iovec *iovp;
1195
1196         KERNEL_DEBUG(DBG_FNC_RECVMSG | DBG_FUNC_START, 0,0,0,0,0);
1197         AUDIT_ARG(fd, uap->s);
1198         if (IS_64BIT_PROCESS(p)) {
1199                 msghdrp = (caddr_t) &user_msg;
1200                 size_of_msghdr = sizeof(user_msg);
1201                 size_of_iovec = sizeof(struct user_iovec);
1202         }
1203         else {
1204                 msghdrp = (caddr_t) &msg;
1205                 size_of_msghdr = sizeof(msg);
1206                 size_of_iovec = sizeof(struct iovec);
1207         }
1208         error = copyin(uap->msg, msghdrp, size_of_msghdr);
1209         if (error)
1210         {
1211                 KERNEL_DEBUG(DBG_FNC_RECVMSG | DBG_FUNC_END, error,0,0,0,0);
1212                 return (error);
1213         }
1214
1215         /* only need to copy if user process is not 64-bit */
1216         if (!IS_64BIT_PROCESS(p)) {
1217                 user_msg.msg_flags = msg.msg_flags;
1218                 user_msg.msg_controllen = msg.msg_controllen;
1219                 user_msg.msg_control = CAST_USER_ADDR_T(msg.msg_control);
1220                 user_msg.msg_iovlen = msg.msg_iovlen;
1221                 user_msg.msg_iov = CAST_USER_ADDR_T(msg.msg_iov);
1222                 user_msg.msg_namelen = msg.msg_namelen;
1223                 user_msg.msg_name = CAST_USER_ADDR_T(msg.msg_name);
1224         }
1225
1226         if (user_msg.msg_iovlen <= 0 || user_msg.msg_iovlen > UIO_MAXIOV) {
1227                 KERNEL_DEBUG(DBG_FNC_RECVMSG | DBG_FUNC_END, EMSGSIZE,0,0,0,0);
1228                 return (EMSGSIZE);
1229         }
1230
1231 #if COMPAT_43_SOCKET
1232         user_msg.msg_flags = uap->flags &~ MSG_COMPAT;
1233 #else
1234         user_msg.msg_flags = uap->flags;
1235 #endif
1236
1237         /* allocate a uio large enough to hold the number of iovecs passed */
1238         auio = uio_create(user_msg.msg_iovlen, 0,
1239                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
1240                                   UIO_READ);
1241         if (auio == NULL) {
1242                 error = ENOMEM;
1243                 goto done;
1244         }
1245
1246         /* get location of iovecs within the uio.  then copyin the iovecs from
1247          * user space.
1248          */
1249         iovp = uio_iovsaddr(auio);
1250         if (iovp == NULL) {
1251                 error = ENOMEM;
1252                 goto done;
1253         }
1254         uiov = user_msg.msg_iov;
1255         user_msg.msg_iov = CAST_USER_ADDR_T(iovp);
1256         error = copyin(uiov, (caddr_t)iovp, (user_msg.msg_iovlen * size_of_iovec));
1257         if (error)
1258                 goto done;
1259
1260         /* finish setup of uio_t */
1261         uio_calculateresid(auio);
1262
1263         error = recvit(p, uap->s, &user_msg, auio, 0, retval);
1264         if (!error) {
1265                 user_msg.msg_iov = uiov;
1266                 /* only need to copy if user process is not 64-bit */
1267                 if (!IS_64BIT_PROCESS(p)) {
1268                         // LP64todo - do all these change?  if not, then no need to copy all of them!
1269                         msg.msg_flags = user_msg.msg_flags;
1270                         msg.msg_controllen = user_msg.msg_controllen;
1271                         msg.msg_control = CAST_DOWN(caddr_t, user_msg.msg_control);
1272                         msg.msg_iovlen = user_msg.msg_iovlen;
1273                         msg.msg_iov = (struct iovec *) CAST_DOWN(caddr_t, user_msg.msg_iov);
1274                         msg.msg_namelen = user_msg.msg_namelen;
1275                         msg.msg_name = CAST_DOWN(caddr_t, user_msg.msg_name);
1276                 }
1277                 error = copyout(msghdrp, uap->msg, size_of_msghdr);
1278         }
1279 done:
1280         if (auio != NULL) {
1281                 uio_free(auio);
1282         }
1283         KERNEL_DEBUG(DBG_FNC_RECVMSG | DBG_FUNC_END, error,0,0,0,0);
1284         return (error);
1285 }
1286
1287 /* ARGSUSED */
1288 int
1289 shutdown(__unused struct proc *p, struct shutdown_args *uap, __unused register_t *retval)
1290 {
1291         struct socket * so;
1292         int error;
1293
1294         AUDIT_ARG(fd, uap->s);
1295         error = file_socket(uap->s, &so);
1296         if (error)
1297                 return (error);
1298         if (so == NULL) {
1299                 error = EBADF;
1300                 goto out;
1301         }
1302         error =  soshutdown((struct socket *)so, uap->how);
1303 out:
1304         file_drop(uap->s);
1305         return(error);
1306 }
1307
1308
1309
1310
1311
1312 /* ARGSUSED */
1313 int
1314 setsockopt(struct proc *p, struct setsockopt_args *uap, __unused register_t *retval)
1315 {
1316         struct socket * so;
1317         struct sockopt sopt;
1318         int error;
1319
1320         AUDIT_ARG(fd, uap->s);
1321         if (uap->val == 0 && uap->valsize != 0)
1322                 return (EFAULT);
1323         if (uap->valsize < 0)
1324                 return (EINVAL);
1325
1326         error = file_socket(uap->s, &so);
1327         if (error)
1328                 return (error);
1329
1330         sopt.sopt_dir = SOPT_SET;
1331         sopt.sopt_level = uap->level;
1332         sopt.sopt_name = uap->name;
1333         sopt.sopt_val = uap->val;
1334         sopt.sopt_valsize = uap->valsize;
1335         sopt.sopt_p = p;
1336
1337         if (so == NULL) {
1338                 error = EINVAL;
1339                 goto out;
1340         }
1341         error = sosetopt(so, &sopt);
1342 out:
1343         file_drop(uap->s);
1344         return(error);
1345 }
1346
1347
1348
1349 int
1350 getsockopt(struct proc *p, struct getsockopt_args  *uap, __unused register_t *retval)
1351 {
1352         int             error;
1353         socklen_t       valsize;
1354         struct sockopt  sopt;
1355         struct socket * so;
1356
1357         error = file_socket(uap->s, &so);
1358         if (error)
1359                 return (error);
1360         if (uap->val) {
1361                 error = copyin(uap->avalsize, (caddr_t)&valsize, sizeof (valsize));
1362                 if (error)
1363                         goto out;
1364                 if (valsize < 0) {
1365                         error = EINVAL;
1366                         goto out;
1367                 }
1368         } else
1369                 valsize = 0;
1370
1371         sopt.sopt_dir = SOPT_GET;
1372         sopt.sopt_level = uap->level;
1373         sopt.sopt_name = uap->name;
1374         sopt.sopt_val = uap->val;
1375         sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1376         sopt.sopt_p = p;
1377
1378         if (so == NULL) {
1379                 error = EBADF;
1380                 goto out;
1381         }
1382         error = sogetopt((struct socket *)so, &sopt);
1383         if (error == 0) {
1384                 valsize = sopt.sopt_valsize;
1385                 error = copyout((caddr_t)&valsize, uap->avalsize, sizeof (valsize));
1386         }
1387 out:
1388         file_drop(uap->s);
1389         return (error);
1390 }
1391
1392
1393 /*
1394  * Get socket name.
1395  */
1396 /* ARGSUSED */
1397 static int
1398 getsockname1(__unused struct proc *p, struct getsockname_args *uap, __unused register_t *retval,
1399         int compat)
1400 {
1401         struct socket *so;
1402         struct sockaddr *sa;
1403         socklen_t len;
1404         int error;
1405
1406         error = file_socket(uap->fdes, &so);
1407         if (error)
1408                 return (error);
1409         error = copyin(uap->alen, (caddr_t)&len, sizeof(socklen_t));
1410         if (error)
1411                 goto out;
1412         if (so == NULL) {
1413                 error = EBADF;
1414                 goto out;
1415         }
1416         sa = 0;
1417         socket_lock(so, 1);
1418         error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1419         if (error == 0)
1420         {
1421                 struct socket_filter_entry *filter;
1422                 int     filtered = 0;
1423                 for (filter = so->so_filt; filter && error == 0;
1424                          filter = filter->sfe_next_onsocket) {
1425                         if (filter->sfe_filter->sf_filter.sf_getsockname) {
1426                                 if (!filtered) {
1427                                         filtered = 1;
1428                                         sflt_use(so);
1429                                         socket_unlock(so, 0);
1430                                 }
1431                                 error = filter->sfe_filter->sf_filter.sf_getsockname(filter->sfe_cookie,
1432                                                         so, &sa);
1433                         }
1434                 }
1435
1436                 if (error == EJUSTRETURN)
1437                         error = 0;
1438
1439                 if (filtered) {
1440                         socket_lock(so, 0);
1441                         sflt_unuse(so);
1442                 }
1443         }
1444         socket_unlock(so, 1);
1445         if (error)
1446                 goto bad;
1447         if (sa == 0) {
1448                 len = 0;
1449                 goto gotnothing;
1450         }
1451
1452         len = MIN(len, sa->sa_len);
1453 #if COMPAT_43_SOCKET
1454         if (compat)
1455                 ((struct osockaddr *)sa)->sa_family = sa->sa_family;
1456 #endif
1457         error = copyout((caddr_t)sa, uap->asa, len);
1458         if (error == 0)
1459 gotnothing:
1460                 error = copyout((caddr_t)&len, uap->alen, sizeof(socklen_t));
1461 bad:
1462         if (sa)
1463                 FREE(sa, M_SONAME);
1464 out:
1465         file_drop(uap->fdes);
1466         return (error);
1467 }
1468
1469 int
1470 getsockname(struct proc *p, struct getsockname_args *uap, register_t *retval)
1471 {
1472         return (getsockname1(p, uap, retval, 0));
1473 }
1474
1475 #if COMPAT_43_SOCKET
1476 int
1477 ogetsockname(struct proc *p, struct getsockname_args *uap, register_t *retval)
1478 {
1479         return (getsockname1(p, uap, retval, 1));
1480 }
1481 #endif /* COMPAT_43_SOCKET */
1482
1483 /*
1484  * Get name of peer for connected socket.
1485  */
1486 /* ARGSUSED */
1487 int
1488 getpeername1(__unused struct proc *p, struct getpeername_args *uap, __unused register_t *retval,
1489         int compat)
1490 {
1491         struct socket *so;
1492         struct sockaddr *sa;
1493         socklen_t len;
1494         int error;
1495
1496         error = file_socket(uap->fdes, &so);
1497         if (error)
1498                 return (error);
1499         if (so == NULL) {
1500                 error = EBADF;
1501                 goto out;
1502         }
1503
1504         socket_lock(so, 1);
1505
1506         if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1507                 socket_unlock(so, 1);
1508                 error = ENOTCONN;
1509                 goto out;
1510         }
1511         error = copyin(uap->alen, (caddr_t)&len, sizeof(socklen_t));
1512         if (error) {
1513                 socket_unlock(so, 1);
1514                 goto out;
1515         }
1516         sa = 0;
1517         error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1518         if (error == 0)
1519         {
1520                 struct socket_filter_entry *filter;
1521                 int     filtered = 0;
1522                 for (filter = so->so_filt; filter && error == 0;
1523                          filter = filter->sfe_next_onsocket) {
1524                         if (filter->sfe_filter->sf_filter.sf_getpeername) {
1525                                 if (!filtered) {
1526                                         filtered = 1;
1527                                         sflt_use(so);
1528                                         socket_unlock(so, 0);
1529                                 }
1530                                 error = filter->sfe_filter->sf_filter.sf_getpeername(filter->sfe_cookie,
1531                                                         so, &sa);
1532                         }
1533                 }
1534
1535                 if (error == EJUSTRETURN)
1536                         error = 0;
1537
1538                 if (filtered) {
1539                         socket_lock(so, 0);
1540                         sflt_unuse(so);
1541                 }
1542         }
1543         socket_unlock(so, 1);
1544         if (error)
1545                 goto bad;
1546         if (sa == 0) {
1547                 len = 0;
1548                 goto gotnothing;
1549         }
1550         len = MIN(len, sa->sa_len);
1551 #if COMPAT_43_SOCKET
1552         if (compat)
1553                 ((struct osockaddr *)sa)->sa_family =
1554                     sa->sa_family;
1555 #endif
1556         error = copyout(sa, uap->asa, len);
1557         if (error)
1558                 goto bad;
1559 gotnothing:
1560         error = copyout((caddr_t)&len, uap->alen, sizeof(socklen_t));
1561 bad:
1562         if (sa) FREE(sa, M_SONAME);
1563 out:
1564         file_drop(uap->fdes);
1565         return (error);
1566 }
1567
1568 int
1569 getpeername(struct proc *p, struct getpeername_args *uap, register_t *retval)
1570 {
1571
1572         return (getpeername1(p, uap, retval, 0));
1573 }
1574
1575 #if COMPAT_43_SOCKET
1576 int
1577 ogetpeername(struct proc *p, struct getpeername_args *uap, register_t *retval)
1578 {
1579
1580         return (getpeername1(p, uap, retval, 1));
1581 }
1582 #endif /* COMPAT_43_SOCKET */
1583
1584 int
1585 sockargs(mp, data, buflen, type)
1586         struct mbuf **mp;
1587         user_addr_t data;
1588         int buflen, type;
1589 {
1590         register struct sockaddr *sa;
1591         register struct mbuf *m;
1592         int error;
1593
1594         if ((u_int)buflen > MLEN) {
1595 #if COMPAT_43_SOCKET
1596                 if (type == MT_SONAME && (u_int)buflen <= 112)
1597                         buflen = MLEN;          /* unix domain compat. hack */
1598                 else
1599 #endif
1600                 if ((u_int)buflen > MCLBYTES)
1601                         return (EINVAL);
1602         }
1603         m = m_get(M_WAIT, type);
1604         if (m == NULL)
1605                 return (ENOBUFS);
1606         if ((u_int)buflen > MLEN) {
1607                 MCLGET(m, M_WAIT);
1608                 if ((m->m_flags & M_EXT) == 0) {
1609                         m_free(m);
1610                         return ENOBUFS;
1611                 }
1612         }
1613         m->m_len = buflen;
1614         error = copyin(data, mtod(m, caddr_t), (u_int)buflen);
1615         if (error)
1616                 (void) m_free(m);
1617         else {
1618                 *mp = m;
1619                 if (type == MT_SONAME) {
1620                         sa = mtod(m, struct sockaddr *);
1621
1622 #if COMPAT_43_SOCKET && BYTE_ORDER != BIG_ENDIAN
1623                         if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1624                                 sa->sa_family = sa->sa_len;
1625 #endif
1626                         sa->sa_len = buflen;
1627                 }
1628         }
1629         return (error);
1630 }
1631
1632 /*
1633  * Given a user_addr_t of length len, allocate and fill out a *sa.
1634  */
1635 int
1636 getsockaddr(struct sockaddr **namp, user_addr_t uaddr, size_t len)
1637 {
1638         struct sockaddr *sa;
1639         int error;
1640
1641         if (len > SOCK_MAXADDRLEN)
1642                 return ENAMETOOLONG;
1643
1644         if (len == 0)
1645              return EINVAL;
1646
1647         MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1648         if (sa == NULL) {
1649                 return ENOMEM;
1650         }
1651         error = copyin(uaddr, (caddr_t)sa, len);
1652         if (error) {
1653                 FREE(sa, M_SONAME);
1654         } else {
1655 #if COMPAT_43_SOCKET && BYTE_ORDER != BIG_ENDIAN
1656                 if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1657                         sa->sa_family = sa->sa_len;
1658 #endif
1659                 sa->sa_len = len;
1660                 *namp = sa;
1661         }
1662         return error;
1663 }
1664
1665
1666 #if SENDFILE
1667 /*
1668  * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1669  * XXX - The sf_buf functions are currently private to sendfile(2), so have
1670  * been made static, but may be useful in the future for doing zero-copy in
1671  * other parts of the networking code.
1672  */
1673 static void
1674 sf_buf_init(void *arg)
1675 {
1676         int i;
1677
1678         SLIST_INIT(&sf_freelist);
1679         kmem_alloc_pageable(kernel_map, &sf_base, nsfbufs * PAGE_SIZE);
1680         MALLOC(sf_bufs, struct sf_buf *, nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT|M_ZERO);
1681         if (sf_bufs == NULL)
1682                 return;         /* XXX silently fail leaving sf_bufs NULL */
1683
1684         for (i = 0; i < nsfbufs; i++) {
1685                 sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1686                 SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list);
1687         }
1688 }
1689
1690 /*
1691  * Get an sf_buf from the freelist. Will block if none are available.
1692  */
1693 static struct sf_buf *
1694 sf_buf_alloc()
1695 {
1696         struct sf_buf *sf;
1697
1698         while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) {
1699                 sf_buf_alloc_want = 1;
1700                 tsleep(&sf_freelist, PVM, "sfbufa", 0);
1701         }
1702         SLIST_REMOVE_HEAD(&sf_freelist, free_list);
1703         sf->refcnt = 1;
1704         return (sf);
1705 }
1706
1707 #define dtosf(x)        (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
1708 static void
1709 sf_buf_ref(caddr_t addr, u_int size)
1710 {
1711         struct sf_buf *sf;
1712
1713         sf = dtosf(addr);
1714         if (sf->refcnt == 0)
1715                 panic("sf_buf_ref: referencing a free sf_buf");
1716         sf->refcnt++;
1717 }
1718
1719 /*
1720  * Lose a reference to an sf_buf. When none left, detach mapped page
1721  * and release resources back to the system.
1722  *
1723  * Must be called at splimp.
1724  */
1725 static void
1726 sf_buf_free(caddr_t addr, u_int size)
1727 {
1728         struct sf_buf *sf;
1729         struct vm_page *m;
1730
1731         sf = dtosf(addr);
1732         if (sf->refcnt == 0)
1733                 panic("sf_buf_free: freeing free sf_buf");
1734         sf->refcnt--;
1735         if (sf->refcnt == 0) {
1736                 pmap_qremove((vm_offset_t)addr, 1);
1737                 m = sf->m;
1738                 vm_page_unwire(m, 0);
1739                 /*
1740                  * Check for the object going away on us. This can
1741                  * happen since we don't hold a reference to it.
1742                  * If so, we're responsible for freeing the page.
1743                  */
1744                 if (m->wire_count == 0 && m->object == NULL)
1745                         vm_page_lock_queues();
1746                         vm_page_free(m);
1747                         vm_page_unlock_queues();
1748                 sf->m = NULL;
1749                 SLIST_INSERT_HEAD(&sf_freelist, sf, free_list);
1750                 if (sf_buf_alloc_want) {
1751                         sf_buf_alloc_want = 0;
1752                         wakeup(&sf_freelist);
1753                 }
1754         }
1755 }
1756
1757 /*
1758  * sendfile(2).
1759  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1760  *       struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1761  *
1762  * Send a file specified by 'fd' and starting at 'offset' to a socket
1763  * specified by 's'. Send only 'nbytes' of the file or until EOF if
1764  * nbytes == 0. Optionally add a header and/or trailer to the socket
1765  * output. If specified, write the total number of bytes sent into *sbytes.
1766  */
1767 int
1768 sendfile(struct proc *p, struct sendfile_args *uap)
1769 {
1770         struct fileproc *fp;
1771         struct vnode *vp;
1772         struct vm_object *obj;
1773         struct socket *so;
1774         struct mbuf *m;
1775         struct sf_buf *sf;
1776         struct vm_page *pg;
1777         struct writev_args nuap;
1778         struct sf_hdtr hdtr;
1779         off_t off, xfsize, sbytes = 0;
1780         int error = 0, s;
1781
1782         if (sf_bufs == NULL) {
1783                 /* Fail if initialization failed */
1784                 return ENOSYS;
1785         }
1786
1787         /*
1788          * Do argument checking. Must be a regular file in, stream
1789          * type and connected socket out, positive offset.
1790          */
1791         if (error = fp_getfvp(p, uap->fd, &fp, &vp))
1792                 goto done;
1793         if (fp->f_flag & FREAD) == 0) {
1794                 error = EBADF;
1795                 goto done1;
1796         }
1797         obj = vp->v_object;
1798         if (vp->v_type != VREG || obj == NULL) {
1799                 error = EINVAL;
1800                 goto done1;
1801         }
1802         error = file_socket(uap->s, &so);
1803         if (error)
1804                 goto done1;
1805         if (so == NULL) {
1806                 error = EBADF;
1807                 goto done2;
1808         }
1809
1810         socket_lock(so, 1);
1811
1812         if (so->so_type != SOCK_STREAM) {
1813                 error = EINVAL;
1814                 goto done3;
1815         }
1816         if ((so->so_state & SS_ISCONNECTED) == 0) {
1817                 error = ENOTCONN;
1818                 goto done3;
1819         }
1820         if (uap->offset < 0) {
1821                 error = EINVAL;
1822                 goto done3;
1823         }
1824
1825         /*
1826          * If specified, get the pointer to the sf_hdtr struct for
1827          * any headers/trailers.
1828          */
1829         if (uap->hdtr != NULL) {
1830                 error = copyin(CAST_USER_ADDR_T(uap->hdtr), &hdtr, sizeof(hdtr));
1831                 if (error)
1832                         goto done3;
1833                 /*
1834                  * Send any headers. Wimp out and use writev(2).
1835                  */
1836                 if (hdtr.headers != NULL) {
1837                         nuap.fd = uap->s;
1838                         nuap.iovp = hdtr.headers;
1839                         nuap.iovcnt = hdtr.hdr_cnt;
1840                         error = writev(p, &nuap);
1841                         if (error)
1842                                 goto done3;
1843                         sbytes += p->p_retval[0];
1844                 }
1845         }
1846
1847         /*
1848          * Protect against multiple writers to the socket.
1849          */
1850         (void) sblock(&so->so_snd, M_WAIT);
1851
1852         /*
1853          * Loop through the pages in the file, starting with the requested
1854          * offset. Get a file page (do I/O if necessary), map the file page
1855          * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1856          * it on the socket.
1857          */
1858         for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1859                 vm_object_offset_t pindex;
1860                 vm_object_offset_t pgoff;
1861
1862                 pindex = OFF_TO_IDX(off);
1863 retry_lookup:
1864                 /*
1865                  * Calculate the amount to transfer. Not to exceed a page,
1866                  * the EOF, or the passed in nbytes.
1867                  */
1868                 xfsize = obj->un_pager.vnp.vnp_size - off;
1869                 if (xfsize > PAGE_SIZE_64)
1870                         xfsize = PAGE_SIZE;
1871                 pgoff = (vm_object_offset_t)(off & PAGE_MASK_64);
1872                 if (PAGE_SIZE - pgoff < xfsize)
1873                         xfsize = PAGE_SIZE_64 - pgoff;
1874                 if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1875                         xfsize = uap->nbytes - sbytes;
1876                 if (xfsize <= 0)
1877                         break;
1878                 /*
1879                  * Optimize the non-blocking case by looking at the socket space
1880                  * before going to the extra work of constituting the sf_buf.
1881                  */
1882                 if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1883                         if (so->so_state & SS_CANTSENDMORE)
1884                                 error = EPIPE;
1885                         else
1886                                 error = EAGAIN;
1887                         sbunlock(&so->so_snd, 0); /* will release lock */
1888                         goto done2;
1889                 }
1890                 /*
1891                  * Attempt to look up the page. If the page doesn't exist or the
1892                  * part we're interested in isn't valid, then read it from disk.
1893                  * If some other part of the kernel has this page (i.e. it's busy),
1894                  * then disk I/O may be occuring on it, so wait and retry.
1895                  */
1896                 pg = vm_page_lookup(obj, pindex);
1897                 if (pg == NULL || (!(pg->flags & PG_BUSY) && !pg->busy &&
1898                     !vm_page_is_valid(pg, pgoff, xfsize))) {
1899                         struct uio auio;
1900                         struct iovec aiov;
1901                         int bsize;
1902
1903                         if (pg == NULL) {
1904                                 pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1905                                 if (pg == NULL) {
1906                                         VM_WAIT;
1907                                         goto retry_lookup;
1908                                 }
1909                                 /*
1910                                  * don't just clear PG_BUSY manually -
1911                                  * vm_page_alloc() should be considered opaque,
1912                                  * use the VM routine provided to clear
1913                                  * PG_BUSY.
1914                                  */
1915                                 vm_page_wakeup(pg);
1916
1917                         }
1918                         /*
1919                          * Ensure that our page is still around when the I/O completes.
1920                          */
1921                         vm_page_io_start(pg);
1922                         vm_page_wire(pg);
1923                         /*
1924                          * Get the page from backing store.
1925                          */
1926                         bsize = vp->v_mount->mnt_vfsstat.f_iosize;
1927                         auio.uio_iov = &aiov;
1928                         auio.uio_iovcnt = 1;
1929                         aiov.iov_base = 0;
1930                         aiov.iov_len = MAXBSIZE;
1931                         auio.uio_offset = trunc_page(off);
1932                         auio.uio_segflg = UIO_NOCOPY;
1933                         auio.uio_rw = UIO_READ;
1934                         uio_setresid(&auio, MAXBSIZE);
1935                         error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
1936                                 p->p_ucred);
1937                         vm_page_flag_clear(pg, PG_ZERO);
1938                         vm_page_io_finish(pg);
1939                         if (error) {
1940                                 vm_page_unwire(pg, 0);
1941                                 /*
1942                                  * See if anyone else might know about this page.
1943                                  * If not and it is not valid, then free it.
1944                                  */
1945                                 if (pg->wire_count == 0 && pg->valid == 0 &&
1946                                     pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1947                                     pg->hold_count == 0)
1948                                         vm_page_lock_queues();
1949                                         vm_page_free(pg);
1950                                         vm_page_unlock_queues();
1951                                 sbunlock(&so->so_snd, 0); /* will release socket lock */
1952                                 goto done2;
1953                         }
1954                 } else {
1955                         if ((pg->flags & PG_BUSY) || pg->busy)  {
1956                                 s = splvm();
1957                                 if ((pg->flags & PG_BUSY) || pg->busy) {
1958                                         /*
1959                                          * Page is busy. Wait and retry.
1960                                          */
1961                                         vm_page_flag_set(pg, PG_WANTED);
1962                                         tsleep(pg, PVM, "sfpbsy", 0);
1963                                         goto retry_lookup;
1964                                 }
1965                         }
1966                         /*
1967                          * Protect from having the page ripped out from beneath us.
1968                          */
1969                         vm_page_wire(pg);
1970                 }
1971                 /*
1972                  * Allocate a kernel virtual page and insert the physical page
1973                  * into it.
1974                  */
1975                 sf = sf_buf_alloc();
1976                 sf->m = pg;
1977                 pmap_qenter(sf->kva, &pg, 1);
1978                 /*
1979                  * Get an mbuf header and set it up as having external storage.
1980                  */
1981                 MGETHDR(m, M_WAIT, MT_DATA);
1982                 if (m == NULL) {
1983                         error = ENOBUFS;
1984                         sbunlock(&so->so_snd, 0); /* will release socket lock */
1985                         goto done2;
1986                 }
1987                 m->m_ext.ext_free = sf_buf_free;
1988                 m->m_ext.ext_ref = sf_buf_ref;
1989                 m->m_ext.ext_buf = (void *)sf->kva;
1990                 m->m_ext.ext_size = PAGE_SIZE;
1991                 m->m_data = (char *) sf->kva + pgoff;
1992                 m->m_flags |= M_EXT;
1993                 m->m_pkthdr.len = m->m_len = xfsize;
1994                 /*
1995                  * Add the buffer to the socket buffer chain.
1996                  */
1997 retry_space:
1998                 /*
1999                  * Make sure that the socket is still able to take more data.
2000                  * CANTSENDMORE being true usually means that the connection
2001                  * was closed. so_error is true when an error was sensed after
2002                  * a previous send.
2003                  * The state is checked after the page mapping and buffer
2004                  * allocation above since those operations may block and make
2005                  * any socket checks stale. From this point forward, nothing
2006                  * blocks before the pru_send (or more accurately, any blocking
2007                  * results in a loop back to here to re-check).
2008                  */
2009                 if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
2010                         if (so->so_state & SS_CANTSENDMORE) {
2011                                 error = EPIPE;
2012                         } else {
2013                                 error = so->so_error;
2014                                 so->so_error = 0;
2015                         }
2016                         m_freem(m);
2017                         sbunlock(&so->so_snd, 0); /* will release socket lock */
2018                         goto done2;
2019                 }
2020                 /*
2021                  * Wait for socket space to become available. We do this just
2022                  * after checking the connection state above in order to avoid
2023                  * a race condition with sbwait().
2024                  */
2025                 if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
2026                         if (so->so_state & SS_NBIO) {
2027                                 m_freem(m);
2028                                 sbunlock(&so->so_snd, 0); /* will release socket lock */
2029                                 error = EAGAIN;
2030                                 goto done2;
2031                         }
2032                         error = sbwait(&so->so_snd);
2033                         /*
2034                          * An error from sbwait usually indicates that we've
2035                          * been interrupted by a signal. If we've sent anything
2036                          * then return bytes sent, otherwise return the error.
2037                          */
2038                         if (error) {
2039                                 m_freem(m);
2040                                 sbunlock(&so->so_snd, 0);
2041                                 goto done2;
2042                         }
2043                         goto retry_space;
2044                 }
2045                 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
2046                 splx(s);
2047                 if (error) {
2048                         sbunlock(&so->so_snd, 0); /* will release socket lock */
2049                         goto done2;
2050                 }
2051         }
2052         sbunlock(&so->so_snd, 0); /* will release socket lock */
2053
2054         /*
2055          * Send trailers. Wimp out and use writev(2).
2056          */
2057         if (uap->hdtr != NULL && hdtr.trailers != NULL) {
2058                         nuap.fd = uap->s;
2059                         nuap.iovp = hdtr.trailers;
2060                         nuap.iovcnt = hdtr.trl_cnt;
2061                         error = writev(p, &nuap);
2062                         if (error)
2063                                 goto done2;
2064                         sbytes += p->p_retval[0];
2065         }
2066 done2:
2067         file_drop(uap->s);
2068 done1:
2069         file_drop(uap->fd);
2070 done:
2071         if (uap->sbytes != NULL) {
2072                 /* XXX this appears bogus for some early failure conditions */
2073                 copyout(&sbytes, CAST_USER_ADDR_T(uap->sbytes), sizeof(off_t));
2074         }
2075         return (error);
2076 done3:
2077         socket_unlock(so, 1);
2078         goto done2;
2079 }
2080
2081 #endif