bsd/kern/kern_resource.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995, 1997 Apple Computer, Inc. All Rights Reserved */
  29 /*-
  30  * Copyright (c) 1982, 1986, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  * (c) UNIX System Laboratories, Inc.
  33  * All or some portions of this file are derived from material licensed
  34  * to the University of California by American Telephone and Telegraph
  35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  36  * the permission of UNIX System Laboratories, Inc.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      @(#)kern_resource.c     8.5 (Berkeley) 1/21/94
  67  */
  68 /*
  69  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  70  * support for mandatory and extensible security protections.  This notice
  71  * is included in support of clause 2.2 (b) of the Apple Public License,
  72  * Version 2.0.
  73  */
  74
  75 #include <sys/param.h>
  76 #include <sys/systm.h>
  77 #include <sys/sysctl.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/resourcevar.h>
  81 #include <sys/malloc.h>
  82 #include <sys/proc_internal.h>
  83 #include <sys/kauth.h>
  84 #include <machine/spl.h>
  85
  86 #include <sys/mount_internal.h>
  87 #include <sys/sysproto.h>
  88
  89 #include <security/audit/audit.h>
  90
  91 #include <machine/vmparam.h>
  92
  93 #include <mach/mach_types.h>
  94 #include <mach/time_value.h>
  95 #include <mach/task.h>
  96 #include <mach/task_info.h>
  97 #include <mach/vm_map.h>
  98 #include <mach/mach_vm.h>
  99 #include <mach/thread_act.h>  /* for thread_policy_set( ) */
 100 #include <kern/lock.h>
 101 #include <kern/thread.h>
 102
 103 #include <kern/task.h>
 104 #include <kern/clock.h>         /* for absolutetime_to_microtime() */
 105 #include <netinet/in.h>         /* for TRAFFIC_MGT_SO_BACKGROUND */
 106 #include <sys/socketvar.h>      /* for struct socket */
 107
 108 #include <vm/vm_map.h>
 109
 110 int     donice(struct proc *curp, struct proc *chgp, int n);
 111 int     dosetrlimit(struct proc *p, u_int which, struct rlimit *limp);
 112 static int do_background_thread(struct proc *curp, int priority);
 113
 114 rlim_t maxdmap = MAXDSIZ;       /* XXX */
 115 rlim_t maxsmap = MAXSSIZ - PAGE_SIZE;   /* XXX */
 116
 117 /*
 118  * Limits on the number of open files per process, and the number
 119  * of child processes per process.
 120  *
 121  * Note: would be in kern/subr_param.c in FreeBSD.
 122  */
 123 __private_extern__ int maxfilesperproc = OPEN_MAX;              /* per-proc open files limit */
 124
 125 SYSCTL_INT( _kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW,
 126                 &maxprocperuid, 0, "Maximum processes allowed per userid" );
 127
 128 SYSCTL_INT( _kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
 129                 &maxfilesperproc, 0, "Maximum files allowed open per process" );
 130
 131 /* Args and fn for proc_iteration callback used in setpriority */
 132 struct puser_nice_args {
 133         proc_t curp;
 134         int     prio;
 135         id_t    who;
 136         int *   foundp;
 137         int *   errorp;
 138 };
 139 static int puser_donice_callback(proc_t p, void * arg);
 140
 141
 142 /* Args and fn for proc_iteration callback used in setpriority */
 143 struct ppgrp_nice_args {
 144         proc_t curp;
 145         int     prio;
 146         int *   foundp;
 147         int *   errorp;
 148 };
 149 static int ppgrp_donice_callback(proc_t p, void * arg);
 150
 151 /*
 152  * Resource controls and accounting.
 153  */
 154 int
 155 getpriority(struct proc *curp, struct getpriority_args *uap, int32_t *retval)
 156 {
 157         struct proc *p;
 158         int low = PRIO_MAX + 1;
 159         kauth_cred_t my_cred;
 160
 161         /* would also test (uap->who < 0), but id_t is unsigned */
 162         if (uap->who > 0x7fffffff)
 163                 return (EINVAL);
 164
 165         switch (uap->which) {
 166
 167         case PRIO_PROCESS:
 168                 if (uap->who == 0) {
 169                         p = curp;
 170                         low = p->p_nice;
 171                 } else {
 172                         p = proc_find(uap->who);
 173                         if (p == 0)
 174                                 break;
 175                         low = p->p_nice;
 176                         proc_rele(p);
 177
 178                 }
 179                 break;
 180
 181         case PRIO_PGRP: {
 182                 struct pgrp *pg = PGRP_NULL;
 183
 184                 if (uap->who == 0) {
 185                         /* returns the pgrp to ref */
 186                         pg = proc_pgrp(curp);
 187                  } else if ((pg = pgfind(uap->who)) == PGRP_NULL) {
 188                         break;
 189                 }
 190                 /* No need for iteration as it is a simple scan */
 191                 pgrp_lock(pg);
 192                 for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next) {
 193                         if (p->p_nice < low)
 194                                 low = p->p_nice;
 195                 }
 196                 pgrp_unlock(pg);
 197                 pg_rele(pg);
 198                 break;
 199         }
 200
 201         case PRIO_USER:
 202                 if (uap->who == 0)
 203                         uap->who = kauth_cred_getuid(kauth_cred_get());
 204
 205                 proc_list_lock();
 206
 207                 for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
 208                         my_cred = kauth_cred_proc_ref(p);
 209                         if (kauth_cred_getuid(my_cred) == uap->who &&
 210                             p->p_nice < low)
 211                                 low = p->p_nice;
 212                         kauth_cred_unref(&my_cred);
 213                 }
 214
 215                 proc_list_unlock();
 216
 217                 break;
 218
 219         case PRIO_DARWIN_THREAD: {
 220                 thread_t                        thread;
 221                 struct uthread          *ut;
 222
 223                 /* we currently only support the current thread */
 224                 if (uap->who != 0) {
 225                         return (EINVAL);
 226                 }
 227
 228                 thread = current_thread();
 229                 ut = get_bsdthread_info(thread);
 230
 231                 low = 0;
 232                 if ( (ut->uu_flag & UT_BACKGROUND) != 0 ) {
 233                         low = 1;
 234                 }
 235                 break;
 236         }
 237
 238         default:
 239                 return (EINVAL);
 240         }
 241         if (low == PRIO_MAX + 1)
 242                 return (ESRCH);
 243         *retval = low;
 244         return (0);
 245 }
 246
 247 /* call back function used for proc iteration in PRIO_USER */
 248 static int
 249 puser_donice_callback(proc_t p, void * arg)
 250 {
 251         int error, n;
 252         struct puser_nice_args * pun = (struct puser_nice_args *)arg;
 253         kauth_cred_t my_cred;
 254
 255         my_cred = kauth_cred_proc_ref(p);
 256         if (kauth_cred_getuid(my_cred) == pun->who) {
 257                 error = donice(pun->curp, p, pun->prio);
 258                 if (pun->errorp != NULL)
 259                         *pun->errorp = error;
 260                 if (pun->foundp != NULL) {
 261                         n = *pun->foundp;
 262                         *pun->foundp = n+1;
 263                 }
 264         }
 265         kauth_cred_unref(&my_cred);
 266
 267         return(PROC_RETURNED);
 268 }
 269
 270 /* call back function used for proc iteration in PRIO_PGRP */
 271 static int
 272 ppgrp_donice_callback(proc_t p, void * arg)
 273 {
 274         int error;
 275         struct ppgrp_nice_args * pun = (struct ppgrp_nice_args *)arg;
 276         int n;
 277
 278         error = donice(pun->curp, p, pun->prio);
 279         if (pun->errorp != NULL)
 280                 *pun->errorp = error;
 281         if (pun->foundp!= NULL) {
 282                 n = *pun->foundp;
 283                 *pun->foundp = n+1;
 284         }
 285
 286         return(PROC_RETURNED);
 287 }
 288
 289 /*
 290  * Returns:     0                       Success
 291  *              EINVAL
 292  *              ESRCH
 293  *      donice:EPERM
 294  *      donice:EACCES
 295  */
 296 /* ARGSUSED */
 297 int
 298 setpriority(struct proc *curp, struct setpriority_args *uap, __unused int32_t *retval)
 299 {
 300         struct proc *p;
 301         int found = 0, error = 0;
 302         int refheld = 0;
 303
 304         AUDIT_ARG(cmd, uap->which);
 305         AUDIT_ARG(owner, uap->who, 0);
 306         AUDIT_ARG(value32, uap->prio);
 307
 308         /* would also test (uap->who < 0), but id_t is unsigned */
 309         if (uap->who > 0x7fffffff)
 310                 return (EINVAL);
 311
 312         switch (uap->which) {
 313
 314         case PRIO_PROCESS:
 315                 if (uap->who == 0)
 316                         p = curp;
 317                 else {
 318                         p = proc_find(uap->who);
 319                         if (p == 0)
 320                                 break;
 321                         refheld = 1;
 322                 }
 323                 error = donice(curp, p, uap->prio);
 324                 found++;
 325                 if (refheld != 0)
 326                         proc_rele(p);
 327                 break;
 328
 329         case PRIO_PGRP: {
 330                 struct pgrp *pg = PGRP_NULL;
 331                 struct ppgrp_nice_args ppgrp;
 332
 333                 if (uap->who == 0) {
 334                         pg = proc_pgrp(curp);
 335                  } else if ((pg = pgfind(uap->who)) == PGRP_NULL)
 336                         break;
 337
 338                 ppgrp.curp = curp;
 339                 ppgrp.prio = uap->prio;
 340                 ppgrp.foundp = &found;
 341                 ppgrp.errorp = &error;
 342
 343                 /* PGRP_DROPREF drops the reference on process group */
 344                 pgrp_iterate(pg, PGRP_DROPREF, ppgrp_donice_callback, (void *)&ppgrp, NULL, NULL);
 345
 346                 break;
 347         }
 348
 349         case PRIO_USER: {
 350                 struct puser_nice_args punice;
 351
 352                 if (uap->who == 0)
 353                         uap->who = kauth_cred_getuid(kauth_cred_get());
 354
 355                 punice.curp = curp;
 356                 punice.prio = uap->prio;
 357                 punice.who = uap->who;
 358                 punice.foundp = &found;
 359                 error = 0;
 360                 punice.errorp = &error;
 361                 proc_iterate(PROC_ALLPROCLIST, puser_donice_callback, (void *)&punice, NULL, NULL);
 362
 363                 break;
 364         }
 365
 366         case PRIO_DARWIN_THREAD: {
 367                 /* we currently only support the current thread */
 368                 if (uap->who != 0) {
 369                         return (EINVAL);
 370                 }
 371                 error = do_background_thread(curp, uap->prio);
 372                 found++;
 373                 break;
 374         }
 375
 376         default:
 377                 return (EINVAL);
 378         }
 379         if (found == 0)
 380                 return (ESRCH);
 381         return (error);
 382 }
 383
 384
 385 /*
 386  * Returns:     0                       Success
 387  *              EPERM
 388  *              EACCES
 389  *      mac_check_proc_sched:???
 390  */
 391 int
 392 donice(struct proc *curp, struct proc *chgp, int n)
 393 {
 394         int error = 0;
 395         kauth_cred_t ucred;
 396         kauth_cred_t my_cred;
 397
 398         ucred = kauth_cred_proc_ref(curp);
 399         my_cred = kauth_cred_proc_ref(chgp);
 400
 401         if (suser(ucred, NULL) && ucred->cr_ruid &&
 402             kauth_cred_getuid(ucred) != kauth_cred_getuid(my_cred) &&
 403             ucred->cr_ruid != kauth_cred_getuid(my_cred)) {
 404                 error = EPERM;
 405                 goto out;
 406         }
 407         if (n > PRIO_MAX)
 408                 n = PRIO_MAX;
 409         if (n < PRIO_MIN)
 410                 n = PRIO_MIN;
 411         if (n < chgp->p_nice && suser(ucred, &curp->p_acflag)) {
 412                 error = EACCES;
 413                 goto out;
 414         }
 415 #if CONFIG_MACF
 416         error = mac_proc_check_sched(curp, chgp);
 417         if (error)
 418                 goto out;
 419 #endif
 420         proc_lock(chgp);
 421         chgp->p_nice = n;
 422         proc_unlock(chgp);
 423         (void)resetpriority(chgp);
 424 out:
 425         kauth_cred_unref(&ucred);
 426         kauth_cred_unref(&my_cred);
 427         return (error);
 428 }
 429
 430 /*
 431  * do_background_thread
 432  * Returns:     0                       Success
 433  * XXX - todo - does this need a MACF hook?
 434  */
 435 static int
 436 do_background_thread(struct proc *curp, int priority)
 437 {
 438         int                                                                     i;
 439         thread_t                                                        thread;
 440         struct uthread                                          *ut;
 441         thread_precedence_policy_data_t         policy;
 442         struct filedesc                                         *fdp;
 443         struct fileproc                                         *fp;
 444
 445         thread = current_thread();
 446         ut = get_bsdthread_info(thread);
 447
 448         if ( (priority & PRIO_DARWIN_BG) == 0 ) {
 449                 /* turn off backgrounding of thread */
 450                 if ( (ut->uu_flag & UT_BACKGROUND) == 0 ) {
 451                         /* already off */
 452                         return(0);
 453                 }
 454
 455                 /* clear background bit in thread and disable disk IO throttle */
 456                 ut->uu_flag &= ~UT_BACKGROUND;
 457                 ut->uu_iopol_disk = IOPOL_NORMAL;
 458
 459                 /* reset thread priority (we did not save previous value) */
 460                 policy.importance = 0;
 461                 thread_policy_set( thread, THREAD_PRECEDENCE_POLICY,
 462                                                    (thread_policy_t)&policy,
 463                                                    THREAD_PRECEDENCE_POLICY_COUNT );
 464
 465                 /* disable networking IO throttle.
 466                  * NOTE - It is a known limitation of the current design that we
 467                  * could potentially clear TRAFFIC_MGT_SO_BACKGROUND bit for
 468                  * sockets created by other threads within this process.
 469                  */
 470                 proc_fdlock(curp);
 471                 fdp = curp->p_fd;
 472                 for ( i = 0; i < fdp->fd_nfiles; i++ ) {
 473                         struct socket           *sockp;
 474
 475                         fp = fdp->fd_ofiles[ i ];
 476                         if ( fp == NULL || (fdp->fd_ofileflags[ i ] & UF_RESERVED) != 0 ||
 477                                  fp->f_fglob->fg_type != DTYPE_SOCKET ) {
 478                                 continue;
 479                         }
 480                         sockp = (struct socket *)fp->f_fglob->fg_data;
 481                         if ( sockp->so_background_thread != thread ) {
 482                                 continue;
 483                         }
 484                         sockp->so_traffic_mgt_flags &= ~TRAFFIC_MGT_SO_BACKGROUND;
 485                         sockp->so_background_thread = NULL;
 486                 }
 487                 proc_fdunlock(curp);
 488
 489                 return(0);
 490         }
 491
 492         /* background this thread */
 493         if ( (ut->uu_flag & UT_BACKGROUND) != 0 ) {
 494                 /* already backgrounded */
 495                 return(0);
 496         }
 497
 498         /* tag thread as background and throttle disk IO */
 499         ut->uu_flag |= UT_BACKGROUND;
 500         ut->uu_iopol_disk = IOPOL_THROTTLE;
 501
 502         policy.importance = INT_MIN;
 503         thread_policy_set( thread, THREAD_PRECEDENCE_POLICY,
 504                                            (thread_policy_t)&policy,
 505                                            THREAD_PRECEDENCE_POLICY_COUNT );
 506
 507         /* throttle networking IO happens in socket( ) syscall.
 508          * If UT_BACKGROUND is set in the current thread then
 509          * TRAFFIC_MGT_SO_BACKGROUND socket option is set.
 510          */
 511         return(0);
 512 }
 513
 514
 515 /*
 516  * Returns:     0                       Success
 517  *      copyin:EFAULT
 518  *      dosetrlimit:
 519  */
 520 /* ARGSUSED */
 521 int
 522 setrlimit(struct proc *p, struct setrlimit_args *uap, __unused int32_t *retval)
 523 {
 524         struct rlimit alim;
 525         int error;
 526
 527         if ((error = copyin(uap->rlp, (caddr_t)&alim,
 528             sizeof (struct rlimit))))
 529                 return (error);
 530
 531         return (dosetrlimit(p, uap->which, &alim));
 532 }
 533
 534 /*
 535  * Returns:     0                       Success
 536  *              EINVAL
 537  *              ENOMEM                  Cannot copy limit structure
 538  *      suser:EPERM
 539  *
 540  * Notes:       EINVAL is returned both for invalid arguments, and in the
 541  *              case that the current usage (e.g. RLIMIT_STACK) is already
 542  *              in excess of the requested limit.
 543  */
 544 int
 545 dosetrlimit(struct proc *p, u_int which, struct rlimit *limp)
 546 {
 547         struct rlimit *alimp;
 548         int error;
 549         kern_return_t   kr;
 550         int posix = (which & _RLIMIT_POSIX_FLAG) ? 1 : 0;
 551
 552         /* Mask out POSIX flag, saved above */
 553         which &= ~_RLIMIT_POSIX_FLAG;
 554
 555         if (which >= RLIM_NLIMITS)
 556                 return (EINVAL);
 557
 558         alimp = &p->p_rlimit[which];
 559         if (limp->rlim_cur > limp->rlim_max)
 560                 return EINVAL;
 561
 562         if (limp->rlim_cur > alimp->rlim_max ||
 563             limp->rlim_max > alimp->rlim_max)
 564                 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
 565                         return (error);
 566         }
 567
 568         proc_limitblock(p);
 569
 570         if ((error = proc_limitreplace(p)) != 0) {
 571                 proc_limitunblock(p);
 572                 return(error);
 573         }
 574
 575         alimp = &p->p_rlimit[which];
 576
 577         switch (which) {
 578
 579         case RLIMIT_CPU:
 580                 if (limp->rlim_cur == RLIM_INFINITY) {
 581                         task_vtimer_clear(p->task, TASK_VTIMER_RLIM);
 582                         timerclear(&p->p_rlim_cpu);
 583                 }
 584                 else {
 585                         task_absolutetime_info_data_t   tinfo;
 586                         mach_msg_type_number_t                  count;
 587                         struct timeval                                  ttv, tv;
 588                         clock_sec_t                                             tv_sec;
 589                         clock_usec_t                                    tv_usec;
 590
 591                         count = TASK_ABSOLUTETIME_INFO_COUNT;
 592                         task_info(p->task, TASK_ABSOLUTETIME_INFO,
 593                                                                 (task_info_t)&tinfo, &count);
 594                         absolutetime_to_microtime(tinfo.total_user + tinfo.total_system,
 595                                                                           &tv_sec, &tv_usec);
 596                         ttv.tv_sec = tv_sec;
 597                         ttv.tv_usec = tv_usec;
 598
 599                         tv.tv_sec = (limp->rlim_cur > __INT_MAX__ ? __INT_MAX__ : limp->rlim_cur);
 600                         tv.tv_usec = 0;
 601                         timersub(&tv, &ttv, &p->p_rlim_cpu);
 602
 603                         timerclear(&tv);
 604                         if (timercmp(&p->p_rlim_cpu, &tv, >))
 605                                 task_vtimer_set(p->task, TASK_VTIMER_RLIM);
 606                         else {
 607                                 task_vtimer_clear(p->task, TASK_VTIMER_RLIM);
 608
 609                                 timerclear(&p->p_rlim_cpu);
 610
 611                                 psignal(p, SIGXCPU);
 612                         }
 613                 }
 614                 break;
 615
 616         case RLIMIT_DATA:
 617                 if (limp->rlim_cur > maxdmap)
 618                         limp->rlim_cur = maxdmap;
 619                 if (limp->rlim_max > maxdmap)
 620                         limp->rlim_max = maxdmap;
 621                 break;
 622
 623         case RLIMIT_STACK:
 624                 /* Disallow illegal stack size instead of clipping */
 625                 if (limp->rlim_cur > maxsmap ||
 626                     limp->rlim_max > maxsmap) {
 627                         if (posix) {
 628                                 error = EINVAL;
 629                                 goto out;
 630                         }
 631                         else {
 632                                 /*
 633                                  * 4797860 - workaround poorly written installers by
 634                                  * doing previous implementation (< 10.5) when caller
 635                                  * is non-POSIX conforming.
 636                                  */
 637                                 if (limp->rlim_cur > maxsmap)
 638                                         limp->rlim_cur = maxsmap;
 639                                 if (limp->rlim_max > maxsmap)
 640                                         limp->rlim_max = maxsmap;
 641                         }
 642                 }
 643
 644                 /*
 645                  * Stack is allocated to the max at exec time with only
 646                  * "rlim_cur" bytes accessible.  If stack limit is going
 647                  * up make more accessible, if going down make inaccessible.
 648                  */
 649                 if (limp->rlim_cur > alimp->rlim_cur) {
 650                         user_addr_t addr;
 651                         user_size_t size;
 652
 653                                 /* grow stack */
 654                                 size = round_page_64(limp->rlim_cur);
 655                                 size -= round_page_64(alimp->rlim_cur);
 656
 657 #if STACK_GROWTH_UP
 658                                 /* go to top of current stack */
 659                         addr = p->user_stack + round_page_64(alimp->rlim_cur);
 660 #else   /* STACK_GROWTH_UP */
 661                         addr = p->user_stack - round_page_64(limp->rlim_cur);
 662 #endif /* STACK_GROWTH_UP */
 663                         kr = mach_vm_protect(current_map(),
 664                                              addr, size,
 665                                              FALSE, VM_PROT_DEFAULT);
 666                         if (kr != KERN_SUCCESS) {
 667                                 error =  EINVAL;
 668                                 goto out;
 669                         }
 670                 } else if (limp->rlim_cur < alimp->rlim_cur) {
 671                         user_addr_t addr;
 672                         user_size_t size;
 673                         user_addr_t cur_sp;
 674
 675                                 /* shrink stack */
 676
 677                         /*
 678                          * First check if new stack limit would agree
 679                          * with current stack usage.
 680                          * Get the current thread's stack pointer...
 681                          */
 682                         cur_sp = thread_adjuserstack(current_thread(),
 683                                                      0);
 684 #if STACK_GROWTH_UP
 685                         if (cur_sp >= p->user_stack &&
 686                             cur_sp < (p->user_stack +
 687                                       round_page_64(alimp->rlim_cur))) {
 688                                 /* current stack pointer is in main stack */
 689                                 if (cur_sp >= (p->user_stack +
 690                                                round_page_64(limp->rlim_cur))) {
 691                                         /*
 692                                          * New limit would cause
 693                                          * current usage to be invalid:
 694                                          * reject new limit.
 695                                          */
 696                                         error =  EINVAL;
 697                                         goto out;
 698                         }
 699                         } else {
 700                                 /* not on the main stack: reject */
 701                                 error =  EINVAL;
 702                                 goto out;
 703                 }
 704
 705 #else   /* STACK_GROWTH_UP */
 706                         if (cur_sp <= p->user_stack &&
 707                             cur_sp > (p->user_stack -
 708                                       round_page_64(alimp->rlim_cur))) {
 709                                 /* stack pointer is in main stack */
 710                                 if (cur_sp <= (p->user_stack -
 711                                                round_page_64(limp->rlim_cur))) {
 712                                         /*
 713                                          * New limit would cause
 714                                          * current usage to be invalid:
 715                                          * reject new limit.
 716                                          */
 717                                         error =  EINVAL;
 718                                         goto out;
 719                                 }
 720                         } else {
 721                                 /* not on the main stack: reject */
 722                                 error =  EINVAL;
 723                                 goto out;
 724                         }
 725 #endif  /* STACK_GROWTH_UP */
 726
 727                         size = round_page_64(alimp->rlim_cur);
 728                         size -= round_page_64(limp->rlim_cur);
 729
 730 #if STACK_GROWTH_UP
 731                         addr = p->user_stack + round_page_64(limp->rlim_cur);
 732 #else   /* STACK_GROWTH_UP */
 733                         addr = p->user_stack - round_page_64(alimp->rlim_cur);
 734 #endif /* STACK_GROWTH_UP */
 735
 736                         kr = mach_vm_protect(current_map(),
 737                                              addr, size,
 738                                              FALSE, VM_PROT_NONE);
 739                         if (kr != KERN_SUCCESS) {
 740                                 error =  EINVAL;
 741                                 goto out;
 742                         }
 743                 } else {
 744                         /* no change ... */
 745                 }
 746                 break;
 747
 748         case RLIMIT_NOFILE:
 749                 /*
 750                  * Only root can set the maxfiles limits, as it is
 751                  * systemwide resource.  If we are expecting POSIX behavior,
 752                  * instead of clamping the value, return EINVAL.  We do this
 753                  * because historically, people have been able to attempt to
 754                  * set RLIM_INFINITY to get "whatever the maximum is".
 755                 */
 756                 if ( is_suser() ) {
 757                         if (limp->rlim_cur != alimp->rlim_cur &&
 758                             limp->rlim_cur > (rlim_t)maxfiles) {
 759                                 if (posix) {
 760                                         error =  EINVAL;
 761                                         goto out;
 762                                 }
 763                                 limp->rlim_cur = maxfiles;
 764                         }
 765                         if (limp->rlim_max != alimp->rlim_max &&
 766                             limp->rlim_max > (rlim_t)maxfiles)
 767                                 limp->rlim_max = maxfiles;
 768                 }
 769                 else {
 770                         if (limp->rlim_cur != alimp->rlim_cur &&
 771                             limp->rlim_cur > (rlim_t)maxfilesperproc) {
 772                                 if (posix) {
 773                                         error =  EINVAL;
 774                                         goto out;
 775                                 }
 776                                 limp->rlim_cur = maxfilesperproc;
 777                         }
 778                         if (limp->rlim_max != alimp->rlim_max &&
 779                             limp->rlim_max > (rlim_t)maxfilesperproc)
 780                                 limp->rlim_max = maxfilesperproc;
 781                 }
 782                 break;
 783
 784         case RLIMIT_NPROC:
 785                 /*
 786                  * Only root can set to the maxproc limits, as it is
 787                  * systemwide resource; all others are limited to
 788                  * maxprocperuid (presumably less than maxproc).
 789                  */
 790                 if ( is_suser() ) {
 791                         if (limp->rlim_cur > (rlim_t)maxproc)
 792                                 limp->rlim_cur = maxproc;
 793                         if (limp->rlim_max > (rlim_t)maxproc)
 794                                 limp->rlim_max = maxproc;
 795                 }
 796                 else {
 797                         if (limp->rlim_cur > (rlim_t)maxprocperuid)
 798                                 limp->rlim_cur = maxprocperuid;
 799                         if (limp->rlim_max > (rlim_t)maxprocperuid)
 800                                 limp->rlim_max = maxprocperuid;
 801                 }
 802                 break;
 803
 804         case RLIMIT_MEMLOCK:
 805                 /*
 806                  * Tell the Mach VM layer about the new limit value.
 807                  */
 808
 809                 vm_map_set_user_wire_limit(current_map(), limp->rlim_cur);
 810                 break;
 811
 812         } /* switch... */
 813         proc_lock(p);
 814         *alimp = *limp;
 815         proc_unlock(p);
 816         error = 0;
 817 out:
 818         proc_limitunblock(p);
 819         return (error);
 820 }
 821
 822 /* ARGSUSED */
 823 int
 824 getrlimit(struct proc *p, struct getrlimit_args *uap, __unused int32_t *retval)
 825 {
 826         struct rlimit lim;
 827
 828         /*
 829          * Take out flag now in case we need to use it to trigger variant
 830          * behaviour later.
 831          */
 832         uap->which &= ~_RLIMIT_POSIX_FLAG;
 833
 834         if (uap->which >= RLIM_NLIMITS)
 835                 return (EINVAL);
 836         proc_limitget(p, uap->which, &lim);
 837         return (copyout((caddr_t)&lim,
 838                         uap->rlp, sizeof (struct rlimit)));
 839 }
 840
 841 /*
 842  * Transform the running time and tick information in proc p into user,
 843  * system, and interrupt time usage.
 844  */
 845 /* No lock on proc is held for this.. */
 846 void
 847 calcru(struct proc *p, struct timeval *up, struct timeval *sp, struct timeval *ip)
 848 {
 849         task_t                  task;
 850
 851         timerclear(up);
 852         timerclear(sp);
 853         if (ip != NULL)
 854                 timerclear(ip);
 855
 856         task = p->task;
 857         if (task) {
 858                 task_basic_info_32_data_t tinfo;
 859                 task_thread_times_info_data_t ttimesinfo;
 860                 task_events_info_data_t teventsinfo;
 861                 mach_msg_type_number_t task_info_count, task_ttimes_count;
 862                 mach_msg_type_number_t task_events_count;
 863                 struct timeval ut,st;
 864
 865                 task_info_count = TASK_BASIC_INFO_32_COUNT;
 866                 task_info(task, TASK_BASIC2_INFO_32,
 867                           (task_info_t)&tinfo, &task_info_count);
 868                 ut.tv_sec = tinfo.user_time.seconds;
 869                 ut.tv_usec = tinfo.user_time.microseconds;
 870                 st.tv_sec = tinfo.system_time.seconds;
 871                 st.tv_usec = tinfo.system_time.microseconds;
 872                 timeradd(&ut, up, up);
 873                 timeradd(&st, sp, sp);
 874
 875                 task_ttimes_count = TASK_THREAD_TIMES_INFO_COUNT;
 876                 task_info(task, TASK_THREAD_TIMES_INFO,
 877                           (task_info_t)&ttimesinfo, &task_ttimes_count);
 878
 879                 ut.tv_sec = ttimesinfo.user_time.seconds;
 880                 ut.tv_usec = ttimesinfo.user_time.microseconds;
 881                 st.tv_sec = ttimesinfo.system_time.seconds;
 882                 st.tv_usec = ttimesinfo.system_time.microseconds;
 883                 timeradd(&ut, up, up);
 884                 timeradd(&st, sp, sp);
 885
 886                 task_events_count = TASK_EVENTS_INFO_COUNT;
 887                 task_info(task, TASK_EVENTS_INFO,
 888                           (task_info_t)&teventsinfo, &task_events_count);
 889
 890                 /*
 891                  * No need to lock "p":  this does not need to be
 892                  * completely consistent, right ?
 893                  */
 894                 p->p_stats->p_ru.ru_minflt = (teventsinfo.faults -
 895                                               teventsinfo.pageins);
 896                 p->p_stats->p_ru.ru_majflt = teventsinfo.pageins;
 897                 p->p_stats->p_ru.ru_nivcsw = (teventsinfo.csw -
 898                                               p->p_stats->p_ru.ru_nvcsw);
 899                 if (p->p_stats->p_ru.ru_nivcsw < 0)
 900                         p->p_stats->p_ru.ru_nivcsw = 0;
 901
 902                 p->p_stats->p_ru.ru_maxrss = tinfo.resident_size;
 903         }
 904 }
 905
 906 __private_extern__ void munge_user64_rusage(struct rusage *a_rusage_p, struct user64_rusage *a_user_rusage_p);
 907 __private_extern__ void munge_user32_rusage(struct rusage *a_rusage_p, struct user32_rusage *a_user_rusage_p);
 908
 909 /* ARGSUSED */
 910 int
 911 getrusage(struct proc *p, struct getrusage_args *uap, __unused int32_t *retval)
 912 {
 913         struct rusage *rup, rubuf;
 914         struct user64_rusage rubuf64;
 915         struct user32_rusage rubuf32;
 916         size_t retsize = sizeof(rubuf);                 /* default: 32 bits */
 917         caddr_t retbuf = (caddr_t)&rubuf;               /* default: 32 bits */
 918         struct timeval utime;
 919         struct timeval stime;
 920
 921
 922         switch (uap->who) {
 923         case RUSAGE_SELF:
 924                 calcru(p, &utime, &stime, NULL);
 925                 proc_lock(p);
 926                 rup = &p->p_stats->p_ru;
 927                 rup->ru_utime = utime;
 928                 rup->ru_stime = stime;
 929
 930                 rubuf = *rup;
 931                 proc_unlock(p);
 932
 933                 break;
 934
 935         case RUSAGE_CHILDREN:
 936                 proc_lock(p);
 937                 rup = &p->p_stats->p_cru;
 938                 rubuf = *rup;
 939                 proc_unlock(p);
 940                 break;
 941
 942         default:
 943                 return (EINVAL);
 944         }
 945         if (IS_64BIT_PROCESS(p)) {
 946                 retsize = sizeof(rubuf64);
 947                 retbuf = (caddr_t)&rubuf64;
 948                 munge_user64_rusage(&rubuf, &rubuf64);
 949         } else {
 950                 retsize = sizeof(rubuf32);
 951                 retbuf = (caddr_t)&rubuf32;
 952                 munge_user32_rusage(&rubuf, &rubuf32);
 953         }
 954
 955         return (copyout(retbuf, uap->rusage, retsize));
 956 }
 957
 958 void
 959 ruadd(struct rusage *ru, struct rusage *ru2)
 960 {
 961         long *ip, *ip2;
 962         long i;
 963
 964         timeradd(&ru->ru_utime, &ru2->ru_utime, &ru->ru_utime);
 965         timeradd(&ru->ru_stime, &ru2->ru_stime, &ru->ru_stime);
 966         if (ru->ru_maxrss < ru2->ru_maxrss)
 967                 ru->ru_maxrss = ru2->ru_maxrss;
 968         ip = &ru->ru_first; ip2 = &ru2->ru_first;
 969         for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
 970                 *ip++ += *ip2++;
 971 }
 972
 973 void
 974 proc_limitget(proc_t p, int which, struct rlimit * limp)
 975 {
 976         proc_list_lock();
 977         limp->rlim_cur = p->p_rlimit[which].rlim_cur;
 978         limp->rlim_max = p->p_rlimit[which].rlim_max;
 979         proc_list_unlock();
 980 }
 981
 982
 983 void
 984 proc_limitdrop(proc_t p, int exiting)
 985 {
 986         struct  plimit * freelim = NULL;
 987         struct  plimit * freeoldlim = NULL;
 988
 989         proc_list_lock();
 990
 991         if (--p->p_limit->pl_refcnt == 0) {
 992                 freelim = p->p_limit;
 993                 p->p_limit = NULL;
 994         }
 995         if ((exiting != 0) && (p->p_olimit != NULL) && (--p->p_olimit->pl_refcnt == 0)) {
 996                 freeoldlim =  p->p_olimit;
 997                 p->p_olimit = NULL;
 998         }
 999
1000         proc_list_unlock();
1001         if (freelim != NULL)
1002                 FREE_ZONE(freelim, sizeof *p->p_limit, M_PLIMIT);
1003         if (freeoldlim != NULL)
1004                 FREE_ZONE(freeoldlim, sizeof *p->p_olimit, M_PLIMIT);
1005 }
1006
1007
1008 void
1009 proc_limitfork(proc_t parent, proc_t child)
1010 {
1011         proc_list_lock();
1012         child->p_limit = parent->p_limit;
1013         child->p_limit->pl_refcnt++;
1014         child->p_olimit = NULL;
1015         proc_list_unlock();
1016 }
1017
1018 void
1019 proc_limitblock(proc_t p)
1020 {
1021         proc_lock(p);
1022         while (p->p_lflag & P_LLIMCHANGE) {
1023                 p->p_lflag |= P_LLIMWAIT;
1024                 msleep(&p->p_olimit, &p->p_mlock, 0, "proc_limitblock", NULL);
1025         }
1026         p->p_lflag |= P_LLIMCHANGE;
1027         proc_unlock(p);
1028
1029 }
1030
1031
1032 void
1033 proc_limitunblock(proc_t p)
1034 {
1035         proc_lock(p);
1036         p->p_lflag &= ~P_LLIMCHANGE;
1037         if (p->p_lflag & P_LLIMWAIT) {
1038                 p->p_lflag &= ~P_LLIMWAIT;
1039                 wakeup(&p->p_olimit);
1040         }
1041         proc_unlock(p);
1042 }
1043
1044 /* This is called behind serialization provided by proc_limitblock/unlbock */
1045 int
1046 proc_limitreplace(proc_t p)
1047 {
1048         struct plimit *copy;
1049
1050
1051         proc_list_lock();
1052
1053         if (p->p_limit->pl_refcnt == 1) {
1054                 proc_list_unlock();
1055                 return(0);
1056         }
1057
1058         proc_list_unlock();
1059
1060         MALLOC_ZONE(copy, struct plimit *,
1061                         sizeof(struct plimit), M_PLIMIT, M_WAITOK);
1062         if (copy == NULL) {
1063                 return(ENOMEM);
1064         }
1065
1066         proc_list_lock();
1067         bcopy(p->p_limit->pl_rlimit, copy->pl_rlimit,
1068             sizeof(struct rlimit) * RLIM_NLIMITS);
1069         copy->pl_refcnt = 1;
1070         /* hang on to reference to old till process exits */
1071         p->p_olimit = p->p_limit;
1072         p->p_limit = copy;
1073         proc_list_unlock();
1074
1075         return(0);
1076 }
1077
1078
1079 /*
1080  * iopolicysys
1081  *
1082  * Description: System call MUX for use in manipulating I/O policy attributes of the current process or thread
1083  *
1084  * Parameters:  cmd                             Policy command
1085  *              arg                             Pointer to policy arguments
1086  *
1087  * Returns:     0                               Success
1088  *              EINVAL                          Invalid command or invalid policy arguments
1089  *
1090  */
1091 int
1092 iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __unused int32_t *retval)
1093 {
1094         int     error = 0;
1095         thread_t thread = THREAD_NULL;
1096         int *policy;
1097         struct uthread  *ut = NULL;
1098         struct _iopol_param_t iop_param;
1099
1100         if ((error = copyin(uap->arg, &iop_param, sizeof(iop_param))) != 0)
1101                 goto exit;
1102
1103         if (iop_param.iop_iotype != IOPOL_TYPE_DISK) {
1104                 error = EINVAL;
1105                 goto exit;
1106         }
1107
1108         switch (iop_param.iop_scope) {
1109         case IOPOL_SCOPE_PROCESS:
1110                 policy = &p->p_iopol_disk;
1111                 break;
1112         case IOPOL_SCOPE_THREAD:
1113                 thread = current_thread();
1114                 ut = get_bsdthread_info(thread);
1115                 policy = &ut->uu_iopol_disk;
1116                 break;
1117         default:
1118                 error = EINVAL;
1119                 goto exit;
1120         }
1121
1122         switch(uap->cmd) {
1123         case IOPOL_CMD_SET:
1124                 switch (iop_param.iop_policy) {
1125                 case IOPOL_DEFAULT:
1126                 case IOPOL_NORMAL:
1127                 case IOPOL_THROTTLE:
1128                 case IOPOL_PASSIVE:
1129                         proc_lock(p);
1130                         *policy = iop_param.iop_policy;
1131                         proc_unlock(p);
1132                         break;
1133                 default:
1134                         error = EINVAL;
1135                         goto exit;
1136                 }
1137                 break;
1138         case IOPOL_CMD_GET:
1139                 switch (*policy) {
1140                 case IOPOL_DEFAULT:
1141                 case IOPOL_NORMAL:
1142                 case IOPOL_THROTTLE:
1143                 case IOPOL_PASSIVE:
1144                         iop_param.iop_policy = *policy;
1145                         break;
1146                 default: // in-kernel
1147                         // this should never happen
1148                         printf("%s: unknown I/O policy %d\n", __func__, *policy);
1149                         // restore to default value
1150                         *policy = IOPOL_DEFAULT;
1151                         iop_param.iop_policy = *policy;
1152                 }
1153
1154                 error = copyout((caddr_t)&iop_param, uap->arg, sizeof(iop_param));
1155                 break;
1156         default:
1157                 error = EINVAL; // unknown command
1158                 break;
1159         }
1160
1161   exit:
1162         *retval = error;
1163         return (error);
1164 }
1165
1166
1167 boolean_t thread_is_io_throttled(void);
1168
1169 boolean_t
1170 thread_is_io_throttled(void) {
1171
1172         int     policy;
1173         struct uthread  *ut;
1174
1175         policy = current_proc()->p_iopol_disk;
1176
1177         ut = get_bsdthread_info(current_thread());
1178
1179         if (ut->uu_iopol_disk != IOPOL_DEFAULT)
1180                 policy = ut->uu_iopol_disk;
1181
1182         if (policy == IOPOL_THROTTLE)
1183                 return TRUE;
1184
1185         return FALSE;
1186 }