bsd/kern/kern_resource.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995, 1997 Apple Computer, Inc. All Rights Reserved */
  29 /*-
  30  * Copyright (c) 1982, 1986, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  * (c) UNIX System Laboratories, Inc.
  33  * All or some portions of this file are derived from material licensed
  34  * to the University of California by American Telephone and Telegraph
  35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  36  * the permission of UNIX System Laboratories, Inc.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      @(#)kern_resource.c     8.5 (Berkeley) 1/21/94
  67  */
  68 /*
  69  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  70  * support for mandatory and extensible security protections.  This notice
  71  * is included in support of clause 2.2 (b) of the Apple Public License,
  72  * Version 2.0.
  73  */
  74
  75 #include <sys/param.h>
  76 #include <sys/systm.h>
  77 #include <sys/sysctl.h>
  78 #include <sys/kernel.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/resourcevar.h>
  81 #include <sys/malloc.h>
  82 #include <sys/proc_internal.h>
  83 #include <sys/kauth.h>
  84 #include <machine/spl.h>
  85
  86 #include <sys/mount_internal.h>
  87 #include <sys/sysproto.h>
  88
  89 #include <security/audit/audit.h>
  90
  91 #include <machine/vmparam.h>
  92
  93 #include <mach/mach_types.h>
  94 #include <mach/time_value.h>
  95 #include <mach/task.h>
  96 #include <mach/task_info.h>
  97 #include <mach/vm_map.h>
  98 #include <mach/mach_vm.h>
  99 #include <mach/thread_act.h>  /* for thread_policy_set( ) */
 100 #include <kern/lock.h>
 101 #include <kern/thread.h>
 102
 103 #include <kern/task.h>
 104 #include <kern/clock.h>         /* for absolutetime_to_microtime() */
 105 #include <netinet/in.h>         /* for TRAFFIC_MGT_SO_BACKGROUND */
 106 #include <sys/socketvar.h>      /* for struct socket */
 107
 108 #include <vm/vm_map.h>
 109
 110 int     donice(struct proc *curp, struct proc *chgp, int n);
 111 int     dosetrlimit(struct proc *p, u_int which, struct rlimit *limp);
 112 static void do_background_socket(struct proc *curp, thread_t thread, int priority);
 113 static int do_background_thread(struct proc *curp, int priority);
 114 static int do_background_task(struct proc *curp, int priority);
 115
 116 rlim_t maxdmap = MAXDSIZ;       /* XXX */
 117 rlim_t maxsmap = MAXSSIZ - PAGE_SIZE;   /* XXX */
 118
 119 /*
 120  * Limits on the number of open files per process, and the number
 121  * of child processes per process.
 122  *
 123  * Note: would be in kern/subr_param.c in FreeBSD.
 124  */
 125 __private_extern__ int maxfilesperproc = OPEN_MAX;              /* per-proc open files limit */
 126
 127 SYSCTL_INT( _kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW,
 128                 &maxprocperuid, 0, "Maximum processes allowed per userid" );
 129
 130 SYSCTL_INT( _kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
 131                 &maxfilesperproc, 0, "Maximum files allowed open per process" );
 132
 133 /* Args and fn for proc_iteration callback used in setpriority */
 134 struct puser_nice_args {
 135         proc_t curp;
 136         int     prio;
 137         id_t    who;
 138         int *   foundp;
 139         int *   errorp;
 140 };
 141 static int puser_donice_callback(proc_t p, void * arg);
 142
 143
 144 /* Args and fn for proc_iteration callback used in setpriority */
 145 struct ppgrp_nice_args {
 146         proc_t curp;
 147         int     prio;
 148         int *   foundp;
 149         int *   errorp;
 150 };
 151 static int ppgrp_donice_callback(proc_t p, void * arg);
 152
 153 /*
 154  * Resource controls and accounting.
 155  */
 156 int
 157 getpriority(struct proc *curp, struct getpriority_args *uap, int32_t *retval)
 158 {
 159         struct proc *p;
 160         int low = PRIO_MAX + 1;
 161         kauth_cred_t my_cred;
 162
 163         /* would also test (uap->who < 0), but id_t is unsigned */
 164         if (uap->who > 0x7fffffff)
 165                 return (EINVAL);
 166
 167         switch (uap->which) {
 168
 169         case PRIO_PROCESS:
 170                 if (uap->who == 0) {
 171                         p = curp;
 172                         low = p->p_nice;
 173                 } else {
 174                         p = proc_find(uap->who);
 175                         if (p == 0)
 176                                 break;
 177                         low = p->p_nice;
 178                         proc_rele(p);
 179
 180                 }
 181                 break;
 182
 183         case PRIO_PGRP: {
 184                 struct pgrp *pg = PGRP_NULL;
 185
 186                 if (uap->who == 0) {
 187                         /* returns the pgrp to ref */
 188                         pg = proc_pgrp(curp);
 189                  } else if ((pg = pgfind(uap->who)) == PGRP_NULL) {
 190                         break;
 191                 }
 192                 /* No need for iteration as it is a simple scan */
 193                 pgrp_lock(pg);
 194                 for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next) {
 195                         if (p->p_nice < low)
 196                                 low = p->p_nice;
 197                 }
 198                 pgrp_unlock(pg);
 199                 pg_rele(pg);
 200                 break;
 201         }
 202
 203         case PRIO_USER:
 204                 if (uap->who == 0)
 205                         uap->who = kauth_cred_getuid(kauth_cred_get());
 206
 207                 proc_list_lock();
 208
 209                 for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
 210                         my_cred = kauth_cred_proc_ref(p);
 211                         if (kauth_cred_getuid(my_cred) == uap->who &&
 212                             p->p_nice < low)
 213                                 low = p->p_nice;
 214                         kauth_cred_unref(&my_cred);
 215                 }
 216
 217                 proc_list_unlock();
 218
 219                 break;
 220
 221         case PRIO_DARWIN_THREAD: {
 222                 thread_t                        thread;
 223                 struct uthread          *ut;
 224
 225                 /* we currently only support the current thread */
 226                 if (uap->who != 0) {
 227                         return (EINVAL);
 228                 }
 229
 230                 thread = current_thread();
 231                 ut = get_bsdthread_info(thread);
 232
 233                 low = 0;
 234                 if ( (ut->uu_flag & UT_BACKGROUND) != 0 ) {
 235                         low = 1;
 236                 }
 237                 break;
 238         }
 239
 240         default:
 241                 return (EINVAL);
 242         }
 243         if (low == PRIO_MAX + 1)
 244                 return (ESRCH);
 245         *retval = low;
 246         return (0);
 247 }
 248
 249 /* call back function used for proc iteration in PRIO_USER */
 250 static int
 251 puser_donice_callback(proc_t p, void * arg)
 252 {
 253         int error, n;
 254         struct puser_nice_args * pun = (struct puser_nice_args *)arg;
 255         kauth_cred_t my_cred;
 256
 257         my_cred = kauth_cred_proc_ref(p);
 258         if (kauth_cred_getuid(my_cred) == pun->who) {
 259                 error = donice(pun->curp, p, pun->prio);
 260                 if (pun->errorp != NULL)
 261                         *pun->errorp = error;
 262                 if (pun->foundp != NULL) {
 263                         n = *pun->foundp;
 264                         *pun->foundp = n+1;
 265                 }
 266         }
 267         kauth_cred_unref(&my_cred);
 268
 269         return(PROC_RETURNED);
 270 }
 271
 272 /* call back function used for proc iteration in PRIO_PGRP */
 273 static int
 274 ppgrp_donice_callback(proc_t p, void * arg)
 275 {
 276         int error;
 277         struct ppgrp_nice_args * pun = (struct ppgrp_nice_args *)arg;
 278         int n;
 279
 280         error = donice(pun->curp, p, pun->prio);
 281         if (pun->errorp != NULL)
 282                 *pun->errorp = error;
 283         if (pun->foundp!= NULL) {
 284                 n = *pun->foundp;
 285                 *pun->foundp = n+1;
 286         }
 287
 288         return(PROC_RETURNED);
 289 }
 290
 291 /*
 292  * Returns:     0                       Success
 293  *              EINVAL
 294  *              ESRCH
 295  *      donice:EPERM
 296  *      donice:EACCES
 297  */
 298 /* ARGSUSED */
 299 int
 300 setpriority(struct proc *curp, struct setpriority_args *uap, __unused int32_t *retval)
 301 {
 302         struct proc *p;
 303         int found = 0, error = 0;
 304         int refheld = 0;
 305
 306         AUDIT_ARG(cmd, uap->which);
 307         AUDIT_ARG(owner, uap->who, 0);
 308         AUDIT_ARG(value32, uap->prio);
 309
 310         /* would also test (uap->who < 0), but id_t is unsigned */
 311         if (uap->who > 0x7fffffff)
 312                 return (EINVAL);
 313
 314         switch (uap->which) {
 315
 316         case PRIO_PROCESS:
 317                 if (uap->who == 0)
 318                         p = curp;
 319                 else {
 320                         p = proc_find(uap->who);
 321                         if (p == 0)
 322                                 break;
 323                         refheld = 1;
 324                 }
 325                 error = donice(curp, p, uap->prio);
 326                 found++;
 327                 if (refheld != 0)
 328                         proc_rele(p);
 329                 break;
 330
 331         case PRIO_PGRP: {
 332                 struct pgrp *pg = PGRP_NULL;
 333                 struct ppgrp_nice_args ppgrp;
 334
 335                 if (uap->who == 0) {
 336                         pg = proc_pgrp(curp);
 337                  } else if ((pg = pgfind(uap->who)) == PGRP_NULL)
 338                         break;
 339
 340                 ppgrp.curp = curp;
 341                 ppgrp.prio = uap->prio;
 342                 ppgrp.foundp = &found;
 343                 ppgrp.errorp = &error;
 344
 345                 /* PGRP_DROPREF drops the reference on process group */
 346                 pgrp_iterate(pg, PGRP_DROPREF, ppgrp_donice_callback, (void *)&ppgrp, NULL, NULL);
 347
 348                 break;
 349         }
 350
 351         case PRIO_USER: {
 352                 struct puser_nice_args punice;
 353
 354                 if (uap->who == 0)
 355                         uap->who = kauth_cred_getuid(kauth_cred_get());
 356
 357                 punice.curp = curp;
 358                 punice.prio = uap->prio;
 359                 punice.who = uap->who;
 360                 punice.foundp = &found;
 361                 error = 0;
 362                 punice.errorp = &error;
 363                 proc_iterate(PROC_ALLPROCLIST, puser_donice_callback, (void *)&punice, NULL, NULL);
 364
 365                 break;
 366         }
 367
 368         case PRIO_DARWIN_THREAD: {
 369                 /* we currently only support the current thread */
 370                 if (uap->who != 0) {
 371                         return (EINVAL);
 372                 }
 373                 error = do_background_thread(curp, uap->prio);
 374                 (void) do_background_socket(curp, current_thread(), uap->prio);
 375                 found++;
 376                 break;
 377         }
 378
 379         case PRIO_DARWIN_PROCESS: {
 380                 if (uap->who == 0)
 381                         p = curp;
 382                 else {
 383                         p = proc_find(uap->who);
 384                         if (p == 0)
 385                                 break;
 386                         refheld = 1;
 387                 }
 388
 389                 error = do_background_task(p, uap->prio);
 390                 (void) do_background_socket(p, NULL, uap->prio);
 391
 392                 proc_lock(p);
 393                 p->p_iopol_disk = (uap->prio == PRIO_DARWIN_BG ?
 394                                 IOPOL_THROTTLE : IOPOL_DEFAULT);
 395                 proc_unlock(p);
 396
 397                 found++;
 398                 if (refheld != 0)
 399                         proc_rele(p);
 400                 break;
 401         }
 402
 403         default:
 404                 return (EINVAL);
 405         }
 406         if (found == 0)
 407                 return (ESRCH);
 408         return (error);
 409 }
 410
 411
 412 /*
 413  * Returns:     0                       Success
 414  *              EPERM
 415  *              EACCES
 416  *      mac_check_proc_sched:???
 417  */
 418 int
 419 donice(struct proc *curp, struct proc *chgp, int n)
 420 {
 421         int error = 0;
 422         kauth_cred_t ucred;
 423         kauth_cred_t my_cred;
 424
 425         ucred = kauth_cred_proc_ref(curp);
 426         my_cred = kauth_cred_proc_ref(chgp);
 427
 428         if (suser(ucred, NULL) && ucred->cr_ruid &&
 429             kauth_cred_getuid(ucred) != kauth_cred_getuid(my_cred) &&
 430             ucred->cr_ruid != kauth_cred_getuid(my_cred)) {
 431                 error = EPERM;
 432                 goto out;
 433         }
 434         if (n > PRIO_MAX)
 435                 n = PRIO_MAX;
 436         if (n < PRIO_MIN)
 437                 n = PRIO_MIN;
 438         if (n < chgp->p_nice && suser(ucred, &curp->p_acflag)) {
 439                 error = EACCES;
 440                 goto out;
 441         }
 442 #if CONFIG_MACF
 443         error = mac_proc_check_sched(curp, chgp);
 444         if (error)
 445                 goto out;
 446 #endif
 447         proc_lock(chgp);
 448         chgp->p_nice = n;
 449         proc_unlock(chgp);
 450         (void)resetpriority(chgp);
 451 out:
 452         kauth_cred_unref(&ucred);
 453         kauth_cred_unref(&my_cred);
 454         return (error);
 455 }
 456
 457 static int
 458 do_background_task(struct proc *p, int priority)
 459 {
 460         int error = 0;
 461         task_category_policy_data_t info;
 462
 463         if (priority & PRIO_DARWIN_BG) {
 464                 info.role = TASK_THROTTLE_APPLICATION;
 465         } else {
 466                 info.role = TASK_DEFAULT_APPLICATION;
 467         }
 468
 469         error = task_policy_set(p->task,
 470                         TASK_CATEGORY_POLICY,
 471                         (task_policy_t) &info,
 472                         TASK_CATEGORY_POLICY_COUNT);
 473         return (error);
 474 }
 475
 476 static void
 477 do_background_socket(struct proc *curp, thread_t thread, int priority)
 478 {
 479         struct filedesc                     *fdp;
 480         struct fileproc                     *fp;
 481         int                                 i;
 482
 483         if (priority & PRIO_DARWIN_BG) {
 484                 /* enable network throttle process-wide (if no thread is specified) */
 485                 if (thread == NULL) {
 486                         proc_fdlock(curp);
 487                         fdp = curp->p_fd;
 488
 489                         for (i = 0; i < fdp->fd_nfiles; i++) {
 490                                 struct socket       *sockp;
 491
 492                                 fp = fdp->fd_ofiles[i];
 493                                 if (fp == NULL || (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
 494                                                 fp->f_fglob->fg_type != DTYPE_SOCKET) {
 495                                         continue;
 496                                 }
 497                                 sockp = (struct socket *)fp->f_fglob->fg_data;
 498                                 sockp->so_traffic_mgt_flags |= TRAFFIC_MGT_SO_BACKGROUND;
 499                                 sockp->so_background_thread = NULL;
 500                         }
 501                         proc_fdunlock(curp);
 502                 }
 503
 504         } else {
 505                 /* disable networking IO throttle.
 506                  * NOTE - It is a known limitation of the current design that we
 507                  * could potentially clear TRAFFIC_MGT_SO_BACKGROUND bit for
 508                  * sockets created by other threads within this process.
 509                  */
 510                 proc_fdlock(curp);
 511                 fdp = curp->p_fd;
 512                 for ( i = 0; i < fdp->fd_nfiles; i++ ) {
 513                         struct socket       *sockp;
 514
 515                         fp = fdp->fd_ofiles[ i ];
 516                         if ( fp == NULL || (fdp->fd_ofileflags[ i ] & UF_RESERVED) != 0 ||
 517                                         fp->f_fglob->fg_type != DTYPE_SOCKET ) {
 518                                 continue;
 519                         }
 520                         sockp = (struct socket *)fp->f_fglob->fg_data;
 521                         /* skip if only clearing this thread's sockets */
 522                         if ((thread) && (sockp->so_background_thread != thread)) {
 523                                 continue;
 524                         }
 525                         sockp->so_traffic_mgt_flags &= ~TRAFFIC_MGT_SO_BACKGROUND;
 526                         sockp->so_background_thread = NULL;
 527                 }
 528                 proc_fdunlock(curp);
 529         }
 530 }
 531
 532
 533 /*
 534  * do_background_thread
 535  * Returns:     0                       Success
 536  * XXX - todo - does this need a MACF hook?
 537  */
 538 static int
 539 do_background_thread(struct proc *curp __unused, int priority)
 540 {
 541         thread_t                                                        thread;
 542         struct uthread                                          *ut;
 543         thread_precedence_policy_data_t         policy;
 544
 545         thread = current_thread();
 546         ut = get_bsdthread_info(thread);
 547
 548         if ( (priority & PRIO_DARWIN_BG) == 0 ) {
 549                 /* turn off backgrounding of thread */
 550                 if ( (ut->uu_flag & UT_BACKGROUND) == 0 ) {
 551                         /* already off */
 552                         return(0);
 553                 }
 554
 555                 /* clear background bit in thread and disable disk IO throttle */
 556                 ut->uu_flag &= ~UT_BACKGROUND;
 557                 ut->uu_iopol_disk = IOPOL_NORMAL;
 558
 559                 /* reset thread priority (we did not save previous value) */
 560                 policy.importance = 0;
 561                 thread_policy_set( thread, THREAD_PRECEDENCE_POLICY,
 562                                                    (thread_policy_t)&policy,
 563                                                    THREAD_PRECEDENCE_POLICY_COUNT );
 564                 return(0);
 565         }
 566
 567         /* background this thread */
 568         if ( (ut->uu_flag & UT_BACKGROUND) != 0 ) {
 569                 /* already backgrounded */
 570                 return(0);
 571         }
 572
 573         /* tag thread as background and throttle disk IO */
 574         ut->uu_flag |= UT_BACKGROUND;
 575         ut->uu_iopol_disk = IOPOL_THROTTLE;
 576
 577         policy.importance = INT_MIN;
 578         thread_policy_set( thread, THREAD_PRECEDENCE_POLICY,
 579                                            (thread_policy_t)&policy,
 580                                            THREAD_PRECEDENCE_POLICY_COUNT );
 581
 582         /* throttle networking IO happens in socket( ) syscall.
 583          * If UT_BACKGROUND is set in the current thread then
 584          * TRAFFIC_MGT_SO_BACKGROUND socket option is set.
 585          */
 586         return(0);
 587 }
 588
 589
 590 /*
 591  * Returns:     0                       Success
 592  *      copyin:EFAULT
 593  *      dosetrlimit:
 594  */
 595 /* ARGSUSED */
 596 int
 597 setrlimit(struct proc *p, struct setrlimit_args *uap, __unused int32_t *retval)
 598 {
 599         struct rlimit alim;
 600         int error;
 601
 602         if ((error = copyin(uap->rlp, (caddr_t)&alim,
 603             sizeof (struct rlimit))))
 604                 return (error);
 605
 606         return (dosetrlimit(p, uap->which, &alim));
 607 }
 608
 609 /*
 610  * Returns:     0                       Success
 611  *              EINVAL
 612  *              ENOMEM                  Cannot copy limit structure
 613  *      suser:EPERM
 614  *
 615  * Notes:       EINVAL is returned both for invalid arguments, and in the
 616  *              case that the current usage (e.g. RLIMIT_STACK) is already
 617  *              in excess of the requested limit.
 618  */
 619 int
 620 dosetrlimit(struct proc *p, u_int which, struct rlimit *limp)
 621 {
 622         struct rlimit *alimp;
 623         int error;
 624         kern_return_t   kr;
 625         int posix = (which & _RLIMIT_POSIX_FLAG) ? 1 : 0;
 626
 627         /* Mask out POSIX flag, saved above */
 628         which &= ~_RLIMIT_POSIX_FLAG;
 629
 630         if (which >= RLIM_NLIMITS)
 631                 return (EINVAL);
 632
 633         alimp = &p->p_rlimit[which];
 634         if (limp->rlim_cur > limp->rlim_max)
 635                 return EINVAL;
 636
 637         if (limp->rlim_cur > alimp->rlim_max ||
 638             limp->rlim_max > alimp->rlim_max)
 639                 if ((error = suser(kauth_cred_get(), &p->p_acflag))) {
 640                         return (error);
 641         }
 642
 643         proc_limitblock(p);
 644
 645         if ((error = proc_limitreplace(p)) != 0) {
 646                 proc_limitunblock(p);
 647                 return(error);
 648         }
 649
 650         alimp = &p->p_rlimit[which];
 651
 652         switch (which) {
 653
 654         case RLIMIT_CPU:
 655                 if (limp->rlim_cur == RLIM_INFINITY) {
 656                         task_vtimer_clear(p->task, TASK_VTIMER_RLIM);
 657                         timerclear(&p->p_rlim_cpu);
 658                 }
 659                 else {
 660                         task_absolutetime_info_data_t   tinfo;
 661                         mach_msg_type_number_t                  count;
 662                         struct timeval                                  ttv, tv;
 663                         clock_sec_t                                             tv_sec;
 664                         clock_usec_t                                    tv_usec;
 665
 666                         count = TASK_ABSOLUTETIME_INFO_COUNT;
 667                         task_info(p->task, TASK_ABSOLUTETIME_INFO,
 668                                                                 (task_info_t)&tinfo, &count);
 669                         absolutetime_to_microtime(tinfo.total_user + tinfo.total_system,
 670                                                                           &tv_sec, &tv_usec);
 671                         ttv.tv_sec = tv_sec;
 672                         ttv.tv_usec = tv_usec;
 673
 674                         tv.tv_sec = (limp->rlim_cur > __INT_MAX__ ? __INT_MAX__ : limp->rlim_cur);
 675                         tv.tv_usec = 0;
 676                         timersub(&tv, &ttv, &p->p_rlim_cpu);
 677
 678                         timerclear(&tv);
 679                         if (timercmp(&p->p_rlim_cpu, &tv, >))
 680                                 task_vtimer_set(p->task, TASK_VTIMER_RLIM);
 681                         else {
 682                                 task_vtimer_clear(p->task, TASK_VTIMER_RLIM);
 683
 684                                 timerclear(&p->p_rlim_cpu);
 685
 686                                 psignal(p, SIGXCPU);
 687                         }
 688                 }
 689                 break;
 690
 691         case RLIMIT_DATA:
 692                 if (limp->rlim_cur > maxdmap)
 693                         limp->rlim_cur = maxdmap;
 694                 if (limp->rlim_max > maxdmap)
 695                         limp->rlim_max = maxdmap;
 696                 break;
 697
 698         case RLIMIT_STACK:
 699                 /* Disallow illegal stack size instead of clipping */
 700                 if (limp->rlim_cur > maxsmap ||
 701                     limp->rlim_max > maxsmap) {
 702                         if (posix) {
 703                                 error = EINVAL;
 704                                 goto out;
 705                         }
 706                         else {
 707                                 /*
 708                                  * 4797860 - workaround poorly written installers by
 709                                  * doing previous implementation (< 10.5) when caller
 710                                  * is non-POSIX conforming.
 711                                  */
 712                                 if (limp->rlim_cur > maxsmap)
 713                                         limp->rlim_cur = maxsmap;
 714                                 if (limp->rlim_max > maxsmap)
 715                                         limp->rlim_max = maxsmap;
 716                         }
 717                 }
 718
 719                 /*
 720                  * Stack is allocated to the max at exec time with only
 721                  * "rlim_cur" bytes accessible.  If stack limit is going
 722                  * up make more accessible, if going down make inaccessible.
 723                  */
 724                 if (limp->rlim_cur > alimp->rlim_cur) {
 725                         user_addr_t addr;
 726                         user_size_t size;
 727
 728                                 /* grow stack */
 729                                 size = round_page_64(limp->rlim_cur);
 730                                 size -= round_page_64(alimp->rlim_cur);
 731
 732 #if STACK_GROWTH_UP
 733                                 /* go to top of current stack */
 734                         addr = p->user_stack + round_page_64(alimp->rlim_cur);
 735 #else   /* STACK_GROWTH_UP */
 736                         addr = p->user_stack - round_page_64(limp->rlim_cur);
 737 #endif /* STACK_GROWTH_UP */
 738                         kr = mach_vm_protect(current_map(),
 739                                              addr, size,
 740                                              FALSE, VM_PROT_DEFAULT);
 741                         if (kr != KERN_SUCCESS) {
 742                                 error =  EINVAL;
 743                                 goto out;
 744                         }
 745                 } else if (limp->rlim_cur < alimp->rlim_cur) {
 746                         user_addr_t addr;
 747                         user_size_t size;
 748                         user_addr_t cur_sp;
 749
 750                                 /* shrink stack */
 751
 752                         /*
 753                          * First check if new stack limit would agree
 754                          * with current stack usage.
 755                          * Get the current thread's stack pointer...
 756                          */
 757                         cur_sp = thread_adjuserstack(current_thread(),
 758                                                      0);
 759 #if STACK_GROWTH_UP
 760                         if (cur_sp >= p->user_stack &&
 761                             cur_sp < (p->user_stack +
 762                                       round_page_64(alimp->rlim_cur))) {
 763                                 /* current stack pointer is in main stack */
 764                                 if (cur_sp >= (p->user_stack +
 765                                                round_page_64(limp->rlim_cur))) {
 766                                         /*
 767                                          * New limit would cause
 768                                          * current usage to be invalid:
 769                                          * reject new limit.
 770                                          */
 771                                         error =  EINVAL;
 772                                         goto out;
 773                         }
 774                         } else {
 775                                 /* not on the main stack: reject */
 776                                 error =  EINVAL;
 777                                 goto out;
 778                 }
 779
 780 #else   /* STACK_GROWTH_UP */
 781                         if (cur_sp <= p->user_stack &&
 782                             cur_sp > (p->user_stack -
 783                                       round_page_64(alimp->rlim_cur))) {
 784                                 /* stack pointer is in main stack */
 785                                 if (cur_sp <= (p->user_stack -
 786                                                round_page_64(limp->rlim_cur))) {
 787                                         /*
 788                                          * New limit would cause
 789                                          * current usage to be invalid:
 790                                          * reject new limit.
 791                                          */
 792                                         error =  EINVAL;
 793                                         goto out;
 794                                 }
 795                         } else {
 796                                 /* not on the main stack: reject */
 797                                 error =  EINVAL;
 798                                 goto out;
 799                         }
 800 #endif  /* STACK_GROWTH_UP */
 801
 802                         size = round_page_64(alimp->rlim_cur);
 803                         size -= round_page_64(limp->rlim_cur);
 804
 805 #if STACK_GROWTH_UP
 806                         addr = p->user_stack + round_page_64(limp->rlim_cur);
 807 #else   /* STACK_GROWTH_UP */
 808                         addr = p->user_stack - round_page_64(alimp->rlim_cur);
 809 #endif /* STACK_GROWTH_UP */
 810
 811                         kr = mach_vm_protect(current_map(),
 812                                              addr, size,
 813                                              FALSE, VM_PROT_NONE);
 814                         if (kr != KERN_SUCCESS) {
 815                                 error =  EINVAL;
 816                                 goto out;
 817                         }
 818                 } else {
 819                         /* no change ... */
 820                 }
 821                 break;
 822
 823         case RLIMIT_NOFILE:
 824                 /*
 825                  * Only root can set the maxfiles limits, as it is
 826                  * systemwide resource.  If we are expecting POSIX behavior,
 827                  * instead of clamping the value, return EINVAL.  We do this
 828                  * because historically, people have been able to attempt to
 829                  * set RLIM_INFINITY to get "whatever the maximum is".
 830                 */
 831                 if ( is_suser() ) {
 832                         if (limp->rlim_cur != alimp->rlim_cur &&
 833                             limp->rlim_cur > (rlim_t)maxfiles) {
 834                                 if (posix) {
 835                                         error =  EINVAL;
 836                                         goto out;
 837                                 }
 838                                 limp->rlim_cur = maxfiles;
 839                         }
 840                         if (limp->rlim_max != alimp->rlim_max &&
 841                             limp->rlim_max > (rlim_t)maxfiles)
 842                                 limp->rlim_max = maxfiles;
 843                 }
 844                 else {
 845                         if (limp->rlim_cur != alimp->rlim_cur &&
 846                             limp->rlim_cur > (rlim_t)maxfilesperproc) {
 847                                 if (posix) {
 848                                         error =  EINVAL;
 849                                         goto out;
 850                                 }
 851                                 limp->rlim_cur = maxfilesperproc;
 852                         }
 853                         if (limp->rlim_max != alimp->rlim_max &&
 854                             limp->rlim_max > (rlim_t)maxfilesperproc)
 855                                 limp->rlim_max = maxfilesperproc;
 856                 }
 857                 break;
 858
 859         case RLIMIT_NPROC:
 860                 /*
 861                  * Only root can set to the maxproc limits, as it is
 862                  * systemwide resource; all others are limited to
 863                  * maxprocperuid (presumably less than maxproc).
 864                  */
 865                 if ( is_suser() ) {
 866                         if (limp->rlim_cur > (rlim_t)maxproc)
 867                                 limp->rlim_cur = maxproc;
 868                         if (limp->rlim_max > (rlim_t)maxproc)
 869                                 limp->rlim_max = maxproc;
 870                 }
 871                 else {
 872                         if (limp->rlim_cur > (rlim_t)maxprocperuid)
 873                                 limp->rlim_cur = maxprocperuid;
 874                         if (limp->rlim_max > (rlim_t)maxprocperuid)
 875                                 limp->rlim_max = maxprocperuid;
 876                 }
 877                 break;
 878
 879         case RLIMIT_MEMLOCK:
 880                 /*
 881                  * Tell the Mach VM layer about the new limit value.
 882                  */
 883
 884                 vm_map_set_user_wire_limit(current_map(), limp->rlim_cur);
 885                 break;
 886
 887         } /* switch... */
 888         proc_lock(p);
 889         *alimp = *limp;
 890         proc_unlock(p);
 891         error = 0;
 892 out:
 893         proc_limitunblock(p);
 894         return (error);
 895 }
 896
 897 /* ARGSUSED */
 898 int
 899 getrlimit(struct proc *p, struct getrlimit_args *uap, __unused int32_t *retval)
 900 {
 901         struct rlimit lim;
 902
 903         /*
 904          * Take out flag now in case we need to use it to trigger variant
 905          * behaviour later.
 906          */
 907         uap->which &= ~_RLIMIT_POSIX_FLAG;
 908
 909         if (uap->which >= RLIM_NLIMITS)
 910                 return (EINVAL);
 911         proc_limitget(p, uap->which, &lim);
 912         return (copyout((caddr_t)&lim,
 913                         uap->rlp, sizeof (struct rlimit)));
 914 }
 915
 916 /*
 917  * Transform the running time and tick information in proc p into user,
 918  * system, and interrupt time usage.
 919  */
 920 /* No lock on proc is held for this.. */
 921 void
 922 calcru(struct proc *p, struct timeval *up, struct timeval *sp, struct timeval *ip)
 923 {
 924         task_t                  task;
 925
 926         timerclear(up);
 927         timerclear(sp);
 928         if (ip != NULL)
 929                 timerclear(ip);
 930
 931         task = p->task;
 932         if (task) {
 933                 task_basic_info_32_data_t tinfo;
 934                 task_thread_times_info_data_t ttimesinfo;
 935                 task_events_info_data_t teventsinfo;
 936                 mach_msg_type_number_t task_info_count, task_ttimes_count;
 937                 mach_msg_type_number_t task_events_count;
 938                 struct timeval ut,st;
 939
 940                 task_info_count = TASK_BASIC_INFO_32_COUNT;
 941                 task_info(task, TASK_BASIC2_INFO_32,
 942                           (task_info_t)&tinfo, &task_info_count);
 943                 ut.tv_sec = tinfo.user_time.seconds;
 944                 ut.tv_usec = tinfo.user_time.microseconds;
 945                 st.tv_sec = tinfo.system_time.seconds;
 946                 st.tv_usec = tinfo.system_time.microseconds;
 947                 timeradd(&ut, up, up);
 948                 timeradd(&st, sp, sp);
 949
 950                 task_ttimes_count = TASK_THREAD_TIMES_INFO_COUNT;
 951                 task_info(task, TASK_THREAD_TIMES_INFO,
 952                           (task_info_t)&ttimesinfo, &task_ttimes_count);
 953
 954                 ut.tv_sec = ttimesinfo.user_time.seconds;
 955                 ut.tv_usec = ttimesinfo.user_time.microseconds;
 956                 st.tv_sec = ttimesinfo.system_time.seconds;
 957                 st.tv_usec = ttimesinfo.system_time.microseconds;
 958                 timeradd(&ut, up, up);
 959                 timeradd(&st, sp, sp);
 960
 961                 task_events_count = TASK_EVENTS_INFO_COUNT;
 962                 task_info(task, TASK_EVENTS_INFO,
 963                           (task_info_t)&teventsinfo, &task_events_count);
 964
 965                 /*
 966                  * No need to lock "p":  this does not need to be
 967                  * completely consistent, right ?
 968                  */
 969                 p->p_stats->p_ru.ru_minflt = (teventsinfo.faults -
 970                                               teventsinfo.pageins);
 971                 p->p_stats->p_ru.ru_majflt = teventsinfo.pageins;
 972                 p->p_stats->p_ru.ru_nivcsw = (teventsinfo.csw -
 973                                               p->p_stats->p_ru.ru_nvcsw);
 974                 if (p->p_stats->p_ru.ru_nivcsw < 0)
 975                         p->p_stats->p_ru.ru_nivcsw = 0;
 976
 977                 p->p_stats->p_ru.ru_maxrss = tinfo.resident_size;
 978         }
 979 }
 980
 981 __private_extern__ void munge_user64_rusage(struct rusage *a_rusage_p, struct user64_rusage *a_user_rusage_p);
 982 __private_extern__ void munge_user32_rusage(struct rusage *a_rusage_p, struct user32_rusage *a_user_rusage_p);
 983
 984 /* ARGSUSED */
 985 int
 986 getrusage(struct proc *p, struct getrusage_args *uap, __unused int32_t *retval)
 987 {
 988         struct rusage *rup, rubuf;
 989         struct user64_rusage rubuf64;
 990         struct user32_rusage rubuf32;
 991         size_t retsize = sizeof(rubuf);                 /* default: 32 bits */
 992         caddr_t retbuf = (caddr_t)&rubuf;               /* default: 32 bits */
 993         struct timeval utime;
 994         struct timeval stime;
 995
 996
 997         switch (uap->who) {
 998         case RUSAGE_SELF:
 999                 calcru(p, &utime, &stime, NULL);
1000                 proc_lock(p);
1001                 rup = &p->p_stats->p_ru;
1002                 rup->ru_utime = utime;
1003                 rup->ru_stime = stime;
1004
1005                 rubuf = *rup;
1006                 proc_unlock(p);
1007
1008                 break;
1009
1010         case RUSAGE_CHILDREN:
1011                 proc_lock(p);
1012                 rup = &p->p_stats->p_cru;
1013                 rubuf = *rup;
1014                 proc_unlock(p);
1015                 break;
1016
1017         default:
1018                 return (EINVAL);
1019         }
1020         if (IS_64BIT_PROCESS(p)) {
1021                 retsize = sizeof(rubuf64);
1022                 retbuf = (caddr_t)&rubuf64;
1023                 munge_user64_rusage(&rubuf, &rubuf64);
1024         } else {
1025                 retsize = sizeof(rubuf32);
1026                 retbuf = (caddr_t)&rubuf32;
1027                 munge_user32_rusage(&rubuf, &rubuf32);
1028         }
1029
1030         return (copyout(retbuf, uap->rusage, retsize));
1031 }
1032
1033 void
1034 ruadd(struct rusage *ru, struct rusage *ru2)
1035 {
1036         long *ip, *ip2;
1037         long i;
1038
1039         timeradd(&ru->ru_utime, &ru2->ru_utime, &ru->ru_utime);
1040         timeradd(&ru->ru_stime, &ru2->ru_stime, &ru->ru_stime);
1041         if (ru->ru_maxrss < ru2->ru_maxrss)
1042                 ru->ru_maxrss = ru2->ru_maxrss;
1043         ip = &ru->ru_first; ip2 = &ru2->ru_first;
1044         for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
1045                 *ip++ += *ip2++;
1046 }
1047
1048 void
1049 proc_limitget(proc_t p, int which, struct rlimit * limp)
1050 {
1051         proc_list_lock();
1052         limp->rlim_cur = p->p_rlimit[which].rlim_cur;
1053         limp->rlim_max = p->p_rlimit[which].rlim_max;
1054         proc_list_unlock();
1055 }
1056
1057
1058 void
1059 proc_limitdrop(proc_t p, int exiting)
1060 {
1061         struct  plimit * freelim = NULL;
1062         struct  plimit * freeoldlim = NULL;
1063
1064         proc_list_lock();
1065
1066         if (--p->p_limit->pl_refcnt == 0) {
1067                 freelim = p->p_limit;
1068                 p->p_limit = NULL;
1069         }
1070         if ((exiting != 0) && (p->p_olimit != NULL) && (--p->p_olimit->pl_refcnt == 0)) {
1071                 freeoldlim =  p->p_olimit;
1072                 p->p_olimit = NULL;
1073         }
1074
1075         proc_list_unlock();
1076         if (freelim != NULL)
1077                 FREE_ZONE(freelim, sizeof *p->p_limit, M_PLIMIT);
1078         if (freeoldlim != NULL)
1079                 FREE_ZONE(freeoldlim, sizeof *p->p_olimit, M_PLIMIT);
1080 }
1081
1082
1083 void
1084 proc_limitfork(proc_t parent, proc_t child)
1085 {
1086         proc_list_lock();
1087         child->p_limit = parent->p_limit;
1088         child->p_limit->pl_refcnt++;
1089         child->p_olimit = NULL;
1090         proc_list_unlock();
1091 }
1092
1093 void
1094 proc_limitblock(proc_t p)
1095 {
1096         proc_lock(p);
1097         while (p->p_lflag & P_LLIMCHANGE) {
1098                 p->p_lflag |= P_LLIMWAIT;
1099                 msleep(&p->p_olimit, &p->p_mlock, 0, "proc_limitblock", NULL);
1100         }
1101         p->p_lflag |= P_LLIMCHANGE;
1102         proc_unlock(p);
1103
1104 }
1105
1106
1107 void
1108 proc_limitunblock(proc_t p)
1109 {
1110         proc_lock(p);
1111         p->p_lflag &= ~P_LLIMCHANGE;
1112         if (p->p_lflag & P_LLIMWAIT) {
1113                 p->p_lflag &= ~P_LLIMWAIT;
1114                 wakeup(&p->p_olimit);
1115         }
1116         proc_unlock(p);
1117 }
1118
1119 /* This is called behind serialization provided by proc_limitblock/unlbock */
1120 int
1121 proc_limitreplace(proc_t p)
1122 {
1123         struct plimit *copy;
1124
1125
1126         proc_list_lock();
1127
1128         if (p->p_limit->pl_refcnt == 1) {
1129                 proc_list_unlock();
1130                 return(0);
1131         }
1132
1133         proc_list_unlock();
1134
1135         MALLOC_ZONE(copy, struct plimit *,
1136                         sizeof(struct plimit), M_PLIMIT, M_WAITOK);
1137         if (copy == NULL) {
1138                 return(ENOMEM);
1139         }
1140
1141         proc_list_lock();
1142         bcopy(p->p_limit->pl_rlimit, copy->pl_rlimit,
1143             sizeof(struct rlimit) * RLIM_NLIMITS);
1144         copy->pl_refcnt = 1;
1145         /* hang on to reference to old till process exits */
1146         p->p_olimit = p->p_limit;
1147         p->p_limit = copy;
1148         proc_list_unlock();
1149
1150         return(0);
1151 }
1152
1153
1154 /*
1155  * iopolicysys
1156  *
1157  * Description: System call MUX for use in manipulating I/O policy attributes of the current process or thread
1158  *
1159  * Parameters:  cmd                             Policy command
1160  *              arg                             Pointer to policy arguments
1161  *
1162  * Returns:     0                               Success
1163  *              EINVAL                          Invalid command or invalid policy arguments
1164  *
1165  */
1166 int
1167 iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __unused int32_t *retval)
1168 {
1169         int     error = 0;
1170         thread_t thread = THREAD_NULL;
1171         int *policy;
1172         struct uthread  *ut = NULL;
1173         struct _iopol_param_t iop_param;
1174
1175         if ((error = copyin(uap->arg, &iop_param, sizeof(iop_param))) != 0)
1176                 goto exit;
1177
1178         if (iop_param.iop_iotype != IOPOL_TYPE_DISK) {
1179                 error = EINVAL;
1180                 goto exit;
1181         }
1182
1183         switch (iop_param.iop_scope) {
1184         case IOPOL_SCOPE_PROCESS:
1185                 policy = &p->p_iopol_disk;
1186                 break;
1187         case IOPOL_SCOPE_THREAD:
1188                 thread = current_thread();
1189                 ut = get_bsdthread_info(thread);
1190                 policy = &ut->uu_iopol_disk;
1191                 break;
1192         default:
1193                 error = EINVAL;
1194                 goto exit;
1195         }
1196
1197         switch(uap->cmd) {
1198         case IOPOL_CMD_SET:
1199                 switch (iop_param.iop_policy) {
1200                 case IOPOL_DEFAULT:
1201                 case IOPOL_NORMAL:
1202                 case IOPOL_THROTTLE:
1203                 case IOPOL_PASSIVE:
1204                         proc_lock(p);
1205                         *policy = iop_param.iop_policy;
1206                         proc_unlock(p);
1207                         break;
1208                 default:
1209                         error = EINVAL;
1210                         goto exit;
1211                 }
1212                 break;
1213         case IOPOL_CMD_GET:
1214                 switch (*policy) {
1215                 case IOPOL_DEFAULT:
1216                 case IOPOL_NORMAL:
1217                 case IOPOL_THROTTLE:
1218                 case IOPOL_PASSIVE:
1219                         iop_param.iop_policy = *policy;
1220                         break;
1221                 default: // in-kernel
1222                         // this should never happen
1223                         printf("%s: unknown I/O policy %d\n", __func__, *policy);
1224                         // restore to default value
1225                         *policy = IOPOL_DEFAULT;
1226                         iop_param.iop_policy = *policy;
1227                 }
1228
1229                 error = copyout((caddr_t)&iop_param, uap->arg, sizeof(iop_param));
1230                 break;
1231         default:
1232                 error = EINVAL; // unknown command
1233                 break;
1234         }
1235
1236   exit:
1237         *retval = error;
1238         return (error);
1239 }
1240
1241
1242 boolean_t thread_is_io_throttled(void);
1243
1244 boolean_t
1245 thread_is_io_throttled(void) {
1246
1247         int     policy;
1248         struct uthread  *ut;
1249
1250         policy = current_proc()->p_iopol_disk;
1251
1252         ut = get_bsdthread_info(current_thread());
1253
1254         if (ut->uu_iopol_disk != IOPOL_DEFAULT)
1255                 policy = ut->uu_iopol_disk;
1256
1257         if (policy == IOPOL_THROTTLE)
1258                 return TRUE;
1259
1260         return FALSE;
1261 }