bsd/kern/kern_lockf.c

   1 /*
   2  * Copyright (c) 2015 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * This code is derived from software contributed to Berkeley by
  33  * Scooter Morris at Genentech Inc.
  34  *
  35  * Redistribution and use in source and binary forms, with or without
  36  * modification, are permitted provided that the following conditions
  37  * are met:
  38  * 1. Redistributions of source code must retain the above copyright
  39  *    notice, this list of conditions and the following disclaimer.
  40  * 2. Redistributions in binary form must reproduce the above copyright
  41  *    notice, this list of conditions and the following disclaimer in the
  42  *    documentation and/or other materials provided with the distribution.
  43  * 4. Neither the name of the University nor the names of its contributors
  44  *    may be used to endorse or promote products derived from this software
  45  *    without specific prior written permission.
  46  *
  47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  57  * SUCH DAMAGE.
  58  *
  59  *      @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
  60  */
  61
  62 #include <sys/cdefs.h>
  63 #include <sys/param.h>
  64 #include <sys/systm.h>
  65 #include <sys/kernel.h>
  66 #include <sys/lock.h>
  67 #include <sys/mount.h>
  68 #include <sys/proc.h>
  69 #include <sys/signalvar.h>
  70 #include <sys/unistd.h>
  71 #include <sys/user.h>
  72 #include <sys/vnode.h>
  73 #include <sys/vnode_internal.h>
  74 #include <sys/vnode_if.h>
  75 #include <sys/malloc.h>
  76 #include <sys/fcntl.h>
  77 #include <sys/lockf.h>
  78 #include <sys/sdt.h>
  79 #include <kern/policy_internal.h>
  80
  81 #include <sys/file_internal.h>
  82
  83 /*
  84  * This variable controls the maximum number of processes that will
  85  * be checked in doing deadlock detection.
  86  */
  87 static int maxlockdepth = MAXDEPTH;
  88
  89 #if (DEVELOPMENT || DEBUG)
  90 #define LOCKF_DEBUGGING 1
  91 #endif
  92
  93 #ifdef LOCKF_DEBUGGING
  94 #include <sys/sysctl.h>
  95 void lf_print(const char *tag, struct lockf *lock);
  96 void lf_printlist(const char *tag, struct lockf *lock);
  97
  98 #define LF_DBG_LOCKOP   (1 << 0)        /* setlk, getlk, clearlk */
  99 #define LF_DBG_LIST     (1 << 1)        /* split, coalesce */
 100 #define LF_DBG_IMPINH   (1 << 2)        /* importance inheritance */
 101 #define LF_DBG_TRACE    (1 << 3)        /* errors, exit */
 102
 103 static int      lockf_debug = 0;        /* was 2, could be 3 ;-) */
 104 SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_debug, 0, "");
 105
 106 /*
 107  * If there is no mask bit selector, or there is one, and the selector is
 108  * set, then output the debugging diagnostic.
 109  */
 110 #define LOCKF_DEBUG(mask, ...)                                  \
 111         do {                                                    \
 112                 if( !(mask) || ((mask) & lockf_debug)) {        \
 113                         printf(__VA_ARGS__);                    \
 114                 }                                               \
 115         } while(0)
 116 #else   /* !LOCKF_DEBUGGING */
 117 #define LOCKF_DEBUG(mask, ...)          /* mask */
 118 #endif  /* !LOCKF_DEBUGGING */
 119
 120 MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
 121
 122 #define NOLOCKF (struct lockf *)0
 123 #define SELF    0x1
 124 #define OTHERS  0x2
 125 #define OFF_MAX 0x7fffffffffffffffULL   /* max off_t */
 126
 127 /*
 128  * Overlapping lock states
 129  */
 130 typedef enum {
 131         OVERLAP_NONE = 0,
 132         OVERLAP_EQUALS_LOCK,
 133         OVERLAP_CONTAINS_LOCK,
 134         OVERLAP_CONTAINED_BY_LOCK,
 135         OVERLAP_STARTS_BEFORE_LOCK,
 136         OVERLAP_ENDS_AFTER_LOCK
 137 } overlap_t;
 138
 139 static int       lf_clearlock(struct lockf *);
 140 static overlap_t lf_findoverlap(struct lockf *,
 141             struct lockf *, int, struct lockf ***, struct lockf **);
 142 static struct lockf *lf_getblock(struct lockf *, pid_t);
 143 static int       lf_getlock(struct lockf *, struct flock *, pid_t);
 144 static int       lf_setlock(struct lockf *, struct timespec *);
 145 static int       lf_split(struct lockf *, struct lockf *);
 146 static void      lf_wakelock(struct lockf *, boolean_t);
 147 #if IMPORTANCE_INHERITANCE
 148 static void      lf_hold_assertion(task_t, struct lockf *);
 149 static void      lf_jump_to_queue_head(struct lockf *, struct lockf *);
 150 static void      lf_drop_assertion(struct lockf *);
 151 static void      lf_boost_blocking_proc(struct lockf *, struct lockf *);
 152 #endif /* IMPORTANCE_INHERITANCE */
 153
 154 /*
 155  * lf_advlock
 156  *
 157  * Description: Advisory record locking support
 158  *
 159  * Parameters:  ap                      Argument pointer to a vnop_advlock_args
 160  *                                      argument descriptor structure for the
 161  *                                      lock operation to be attempted.
 162  *
 163  * Returns:     0                       Success
 164  *              EOVERFLOW
 165  *              EINVAL
 166  *              ENOLCK                  Number of locked regions exceeds limit
 167  *      lf_setlock:EAGAIN
 168  *      lf_setlock:EDEADLK
 169  *      lf_setlock:EINTR
 170  *      lf_setlock:ENOLCK
 171  *      lf_setlock:ETIMEDOUT
 172  *      lf_clearlock:ENOLCK
 173  *      vnode_size:???
 174  *
 175  * Notes:       We return ENOLCK when we run out of memory to support locks; as
 176  *              such, there is no specific expectation limit other than the
 177  *              amount of available resources.
 178  */
 179 int
 180 lf_advlock(struct vnop_advlock_args *ap)
 181 {
 182         struct vnode *vp = ap->a_vp;
 183         struct flock *fl = ap->a_fl;
 184         vfs_context_t context = ap->a_context;
 185         struct lockf *lock;
 186         off_t start, end, oadd;
 187         u_quad_t size;
 188         int error;
 189         struct lockf **head = &vp->v_lockf;
 190
 191         /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */
 192
 193         /*
 194          * Avoid the common case of unlocking when inode has no locks.
 195          */
 196         if (*head == (struct lockf *)0) {
 197                 if (ap->a_op != F_SETLK) {
 198                         fl->l_type = F_UNLCK;
 199                         LOCKF_DEBUG(LF_DBG_TRACE,
 200                             "lf_advlock: '%s' unlock without lock\n",
 201                             vfs_context_proc(context)->p_comm);
 202                         return (0);
 203                 }
 204         }
 205
 206         /*
 207          * Convert the flock structure into a start and end.
 208          */
 209         switch (fl->l_whence) {
 210
 211         case SEEK_SET:
 212         case SEEK_CUR:
 213                 /*
 214                  * Caller is responsible for adding any necessary offset
 215                  * when SEEK_CUR is used.
 216                  */
 217                 start = fl->l_start;
 218                 break;
 219
 220         case SEEK_END:
 221
 222                 /*
 223                  * It's OK to cast the u_quad_t to and off_t here, since they
 224                  * are the same storage size, and the value of the returned
 225                  * contents will never overflow into the sign bit.  We need to
 226                  * do this because we will use size to force range checks.
 227                  */
 228                 if ((error = vnode_size(vp, (off_t *)&size, context))) {
 229                         LOCKF_DEBUG(LF_DBG_TRACE,
 230                             "lf_advlock: vnode_getattr failed: %d\n", error);
 231                         return (error);
 232                 }
 233
 234                 if (size > OFF_MAX ||
 235                     (fl->l_start > 0 &&
 236                      size > (u_quad_t)(OFF_MAX - fl->l_start)))
 237                         return (EOVERFLOW);
 238                 start = size + fl->l_start;
 239                 break;
 240
 241         default:
 242                 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: unknown whence %d\n",
 243                     fl->l_whence);
 244                 return (EINVAL);
 245         }
 246         if (start < 0) {
 247                 LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: start < 0 (%qd)\n",
 248                     start);
 249                 return (EINVAL);
 250         }
 251         if (fl->l_len < 0) {
 252                 if (start == 0) {
 253                         LOCKF_DEBUG(LF_DBG_TRACE,
 254                             "lf_advlock: len < 0 & start == 0\n");
 255                         return (EINVAL);
 256                 }
 257                 end = start - 1;
 258                 start += fl->l_len;
 259                 if (start < 0) {
 260                         LOCKF_DEBUG(LF_DBG_TRACE,
 261                             "lf_advlock: start < 0 (%qd)\n", start);
 262                         return (EINVAL);
 263                 }
 264         } else if (fl->l_len == 0)
 265                 end = -1;
 266         else {
 267                 oadd = fl->l_len - 1;
 268                 if (oadd > (off_t)(OFF_MAX - start)) {
 269                         LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: overflow\n");
 270                         return (EOVERFLOW);
 271                 }
 272                 end = start + oadd;
 273         }
 274         /*
 275          * Create the lockf structure
 276          */
 277         MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
 278         if (lock == NULL)
 279                 return (ENOLCK);
 280         lock->lf_start = start;
 281         lock->lf_end = end;
 282         lock->lf_id = ap->a_id;
 283         lock->lf_vnode = vp;
 284         lock->lf_type = fl->l_type;
 285         lock->lf_head = head;
 286         lock->lf_next = (struct lockf *)0;
 287         TAILQ_INIT(&lock->lf_blkhd);
 288         lock->lf_flags = ap->a_flags;
 289 #if IMPORTANCE_INHERITANCE
 290         lock->lf_boosted = LF_NOT_BOOSTED;
 291 #endif
 292         if (ap->a_flags & F_POSIX)
 293                 lock->lf_owner = (struct proc *)lock->lf_id;
 294         else
 295                 lock->lf_owner = NULL;
 296
 297         if (ap->a_flags & F_FLOCK)
 298                 lock->lf_flags |= F_WAKE1_SAFE;
 299
 300         lck_mtx_lock(&vp->v_lock);      /* protect the lockf list */
 301         /*
 302          * Do the requested operation.
 303          */
 304         switch(ap->a_op) {
 305         case F_SETLK:
 306                 /*
 307                  * For F_OFD_* locks, lf_id is the fileglob.
 308                  * Record an "lf_owner" iff this is a confined fd
 309                  * i.e. it cannot escape this process and will be
 310                  * F_UNLCKed before the owner exits.  (This is
 311                  * the implicit guarantee needed to ensure lf_owner
 312                  * remains a valid reference here.)
 313                  */
 314                 if (ap->a_flags & F_OFD_LOCK) {
 315                         struct fileglob *fg = (void *)lock->lf_id;
 316                         if (fg->fg_lflags & FG_CONFINED)
 317                                 lock->lf_owner = current_proc();
 318                 }
 319                 error = lf_setlock(lock, ap->a_timeout);
 320                 break;
 321
 322         case F_UNLCK:
 323                 error = lf_clearlock(lock);
 324                 FREE(lock, M_LOCKF);
 325                 break;
 326
 327         case F_GETLK:
 328                 error = lf_getlock(lock, fl, -1);
 329                 FREE(lock, M_LOCKF);
 330                 break;
 331
 332
 333         default:
 334                 FREE(lock, M_LOCKF);
 335                 error = EINVAL;
 336                 break;
 337         }
 338         lck_mtx_unlock(&vp->v_lock);    /* done manipulating the list */
 339
 340         LOCKF_DEBUG(LF_DBG_TRACE, "lf_advlock: normal exit: %d\n", error);
 341         return (error);
 342 }
 343
 344 /*
 345  * Empty the queue of msleeping requests for a lock on the given vnode.
 346  * Called with the vnode already locked.  Used for forced unmount, where
 347  * a flock(2) invoker sleeping on a blocked lock holds an iocount reference
 348  * that prevents the vnode from ever being drained.  Force unmounting wins.
 349  */
 350 void
 351 lf_abort_advlocks(vnode_t vp)
 352 {
 353         struct lockf *lock;
 354
 355         if ((lock = vp->v_lockf) == NULL)
 356                 return;
 357
 358         lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
 359
 360         if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
 361                 struct lockf *tlock;
 362
 363                 TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
 364                         /*
 365                          * Setting this flag should cause all
 366                          * currently blocked F_SETLK request to
 367                          * return to userland with an errno.
 368                          */
 369                         tlock->lf_flags |= F_ABORT;
 370                 }
 371                 lf_wakelock(lock, TRUE);
 372         }
 373 }
 374
 375 /*
 376  * Take any lock attempts which are currently blocked by a given lock ("from")
 377  * and mark them as blocked by a different lock ("to").  Used in the case
 378  * where a byte range currently occupied by "from" is to be occupied by "to."
 379  */
 380 static void
 381 lf_move_blocked(struct lockf *to, struct lockf *from)
 382 {
 383         struct lockf *tlock;
 384
 385         TAILQ_FOREACH(tlock, &from->lf_blkhd, lf_block) {
 386                 tlock->lf_next = to;
 387         }
 388
 389         TAILQ_CONCAT(&to->lf_blkhd, &from->lf_blkhd, lf_block);
 390 }
 391
 392 /*
 393  * lf_coalesce_adjacent
 394  *
 395  * Description: Helper function: when setting a lock, coalesce adjacent
 396  *              locks.  Needed because adjacent locks are not overlapping,
 397  *              but POSIX requires that they be coalesced.
 398  *
 399  * Parameters:  lock                    The new lock which may be adjacent
 400  *                                      to already locked regions, and which
 401  *                                      should therefore be coalesced with them
 402  *
 403  * Returns:     <void>
 404  */
 405 static void
 406 lf_coalesce_adjacent(struct lockf *lock)
 407 {
 408         struct lockf **lf = lock->lf_head;
 409
 410         while (*lf != NOLOCKF) {
 411                 /* reject locks that obviously could not be coalesced */
 412                 if ((*lf == lock) ||
 413                     ((*lf)->lf_id != lock->lf_id) ||
 414                     ((*lf)->lf_type != lock->lf_type)) {
 415                         lf = &(*lf)->lf_next;
 416                         continue;
 417                 }
 418
 419                 /*
 420                  * NOTE: Assumes that if two locks are adjacent on the number line
 421                  * and belong to the same owner, then they are adjacent on the list.
 422                  */
 423                 if ((*lf)->lf_end != -1 &&
 424                     ((*lf)->lf_end + 1) == lock->lf_start) {
 425                         struct lockf *adjacent = *lf;
 426
 427                         LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent previous\n");
 428                         lock->lf_start = (*lf)->lf_start;
 429                         *lf = lock;
 430                         lf = &(*lf)->lf_next;
 431
 432                         lf_move_blocked(lock, adjacent);
 433
 434                         FREE(adjacent, M_LOCKF);
 435                         continue;
 436                 }
 437                 /* If the lock starts adjacent to us, we can coalesce it */
 438                 if (lock->lf_end != -1 &&
 439                     (lock->lf_end + 1) == (*lf)->lf_start) {
 440                         struct lockf *adjacent = *lf;
 441
 442                         LOCKF_DEBUG(LF_DBG_LIST, "lf_coalesce_adjacent: coalesce adjacent following\n");
 443                         lock->lf_end = (*lf)->lf_end;
 444                         lock->lf_next = (*lf)->lf_next;
 445                         lf = &lock->lf_next;
 446
 447                         lf_move_blocked(lock, adjacent);
 448
 449                         FREE(adjacent, M_LOCKF);
 450                         continue;
 451                 }
 452
 453                 /* no matching conditions; go on to next lock */
 454                 lf = &(*lf)->lf_next;
 455         }
 456 }
 457
 458 /*
 459  * lf_setlock
 460  *
 461  * Description: Set a byte-range lock.
 462  *
 463  * Parameters:  lock                    The lock structure describing the lock
 464  *                                      to be set; allocated by the caller, it
 465  *                                      will be linked into the lock list if
 466  *                                      the set is successful, and freed if the
 467  *                                      set is unsuccessful.
 468  *
 469  *              timeout                 Timeout specified in the case of
 470  *                                      SETLKWTIMEOUT.
 471  *
 472  * Returns:     0                       Success
 473  *              EAGAIN
 474  *              EDEADLK
 475  *      lf_split:ENOLCK
 476  *      lf_clearlock:ENOLCK
 477  *      msleep:EINTR
 478  *      msleep:ETIMEDOUT
 479  *
 480  * Notes:       We add the lock to the provisional lock list.  We do not
 481  *              coalesce at this time; this has implications for other lock
 482  *              requestors in the blocker search mechanism.
 483  */
 484 static int
 485 lf_setlock(struct lockf *lock, struct timespec *timeout)
 486 {
 487         struct lockf *block;
 488         struct lockf **head = lock->lf_head;
 489         struct lockf **prev, *overlap, *ltmp;
 490         static char lockstr[] = "lockf";
 491         int priority, needtolink, error;
 492         struct vnode *vp = lock->lf_vnode;
 493         overlap_t ovcase;
 494
 495 #ifdef LOCKF_DEBUGGING
 496         if (lockf_debug & LF_DBG_LOCKOP) {
 497                 lf_print("lf_setlock", lock);
 498                 lf_printlist("lf_setlock(in)", lock);
 499         }
 500 #endif /* LOCKF_DEBUGGING */
 501
 502         /*
 503          * Set the priority
 504          */
 505         priority = PLOCK;
 506         if (lock->lf_type == F_WRLCK)
 507                 priority += 4;
 508         priority |= PCATCH;
 509         /*
 510          * Scan lock list for this file looking for locks that would block us.
 511          */
 512         while ((block = lf_getblock(lock, -1))) {
 513                 /*
 514                  * Free the structure and return if nonblocking.
 515                  */
 516                 if ((lock->lf_flags & F_WAIT) == 0) {
 517                         DTRACE_FSINFO(advlock__nowait, vnode_t, vp);
 518                         FREE(lock, M_LOCKF);
 519                         return (EAGAIN);
 520                 }
 521
 522                 /*
 523                  * We are blocked. Since flock style locks cover
 524                  * the whole file, there is no chance for deadlock.
 525                  *
 526                  * OFD byte-range locks currently do NOT support
 527                  * deadlock detection.
 528                  *
 529                  * For POSIX byte-range locks we must check for deadlock.
 530                  *
 531                  * Deadlock detection is done by looking through the
 532                  * wait channels to see if there are any cycles that
 533                  * involve us. MAXDEPTH is set just to make sure we
 534                  * do not go off into neverland.
 535                  */
 536                 if ((lock->lf_flags & F_POSIX) &&
 537                     (block->lf_flags & F_POSIX)) {
 538                         struct proc *wproc, *bproc;
 539                         struct uthread *ut;
 540                         struct lockf *waitblock;
 541                         int i = 0;
 542
 543                         /* The block is waiting on something */
 544                         wproc = block->lf_owner;
 545                         proc_lock(wproc);
 546                         TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) {
 547                                 /*
 548                                  * While the thread is asleep (uu_wchan != 0)
 549                                  * in this code (uu_wmesg == lockstr)
 550                                  * and we have not exceeded the maximum cycle
 551                                  * depth (i < maxlockdepth), then check for a
 552                                  * cycle to see if the lock is blocked behind
 553                                  * someone blocked behind us.
 554                                  */
 555                                 while (((waitblock = (struct lockf *)ut->uu_wchan) != NULL) &&
 556                                     ut->uu_wmesg == lockstr &&
 557                                     (i++ < maxlockdepth)) {
 558                                         waitblock = (struct lockf *)ut->uu_wchan;
 559                                         /*
 560                                          * Get the lock blocking the lock
 561                                          * which would block us, and make
 562                                          * certain it hasn't come unblocked
 563                                          * (been granted, e.g. between the time
 564                                          * we called lf_getblock, and the time
 565                                          * we successfully acquired the
 566                                          * proc_lock).
 567                                          */
 568                                         waitblock = waitblock->lf_next;
 569                                         if (waitblock == NULL)
 570                                                 break;
 571
 572                                         /*
 573                                          * Make sure it's an advisory range
 574                                          * lock and not any other kind of lock;
 575                                          * if we mix lock types, it's our own
 576                                          * fault.
 577                                          */
 578                                         if ((waitblock->lf_flags & F_POSIX) == 0)
 579                                                 break;
 580
 581                                         /*
 582                                          * If the owner of the lock that's
 583                                          * blocking a lock that's blocking us
 584                                          * getting the requested lock, then we
 585                                          * would deadlock, so error out.
 586                                          */
 587                                         bproc = waitblock->lf_owner;
 588                                         if (bproc == lock->lf_owner) {
 589                                                 proc_unlock(wproc);
 590                                                 FREE(lock, M_LOCKF);
 591                                                 return (EDEADLK);
 592                                         }
 593                                 }
 594                         }
 595                         proc_unlock(wproc);
 596                 }
 597
 598                 /*
 599                  * For flock type locks, we must first remove
 600                  * any shared locks that we hold before we sleep
 601                  * waiting for an exclusive lock.
 602                  */
 603                 if ((lock->lf_flags & F_FLOCK) &&
 604                     lock->lf_type == F_WRLCK) {
 605                         lock->lf_type = F_UNLCK;
 606                         if ((error = lf_clearlock(lock)) != 0) {
 607                                 FREE(lock, M_LOCKF);
 608                                 return (error);
 609                         }
 610                         lock->lf_type = F_WRLCK;
 611                 }
 612                 /*
 613                  * Add our lock to the blocked list and sleep until we're free.
 614                  * Remember who blocked us (for deadlock detection).
 615                  */
 616                 lock->lf_next = block;
 617                 TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
 618
 619                 if ( !(lock->lf_flags & F_FLOCK))
 620                         block->lf_flags &= ~F_WAKE1_SAFE;
 621
 622 #if IMPORTANCE_INHERITANCE
 623                 /*
 624                  * Importance donation is done only for cases where the
 625                  * owning task can be unambiguously determined.
 626                  *
 627                  * POSIX type locks are not inherited by child processes;
 628                  * we maintain a 1:1 mapping between a lock and its owning
 629                  * process.
 630                  *
 631                  * Flock type locks are inherited across fork() and there is
 632                  * no 1:1 mapping in the general case.  However, the fileglobs
 633                  * used by OFD locks *may* be confined to the process that
 634                  * created them, and thus have an "owner", in which case
 635                  * we also attempt importance donation.
 636                  */
 637                 if ((lock->lf_flags & block->lf_flags & F_POSIX) != 0)
 638                         lf_boost_blocking_proc(lock, block);
 639                 else if ((lock->lf_flags & block->lf_flags & F_OFD_LOCK) &&
 640                     lock->lf_owner != block->lf_owner &&
 641                     NULL != lock->lf_owner && NULL != block->lf_owner)
 642                         lf_boost_blocking_proc(lock, block);
 643 #endif /* IMPORTANCE_INHERITANCE */
 644
 645 #ifdef LOCKF_DEBUGGING
 646                 if (lockf_debug & LF_DBG_LOCKOP) {
 647                         lf_print("lf_setlock: blocking on", block);
 648                         lf_printlist("lf_setlock(block)", block);
 649                 }
 650 #endif /* LOCKF_DEBUGGING */
 651                 DTRACE_FSINFO(advlock__wait, vnode_t, vp);
 652
 653                 error = msleep(lock, &vp->v_lock, priority, lockstr, timeout);
 654
 655                 if (error == 0 && (lock->lf_flags & F_ABORT) != 0)
 656                         error = EBADF;
 657
 658                 if (lock->lf_next) {
 659                         /*
 660                          * lf_wakelock() always sets wakelock->lf_next to
 661                          * NULL before a wakeup; so we've been woken early
 662                          * - perhaps by a debugger, signal or other event.
 663                          *
 664                          * Remove 'lock' from the block list (avoids double-add
 665                          * in the spurious case, which would create a cycle)
 666                          */
 667                         TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
 668                         lock->lf_next = NULL;
 669
 670                         if (error == 0) {
 671                                 /*
 672                                  * If this was a spurious wakeup, retry
 673                                  */
 674                                 printf("%s: spurious wakeup, retrying lock\n",
 675                                     __func__);
 676                                 continue;
 677                         }
 678                 }
 679
 680                 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
 681                         if ((block = lf_getblock(lock, -1)) != NULL)
 682                                 lf_move_blocked(block, lock);
 683                 }
 684
 685                 if (error) {
 686                         if (!TAILQ_EMPTY(&lock->lf_blkhd))
 687                                 lf_wakelock(lock, TRUE);
 688                         FREE(lock, M_LOCKF);
 689                         /* Return ETIMEDOUT if timeout occoured. */
 690                         if (error == EWOULDBLOCK) {
 691                                 error = ETIMEDOUT;
 692                         }
 693                         return (error);
 694                 }
 695         }
 696
 697         /*
 698          * No blocks!!  Add the lock.  Note that we will
 699          * downgrade or upgrade any overlapping locks this
 700          * process already owns.
 701          *
 702          * Skip over locks owned by other processes.
 703          * Handle any locks that overlap and are owned by ourselves.
 704          */
 705         prev = head;
 706         block = *head;
 707         needtolink = 1;
 708         for (;;) {
 709                 ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
 710                 if (ovcase)
 711                         block = overlap->lf_next;
 712                 /*
 713                  * Six cases:
 714                  *      0) no overlap
 715                  *      1) overlap == lock
 716                  *      2) overlap contains lock
 717                  *      3) lock contains overlap
 718                  *      4) overlap starts before lock
 719                  *      5) overlap ends after lock
 720                  */
 721                 switch (ovcase) {
 722                 case OVERLAP_NONE:
 723                         if (needtolink) {
 724                                 *prev = lock;
 725                                 lock->lf_next = overlap;
 726                         }
 727                         break;
 728
 729                 case OVERLAP_EQUALS_LOCK:
 730                         /*
 731                          * If downgrading lock, others may be
 732                          * able to acquire it.
 733                          */
 734                         if (lock->lf_type == F_RDLCK &&
 735                             overlap->lf_type == F_WRLCK)
 736                                 lf_wakelock(overlap, TRUE);
 737                         overlap->lf_type = lock->lf_type;
 738                         FREE(lock, M_LOCKF);
 739                         lock = overlap; /* for lf_coalesce_adjacent() */
 740                         break;
 741
 742                 case OVERLAP_CONTAINS_LOCK:
 743                         /*
 744                          * Check for common starting point and different types.
 745                          */
 746                         if (overlap->lf_type == lock->lf_type) {
 747                                 FREE(lock, M_LOCKF);
 748                                 lock = overlap; /* for lf_coalesce_adjacent() */
 749                                 break;
 750                         }
 751                         if (overlap->lf_start == lock->lf_start) {
 752                                 *prev = lock;
 753                                 lock->lf_next = overlap;
 754                                 overlap->lf_start = lock->lf_end + 1;
 755                         } else {
 756                                 /*
 757                                  * If we can't split the lock, we can't
 758                                  * grant it.  Claim a system limit for the
 759                                  * resource shortage.
 760                                  */
 761                                 if (lf_split(overlap, lock)) {
 762                                         FREE(lock, M_LOCKF);
 763                                         return (ENOLCK);
 764                                 }
 765                         }
 766                         lf_wakelock(overlap, TRUE);
 767                         break;
 768
 769                 case OVERLAP_CONTAINED_BY_LOCK:
 770                         /*
 771                          * If downgrading lock, others may be able to
 772                          * acquire it, otherwise take the list.
 773                          */
 774                         if (lock->lf_type == F_RDLCK &&
 775                             overlap->lf_type == F_WRLCK) {
 776                                 lf_wakelock(overlap, TRUE);
 777                         } else {
 778                                 while (!TAILQ_EMPTY(&overlap->lf_blkhd)) {
 779                                         ltmp = TAILQ_FIRST(&overlap->lf_blkhd);
 780                                         TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
 781                                             lf_block);
 782                                         TAILQ_INSERT_TAIL(&lock->lf_blkhd,
 783                                             ltmp, lf_block);
 784                                         ltmp->lf_next = lock;
 785                                 }
 786                         }
 787                         /*
 788                          * Add the new lock if necessary and delete the overlap.
 789                          */
 790                         if (needtolink) {
 791                                 *prev = lock;
 792                                 lock->lf_next = overlap->lf_next;
 793                                 prev = &lock->lf_next;
 794                                 needtolink = 0;
 795                         } else
 796                                 *prev = overlap->lf_next;
 797                         FREE(overlap, M_LOCKF);
 798                         continue;
 799
 800                 case OVERLAP_STARTS_BEFORE_LOCK:
 801                         /*
 802                          * Add lock after overlap on the list.
 803                          */
 804                         lock->lf_next = overlap->lf_next;
 805                         overlap->lf_next = lock;
 806                         overlap->lf_end = lock->lf_start - 1;
 807                         prev = &lock->lf_next;
 808                         lf_wakelock(overlap, TRUE);
 809                         needtolink = 0;
 810                         continue;
 811
 812                 case OVERLAP_ENDS_AFTER_LOCK:
 813                         /*
 814                          * Add the new lock before overlap.
 815                          */
 816                         if (needtolink) {
 817                                 *prev = lock;
 818                                 lock->lf_next = overlap;
 819                         }
 820                         overlap->lf_start = lock->lf_end + 1;
 821                         lf_wakelock(overlap, TRUE);
 822                         break;
 823                 }
 824                 break;
 825         }
 826         /* Coalesce adjacent locks with identical attributes */
 827         lf_coalesce_adjacent(lock);
 828 #ifdef LOCKF_DEBUGGING
 829         if (lockf_debug & LF_DBG_LOCKOP) {
 830                 lf_print("lf_setlock: got the lock", lock);
 831                 lf_printlist("lf_setlock(out)", lock);
 832         }
 833 #endif /* LOCKF_DEBUGGING */
 834         return (0);
 835 }
 836
 837
 838 /*
 839  * lf_clearlock
 840  *
 841  * Description: Remove a byte-range lock on an vnode.  Generally, find the
 842  *              lock (or an overlap to that lock) and remove it (or shrink
 843  *              it), then wakeup anyone we can.
 844  *
 845  * Parameters:  unlock                  The lock to clear
 846  *
 847  * Returns:     0                       Success
 848  *      lf_split:ENOLCK
 849  *
 850  * Notes:       A caller may unlock all the locks owned by the caller by
 851  *              specifying the entire file range; locks owned by other
 852  *              callers are not effected by this operation.
 853  */
 854 static int
 855 lf_clearlock(struct lockf *unlock)
 856 {
 857         struct lockf **head = unlock->lf_head;
 858         struct lockf *lf = *head;
 859         struct lockf *overlap, **prev;
 860         overlap_t ovcase;
 861
 862         if (lf == NOLOCKF)
 863                 return (0);
 864 #ifdef LOCKF_DEBUGGING
 865         if (unlock->lf_type != F_UNLCK)
 866                 panic("lf_clearlock: bad type");
 867         if (lockf_debug & LF_DBG_LOCKOP)
 868                 lf_print("lf_clearlock", unlock);
 869 #endif /* LOCKF_DEBUGGING */
 870         prev = head;
 871         while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != OVERLAP_NONE) {
 872                 /*
 873                  * Wakeup the list of locks to be retried.
 874                  */
 875                 lf_wakelock(overlap, FALSE);
 876 #if IMPORTANCE_INHERITANCE
 877                 if (overlap->lf_boosted == LF_BOOSTED) {
 878                         lf_drop_assertion(overlap);
 879                 }
 880 #endif /* IMPORTANCE_INHERITANCE */
 881
 882                 switch (ovcase) {
 883                 case OVERLAP_NONE:      /* satisfy compiler enum/switch */
 884                         break;
 885
 886                 case OVERLAP_EQUALS_LOCK:
 887                         *prev = overlap->lf_next;
 888                         FREE(overlap, M_LOCKF);
 889                         break;
 890
 891                 case OVERLAP_CONTAINS_LOCK: /* split it */
 892                         if (overlap->lf_start == unlock->lf_start) {
 893                                 overlap->lf_start = unlock->lf_end + 1;
 894                                 break;
 895                         }
 896                         /*
 897                          * If we can't split the lock, we can't grant it.
 898                          * Claim a system limit for the resource shortage.
 899                          */
 900                         if (lf_split(overlap, unlock))
 901                                 return (ENOLCK);
 902                         overlap->lf_next = unlock->lf_next;
 903                         break;
 904
 905                 case OVERLAP_CONTAINED_BY_LOCK:
 906                         *prev = overlap->lf_next;
 907                         lf = overlap->lf_next;
 908                         FREE(overlap, M_LOCKF);
 909                         continue;
 910
 911                 case OVERLAP_STARTS_BEFORE_LOCK:
 912                         overlap->lf_end = unlock->lf_start - 1;
 913                         prev = &overlap->lf_next;
 914                         lf = overlap->lf_next;
 915                         continue;
 916
 917                 case OVERLAP_ENDS_AFTER_LOCK:
 918                         overlap->lf_start = unlock->lf_end + 1;
 919                         break;
 920                 }
 921                 break;
 922         }
 923 #ifdef LOCKF_DEBUGGING
 924         if (lockf_debug & LF_DBG_LOCKOP)
 925                 lf_printlist("lf_clearlock", unlock);
 926 #endif /* LOCKF_DEBUGGING */
 927         return (0);
 928 }
 929
 930
 931 /*
 932  * lf_getlock
 933  *
 934  * Description: Check whether there is a blocking lock, and if so return
 935  *              its process identifier into the lock being requested.
 936  *
 937  * Parameters:  lock                    Pointer to lock to test for blocks
 938  *              fl                      Pointer to flock structure to receive
 939  *                                      the blocking lock information, if a
 940  *                                      blocking lock is found.
 941  *              matchpid                -1, or pid value to match in lookup.
 942  *
 943  * Returns:     0                       Success
 944  *
 945  * Implicit Returns:
 946  *              *fl                     Contents modified to reflect the
 947  *                                      blocking lock, if one is found; not
 948  *                                      modified otherwise
 949  *
 950  * Notes:       fl->l_pid will be (-1) for file locks and will only be set to
 951  *              the blocking process ID for advisory record locks.
 952  */
 953 static int
 954 lf_getlock(struct lockf *lock, struct flock *fl, pid_t matchpid)
 955 {
 956         struct lockf *block;
 957
 958 #ifdef LOCKF_DEBUGGING
 959         if (lockf_debug & LF_DBG_LOCKOP)
 960                 lf_print("lf_getlock", lock);
 961 #endif /* LOCKF_DEBUGGING */
 962
 963         if ((block = lf_getblock(lock, matchpid))) {
 964                 fl->l_type = block->lf_type;
 965                 fl->l_whence = SEEK_SET;
 966                 fl->l_start = block->lf_start;
 967                 if (block->lf_end == -1)
 968                         fl->l_len = 0;
 969                 else
 970                         fl->l_len = block->lf_end - block->lf_start + 1;
 971                 if (NULL != block->lf_owner) {
 972                         /*
 973                          * lf_owner is only non-NULL when the lock
 974                          * "owner" can be unambiguously determined
 975                          */
 976                         fl->l_pid = proc_pid(block->lf_owner);
 977                 } else
 978                         fl->l_pid = -1;
 979         } else {
 980                 fl->l_type = F_UNLCK;
 981         }
 982         return (0);
 983 }
 984
 985 /*
 986  * lf_getblock
 987  *
 988  * Description: Walk the list of locks for an inode and return the first
 989  *              blocking lock.  A lock is considered blocking if we are not
 990  *              the lock owner; otherwise, we are permitted to upgrade or
 991  *              downgrade it, and it's not considered blocking.
 992  *
 993  * Parameters:  lock                    The lock for which we are interested
 994  *                                      in obtaining the blocking lock, if any
 995  *              matchpid                -1, or pid value to match in lookup.
 996  *
 997  * Returns:     NOLOCKF                 No blocking lock exists
 998  *              !NOLOCKF                The address of the blocking lock's
 999  *                                      struct lockf.
1000  */
1001 static struct lockf *
1002 lf_getblock(struct lockf *lock, pid_t matchpid)
1003 {
1004         struct lockf **prev, *overlap, *lf = *(lock->lf_head);
1005
1006         for (prev = lock->lf_head;
1007             lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != OVERLAP_NONE;
1008             lf = overlap->lf_next) {
1009                 /*
1010                  * Found an overlap.
1011                  *
1012                  * If we're matching pids, and it's a record lock,
1013                  * or it's an OFD lock on a process-confined fd,
1014                  * but the pid doesn't match, then keep on looking ..
1015                  */
1016                 if (matchpid != -1 &&
1017                     (overlap->lf_flags & (F_POSIX|F_OFD_LOCK)) != 0 &&
1018                     proc_pid(overlap->lf_owner) != matchpid)
1019                         continue;
1020
1021                 /*
1022                  * does it block us?
1023                  */
1024                 if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
1025                         return (overlap);
1026         }
1027         return (NOLOCKF);
1028 }
1029
1030
1031 /*
1032  * lf_findoverlap
1033  *
1034  * Description: Walk the list of locks to find an overlapping lock (if any).
1035  *
1036  * Parameters:  lf                      First lock on lock list
1037  *              lock                    The lock we are checking for an overlap
1038  *              check                   Check type
1039  *              prev                    pointer to pointer pointer to contain
1040  *                                      address of pointer to previous lock
1041  *                                      pointer to overlapping lock, if overlap
1042  *              overlap                 pointer to pointer to contain address
1043  *                                      of overlapping lock
1044  *
1045  * Returns:     OVERLAP_NONE
1046  *              OVERLAP_EQUALS_LOCK
1047  *              OVERLAP_CONTAINS_LOCK
1048  *              OVERLAP_CONTAINED_BY_LOCK
1049  *              OVERLAP_STARTS_BEFORE_LOCK
1050  *              OVERLAP_ENDS_AFTER_LOCK
1051  *
1052  * Implicit Returns:
1053  *              *prev                   The address of the next pointer in the
1054  *                                      lock previous to the overlapping lock;
1055  *                                      this is generally used to relink the
1056  *                                      lock list, avoiding a second iteration.
1057  *              *overlap                The pointer to the overlapping lock
1058  *                                      itself; this is used to return data in
1059  *                                      the check == OTHERS case, and for the
1060  *                                      caller to modify the overlapping lock,
1061  *                                      in the check == SELF case
1062  *
1063  * Note:        This returns only the FIRST overlapping lock.  There may be
1064  *              more than one.  lf_getlock will return the first blocking lock,
1065  *              while lf_setlock will iterate over all overlapping locks to
1066  *
1067  *              The check parameter can be SELF, meaning we are looking for
1068  *              overlapping locks owned by us, or it can be OTHERS, meaning
1069  *              we are looking for overlapping locks owned by someone else so
1070  *              we can report a blocking lock on an F_GETLK request.
1071  *
1072  *              The value of *overlap and *prev are modified, even if there is
1073  *              no overlapping lock found; always check the return code.
1074  */
1075 static overlap_t
1076 lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
1077                struct lockf ***prev, struct lockf **overlap)
1078 {
1079         off_t start, end;
1080         int found_self = 0;
1081
1082         *overlap = lf;
1083         if (lf == NOLOCKF)
1084                 return (0);
1085 #ifdef LOCKF_DEBUGGING
1086         if (lockf_debug & LF_DBG_LIST)
1087                 lf_print("lf_findoverlap: looking for overlap in", lock);
1088 #endif /* LOCKF_DEBUGGING */
1089         start = lock->lf_start;
1090         end = lock->lf_end;
1091         while (lf != NOLOCKF) {
1092                 if (((type & SELF) && lf->lf_id != lock->lf_id) ||
1093                     ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
1094                         /*
1095                          * Locks belonging to one process are adjacent on the
1096                          * list, so if we've found any locks belonging to us,
1097                          * and we're now seeing something else, then we've
1098                          * examined all "self" locks.  Note that bailing out
1099                          * here is quite important; for coalescing, we assume
1100                          * numerically adjacent locks from the same owner to
1101                          * be adjacent on the list.
1102                          */
1103                         if ((type & SELF) && found_self) {
1104                                 return OVERLAP_NONE;
1105                         }
1106
1107                         *prev = &lf->lf_next;
1108                         *overlap = lf = lf->lf_next;
1109                         continue;
1110                 }
1111
1112                 if ((type & SELF)) {
1113                         found_self = 1;
1114                 }
1115
1116 #ifdef LOCKF_DEBUGGING
1117                 if (lockf_debug & LF_DBG_LIST)
1118                         lf_print("\tchecking", lf);
1119 #endif /* LOCKF_DEBUGGING */
1120                 /*
1121                  * OK, check for overlap
1122                  */
1123                 if ((lf->lf_end != -1 && start > lf->lf_end) ||
1124                     (end != -1 && lf->lf_start > end)) {
1125                         /* Case 0 */
1126                         LOCKF_DEBUG(LF_DBG_LIST, "no overlap\n");
1127
1128                         /*
1129                          * NOTE: assumes that locks for the same process are
1130                          * nonintersecting and ordered.
1131                          */
1132                         if ((type & SELF) && end != -1 && lf->lf_start > end)
1133                                 return (OVERLAP_NONE);
1134                         *prev = &lf->lf_next;
1135                         *overlap = lf = lf->lf_next;
1136                         continue;
1137                 }
1138                 if ((lf->lf_start == start) && (lf->lf_end == end)) {
1139                         LOCKF_DEBUG(LF_DBG_LIST, "overlap == lock\n");
1140                         return (OVERLAP_EQUALS_LOCK);
1141                 }
1142                 if ((lf->lf_start <= start) &&
1143                     (end != -1) &&
1144                     ((lf->lf_end >= end) || (lf->lf_end == -1))) {
1145                         LOCKF_DEBUG(LF_DBG_LIST, "overlap contains lock\n");
1146                         return (OVERLAP_CONTAINS_LOCK);
1147                 }
1148                 if (start <= lf->lf_start &&
1149                            (end == -1 ||
1150                            (lf->lf_end != -1 && end >= lf->lf_end))) {
1151                         LOCKF_DEBUG(LF_DBG_LIST, "lock contains overlap\n");
1152                         return (OVERLAP_CONTAINED_BY_LOCK);
1153                 }
1154                 if ((lf->lf_start < start) &&
1155                         ((lf->lf_end >= start) || (lf->lf_end == -1))) {
1156                         LOCKF_DEBUG(LF_DBG_LIST, "overlap starts before lock\n");
1157                         return (OVERLAP_STARTS_BEFORE_LOCK);
1158                 }
1159                 if ((lf->lf_start > start) &&
1160                         (end != -1) &&
1161                         ((lf->lf_end > end) || (lf->lf_end == -1))) {
1162                         LOCKF_DEBUG(LF_DBG_LIST, "overlap ends after lock\n");
1163                         return (OVERLAP_ENDS_AFTER_LOCK);
1164                 }
1165                 panic("lf_findoverlap: default");
1166         }
1167         return (OVERLAP_NONE);
1168 }
1169
1170
1171 /*
1172  * lf_split
1173  *
1174  * Description: Split a lock and a contained region into two or three locks
1175  *              as necessary.
1176  *
1177  * Parameters:  lock1                   Lock to split
1178  *              lock2                   Overlapping lock region requiring the
1179  *                                      split (upgrade/downgrade/unlock)
1180  *
1181  * Returns:     0                       Success
1182  *              ENOLCK                  No memory for new lock
1183  *
1184  * Implicit Returns:
1185  *              *lock1                  Modified original lock
1186  *              *lock2                  Overlapping lock (inserted into list)
1187  *              (new lock)              Potential new lock inserted into list
1188  *                                      if split results in 3 locks
1189  *
1190  * Notes:       This operation can only fail if the split would result in three
1191  *              locks, and there is insufficient memory to allocate the third
1192  *              lock; in that case, neither of the locks will be modified.
1193  */
1194 static int
1195 lf_split(struct lockf *lock1, struct lockf *lock2)
1196 {
1197         struct lockf *splitlock;
1198
1199 #ifdef LOCKF_DEBUGGING
1200         if (lockf_debug & LF_DBG_LIST) {
1201                 lf_print("lf_split", lock1);
1202                 lf_print("splitting from", lock2);
1203         }
1204 #endif /* LOCKF_DEBUGGING */
1205         /*
1206          * Check to see if splitting into only two pieces.
1207          */
1208         if (lock1->lf_start == lock2->lf_start) {
1209                 lock1->lf_start = lock2->lf_end + 1;
1210                 lock2->lf_next = lock1;
1211                 return (0);
1212         }
1213         if (lock1->lf_end == lock2->lf_end) {
1214                 lock1->lf_end = lock2->lf_start - 1;
1215                 lock2->lf_next = lock1->lf_next;
1216                 lock1->lf_next = lock2;
1217                 return (0);
1218         }
1219         /*
1220          * Make a new lock consisting of the last part of
1221          * the encompassing lock
1222          */
1223         MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
1224         if (splitlock == NULL)
1225                 return (ENOLCK);
1226         bcopy(lock1, splitlock, sizeof *splitlock);
1227         splitlock->lf_start = lock2->lf_end + 1;
1228         TAILQ_INIT(&splitlock->lf_blkhd);
1229         lock1->lf_end = lock2->lf_start - 1;
1230         /*
1231          * OK, now link it in
1232          */
1233         splitlock->lf_next = lock1->lf_next;
1234         lock2->lf_next = splitlock;
1235         lock1->lf_next = lock2;
1236
1237         return (0);
1238 }
1239
1240
1241 /*
1242  * lf_wakelock
1243  *
1244  * Wakeup a blocklist in the case of a downgrade or unlock, since others
1245  * waiting on the lock may now be able to acquire it.
1246  *
1247  * Parameters:  listhead                Lock list head on which waiters may
1248  *                                      have pending locks
1249  *
1250  * Returns:     <void>
1251  *
1252  * Notes:       This function iterates a list of locks and wakes all waiters,
1253  *              rather than only waiters for the contended regions.  Because
1254  *              of this, for heavily contended files, this can result in a
1255  *              "thundering herd" situation.  Refactoring the code could make
1256  *              this operation more efficient, if heavy contention ever results
1257  *              in a real-world performance problem.
1258  */
1259 static void
1260 lf_wakelock(struct lockf *listhead, boolean_t force_all)
1261 {
1262         struct lockf *wakelock;
1263         boolean_t wake_all = TRUE;
1264
1265         if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE))
1266                 wake_all = FALSE;
1267
1268         while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1269                 wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
1270                 TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
1271
1272                 wakelock->lf_next = NOLOCKF;
1273 #ifdef LOCKF_DEBUGGING
1274                 if (lockf_debug & LF_DBG_LOCKOP)
1275                         lf_print("lf_wakelock: awakening", wakelock);
1276 #endif /* LOCKF_DEBUGGING */
1277                 if (wake_all == FALSE) {
1278                         /*
1279                          * If there are items on the list head block list,
1280                          * move them to the wakelock list instead, and then
1281                          * correct their lf_next pointers.
1282                          */
1283                         if (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1284                                 TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
1285
1286                                 struct lockf *tlock;
1287
1288                                 TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) {
1289                                         if (TAILQ_NEXT(tlock, lf_block) == tlock) {
1290                                                 /* See rdar://10887303 */
1291                                                 panic("cycle in wakelock list");
1292                                         }
1293                                         tlock->lf_next = wakelock;
1294                                 }
1295                         }
1296                 }
1297                 wakeup(wakelock);
1298
1299                 if (wake_all == FALSE)
1300                         break;
1301         }
1302 }
1303
1304
1305 #ifdef LOCKF_DEBUGGING
1306 #define GET_LF_OWNER_PID(lf)    (proc_pid((lf)->lf_owner))
1307
1308 /*
1309  * lf_print DEBUG
1310  *
1311  * Print out a lock; lock information is prefixed by the string in 'tag'
1312  *
1313  * Parameters:  tag                     A string tag for debugging
1314  *              lock                    The lock whose information should be
1315  *                                      displayed
1316  *
1317  * Returns:     <void>
1318  */
1319 void
1320 lf_print(const char *tag, struct lockf *lock)
1321 {
1322         printf("%s: lock %p for ", tag, (void *)lock);
1323         if (lock->lf_flags & F_POSIX)
1324                 printf("proc %p (owner %d)",
1325                     lock->lf_id, GET_LF_OWNER_PID(lock));
1326         else if (lock->lf_flags & F_OFD_LOCK)
1327                 printf("fg %p (owner %d)",
1328                     lock->lf_id, GET_LF_OWNER_PID(lock));
1329         else
1330                 printf("id %p", (void *)lock->lf_id);
1331         if (lock->lf_vnode != 0)
1332                 printf(" in vno %p, %s, start 0x%016llx, end 0x%016llx",
1333                     lock->lf_vnode,
1334                     lock->lf_type == F_RDLCK ? "shared" :
1335                     lock->lf_type == F_WRLCK ? "exclusive" :
1336                     lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1337                     (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1338         else
1339                 printf(" %s, start 0x%016llx, end 0x%016llx",
1340                     lock->lf_type == F_RDLCK ? "shared" :
1341                     lock->lf_type == F_WRLCK ? "exclusive" :
1342                     lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1343                     (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1344         if (!TAILQ_EMPTY(&lock->lf_blkhd))
1345                 printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
1346         else
1347                 printf("\n");
1348 }
1349
1350
1351 /*
1352  * lf_printlist DEBUG
1353  *
1354  * Print out a lock list for the vnode associated with 'lock'; lock information
1355  * is prefixed by the string in 'tag'
1356  *
1357  * Parameters:  tag                     A string tag for debugging
1358  *              lock                    The lock whose vnode's lock list should
1359  *                                      be displayed
1360  *
1361  * Returns:     <void>
1362  */
1363 void
1364 lf_printlist(const char *tag, struct lockf *lock)
1365 {
1366         struct lockf *lf, *blk;
1367
1368         if (lock->lf_vnode == 0)
1369                 return;
1370
1371         printf("%s: Lock list for vno %p:\n",
1372             tag, lock->lf_vnode);
1373         for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) {
1374                 printf("\tlock %p for ",(void *)lf);
1375                 if (lf->lf_flags & F_POSIX)
1376                         printf("proc %p (owner %d)",
1377                             lf->lf_id, GET_LF_OWNER_PID(lf));
1378                 else if (lf->lf_flags & F_OFD_LOCK)
1379                         printf("fg %p (owner %d)",
1380                             lf->lf_id, GET_LF_OWNER_PID(lf));
1381                 else
1382                         printf("id %p", (void *)lf->lf_id);
1383                 printf(", %s, start 0x%016llx, end 0x%016llx",
1384                     lf->lf_type == F_RDLCK ? "shared" :
1385                     lf->lf_type == F_WRLCK ? "exclusive" :
1386                     lf->lf_type == F_UNLCK ? "unlock" :
1387                     "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
1388                 TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
1389                         printf("\n\t\tlock request %p for ", (void *)blk);
1390                         if (blk->lf_flags & F_POSIX)
1391                                 printf("proc %p (owner %d)",
1392                                     blk->lf_id, GET_LF_OWNER_PID(blk));
1393                         else if (blk->lf_flags & F_OFD_LOCK)
1394                                 printf("fg %p (owner %d)",
1395                                     blk->lf_id, GET_LF_OWNER_PID(blk));
1396                         else
1397                                 printf("id %p", (void *)blk->lf_id);
1398                         printf(", %s, start 0x%016llx, end 0x%016llx",
1399                             blk->lf_type == F_RDLCK ? "shared" :
1400                             blk->lf_type == F_WRLCK ? "exclusive" :
1401                             blk->lf_type == F_UNLCK ? "unlock" :
1402                             "unknown", (intmax_t)blk->lf_start,
1403                             (intmax_t)blk->lf_end);
1404                         if (!TAILQ_EMPTY(&blk->lf_blkhd))
1405                                 panic("lf_printlist: bad list");
1406                 }
1407                 printf("\n");
1408         }
1409 }
1410 #endif /* LOCKF_DEBUGGING */
1411
1412 #if IMPORTANCE_INHERITANCE
1413
1414 /*
1415  * lf_hold_assertion
1416  *
1417  * Call task importance hold assertion on the owner of the lock.
1418  *
1419  * Parameters: block_task               Owner of the lock blocking
1420  *                                      current thread.
1421  *
1422  *             block                    lock on which the current thread
1423  *                                      is blocking on.
1424  *
1425  * Returns:    <void>
1426  *
1427  * Notes: The task reference on block_task is not needed to be hold since
1428  *        the current thread has vnode lock and block_task has a file
1429  *        lock, thus removing file lock in exit requires block_task to
1430  *        grab the vnode lock.
1431  */
1432 static void
1433 lf_hold_assertion(task_t block_task, struct lockf *block)
1434 {
1435         if (task_importance_hold_file_lock_assertion(block_task, 1) == 0) {
1436                 block->lf_boosted = LF_BOOSTED;
1437                 LOCKF_DEBUG(LF_DBG_IMPINH,
1438                     "lf: importance hold file lock assert on pid %d lock %p\n",
1439                     proc_pid(block->lf_owner), block);
1440         }
1441 }
1442
1443
1444 /*
1445  * lf_jump_to_queue_head
1446  *
1447  * Jump the lock from the tail of the block queue to the head of
1448  * the queue.
1449  *
1450  * Parameters: block                    lockf struct containing the
1451  *                                      block queue.
1452  *             lock                     lockf struct to be jumped to the
1453  *                                      front.
1454  *
1455  * Returns:    <void>
1456  */
1457 static void
1458 lf_jump_to_queue_head(struct lockf *block, struct lockf *lock)
1459 {
1460         /* Move the lock to the head of the block queue. */
1461         TAILQ_REMOVE(&block->lf_blkhd, lock, lf_block);
1462         TAILQ_INSERT_HEAD(&block->lf_blkhd, lock, lf_block);
1463 }
1464
1465
1466 /*
1467  * lf_drop_assertion
1468  *
1469  * Drops the task hold assertion.
1470  *
1471  * Parameters: block                    lockf struct holding the assertion.
1472  *
1473  * Returns:    <void>
1474  */
1475 static void
1476 lf_drop_assertion(struct lockf *block)
1477 {
1478         LOCKF_DEBUG(LF_DBG_IMPINH, "lf: %d: dropping assertion for lock %p\n",
1479             proc_pid(block->lf_owner), block);
1480
1481         task_t current_task = proc_task(block->lf_owner);
1482         task_importance_drop_file_lock_assertion(current_task, 1);
1483         block->lf_boosted = LF_NOT_BOOSTED;
1484 }
1485
1486 static void
1487 lf_boost_blocking_proc(struct lockf *lock, struct lockf *block)
1488 {
1489         task_t ltask = proc_task(lock->lf_owner);
1490         task_t btask = proc_task(block->lf_owner);
1491
1492         /*
1493          * Check if ltask can donate importance. The
1494          * check of imp_donor bit is done without holding
1495          * any lock. The value may change after you read it,
1496          * but it is ok to boost a task while someone else is
1497          * unboosting you.
1498          *
1499          * TODO: Support live inheritance on file locks.
1500          */
1501         if (task_is_importance_donor(ltask)) {
1502                 LOCKF_DEBUG(LF_DBG_IMPINH,
1503                     "lf: %d: attempt to boost pid %d that holds lock %p\n",
1504                     proc_pid(lock->lf_owner), proc_pid(block->lf_owner), block);
1505
1506                 if (block->lf_boosted != LF_BOOSTED &&
1507                     task_is_importance_receiver_type(btask)) {
1508                         lf_hold_assertion(btask, block);
1509                 }
1510                 lf_jump_to_queue_head(block, lock);
1511         }
1512 }
1513 #endif /* IMPORTANCE_INHERITANCE */