bsd/kern/kern_lockf.c

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1989, 1993
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * This code is derived from software contributed to Berkeley by
  33  * Scooter Morris at Genentech Inc.
  34  *
  35  * Redistribution and use in source and binary forms, with or without
  36  * modification, are permitted provided that the following conditions
  37  * are met:
  38  * 1. Redistributions of source code must retain the above copyright
  39  *    notice, this list of conditions and the following disclaimer.
  40  * 2. Redistributions in binary form must reproduce the above copyright
  41  *    notice, this list of conditions and the following disclaimer in the
  42  *    documentation and/or other materials provided with the distribution.
  43  * 4. Neither the name of the University nor the names of its contributors
  44  *    may be used to endorse or promote products derived from this software
  45  *    without specific prior written permission.
  46  *
  47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  57  * SUCH DAMAGE.
  58  *
  59  *      @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
  60  */
  61
  62 #include <sys/cdefs.h>
  63 #include <sys/param.h>
  64 #include <sys/systm.h>
  65 #include <sys/kernel.h>
  66 #include <sys/lock.h>
  67 #include <sys/mount.h>
  68 #include <sys/proc.h>
  69 #include <sys/signalvar.h>
  70 #include <sys/unistd.h>
  71 #include <sys/user.h>
  72 #include <sys/vnode.h>
  73 #include <sys/vnode_internal.h>
  74 #include <sys/vnode_if.h>
  75 #include <sys/malloc.h>
  76 #include <sys/fcntl.h>
  77 #include <sys/lockf.h>
  78
  79 /*
  80  * This variable controls the maximum number of processes that will
  81  * be checked in doing deadlock detection.
  82  */
  83 static int maxlockdepth = MAXDEPTH;
  84
  85 #ifdef LOCKF_DEBUGGING
  86 #include <sys/sysctl.h>
  87 #include <ufs/ufs/quota.h>
  88 #include <ufs/ufs/inode.h>
  89 void lf_print(const char *tag, struct lockf *lock);
  90 void lf_printlist(const char *tag, struct lockf *lock);
  91 static int      lockf_debug = 2;
  92 SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
  93
  94 /*
  95  * If there is no mask bit selector, or there is on, and the selector is
  96  * set, then output the debugging diagnostic.
  97  */
  98 #define LOCKF_DEBUG(mask, ...)                                  \
  99         do {                                                    \
 100                 if( !(mask) || ((mask) & lockf_debug)) {        \
 101                         printf(__VA_ARGS__);                    \
 102                 }                                               \
 103         } while(0)
 104 #else   /* !LOCKF_DEBUGGING */
 105 #define LOCKF_DEBUG(mask, ...)          /* mask */
 106 #endif  /* !LOCKF_DEBUGGING */
 107
 108 MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
 109
 110 #define NOLOCKF (struct lockf *)0
 111 #define SELF    0x1
 112 #define OTHERS  0x2
 113 #define OFF_MAX 0x7fffffffffffffffULL   /* max off_t */
 114
 115 /*
 116  * Overlapping lock states
 117  */
 118 typedef enum {
 119         OVERLAP_NONE = 0,
 120         OVERLAP_EQUALS_LOCK,
 121         OVERLAP_CONTAINS_LOCK,
 122         OVERLAP_CONTAINED_BY_LOCK,
 123         OVERLAP_STARTS_BEFORE_LOCK,
 124         OVERLAP_ENDS_AFTER_LOCK
 125 } overlap_t;
 126
 127 static int       lf_clearlock(struct lockf *);
 128 static overlap_t lf_findoverlap(struct lockf *,
 129             struct lockf *, int, struct lockf ***, struct lockf **);
 130 static struct lockf *lf_getblock(struct lockf *);
 131 static int       lf_getlock(struct lockf *, struct flock *);
 132 static int       lf_setlock(struct lockf *);
 133 static int       lf_split(struct lockf *, struct lockf *);
 134 static void      lf_wakelock(struct lockf *, boolean_t);
 135
 136
 137 /*
 138  * lf_advlock
 139  *
 140  * Description: Advisory record locking support
 141  *
 142  * Parameters:  ap                      Argument pointer to a vnop_advlock_args
 143  *                                      argument descriptor structure for the
 144  *                                      lock operation to be attempted.
 145  *
 146  * Returns:     0                       Success
 147  *              EOVERFLOW
 148  *              EINVAL
 149  *              ENOLCK                  Number of locked regions exceeds limit
 150  *      lf_setlock:EAGAIN
 151  *      lf_setlock:EDEADLK
 152  *      lf_setlock:EINTR
 153  *      lf_setlock:ENOLCK
 154  *      lf_clearlock:ENOLCK
 155  *      vnode_size:???
 156  *
 157  * Notes:       We return ENOLCK when we run out of memory to support locks; as
 158  *              such, there is no specific expectation limit other than the
 159  *              amount of available resources.
 160  */
 161 int
 162 lf_advlock(struct vnop_advlock_args *ap)
 163 {
 164         struct vnode *vp = ap->a_vp;
 165         struct flock *fl = ap->a_fl;
 166         vfs_context_t context = ap->a_context;
 167         struct lockf *lock;
 168         off_t start, end, oadd;
 169         u_quad_t size;
 170         int error;
 171         struct lockf **head = &vp->v_lockf;
 172
 173         /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */
 174
 175         /*
 176          * Avoid the common case of unlocking when inode has no locks.
 177          */
 178         if (*head == (struct lockf *)0) {
 179                 if (ap->a_op != F_SETLK) {
 180                         fl->l_type = F_UNLCK;
 181                         LOCKF_DEBUG(0, "lf_advlock: '%s' unlock without lock\n", vfs_context_proc(context)->p_comm);
 182                         return (0);
 183                 }
 184         }
 185
 186         /*
 187          * Convert the flock structure into a start and end.
 188          */
 189         switch (fl->l_whence) {
 190
 191         case SEEK_SET:
 192         case SEEK_CUR:
 193                 /*
 194                  * Caller is responsible for adding any necessary offset
 195                  * when SEEK_CUR is used.
 196                  */
 197                 start = fl->l_start;
 198                 break;
 199
 200         case SEEK_END:
 201
 202                 /*
 203                  * It's OK to cast the u_quad_t to and off_t here, since they
 204                  * are the same storage size, and the value of the returned
 205                  * contents will never overflow into the sign bit.  We need to
 206                  * do this because we will use size to force range checks.
 207                  */
 208                 if ((error = vnode_size(vp, (off_t *)&size, context))) {
 209                         LOCKF_DEBUG(0, "lf_advlock: vnode_getattr failed: %d\n", error);
 210                         return (error);
 211                 }
 212
 213                 if (size > OFF_MAX ||
 214                     (fl->l_start > 0 &&
 215                      size > (u_quad_t)(OFF_MAX - fl->l_start)))
 216                         return (EOVERFLOW);
 217                 start = size + fl->l_start;
 218                 break;
 219
 220         default:
 221                 LOCKF_DEBUG(0, "lf_advlock: unknown whence %d\n", fl->l_whence);
 222                 return (EINVAL);
 223         }
 224         if (start < 0) {
 225                 LOCKF_DEBUG(0, "lf_advlock: start < 0 (%qd)\n", start);
 226                 return (EINVAL);
 227         }
 228         if (fl->l_len < 0) {
 229                 if (start == 0) {
 230                         LOCKF_DEBUG(0, "lf_advlock: len < 0 & start == 0\n");
 231                         return (EINVAL);
 232                 }
 233                 end = start - 1;
 234                 start += fl->l_len;
 235                 if (start < 0) {
 236                         LOCKF_DEBUG(0, "lf_advlock: start < 0 (%qd)\n", start);
 237                         return (EINVAL);
 238                 }
 239         } else if (fl->l_len == 0)
 240                 end = -1;
 241         else {
 242                 oadd = fl->l_len - 1;
 243                 if (oadd > (off_t)(OFF_MAX - start)) {
 244                         LOCKF_DEBUG(0, "lf_advlock: overflow\n");
 245                         return (EOVERFLOW);
 246                 }
 247                 end = start + oadd;
 248         }
 249         /*
 250          * Create the lockf structure
 251          */
 252         MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
 253         if (lock == NULL)
 254                 return (ENOLCK);
 255         lock->lf_start = start;
 256         lock->lf_end = end;
 257         lock->lf_id = ap->a_id;
 258         lock->lf_vnode = vp;
 259         lock->lf_type = fl->l_type;
 260         lock->lf_head = head;
 261         lock->lf_next = (struct lockf *)0;
 262         TAILQ_INIT(&lock->lf_blkhd);
 263         lock->lf_flags = ap->a_flags;
 264
 265         if (ap->a_flags & F_FLOCK)
 266                 lock->lf_flags |= F_WAKE1_SAFE;
 267
 268         lck_mtx_lock(&vp->v_lock);      /* protect the lockf list */
 269         /*
 270          * Do the requested operation.
 271          */
 272         switch(ap->a_op) {
 273         case F_SETLK:
 274                 error = lf_setlock(lock);
 275                 break;
 276
 277         case F_UNLCK:
 278                 error = lf_clearlock(lock);
 279                 FREE(lock, M_LOCKF);
 280                 break;
 281
 282         case F_GETLK:
 283                 error = lf_getlock(lock, fl);
 284                 FREE(lock, M_LOCKF);
 285                 break;
 286
 287         default:
 288                 FREE(lock, M_LOCKF);
 289                 error = EINVAL;
 290                 break;
 291         }
 292         lck_mtx_unlock(&vp->v_lock);    /* done maniplulating the list */
 293
 294         LOCKF_DEBUG(0, "lf_advlock: normal exit: %d\n\n", error);
 295         return (error);
 296 }
 297
 298
 299 /*
 300  * lf_coelesce_adjacent
 301  *
 302  * Description: Helper function: when setting a lock, coelesce adjacent
 303  *              locks.  Needed because adjacent locks are not overlapping,
 304  *              but POSIX requires that they be coelesced.
 305  *
 306  * Parameters:  lock                    The new lock which may be adjacent
 307  *                                      to already locked reagions, and which
 308  *                                      should therefore be coelesced with them
 309  *
 310  * Returns:     <void>
 311  */
 312 static void
 313 lf_coelesce_adjacent(struct lockf *lock)
 314 {
 315         struct lockf **lf = lock->lf_head;
 316
 317         while (*lf != NOLOCKF) {
 318                 /* reject locks that obviously could not be coelesced */
 319                 if ((*lf == lock) ||
 320                     ((*lf)->lf_id != lock->lf_id) ||
 321                     ((*lf)->lf_type != lock->lf_type)) {
 322                         lf = &(*lf)->lf_next;
 323                         continue;
 324                 }
 325
 326                 /* If the lock ends adjacent to us, we can coelesce it */
 327                 if ((*lf)->lf_end != -1 &&
 328                     ((*lf)->lf_end + 1) == lock->lf_start) {
 329                         struct lockf *adjacent = *lf;
 330
 331                         LOCKF_DEBUG(0, "lf_coelesce_adjacent: coelesce adjacent previous\n");
 332                         lock->lf_start = (*lf)->lf_start;
 333                         *lf = lock;
 334                         lf = &(*lf)->lf_next;
 335                         FREE(adjacent, M_LOCKF);
 336                         continue;
 337                 }
 338                 /* If the lock starts adjacent to us, we can coelesce it */
 339                 if (lock->lf_end != -1 &&
 340                     (lock->lf_end + 1) == (*lf)->lf_start) {
 341                         struct lockf *adjacent = *lf;
 342
 343                         LOCKF_DEBUG(0, "lf_coelesce_adjacent: coelesce adjacent following\n");
 344                         lock->lf_end = (*lf)->lf_end;
 345                         lock->lf_next = (*lf)->lf_next;
 346                         lf = &lock->lf_next;
 347                         FREE(adjacent, M_LOCKF);
 348                         continue;
 349                 }
 350
 351                 /* no matching conditions; go on to next lock */
 352                 lf = &(*lf)->lf_next;
 353         }
 354 }
 355
 356
 357 /*
 358  * lf_setlock
 359  *
 360  * Description: Set a byte-range lock.
 361  *
 362  * Parameters:  lock                    The lock structure describing the lock
 363  *                                      to be set; allocated by the caller, it
 364  *                                      will be linked into the lock list if
 365  *                                      the set is successful, and freed if the
 366  *                                      set is unsuccessful.
 367  *
 368  * Returns:     0                       Success
 369  *              EAGAIN
 370  *              EDEADLK
 371  *      lf_split:ENOLCK
 372  *      lf_clearlock:ENOLCK
 373  *      msleep:EINTR
 374  *
 375  * Notes:       We add the lock to the provisional lock list.  We do not
 376  *              coelesce at this time; this has implications for other lock
 377  *              requestors in the blocker search mechanism.
 378  */
 379 static int
 380 lf_setlock(struct lockf *lock)
 381 {
 382         struct lockf *block;
 383         struct lockf **head = lock->lf_head;
 384         struct lockf **prev, *overlap, *ltmp;
 385         static char lockstr[] = "lockf";
 386         int priority, needtolink, error;
 387         struct vnode *vp = lock->lf_vnode;
 388         overlap_t ovcase;
 389
 390 #ifdef LOCKF_DEBUGGING
 391         if (lockf_debug & 1) {
 392                 lf_print("lf_setlock", lock);
 393                 lf_printlist("lf_setlock(in)", lock);
 394         }
 395 #endif /* LOCKF_DEBUGGING */
 396
 397         /*
 398          * Set the priority
 399          */
 400         priority = PLOCK;
 401         if (lock->lf_type == F_WRLCK)
 402                 priority += 4;
 403         priority |= PCATCH;
 404         /*
 405          * Scan lock list for this file looking for locks that would block us.
 406          */
 407         while ((block = lf_getblock(lock))) {
 408                 /*
 409                  * Free the structure and return if nonblocking.
 410                  */
 411                 if ((lock->lf_flags & F_WAIT) == 0) {
 412                         FREE(lock, M_LOCKF);
 413                         return (EAGAIN);
 414                 }
 415
 416                 /*
 417                  * We are blocked. Since flock style locks cover
 418                  * the whole file, there is no chance for deadlock.
 419                  * For byte-range locks we must check for deadlock.
 420                  *
 421                  * Deadlock detection is done by looking through the
 422                  * wait channels to see if there are any cycles that
 423                  * involve us. MAXDEPTH is set just to make sure we
 424                  * do not go off into neverland.
 425                  */
 426                 if ((lock->lf_flags & F_POSIX) &&
 427                     (block->lf_flags & F_POSIX)) {
 428                         struct proc *wproc, *bproc;
 429                         struct uthread *ut;
 430                         struct lockf *waitblock;
 431                         int i = 0;
 432
 433                         /* The block is waiting on something */
 434                         wproc = (struct proc *)block->lf_id;
 435                         proc_lock(wproc);
 436                         TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) {
 437                                 /*
 438                                  * While the thread is asleep (uu_wchan != 0)
 439                                  * in this code (uu_wmesg == lockstr)
 440                                  * and we have not exceeded the maximum cycle
 441                                  * depth (i < maxlockdepth), then check for a
 442                                  * cycle to see if the lock is blocked behind
 443                                  * someone blocked behind us.
 444                                  */
 445                                 while (((waitblock = (struct lockf *)ut->uu_wchan) != NULL) &&
 446                                     ut->uu_wmesg == lockstr &&
 447                                     (i++ < maxlockdepth)) {
 448                                         waitblock = (struct lockf *)ut->uu_wchan;
 449                                         /*
 450                                          * Get the lock blocking the lock
 451                                          * which would block us, and make
 452                                          * certain it hasn't come unblocked
 453                                          * (been granted, e.g. between the time
 454                                          * we called lf_getblock, and the time
 455                                          * we successfully acquired the
 456                                          * proc_lock).
 457                                          */
 458                                         waitblock = waitblock->lf_next;
 459                                         if (waitblock == NULL)
 460                                                 break;
 461
 462                                         /*
 463                                          * Make sure it's an advisory range
 464                                          * lock and not an overall file lock;
 465                                          * if we mix lock types, it's our own
 466                                          * fault.
 467                                          */
 468                                         if ((waitblock->lf_flags & F_POSIX) == 0)
 469                                                 break;
 470
 471                                         /*
 472                                          * If the owner of the lock that's
 473                                          * blocking a lock that's blocking us
 474                                          * getting the requested lock, then we
 475                                          * would deadlock, so error out.
 476                                          */
 477                                         bproc = (struct proc *)waitblock->lf_id;
 478                                         if (bproc == (struct proc *)lock->lf_id) {
 479                                                 proc_unlock(wproc);
 480                                                 FREE(lock, M_LOCKF);
 481                                                 return (EDEADLK);
 482                                         }
 483                                 }
 484                         }
 485                         proc_unlock(wproc);
 486                 }
 487
 488                 /*
 489                  * For flock type locks, we must first remove
 490                  * any shared locks that we hold before we sleep
 491                  * waiting for an exclusive lock.
 492                  */
 493                 if ((lock->lf_flags & F_FLOCK) &&
 494                     lock->lf_type == F_WRLCK) {
 495                         lock->lf_type = F_UNLCK;
 496                         if ((error = lf_clearlock(lock)) != 0) {
 497                                 FREE(lock, M_LOCKF);
 498                                 return (error);
 499                         }
 500                         lock->lf_type = F_WRLCK;
 501                 }
 502                 /*
 503                  * Add our lock to the blocked list and sleep until we're free.
 504                  * Remember who blocked us (for deadlock detection).
 505                  */
 506                 lock->lf_next = block;
 507                 TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
 508
 509                 if ( !(lock->lf_flags & F_FLOCK))
 510                         block->lf_flags &= ~F_WAKE1_SAFE;
 511
 512 #ifdef LOCKF_DEBUGGING
 513                 if (lockf_debug & 1) {
 514                         lf_print("lf_setlock: blocking on", block);
 515                         lf_printlist("lf_setlock(block)", block);
 516                 }
 517 #endif /* LOCKF_DEBUGGING */
 518                 error = msleep(lock, &vp->v_lock, priority, lockstr, 0);
 519
 520                 if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
 521                         struct lockf *tlock;
 522
 523                         if ((block = lf_getblock(lock))) {
 524                                 TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
 525                                         tlock->lf_next = block;
 526                                 }
 527                                 TAILQ_CONCAT(&block->lf_blkhd, &lock->lf_blkhd, lf_block);
 528                         }
 529                 }
 530                 if (error) {    /* XXX */
 531                         /*
 532                          * We may have been awakened by a signal and/or by a
 533                          * debugger continuing us (in which cases we must remove
 534                          * ourselves from the blocked list) and/or by another
 535                          * process releasing a lock (in which case we have
 536                          * already been removed from the blocked list and our
 537                          * lf_next field set to NOLOCKF).
 538                          */
 539                         if (lock->lf_next) {
 540                                 TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
 541                                 lock->lf_next = NOLOCKF;
 542                         }
 543                         if (!TAILQ_EMPTY(&lock->lf_blkhd))
 544                                 lf_wakelock(lock, TRUE);
 545
 546                         FREE(lock, M_LOCKF);
 547                         return (error);
 548                 }       /* XXX */
 549         }
 550         /*
 551          * No blocks!!  Add the lock.  Note that we will
 552          * downgrade or upgrade any overlapping locks this
 553          * process already owns.
 554          *
 555          * Skip over locks owned by other processes.
 556          * Handle any locks that overlap and are owned by ourselves.
 557          */
 558         prev = head;
 559         block = *head;
 560         needtolink = 1;
 561         for (;;) {
 562                 ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
 563                 if (ovcase)
 564                         block = overlap->lf_next;
 565                 /*
 566                  * Six cases:
 567                  *      0) no overlap
 568                  *      1) overlap == lock
 569                  *      2) overlap contains lock
 570                  *      3) lock contains overlap
 571                  *      4) overlap starts before lock
 572                  *      5) overlap ends after lock
 573                  */
 574                 switch (ovcase) {
 575                 case OVERLAP_NONE:
 576                         if (needtolink) {
 577                                 *prev = lock;
 578                                 lock->lf_next = overlap;
 579                         }
 580                         break;
 581
 582                 case OVERLAP_EQUALS_LOCK:
 583                         /*
 584                          * If downgrading lock, others may be
 585                          * able to acquire it.
 586                          */
 587                         if (lock->lf_type == F_RDLCK &&
 588                             overlap->lf_type == F_WRLCK)
 589                                 lf_wakelock(overlap, TRUE);
 590                         overlap->lf_type = lock->lf_type;
 591                         FREE(lock, M_LOCKF);
 592                         lock = overlap; /* for lf_coelesce_adjacent() */
 593                         break;
 594
 595                 case OVERLAP_CONTAINS_LOCK:
 596                         /*
 597                          * Check for common starting point and different types.
 598                          */
 599                         if (overlap->lf_type == lock->lf_type) {
 600                                 FREE(lock, M_LOCKF);
 601                                 lock = overlap; /* for lf_coelesce_adjacent() */
 602                                 break;
 603                         }
 604                         if (overlap->lf_start == lock->lf_start) {
 605                                 *prev = lock;
 606                                 lock->lf_next = overlap;
 607                                 overlap->lf_start = lock->lf_end + 1;
 608                         } else {
 609                                 /*
 610                                  * If we can't split the lock, we can't
 611                                  * grant it.  Claim a system limit for the
 612                                  * resource shortage.
 613                                  */
 614                                 if (lf_split(overlap, lock)) {
 615                                         FREE(lock, M_LOCKF);
 616                                         return (ENOLCK);
 617                                 }
 618                         }
 619                         lf_wakelock(overlap, TRUE);
 620                         break;
 621
 622                 case OVERLAP_CONTAINED_BY_LOCK:
 623                         /*
 624                          * If downgrading lock, others may be able to
 625                          * acquire it, otherwise take the list.
 626                          */
 627                         if (lock->lf_type == F_RDLCK &&
 628                             overlap->lf_type == F_WRLCK) {
 629                                 lf_wakelock(overlap, TRUE);
 630                         } else {
 631                                 while (!TAILQ_EMPTY(&overlap->lf_blkhd)) {
 632                                         ltmp = TAILQ_FIRST(&overlap->lf_blkhd);
 633                                         TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
 634                                             lf_block);
 635                                         TAILQ_INSERT_TAIL(&lock->lf_blkhd,
 636                                             ltmp, lf_block);
 637                                         ltmp->lf_next = lock;
 638                                 }
 639                         }
 640                         /*
 641                          * Add the new lock if necessary and delete the overlap.
 642                          */
 643                         if (needtolink) {
 644                                 *prev = lock;
 645                                 lock->lf_next = overlap->lf_next;
 646                                 prev = &lock->lf_next;
 647                                 needtolink = 0;
 648                         } else
 649                                 *prev = overlap->lf_next;
 650                         FREE(overlap, M_LOCKF);
 651                         continue;
 652
 653                 case OVERLAP_STARTS_BEFORE_LOCK:
 654                         /*
 655                          * Add lock after overlap on the list.
 656                          */
 657                         lock->lf_next = overlap->lf_next;
 658                         overlap->lf_next = lock;
 659                         overlap->lf_end = lock->lf_start - 1;
 660                         prev = &lock->lf_next;
 661                         lf_wakelock(overlap, TRUE);
 662                         needtolink = 0;
 663                         continue;
 664
 665                 case OVERLAP_ENDS_AFTER_LOCK:
 666                         /*
 667                          * Add the new lock before overlap.
 668                          */
 669                         if (needtolink) {
 670                                 *prev = lock;
 671                                 lock->lf_next = overlap;
 672                         }
 673                         overlap->lf_start = lock->lf_end + 1;
 674                         lf_wakelock(overlap, TRUE);
 675                         break;
 676                 }
 677                 break;
 678         }
 679         /* Coelesce adjacent locks with identical attributes */
 680         lf_coelesce_adjacent(lock);
 681 #ifdef LOCKF_DEBUGGING
 682         if (lockf_debug & 1) {
 683                 lf_print("lf_setlock: got the lock", lock);
 684                 lf_printlist("lf_setlock(out)", lock);
 685         }
 686 #endif /* LOCKF_DEBUGGING */
 687         return (0);
 688 }
 689
 690
 691 /*
 692  * lf_clearlock
 693  *
 694  * Description: Remove a byte-range lock on an vnode.  Generally, find the
 695  *              lock (or an overlap to that lock) and remove it (or shrink
 696  *              it), then wakeup anyone we can.
 697  *
 698  * Parameters:  unlock                  The lock to clear
 699  *
 700  * Returns:     0                       Success
 701  *      lf_split:ENOLCK
 702  *
 703  * Notes:       A caller may unlock all the locks owned by the caller by
 704  *              specifying the entire file range; locks owned by other
 705  *              callers are not effected by this operation.
 706  */
 707 static int
 708 lf_clearlock(struct lockf *unlock)
 709 {
 710         struct lockf **head = unlock->lf_head;
 711         struct lockf *lf = *head;
 712         struct lockf *overlap, **prev;
 713         overlap_t ovcase;
 714
 715         if (lf == NOLOCKF)
 716                 return (0);
 717 #ifdef LOCKF_DEBUGGING
 718         if (unlock->lf_type != F_UNLCK)
 719                 panic("lf_clearlock: bad type");
 720         if (lockf_debug & 1)
 721                 lf_print("lf_clearlock", unlock);
 722 #endif /* LOCKF_DEBUGGING */
 723         prev = head;
 724         while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != OVERLAP_NONE) {
 725                 /*
 726                  * Wakeup the list of locks to be retried.
 727                  */
 728                 lf_wakelock(overlap, FALSE);
 729
 730                 switch (ovcase) {
 731                 case OVERLAP_NONE:      /* satisfy compiler enum/switch */
 732                         break;
 733
 734                 case OVERLAP_EQUALS_LOCK:
 735                         *prev = overlap->lf_next;
 736                         FREE(overlap, M_LOCKF);
 737                         break;
 738
 739                 case OVERLAP_CONTAINS_LOCK: /* split it */
 740                         if (overlap->lf_start == unlock->lf_start) {
 741                                 overlap->lf_start = unlock->lf_end + 1;
 742                                 break;
 743                         }
 744                         /*
 745                          * If we can't split the lock, we can't grant it.
 746                          * Claim a system limit for the resource shortage.
 747                          */
 748                         if (lf_split(overlap, unlock))
 749                                 return (ENOLCK);
 750                         overlap->lf_next = unlock->lf_next;
 751                         break;
 752
 753                 case OVERLAP_CONTAINED_BY_LOCK:
 754                         *prev = overlap->lf_next;
 755                         lf = overlap->lf_next;
 756                         FREE(overlap, M_LOCKF);
 757                         continue;
 758
 759                 case OVERLAP_STARTS_BEFORE_LOCK:
 760                         overlap->lf_end = unlock->lf_start - 1;
 761                         prev = &overlap->lf_next;
 762                         lf = overlap->lf_next;
 763                         continue;
 764
 765                 case OVERLAP_ENDS_AFTER_LOCK:
 766                         overlap->lf_start = unlock->lf_end + 1;
 767                         break;
 768                 }
 769                 break;
 770         }
 771 #ifdef LOCKF_DEBUGGING
 772         if (lockf_debug & 1)
 773                 lf_printlist("lf_clearlock", unlock);
 774 #endif /* LOCKF_DEBUGGING */
 775         return (0);
 776 }
 777
 778
 779 /*
 780  * lf_getlock
 781  *
 782  * Description: Check whether there is a blocking lock, and if so return
 783  *              its process identifier into the lock being requested.
 784  *
 785  * Parameters:  lock                    Pointer to lock to test for blocks
 786  *              fl                      Pointer to flock structure to receive
 787  *                                      the blocking lock information, if a
 788  *                                      blocking lock is found.
 789  *
 790  * Returns:     0                       Success
 791  *
 792  * Implicit Returns:
 793  *              *fl                     Contents modified to reflect the
 794  *                                      blocking lock, if one is found; not
 795  *                                      modified otherwise
 796  *
 797  * Notes:       fl->l_pid will be (-1) for file locks and will only be set to
 798  *              the blocking process ID for advisory record locks.
 799  */
 800 static int
 801 lf_getlock(struct lockf *lock, struct flock *fl)
 802 {
 803         struct lockf *block;
 804
 805 #ifdef LOCKF_DEBUGGING
 806         if (lockf_debug & 1)
 807                 lf_print("lf_getlock", lock);
 808 #endif /* LOCKF_DEBUGGING */
 809
 810         if ((block = lf_getblock(lock))) {
 811                 fl->l_type = block->lf_type;
 812                 fl->l_whence = SEEK_SET;
 813                 fl->l_start = block->lf_start;
 814                 if (block->lf_end == -1)
 815                         fl->l_len = 0;
 816                 else
 817                         fl->l_len = block->lf_end - block->lf_start + 1;
 818                 if (block->lf_flags & F_POSIX)
 819                         fl->l_pid = proc_pid((struct proc *)(block->lf_id));
 820                 else
 821                         fl->l_pid = -1;
 822         } else {
 823                 fl->l_type = F_UNLCK;
 824         }
 825         return (0);
 826 }
 827
 828
 829 /*
 830  * lf_getblock
 831  *
 832  * Description: Walk the list of locks for an inode and return the first
 833  *              blocking lock.  A lock is considered blocking if we are not
 834  *              the lock owner; otherwise, we are permitted to upgrade or
 835  *              downgrade it, and it's not considered blocking.
 836  *
 837  * Parameters:  lock                    The lock for which we are interested
 838  *                                      in obtaining the blocking lock, if any
 839  *
 840  * Returns:     NOLOCKF                 No blocking lock exists
 841  *              !NOLOCKF                The address of the blocking lock's
 842  *                                      struct lockf.
 843  */
 844 static struct lockf *
 845 lf_getblock(struct lockf *lock)
 846 {
 847         struct lockf **prev, *overlap, *lf = *(lock->lf_head);
 848         int ovcase;
 849
 850         prev = lock->lf_head;
 851         while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap)) != OVERLAP_NONE) {
 852                 /*
 853                  * We've found an overlap, see if it blocks us
 854                  */
 855                 if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
 856                         return (overlap);
 857                 /*
 858                  * Nope, point to the next one on the list and
 859                  * see if it blocks us
 860                  */
 861                 lf = overlap->lf_next;
 862         }
 863         return (NOLOCKF);
 864 }
 865
 866
 867 /*
 868  * lf_findoverlap
 869  *
 870  * Description: Walk the list of locks to find an overlapping lock (if any).
 871  *
 872  * Parameters:  lf                      First lock on lock list
 873  *              lock                    The lock we are checking for an overlap
 874  *              check                   Check type
 875  *              prev                    pointer to pointer pointer to contain
 876  *                                      address of pointer to previous lock
 877  *                                      pointer to overlapping lock, if overlap
 878  *              overlap                 pointer to pointer to contain address
 879  *                                      of overlapping lock
 880  *
 881  * Returns:     OVERLAP_NONE
 882  *              OVERLAP_EQUALS_LOCK
 883  *              OVERLAP_CONTAINS_LOCK
 884  *              OVERLAP_CONTAINED_BY_LOCK
 885  *              OVERLAP_STARTS_BEFORE_LOCK
 886  *              OVERLAP_ENDS_AFTER_LOCK
 887  *
 888  * Implicit Returns:
 889  *              *prev                   The address of the next pointer in the
 890  *                                      lock previous to the overlapping lock;
 891  *                                      this is generally used to relink the
 892  *                                      lock list, avoiding a second iteration.
 893  *              *overlap                The pointer to the overlapping lock
 894  *                                      itself; this is ussed to return data in
 895  *                                      the check == OTHERS case, and for the
 896  *                                      caller to modify the overlapping lock,
 897  *                                      in the check == SELF case
 898  *
 899  * Note:        This returns only the FIRST overlapping lock.  There may be
 900  *              more than one.  lf_getlock will return the first blocking lock,
 901  *              while lf_setlock will iterate over all overlapping locks to
 902  *
 903  *              The check parameter can be SELF, meaning we are looking for
 904  *              overelapping locks owned by us, or it can be OTHERS, meaning
 905  *              we are looking for overlapping locks owned by someone else so
 906  *              we can report a blocking lock on an F_GETLK request.
 907  *
 908  *              The value of *overlap and *prev are modified, even if there is
 909  *              no overlapping lock found; always check the return code.
 910  */
 911 static overlap_t
 912 lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
 913                struct lockf ***prev, struct lockf **overlap)
 914 {
 915         off_t start, end;
 916
 917         *overlap = lf;
 918         if (lf == NOLOCKF)
 919                 return (0);
 920 #ifdef LOCKF_DEBUGGING
 921         if (lockf_debug & 2)
 922                 lf_print("lf_findoverlap: looking for overlap in", lock);
 923 #endif /* LOCKF_DEBUGGING */
 924         start = lock->lf_start;
 925         end = lock->lf_end;
 926         while (lf != NOLOCKF) {
 927                 if (((type & SELF) && lf->lf_id != lock->lf_id) ||
 928                     ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
 929                         *prev = &lf->lf_next;
 930                         *overlap = lf = lf->lf_next;
 931                         continue;
 932                 }
 933 #ifdef LOCKF_DEBUGGING
 934                 if (lockf_debug & 2)
 935                         lf_print("\tchecking", lf);
 936 #endif /* LOCKF_DEBUGGING */
 937                 /*
 938                  * OK, check for overlap
 939                  */
 940                 if ((lf->lf_end != -1 && start > lf->lf_end) ||
 941                     (end != -1 && lf->lf_start > end)) {
 942                         /* Case 0 */
 943                         LOCKF_DEBUG(2, "no overlap\n");
 944                         if ((type & SELF) && end != -1 && lf->lf_start > end)
 945                                 return (OVERLAP_NONE);
 946                         *prev = &lf->lf_next;
 947                         *overlap = lf = lf->lf_next;
 948                         continue;
 949                 }
 950                 if ((lf->lf_start == start) && (lf->lf_end == end)) {
 951                         LOCKF_DEBUG(2, "overlap == lock\n");
 952                         return (OVERLAP_EQUALS_LOCK);
 953                 }
 954                 if ((lf->lf_start <= start) &&
 955                     (end != -1) &&
 956                     ((lf->lf_end >= end) || (lf->lf_end == -1))) {
 957                         LOCKF_DEBUG(2, "overlap contains lock\n");
 958                         return (OVERLAP_CONTAINS_LOCK);
 959                 }
 960                 if (start <= lf->lf_start &&
 961                            (end == -1 ||
 962                            (lf->lf_end != -1 && end >= lf->lf_end))) {
 963                         LOCKF_DEBUG(2, "lock contains overlap\n");
 964                         return (OVERLAP_CONTAINED_BY_LOCK);
 965                 }
 966                 if ((lf->lf_start < start) &&
 967                         ((lf->lf_end >= start) || (lf->lf_end == -1))) {
 968                         LOCKF_DEBUG(2, "overlap starts before lock\n");
 969                         return (OVERLAP_STARTS_BEFORE_LOCK);
 970                 }
 971                 if ((lf->lf_start > start) &&
 972                         (end != -1) &&
 973                         ((lf->lf_end > end) || (lf->lf_end == -1))) {
 974                         LOCKF_DEBUG(2, "overlap ends after lock\n");
 975                         return (OVERLAP_ENDS_AFTER_LOCK);
 976                 }
 977                 panic("lf_findoverlap: default");
 978         }
 979         return (OVERLAP_NONE);
 980 }
 981
 982
 983 /*
 984  * lf_split
 985  *
 986  * Description: Split a lock and a contained region into two or three locks
 987  *              as necessary.
 988  *
 989  * Parameters:  lock1                   Lock to split
 990  *              lock2                   Overlapping lock region requiring the
 991  *                                      split (upgrade/downgrade/unlock)
 992  *
 993  * Returns:     0                       Success
 994  *              ENOLCK                  No memory for new lock
 995  *
 996  * Implicit Returns:
 997  *              *lock1                  Modified original lock
 998  *              *lock2                  Overlapping lock (inserted into list)
 999  *              (new lock)              Potential new lock inserted into list
1000  *                                      if split results in 3 locks
1001  *
1002  * Notes:       This operation can only fail if the split would result in three
1003  *              locks, and there is insufficient memory to allocate the third
1004  *              lock; in that case, neither of the locks will be modified.
1005  */
1006 static int
1007 lf_split(struct lockf *lock1, struct lockf *lock2)
1008 {
1009         struct lockf *splitlock;
1010
1011 #ifdef LOCKF_DEBUGGING
1012         if (lockf_debug & 2) {
1013                 lf_print("lf_split", lock1);
1014                 lf_print("splitting from", lock2);
1015         }
1016 #endif /* LOCKF_DEBUGGING */
1017         /*
1018          * Check to see if spliting into only two pieces.
1019          */
1020         if (lock1->lf_start == lock2->lf_start) {
1021                 lock1->lf_start = lock2->lf_end + 1;
1022                 lock2->lf_next = lock1;
1023                 return (0);
1024         }
1025         if (lock1->lf_end == lock2->lf_end) {
1026                 lock1->lf_end = lock2->lf_start - 1;
1027                 lock2->lf_next = lock1->lf_next;
1028                 lock1->lf_next = lock2;
1029                 return (0);
1030         }
1031         /*
1032          * Make a new lock consisting of the last part of
1033          * the encompassing lock
1034          */
1035         MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
1036         if (splitlock == NULL)
1037                 return (ENOLCK);
1038         bcopy(lock1, splitlock, sizeof *splitlock);
1039         splitlock->lf_start = lock2->lf_end + 1;
1040         TAILQ_INIT(&splitlock->lf_blkhd);
1041         lock1->lf_end = lock2->lf_start - 1;
1042         /*
1043          * OK, now link it in
1044          */
1045         splitlock->lf_next = lock1->lf_next;
1046         lock2->lf_next = splitlock;
1047         lock1->lf_next = lock2;
1048
1049         return (0);
1050 }
1051
1052
1053 /*
1054  * lf_wakelock
1055  *
1056  * Wakeup a blocklist in the case of a downgrade or unlock, since others
1057  * waiting on the lock may now be able to acquire it.
1058  *
1059  * Parameters:  listhead                Lock list head on which waiters may
1060  *                                      have pending locks
1061  *
1062  * Returns:     <void>
1063  *
1064  * Notes:       This function iterates a list of locks and wakes all waiters,
1065  *              rather than only waiters for the contended regions.  Because
1066  *              of this, for heavily contended files, this can result in a
1067  *              "thundering herd" situation.  Refactoring the code could make
1068  *              this operation more efficient, if heavy contention ever results
1069  *              in a real-world performance problem.
1070  */
1071 static void
1072 lf_wakelock(struct lockf *listhead, boolean_t force_all)
1073 {
1074         struct lockf *wakelock;
1075         boolean_t wake_all = TRUE;
1076
1077         if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE))
1078                 wake_all = FALSE;
1079
1080         while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1081                 wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
1082                 TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
1083
1084                 wakelock->lf_next = NOLOCKF;
1085 #ifdef LOCKF_DEBUGGING
1086                 if (lockf_debug & 2)
1087                         lf_print("lf_wakelock: awakening", wakelock);
1088 #endif /* LOCKF_DEBUGGING */
1089                 if (wake_all == FALSE) {
1090                         /*
1091                          * If there are items on the list head block list,
1092                          * move them to the wakelock list instead, and then
1093                          * correct their lf_next pointers.
1094                          */
1095                         if (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1096                                 TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
1097
1098                                 struct lockf *tlock;
1099
1100                                 TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) {
1101                                         tlock->lf_next = wakelock;
1102                                 }
1103                         }
1104                 }
1105                 wakeup(wakelock);
1106
1107                 if (wake_all == FALSE)
1108                         break;
1109         }
1110 }
1111
1112
1113 #ifdef LOCKF_DEBUGGING
1114 /*
1115  * lf_print DEBUG
1116  *
1117  * Print out a lock; lock information is prefixed by the string in 'tag'
1118  *
1119  * Parameters:  tag                     A string tag for debugging
1120  *              lock                    The lock whose information should be
1121  *                                      displayed
1122  *
1123  * Returns:     <void>
1124  */
1125 void
1126 lf_print(const char *tag, struct lockf *lock)
1127 {
1128         printf("%s: lock %p for ", tag, (void *)lock);
1129         if (lock->lf_flags & F_POSIX)
1130                 printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid);
1131         else
1132                 printf("id %p", (void *)lock->lf_id);
1133         if (lock->lf_vnode != 0)
1134                 printf(" in vno %p, %s, start 0x%016llx, end 0x%016llx",
1135                     lock->lf_vnode,
1136                     lock->lf_type == F_RDLCK ? "shared" :
1137                     lock->lf_type == F_WRLCK ? "exclusive" :
1138                     lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1139                     (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1140         else
1141                 printf(" %s, start 0x%016llx, end 0x%016llx",
1142                     lock->lf_type == F_RDLCK ? "shared" :
1143                     lock->lf_type == F_WRLCK ? "exclusive" :
1144                     lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1145                     (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1146         if (!TAILQ_EMPTY(&lock->lf_blkhd))
1147                 printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
1148         else
1149                 printf("\n");
1150 }
1151
1152
1153 /*
1154  * lf_printlist DEBUG
1155  *
1156  * Print out a lock list for the vnode associated with 'lock'; lock information
1157  * is prefixed by the string in 'tag'
1158  *
1159  * Parameters:  tag                     A string tag for debugging
1160  *              lock                    The lock whose vnode's lock list should
1161  *                                      be displayed
1162  *
1163  * Returns:     <void>
1164  */
1165 void
1166 lf_printlist(const char *tag, struct lockf *lock)
1167 {
1168         struct lockf *lf, *blk;
1169
1170         if (lock->lf_vnode == 0)
1171                 return;
1172
1173         printf("%s: Lock list for vno %p:\n",
1174             tag, lock->lf_vnode);
1175         for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) {
1176                 printf("\tlock %p for ",(void *)lf);
1177                 if (lf->lf_flags & F_POSIX)
1178                         printf("proc %ld",
1179                             (long)((struct proc *)lf->lf_id)->p_pid);
1180                 else
1181                         printf("id %p", (void *)lf->lf_id);
1182                 printf(", %s, start 0x%016llx, end 0x%016llx",
1183                     lf->lf_type == F_RDLCK ? "shared" :
1184                     lf->lf_type == F_WRLCK ? "exclusive" :
1185                     lf->lf_type == F_UNLCK ? "unlock" :
1186                     "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
1187                 TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
1188                         printf("\n\t\tlock request %p for ", (void *)blk);
1189                         if (blk->lf_flags & F_POSIX)
1190                                 printf("proc %ld",
1191                                     (long)((struct proc *)blk->lf_id)->p_pid);
1192                         else
1193                                 printf("id %p", (void *)blk->lf_id);
1194                         printf(", %s, start 0x%016llx, end 0x%016llx",
1195                             blk->lf_type == F_RDLCK ? "shared" :
1196                             blk->lf_type == F_WRLCK ? "exclusive" :
1197                             blk->lf_type == F_UNLCK ? "unlock" :
1198                             "unknown", (intmax_t)blk->lf_start,
1199                             (intmax_t)blk->lf_end);
1200                         if (!TAILQ_EMPTY(&blk->lf_blkhd))
1201                                 panic("lf_printlist: bad list");
1202                 }
1203                 printf("\n");
1204         }
1205 }
1206 #endif /* LOCKF_DEBUGGING */