]> git.saurik.com Git - apple/xnu.git/blame_incremental - bsd/kern/kern_lockf.c
xnu-1228.0.2.tar.gz
[apple/xnu.git] / bsd / kern / kern_lockf.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1989, 1993
30 * The Regents of the University of California. All rights reserved.
31 *
32 * This code is derived from software contributed to Berkeley by
33 * Scooter Morris at Genentech Inc.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 4. Neither the name of the University nor the names of its contributors
44 * may be used to endorse or promote products derived from this software
45 * without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
60 */
61
62#include <sys/cdefs.h>
63#include <sys/param.h>
64#include <sys/systm.h>
65#include <sys/kernel.h>
66#include <sys/lock.h>
67#include <sys/mount.h>
68#include <sys/proc.h>
69#include <sys/signalvar.h>
70#include <sys/unistd.h>
71#include <sys/user.h>
72#include <sys/vnode.h>
73#include <sys/vnode_internal.h>
74#include <sys/vnode_if.h>
75#include <sys/malloc.h>
76#include <sys/fcntl.h>
77#include <sys/lockf.h>
78
79/*
80 * This variable controls the maximum number of processes that will
81 * be checked in doing deadlock detection.
82 */
83static int maxlockdepth = MAXDEPTH;
84
85#ifdef LOCKF_DEBUGGING
86#include <sys/sysctl.h>
87#include <ufs/ufs/quota.h>
88#include <ufs/ufs/inode.h>
89void lf_print(const char *tag, struct lockf *lock);
90void lf_printlist(const char *tag, struct lockf *lock);
91static int lockf_debug = 2;
92SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
93
94/*
95 * If there is no mask bit selector, or there is on, and the selector is
96 * set, then output the debugging diagnostic.
97 */
98#define LOCKF_DEBUG(mask, ...) \
99 do { \
100 if( !(mask) || ((mask) & lockf_debug)) { \
101 printf(__VA_ARGS__); \
102 } \
103 } while(0)
104#else /* !LOCKF_DEBUGGING */
105#define LOCKF_DEBUG(mask, ...) /* mask */
106#endif /* !LOCKF_DEBUGGING */
107
108MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
109
110#define NOLOCKF (struct lockf *)0
111#define SELF 0x1
112#define OTHERS 0x2
113#define OFF_MAX 0x7fffffffffffffffULL /* max off_t */
114
115/*
116 * Overlapping lock states
117 */
118typedef enum {
119 OVERLAP_NONE = 0,
120 OVERLAP_EQUALS_LOCK,
121 OVERLAP_CONTAINS_LOCK,
122 OVERLAP_CONTAINED_BY_LOCK,
123 OVERLAP_STARTS_BEFORE_LOCK,
124 OVERLAP_ENDS_AFTER_LOCK
125} overlap_t;
126
127static int lf_clearlock(struct lockf *);
128static overlap_t lf_findoverlap(struct lockf *,
129 struct lockf *, int, struct lockf ***, struct lockf **);
130static struct lockf *lf_getblock(struct lockf *);
131static int lf_getlock(struct lockf *, struct flock *);
132static int lf_setlock(struct lockf *);
133static int lf_split(struct lockf *, struct lockf *);
134static void lf_wakelock(struct lockf *);
135
136
137/*
138 * lf_advlock
139 *
140 * Description: Advisory record locking support
141 *
142 * Parameters: ap Argument pointer to a vnop_advlock_args
143 * argument descriptor structure for the
144 * lock operation to be attempted.
145 *
146 * Returns: 0 Success
147 * EOVERFLOW
148 * EINVAL
149 * ENOLCK Number of locked regions exceeds limit
150 * lf_setlock:EAGAIN
151 * lf_setlock:EDEADLK
152 * lf_setlock:EINTR
153 * lf_setlock:ENOLCK
154 * lf_clearlock:ENOLCK
155 * vnode_size:???
156 *
157 * Notes: We return ENOLCK when we run out of memory to support locks; as
158 * such, there is no specific expectation limit other than the
159 * amount of available resources.
160 */
161int
162lf_advlock(struct vnop_advlock_args *ap)
163{
164 struct vnode *vp = ap->a_vp;
165 struct flock *fl = ap->a_fl;
166 vfs_context_t context = ap->a_context;
167 struct lockf *lock;
168 off_t start, end, oadd;
169 u_quad_t size;
170 int error;
171 struct lockf **head = &vp->v_lockf;
172
173 /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */
174
175 /*
176 * Avoid the common case of unlocking when inode has no locks.
177 */
178 if (*head == (struct lockf *)0) {
179 if (ap->a_op != F_SETLK) {
180 fl->l_type = F_UNLCK;
181 LOCKF_DEBUG(0, "lf_advlock: '%s' unlock without lock\n", vfs_context_proc(context)->p_comm);
182 return (0);
183 }
184 }
185
186 /*
187 * Convert the flock structure into a start and end.
188 */
189 switch (fl->l_whence) {
190
191 case SEEK_SET:
192 case SEEK_CUR:
193 /*
194 * Caller is responsible for adding any necessary offset
195 * when SEEK_CUR is used.
196 */
197 start = fl->l_start;
198 break;
199
200 case SEEK_END:
201
202 /*
203 * It's OK to cast the u_quad_t to and off_t here, since they
204 * are the same storage size, and the value of the returned
205 * contents will never overflow into the sign bit. We need to
206 * do this because we will use size to force range checks.
207 */
208 if ((error = vnode_size(vp, (off_t *)&size, context))) {
209 LOCKF_DEBUG(0, "lf_advlock: vnode_getattr failed: %d\n", error);
210 return (error);
211 }
212
213 if (size > OFF_MAX ||
214 (fl->l_start > 0 &&
215 size > (u_quad_t)(OFF_MAX - fl->l_start)))
216 return (EOVERFLOW);
217 start = size + fl->l_start;
218 break;
219
220 default:
221 LOCKF_DEBUG(0, "lf_advlock: unknown whence %d\n", fl->l_whence);
222 return (EINVAL);
223 }
224 if (start < 0) {
225 LOCKF_DEBUG(0, "lf_advlock: start < 0 (%qd)\n", start);
226 return (EINVAL);
227 }
228 if (fl->l_len < 0) {
229 if (start == 0) {
230 LOCKF_DEBUG(0, "lf_advlock: len < 0 & start == 0\n");
231 return (EINVAL);
232 }
233 end = start - 1;
234 start += fl->l_len;
235 if (start < 0) {
236 LOCKF_DEBUG(0, "lf_advlock: start < 0 (%qd)\n", start);
237 return (EINVAL);
238 }
239 } else if (fl->l_len == 0)
240 end = -1;
241 else {
242 oadd = fl->l_len - 1;
243 if (oadd > (off_t)(OFF_MAX - start)) {
244 LOCKF_DEBUG(0, "lf_advlock: overflow\n");
245 return (EOVERFLOW);
246 }
247 end = start + oadd;
248 }
249 /*
250 * Create the lockf structure
251 */
252 MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
253 if (lock == NULL)
254 return (ENOLCK);
255 lock->lf_start = start;
256 lock->lf_end = end;
257 lock->lf_id = ap->a_id;
258 lock->lf_vnode = vp;
259 lock->lf_type = fl->l_type;
260 lock->lf_head = head;
261 lock->lf_next = (struct lockf *)0;
262 TAILQ_INIT(&lock->lf_blkhd);
263 lock->lf_flags = ap->a_flags;
264
265 lck_mtx_lock(&vp->v_lock); /* protect the lockf list */
266 /*
267 * Do the requested operation.
268 */
269 switch(ap->a_op) {
270 case F_SETLK:
271 error = lf_setlock(lock);
272 break;
273
274 case F_UNLCK:
275 error = lf_clearlock(lock);
276 FREE(lock, M_LOCKF);
277 break;
278
279 case F_GETLK:
280 error = lf_getlock(lock, fl);
281 FREE(lock, M_LOCKF);
282 break;
283
284 default:
285 FREE(lock, M_LOCKF);
286 error = EINVAL;
287 break;
288 }
289 lck_mtx_unlock(&vp->v_lock); /* done maniplulating the list */
290
291 LOCKF_DEBUG(0, "lf_advlock: normal exit: %d\n\n", error);
292 return (error);
293}
294
295
296/*
297 * lf_coelesce_adjacent
298 *
299 * Description: Helper function: when setting a lock, coelesce adjacent
300 * locks. Needed because adjacent locks are not overlapping,
301 * but POSIX requires that they be coelesced.
302 *
303 * Parameters: lock The new lock which may be adjacent
304 * to already locked reagions, and which
305 * should therefore be coelesced with them
306 *
307 * Returns: <void>
308 */
309static void
310lf_coelesce_adjacent(struct lockf *lock)
311{
312 struct lockf **lf = lock->lf_head;
313
314 while (*lf != NOLOCKF) {
315 /* reject locks that obviously could not be coelesced */
316 if ((*lf == lock) ||
317 ((*lf)->lf_id != lock->lf_id) ||
318 ((*lf)->lf_type != lock->lf_type)) {
319 lf = &(*lf)->lf_next;
320 continue;
321 }
322
323 /* If the lock ends adjacent to us, we can coelesce it */
324 if ((*lf)->lf_end != -1 &&
325 ((*lf)->lf_end + 1) == lock->lf_start) {
326 struct lockf *adjacent = *lf;
327
328 LOCKF_DEBUG(0, "lf_coelesce_adjacent: coelesce adjacent previous\n");
329 lock->lf_start = (*lf)->lf_start;
330 *lf = lock;
331 lf = &(*lf)->lf_next;
332 FREE(adjacent, M_LOCKF);
333 continue;
334 }
335 /* If the lock starts adjacent to us, we can coelesce it */
336 if (lock->lf_end != -1 &&
337 (lock->lf_end + 1) == (*lf)->lf_start) {
338 struct lockf *adjacent = *lf;
339
340 LOCKF_DEBUG(0, "lf_coelesce_adjacent: coelesce adjacent following\n");
341 lock->lf_end = (*lf)->lf_end;
342 lock->lf_next = (*lf)->lf_next;
343 lf = &lock->lf_next;
344 FREE(adjacent, M_LOCKF);
345 continue;
346 }
347
348 /* no matching conditions; go on to next lock */
349 lf = &(*lf)->lf_next;
350 }
351}
352
353
354/*
355 * lf_setlock
356 *
357 * Description: Set a byte-range lock.
358 *
359 * Parameters: lock The lock structure describing the lock
360 * to be set; allocated by the caller, it
361 * will be linked into the lock list if
362 * the set is successful, and freed if the
363 * set is unsuccessful.
364 *
365 * Returns: 0 Success
366 * EAGAIN
367 * EDEADLK
368 * lf_split:ENOLCK
369 * lf_clearlock:ENOLCK
370 * msleep:EINTR
371 *
372 * Notes: We add the lock to the provisional lock list. We do not
373 * coelesce at this time; this has implications for other lock
374 * requestors in the blocker search mechanism.
375 */
376static int
377lf_setlock(struct lockf *lock)
378{
379 struct lockf *block;
380 struct lockf **head = lock->lf_head;
381 struct lockf **prev, *overlap, *ltmp;
382 static char lockstr[] = "lockf";
383 int priority, needtolink, error;
384 struct vnode *vp = lock->lf_vnode;
385 overlap_t ovcase;
386
387#ifdef LOCKF_DEBUGGING
388 if (lockf_debug & 1) {
389 lf_print("lf_setlock", lock);
390 lf_printlist("lf_setlock(in)", lock);
391 }
392#endif /* LOCKF_DEBUGGING */
393
394 /*
395 * Set the priority
396 */
397 priority = PLOCK;
398 if (lock->lf_type == F_WRLCK)
399 priority += 4;
400 priority |= PCATCH;
401 /*
402 * Scan lock list for this file looking for locks that would block us.
403 */
404 while ((block = lf_getblock(lock))) {
405 /*
406 * Free the structure and return if nonblocking.
407 */
408 if ((lock->lf_flags & F_WAIT) == 0) {
409 FREE(lock, M_LOCKF);
410 return (EAGAIN);
411 }
412
413 /*
414 * We are blocked. Since flock style locks cover
415 * the whole file, there is no chance for deadlock.
416 * For byte-range locks we must check for deadlock.
417 *
418 * Deadlock detection is done by looking through the
419 * wait channels to see if there are any cycles that
420 * involve us. MAXDEPTH is set just to make sure we
421 * do not go off into neverland.
422 */
423 if ((lock->lf_flags & F_POSIX) &&
424 (block->lf_flags & F_POSIX)) {
425 struct proc *wproc, *bproc;
426 struct uthread *ut;
427 struct lockf *waitblock;
428 int i = 0;
429
430 /* The block is waiting on something */
431 wproc = (struct proc *)block->lf_id;
432 proc_lock(wproc);
433 TAILQ_FOREACH(ut, &wproc->p_uthlist, uu_list) {
434 /*
435 * While the thread is asleep (uu_wchan != 0)
436 * in this code (uu_wmesg == lockstr)
437 * and we have not exceeded the maximum cycle
438 * depth (i < maxlockdepth), then check for a
439 * cycle to see if the lock is blocked behind
440 * someone blocked behind us.
441 */
442 while (((waitblock = (struct lockf *)ut->uu_wchan) != NULL) &&
443 ut->uu_wmesg == lockstr &&
444 (i++ < maxlockdepth)) {
445 waitblock = (struct lockf *)ut->uu_wchan;
446 /*
447 * Get the lock blocking the lock
448 * which would block us, and make
449 * certain it hasn't come unblocked
450 * (been granted, e.g. between the time
451 * we called lf_getblock, and the time
452 * we successfully acquired the
453 * proc_lock).
454 */
455 waitblock = waitblock->lf_next;
456 if (waitblock == NULL)
457 break;
458
459 /*
460 * Make sure it's an advisory range
461 * lock and not an overall file lock;
462 * if we mix lock types, it's our own
463 * fault.
464 */
465 if ((waitblock->lf_flags & F_POSIX) == 0)
466 break;
467
468 /*
469 * If the owner of the lock that's
470 * blocking a lock that's blocking us
471 * getting the requested lock, then we
472 * would deadlock, so error out.
473 */
474 bproc = (struct proc *)waitblock->lf_id;
475 if (bproc == (struct proc *)lock->lf_id) {
476 proc_unlock(wproc);
477 FREE(lock, M_LOCKF);
478 return (EDEADLK);
479 }
480 }
481 }
482 proc_unlock(wproc);
483 }
484
485 /*
486 * For flock type locks, we must first remove
487 * any shared locks that we hold before we sleep
488 * waiting for an exclusive lock.
489 */
490 if ((lock->lf_flags & F_FLOCK) &&
491 lock->lf_type == F_WRLCK) {
492 lock->lf_type = F_UNLCK;
493 if ((error = lf_clearlock(lock)) != 0) {
494 FREE(lock, M_LOCKF);
495 return (error);
496 }
497 lock->lf_type = F_WRLCK;
498 }
499 /*
500 * Add our lock to the blocked list and sleep until we're free.
501 * Remember who blocked us (for deadlock detection).
502 */
503 lock->lf_next = block;
504 TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
505#ifdef LOCKF_DEBUGGING
506 if (lockf_debug & 1) {
507 lf_print("lf_setlock: blocking on", block);
508 lf_printlist("lf_setlock(block)", block);
509 }
510#endif /* LOCKF_DEBUGGING */
511 error = msleep(lock, &vp->v_lock, priority, lockstr, 0);
512 if (error) { /* XXX */
513 /*
514 * We may have been awakened by a signal and/or by a
515 * debugger continuing us (in which cases we must remove
516 * ourselves from the blocked list) and/or by another
517 * process releasing a lock (in which case we have
518 * already been removed from the blocked list and our
519 * lf_next field set to NOLOCKF).
520 */
521 if (lock->lf_next) {
522 TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
523 lock->lf_next = NOLOCKF;
524 }
525 FREE(lock, M_LOCKF);
526 return (error);
527 } /* XXX */
528 }
529 /*
530 * No blocks!! Add the lock. Note that we will
531 * downgrade or upgrade any overlapping locks this
532 * process already owns.
533 *
534 * Skip over locks owned by other processes.
535 * Handle any locks that overlap and are owned by ourselves.
536 */
537 prev = head;
538 block = *head;
539 needtolink = 1;
540 for (;;) {
541 ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
542 if (ovcase)
543 block = overlap->lf_next;
544 /*
545 * Six cases:
546 * 0) no overlap
547 * 1) overlap == lock
548 * 2) overlap contains lock
549 * 3) lock contains overlap
550 * 4) overlap starts before lock
551 * 5) overlap ends after lock
552 */
553 switch (ovcase) {
554 case OVERLAP_NONE:
555 if (needtolink) {
556 *prev = lock;
557 lock->lf_next = overlap;
558 }
559 break;
560
561 case OVERLAP_EQUALS_LOCK:
562 /*
563 * If downgrading lock, others may be
564 * able to acquire it.
565 */
566 if (lock->lf_type == F_RDLCK &&
567 overlap->lf_type == F_WRLCK)
568 lf_wakelock(overlap);
569 overlap->lf_type = lock->lf_type;
570 FREE(lock, M_LOCKF);
571 lock = overlap; /* for lf_coelesce_adjacent() */
572 break;
573
574 case OVERLAP_CONTAINS_LOCK:
575 /*
576 * Check for common starting point and different types.
577 */
578 if (overlap->lf_type == lock->lf_type) {
579 FREE(lock, M_LOCKF);
580 lock = overlap; /* for lf_coelesce_adjacent() */
581 break;
582 }
583 if (overlap->lf_start == lock->lf_start) {
584 *prev = lock;
585 lock->lf_next = overlap;
586 overlap->lf_start = lock->lf_end + 1;
587 } else {
588 /*
589 * If we can't split the lock, we can't
590 * grant it. Claim a system limit for the
591 * resource shortage.
592 */
593 if (lf_split(overlap, lock)) {
594 FREE(lock, M_LOCKF);
595 return (ENOLCK);
596 }
597 }
598 lf_wakelock(overlap);
599 break;
600
601 case OVERLAP_CONTAINED_BY_LOCK:
602 /*
603 * If downgrading lock, others may be able to
604 * acquire it, otherwise take the list.
605 */
606 if (lock->lf_type == F_RDLCK &&
607 overlap->lf_type == F_WRLCK) {
608 lf_wakelock(overlap);
609 } else {
610 while (!TAILQ_EMPTY(&overlap->lf_blkhd)) {
611 ltmp = TAILQ_FIRST(&overlap->lf_blkhd);
612 TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
613 lf_block);
614 TAILQ_INSERT_TAIL(&lock->lf_blkhd,
615 ltmp, lf_block);
616 ltmp->lf_next = lock;
617 }
618 }
619 /*
620 * Add the new lock if necessary and delete the overlap.
621 */
622 if (needtolink) {
623 *prev = lock;
624 lock->lf_next = overlap->lf_next;
625 prev = &lock->lf_next;
626 needtolink = 0;
627 } else
628 *prev = overlap->lf_next;
629 FREE(overlap, M_LOCKF);
630 continue;
631
632 case OVERLAP_STARTS_BEFORE_LOCK:
633 /*
634 * Add lock after overlap on the list.
635 */
636 lock->lf_next = overlap->lf_next;
637 overlap->lf_next = lock;
638 overlap->lf_end = lock->lf_start - 1;
639 prev = &lock->lf_next;
640 lf_wakelock(overlap);
641 needtolink = 0;
642 continue;
643
644 case OVERLAP_ENDS_AFTER_LOCK:
645 /*
646 * Add the new lock before overlap.
647 */
648 if (needtolink) {
649 *prev = lock;
650 lock->lf_next = overlap;
651 }
652 overlap->lf_start = lock->lf_end + 1;
653 lf_wakelock(overlap);
654 break;
655 }
656 break;
657 }
658 /* Coelesce adjacent locks with identical attributes */
659 lf_coelesce_adjacent(lock);
660#ifdef LOCKF_DEBUGGING
661 if (lockf_debug & 1) {
662 lf_print("lf_setlock: got the lock", lock);
663 lf_printlist("lf_setlock(out)", lock);
664 }
665#endif /* LOCKF_DEBUGGING */
666 return (0);
667}
668
669
670/*
671 * lf_clearlock
672 *
673 * Description: Remove a byte-range lock on an vnode. Generally, find the
674 * lock (or an overlap to that lock) and remove it (or shrink
675 * it), then wakeup anyone we can.
676 *
677 * Parameters: unlock The lock to clear
678 *
679 * Returns: 0 Success
680 * lf_split:ENOLCK
681 *
682 * Notes: A caller may unlock all the locks owned by the caller by
683 * specifying the entire file range; locks owned by other
684 * callers are not effected by this operation.
685 */
686static int
687lf_clearlock(struct lockf *unlock)
688{
689 struct lockf **head = unlock->lf_head;
690 struct lockf *lf = *head;
691 struct lockf *overlap, **prev;
692 overlap_t ovcase;
693
694 if (lf == NOLOCKF)
695 return (0);
696#ifdef LOCKF_DEBUGGING
697 if (unlock->lf_type != F_UNLCK)
698 panic("lf_clearlock: bad type");
699 if (lockf_debug & 1)
700 lf_print("lf_clearlock", unlock);
701#endif /* LOCKF_DEBUGGING */
702 prev = head;
703 while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) != OVERLAP_NONE) {
704 /*
705 * Wakeup the list of locks to be retried.
706 */
707 lf_wakelock(overlap);
708
709 switch (ovcase) {
710 case OVERLAP_NONE: /* satisfy compiler enum/switch */
711 break;
712
713 case OVERLAP_EQUALS_LOCK:
714 *prev = overlap->lf_next;
715 FREE(overlap, M_LOCKF);
716 break;
717
718 case OVERLAP_CONTAINS_LOCK: /* split it */
719 if (overlap->lf_start == unlock->lf_start) {
720 overlap->lf_start = unlock->lf_end + 1;
721 break;
722 }
723 /*
724 * If we can't split the lock, we can't grant it.
725 * Claim a system limit for the resource shortage.
726 */
727 if (lf_split(overlap, unlock))
728 return (ENOLCK);
729 overlap->lf_next = unlock->lf_next;
730 break;
731
732 case OVERLAP_CONTAINED_BY_LOCK:
733 *prev = overlap->lf_next;
734 lf = overlap->lf_next;
735 FREE(overlap, M_LOCKF);
736 continue;
737
738 case OVERLAP_STARTS_BEFORE_LOCK:
739 overlap->lf_end = unlock->lf_start - 1;
740 prev = &overlap->lf_next;
741 lf = overlap->lf_next;
742 continue;
743
744 case OVERLAP_ENDS_AFTER_LOCK:
745 overlap->lf_start = unlock->lf_end + 1;
746 break;
747 }
748 break;
749 }
750#ifdef LOCKF_DEBUGGING
751 if (lockf_debug & 1)
752 lf_printlist("lf_clearlock", unlock);
753#endif /* LOCKF_DEBUGGING */
754 return (0);
755}
756
757
758/*
759 * lf_getlock
760 *
761 * Description: Check whether there is a blocking lock, and if so return
762 * its process identifier into the lock being requested.
763 *
764 * Parameters: lock Pointer to lock to test for blocks
765 * fl Pointer to flock structure to receive
766 * the blocking lock information, if a
767 * blocking lock is found.
768 *
769 * Returns: 0 Success
770 *
771 * Implicit Returns:
772 * *fl Contents modified to reflect the
773 * blocking lock, if one is found; not
774 * modified otherwise
775 *
776 * Notes: fl->l_pid will be (-1) for file locks and will only be set to
777 * the blocking process ID for advisory record locks.
778 */
779static int
780lf_getlock(struct lockf *lock, struct flock *fl)
781{
782 struct lockf *block;
783
784#ifdef LOCKF_DEBUGGING
785 if (lockf_debug & 1)
786 lf_print("lf_getlock", lock);
787#endif /* LOCKF_DEBUGGING */
788
789 if ((block = lf_getblock(lock))) {
790 fl->l_type = block->lf_type;
791 fl->l_whence = SEEK_SET;
792 fl->l_start = block->lf_start;
793 if (block->lf_end == -1)
794 fl->l_len = 0;
795 else
796 fl->l_len = block->lf_end - block->lf_start + 1;
797 if (block->lf_flags & F_POSIX)
798 fl->l_pid = proc_pid((struct proc *)(block->lf_id));
799 else
800 fl->l_pid = -1;
801 } else {
802 fl->l_type = F_UNLCK;
803 }
804 return (0);
805}
806
807
808/*
809 * lf_getblock
810 *
811 * Description: Walk the list of locks for an inode and return the first
812 * blocking lock. A lock is considered blocking if we are not
813 * the lock owner; otherwise, we are permitted to upgrade or
814 * downgrade it, and it's not considered blocking.
815 *
816 * Parameters: lock The lock for which we are interested
817 * in obtaining the blocking lock, if any
818 *
819 * Returns: NOLOCKF No blocking lock exists
820 * !NOLOCKF The address of the blocking lock's
821 * struct lockf.
822 */
823static struct lockf *
824lf_getblock(struct lockf *lock)
825{
826 struct lockf **prev, *overlap, *lf = *(lock->lf_head);
827 int ovcase;
828
829 prev = lock->lf_head;
830 while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap)) != OVERLAP_NONE) {
831 /*
832 * We've found an overlap, see if it blocks us
833 */
834 if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
835 return (overlap);
836 /*
837 * Nope, point to the next one on the list and
838 * see if it blocks us
839 */
840 lf = overlap->lf_next;
841 }
842 return (NOLOCKF);
843}
844
845
846/*
847 * lf_findoverlap
848 *
849 * Description: Walk the list of locks to find an overlapping lock (if any).
850 *
851 * Parameters: lf First lock on lock list
852 * lock The lock we are checking for an overlap
853 * check Check type
854 * prev pointer to pointer pointer to contain
855 * address of pointer to previous lock
856 * pointer to overlapping lock, if overlap
857 * overlap pointer to pointer to contain address
858 * of overlapping lock
859 *
860 * Returns: OVERLAP_NONE
861 * OVERLAP_EQUALS_LOCK
862 * OVERLAP_CONTAINS_LOCK
863 * OVERLAP_CONTAINED_BY_LOCK
864 * OVERLAP_STARTS_BEFORE_LOCK
865 * OVERLAP_ENDS_AFTER_LOCK
866 *
867 * Implicit Returns:
868 * *prev The address of the next pointer in the
869 * lock previous to the overlapping lock;
870 * this is generally used to relink the
871 * lock list, avoiding a second iteration.
872 * *overlap The pointer to the overlapping lock
873 * itself; this is ussed to return data in
874 * the check == OTHERS case, and for the
875 * caller to modify the overlapping lock,
876 * in the check == SELF case
877 *
878 * Note: This returns only the FIRST overlapping lock. There may be
879 * more than one. lf_getlock will return the first blocking lock,
880 * while lf_setlock will iterate over all overlapping locks to
881 *
882 * The check parameter can be SELF, meaning we are looking for
883 * overelapping locks owned by us, or it can be OTHERS, meaning
884 * we are looking for overlapping locks owned by someone else so
885 * we can report a blocking lock on an F_GETLK request.
886 *
887 * The value of *overlap and *prev are modified, even if there is
888 * no overlapping lock found; always check the return code.
889 */
890static overlap_t
891lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
892 struct lockf ***prev, struct lockf **overlap)
893{
894 off_t start, end;
895
896 *overlap = lf;
897 if (lf == NOLOCKF)
898 return (0);
899#ifdef LOCKF_DEBUGGING
900 if (lockf_debug & 2)
901 lf_print("lf_findoverlap: looking for overlap in", lock);
902#endif /* LOCKF_DEBUGGING */
903 start = lock->lf_start;
904 end = lock->lf_end;
905 while (lf != NOLOCKF) {
906 if (((type & SELF) && lf->lf_id != lock->lf_id) ||
907 ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
908 *prev = &lf->lf_next;
909 *overlap = lf = lf->lf_next;
910 continue;
911 }
912#ifdef LOCKF_DEBUGGING
913 if (lockf_debug & 2)
914 lf_print("\tchecking", lf);
915#endif /* LOCKF_DEBUGGING */
916 /*
917 * OK, check for overlap
918 */
919 if ((lf->lf_end != -1 && start > lf->lf_end) ||
920 (end != -1 && lf->lf_start > end)) {
921 /* Case 0 */
922 LOCKF_DEBUG(2, "no overlap\n");
923 if ((type & SELF) && end != -1 && lf->lf_start > end)
924 return (OVERLAP_NONE);
925 *prev = &lf->lf_next;
926 *overlap = lf = lf->lf_next;
927 continue;
928 }
929 if ((lf->lf_start == start) && (lf->lf_end == end)) {
930 LOCKF_DEBUG(2, "overlap == lock\n");
931 return (OVERLAP_EQUALS_LOCK);
932 }
933 if ((lf->lf_start <= start) &&
934 (end != -1) &&
935 ((lf->lf_end >= end) || (lf->lf_end == -1))) {
936 LOCKF_DEBUG(2, "overlap contains lock\n");
937 return (OVERLAP_CONTAINS_LOCK);
938 }
939 if (start <= lf->lf_start &&
940 (end == -1 ||
941 (lf->lf_end != -1 && end >= lf->lf_end))) {
942 LOCKF_DEBUG(2, "lock contains overlap\n");
943 return (OVERLAP_CONTAINED_BY_LOCK);
944 }
945 if ((lf->lf_start < start) &&
946 ((lf->lf_end >= start) || (lf->lf_end == -1))) {
947 LOCKF_DEBUG(2, "overlap starts before lock\n");
948 return (OVERLAP_STARTS_BEFORE_LOCK);
949 }
950 if ((lf->lf_start > start) &&
951 (end != -1) &&
952 ((lf->lf_end > end) || (lf->lf_end == -1))) {
953 LOCKF_DEBUG(2, "overlap ends after lock\n");
954 return (OVERLAP_ENDS_AFTER_LOCK);
955 }
956 panic("lf_findoverlap: default");
957 }
958 return (OVERLAP_NONE);
959}
960
961
962/*
963 * lf_split
964 *
965 * Description: Split a lock and a contained region into two or three locks
966 * as necessary.
967 *
968 * Parameters: lock1 Lock to split
969 * lock2 Overlapping lock region requiring the
970 * split (upgrade/downgrade/unlock)
971 *
972 * Returns: 0 Success
973 * ENOLCK No memory for new lock
974 *
975 * Implicit Returns:
976 * *lock1 Modified original lock
977 * *lock2 Overlapping lock (inserted into list)
978 * (new lock) Potential new lock inserted into list
979 * if split results in 3 locks
980 *
981 * Notes: This operation can only fail if the split would result in three
982 * locks, and there is insufficient memory to allocate the third
983 * lock; in that case, neither of the locks will be modified.
984 */
985static int
986lf_split(struct lockf *lock1, struct lockf *lock2)
987{
988 struct lockf *splitlock;
989
990#ifdef LOCKF_DEBUGGING
991 if (lockf_debug & 2) {
992 lf_print("lf_split", lock1);
993 lf_print("splitting from", lock2);
994 }
995#endif /* LOCKF_DEBUGGING */
996 /*
997 * Check to see if spliting into only two pieces.
998 */
999 if (lock1->lf_start == lock2->lf_start) {
1000 lock1->lf_start = lock2->lf_end + 1;
1001 lock2->lf_next = lock1;
1002 return (0);
1003 }
1004 if (lock1->lf_end == lock2->lf_end) {
1005 lock1->lf_end = lock2->lf_start - 1;
1006 lock2->lf_next = lock1->lf_next;
1007 lock1->lf_next = lock2;
1008 return (0);
1009 }
1010 /*
1011 * Make a new lock consisting of the last part of
1012 * the encompassing lock
1013 */
1014 MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
1015 if (splitlock == NULL)
1016 return (ENOLCK);
1017 bcopy(lock1, splitlock, sizeof *splitlock);
1018 splitlock->lf_start = lock2->lf_end + 1;
1019 TAILQ_INIT(&splitlock->lf_blkhd);
1020 lock1->lf_end = lock2->lf_start - 1;
1021 /*
1022 * OK, now link it in
1023 */
1024 splitlock->lf_next = lock1->lf_next;
1025 lock2->lf_next = splitlock;
1026 lock1->lf_next = lock2;
1027
1028 return (0);
1029}
1030
1031
1032/*
1033 * lf_wakelock
1034 *
1035 * Wakeup a blocklist in the case of a downgrade or unlock, since others
1036 * waiting on the lock may now be able to acquire it.
1037 *
1038 * Parameters: listhead Lock list head on which waiters may
1039 * have pending locks
1040 *
1041 * Returns: <void>
1042 *
1043 * Notes: This function iterates a list of locks and wakes all waiters,
1044 * rather than only waiters for the contended regions. Because
1045 * of this, for heavily contended files, this can result in a
1046 * "thundering herd" situation. Refactoring the code could make
1047 * this operation more efficient, if heavy contention ever results
1048 * in a real-world performance problem.
1049 */
1050static void
1051lf_wakelock(struct lockf *listhead)
1052{
1053 struct lockf *wakelock;
1054
1055 while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
1056 wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
1057 TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
1058 wakelock->lf_next = NOLOCKF;
1059#ifdef LOCKF_DEBUGGING
1060 if (lockf_debug & 2)
1061 lf_print("lf_wakelock: awakening", wakelock);
1062#endif /* LOCKF_DEBUGGING */
1063 wakeup(wakelock);
1064 }
1065}
1066
1067
1068#ifdef LOCKF_DEBUGGING
1069/*
1070 * lf_print DEBUG
1071 *
1072 * Print out a lock; lock information is prefixed by the string in 'tag'
1073 *
1074 * Parameters: tag A string tag for debugging
1075 * lock The lock whose information should be
1076 * displayed
1077 *
1078 * Returns: <void>
1079 */
1080void
1081lf_print(const char *tag, struct lockf *lock)
1082{
1083 printf("%s: lock %p for ", tag, (void *)lock);
1084 if (lock->lf_flags & F_POSIX)
1085 printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid);
1086 else
1087 printf("id %p", (void *)lock->lf_id);
1088 if (lock->lf_vnode != 0)
1089 printf(" in vno %p, %s, start 0x%016llx, end 0x%016llx",
1090 lock->lf_vnode,
1091 lock->lf_type == F_RDLCK ? "shared" :
1092 lock->lf_type == F_WRLCK ? "exclusive" :
1093 lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1094 (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1095 else
1096 printf(" %s, start 0x%016llx, end 0x%016llx",
1097 lock->lf_type == F_RDLCK ? "shared" :
1098 lock->lf_type == F_WRLCK ? "exclusive" :
1099 lock->lf_type == F_UNLCK ? "unlock" : "unknown",
1100 (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
1101 if (!TAILQ_EMPTY(&lock->lf_blkhd))
1102 printf(" block %p\n", (void *)TAILQ_FIRST(&lock->lf_blkhd));
1103 else
1104 printf("\n");
1105}
1106
1107
1108/*
1109 * lf_printlist DEBUG
1110 *
1111 * Print out a lock list for the vnode associated with 'lock'; lock information
1112 * is prefixed by the string in 'tag'
1113 *
1114 * Parameters: tag A string tag for debugging
1115 * lock The lock whose vnode's lock list should
1116 * be displayed
1117 *
1118 * Returns: <void>
1119 */
1120void
1121lf_printlist(const char *tag, struct lockf *lock)
1122{
1123 struct lockf *lf, *blk;
1124
1125 if (lock->lf_vnode == 0)
1126 return;
1127
1128 printf("%s: Lock list for vno %p:\n",
1129 tag, lock->lf_vnode);
1130 for (lf = lock->lf_vnode->v_lockf; lf; lf = lf->lf_next) {
1131 printf("\tlock %p for ",(void *)lf);
1132 if (lf->lf_flags & F_POSIX)
1133 printf("proc %ld",
1134 (long)((struct proc *)lf->lf_id)->p_pid);
1135 else
1136 printf("id %p", (void *)lf->lf_id);
1137 printf(", %s, start 0x%016llx, end 0x%016llx",
1138 lf->lf_type == F_RDLCK ? "shared" :
1139 lf->lf_type == F_WRLCK ? "exclusive" :
1140 lf->lf_type == F_UNLCK ? "unlock" :
1141 "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
1142 TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
1143 printf("\n\t\tlock request %p for ", (void *)blk);
1144 if (blk->lf_flags & F_POSIX)
1145 printf("proc %ld",
1146 (long)((struct proc *)blk->lf_id)->p_pid);
1147 else
1148 printf("id %p", (void *)blk->lf_id);
1149 printf(", %s, start 0x%016llx, end 0x%016llx",
1150 blk->lf_type == F_RDLCK ? "shared" :
1151 blk->lf_type == F_WRLCK ? "exclusive" :
1152 blk->lf_type == F_UNLCK ? "unlock" :
1153 "unknown", (intmax_t)blk->lf_start,
1154 (intmax_t)blk->lf_end);
1155 if (!TAILQ_EMPTY(&blk->lf_blkhd))
1156 panic("lf_printlist: bad list");
1157 }
1158 printf("\n");
1159 }
1160}
1161#endif /* LOCKF_DEBUGGING */