]> git.saurik.com Git - apple/xnu.git/blame - bsd/ufs/ffs/ffs_alloc.c
xnu-792.tar.gz
[apple/xnu.git] / bsd / ufs / ffs / ffs_alloc.c
CommitLineData
1c79356b 1/*
55e303ae 2 * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.
1c79356b
A
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
e5568f75
A
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
1c79356b 11 *
e5568f75
A
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
1c79356b
A
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
e5568f75
A
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
1c79356b
A
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23/*
24 * Copyright (c) 1982, 1986, 1989, 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 3. All advertising materials mentioning features or use of this software
36 * must display the following acknowledgement:
37 * This product includes software developed by the University of
38 * California, Berkeley and its contributors.
39 * 4. Neither the name of the University nor the names of its contributors
40 * may be used to endorse or promote products derived from this software
41 * without specific prior written permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95
56 */
57#include <rev_endian_fs.h>
58#include <vm/vm_pager.h>
59
60#include <sys/param.h>
61#include <sys/systm.h>
91447636 62#include <sys/buf_internal.h>
1c79356b 63#include <sys/proc.h>
91447636
A
64#include <sys/kauth.h>
65#include <sys/vnode_internal.h>
66#include <sys/mount_internal.h>
1c79356b
A
67#include <sys/kernel.h>
68#include <sys/syslog.h>
9bccf70c 69#include <sys/quota.h>
1c79356b
A
70
71#include <sys/vm.h>
72
73#include <ufs/ufs/quota.h>
74#include <ufs/ufs/inode.h>
75
76#include <ufs/ffs/fs.h>
77#include <ufs/ffs/ffs_extern.h>
78
79#if REV_ENDIAN_FS
80#include <ufs/ufs/ufs_byte_order.h>
81#include <architecture/byte_order.h>
82#endif /* REV_ENDIAN_FS */
83
84extern u_long nextgennumber;
85
91447636
A
86static ufs_daddr_t ffs_alloccg(struct inode *, int, ufs_daddr_t, int);
87static ufs_daddr_t ffs_alloccgblk(struct fs *, struct cg *, ufs_daddr_t);
88static ufs_daddr_t ffs_clusteralloc(struct inode *, int, ufs_daddr_t, int);
89static ino_t ffs_dirpref(struct inode *);
90static ufs_daddr_t ffs_fragextend(struct inode *, int, long, int, int);
91static void ffs_fserr(struct fs *, u_int, char *);
1c79356b 92static u_long ffs_hashalloc
91447636
A
93 (struct inode *, int, long, int, u_int32_t (*)());
94static ino_t ffs_nodealloccg(struct inode *, int, ufs_daddr_t, int);
95static ufs_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs_daddr_t, int);
96static void ffs_clusteracct
97 (struct fs *fs, struct cg *cgp, ufs_daddr_t blkno, int cnt);
1c79356b
A
98
99/*
100 * Allocate a block in the file system.
101 *
102 * The size of the requested block is given, which must be some
103 * multiple of fs_fsize and <= fs_bsize.
104 * A preference may be optionally specified. If a preference is given
105 * the following hierarchy is used to allocate a block:
106 * 1) allocate the requested block.
107 * 2) allocate a rotationally optimal block in the same cylinder.
108 * 3) allocate a block in the same cylinder group.
109 * 4) quadradically rehash into other cylinder groups, until an
110 * available block is located.
111 * If no block preference is given the following heirarchy is used
112 * to allocate a block:
113 * 1) allocate a block in the cylinder group that contains the
114 * inode for the file.
115 * 2) quadradically rehash into other cylinder groups, until an
116 * available block is located.
117 */
118ffs_alloc(ip, lbn, bpref, size, cred, bnp)
119 register struct inode *ip;
120 ufs_daddr_t lbn, bpref;
121 int size;
91447636 122 kauth_cred_t cred;
1c79356b
A
123 ufs_daddr_t *bnp;
124{
125 register struct fs *fs;
126 ufs_daddr_t bno;
127 int cg, error;
128 int devBlockSize=0;
129 *bnp = 0;
130 fs = ip->i_fs;
131#if DIAGNOSTIC
132 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
133 printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n",
134 ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
135 panic("ffs_alloc: bad size");
136 }
137 if (cred == NOCRED)
138 panic("ffs_alloc: missing credential\n");
139#endif /* DIAGNOSTIC */
140 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
141 goto nospace;
91447636 142 if (suser(cred, NULL) && freespace(fs, fs->fs_minfree) <= 0)
1c79356b 143 goto nospace;
91447636 144 devBlockSize = vfs_devblocksize(vnode_mount(ITOV(ip)));
1c79356b 145#if QUOTA
9bccf70c 146 if (error = chkdq(ip, (int64_t)size, cred, 0))
1c79356b
A
147 return (error);
148#endif /* QUOTA */
149 if (bpref >= fs->fs_size)
150 bpref = 0;
151 if (bpref == 0)
152 cg = ino_to_cg(fs, ip->i_number);
153 else
154 cg = dtog(fs, bpref);
155 bno = (ufs_daddr_t)ffs_hashalloc(ip, cg, (long)bpref, size,
156 (u_int32_t (*)())ffs_alloccg);
157 if (bno > 0) {
158 ip->i_blocks += btodb(size, devBlockSize);
159 ip->i_flag |= IN_CHANGE | IN_UPDATE;
160 *bnp = bno;
161 return (0);
162 }
163#if QUOTA
164 /*
165 * Restore user's disk quota because allocation failed.
166 */
9bccf70c 167 (void) chkdq(ip, (int64_t)-size, cred, FORCE);
1c79356b
A
168#endif /* QUOTA */
169nospace:
91447636 170 ffs_fserr(fs, kauth_cred_getuid(cred), "file system full");
1c79356b
A
171 uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
172 return (ENOSPC);
173}
174
175/*
176 * Reallocate a fragment to a bigger size
177 *
178 * The number and size of the old block is given, and a preference
179 * and new size is also specified. The allocator attempts to extend
180 * the original block. Failing that, the regular block allocator is
181 * invoked to get an appropriate block.
182 */
183ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp)
184 register struct inode *ip;
185 ufs_daddr_t lbprev;
186 ufs_daddr_t bpref;
187 int osize, nsize;
91447636 188 kauth_cred_t cred;
1c79356b
A
189 struct buf **bpp;
190{
191 register struct fs *fs;
192 struct buf *bp;
193 int cg, request, error;
194 ufs_daddr_t bprev, bno;
195 int devBlockSize=0;
196
197 *bpp = 0;
198 fs = ip->i_fs;
199#if DIAGNOSTIC
200 if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
201 (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
202 printf(
203 "dev = 0x%x, bsize = %d, osize = %d, nsize = %d, fs = %s\n",
204 ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
205 panic("ffs_realloccg: bad size");
206 }
207 if (cred == NOCRED)
208 panic("ffs_realloccg: missing credential\n");
209#endif /* DIAGNOSTIC */
91447636 210 if (suser(cred, NULL) != 0 && freespace(fs, fs->fs_minfree) <= 0)
1c79356b
A
211 goto nospace;
212 if ((bprev = ip->i_db[lbprev]) == 0) {
213 printf("dev = 0x%x, bsize = %d, bprev = %d, fs = %s\n",
214 ip->i_dev, fs->fs_bsize, bprev, fs->fs_fsmnt);
215 panic("ffs_realloccg: bad bprev");
216 }
217 /*
218 * Allocate the extra space in the buffer.
219 */
91447636
A
220 if (error = (int)buf_bread(ITOV(ip), (daddr64_t)((unsigned)lbprev), osize, NOCRED, &bp)) {
221 buf_brelse(bp);
1c79356b
A
222 return (error);
223 }
91447636 224 devBlockSize = vfs_devblocksize(vnode_mount(ITOV(ip)));
1c79356b
A
225
226#if QUOTA
9bccf70c 227 if (error = chkdq(ip, (int64_t)(nsize - osize), cred, 0))
1c79356b 228 {
91447636 229 buf_brelse(bp);
1c79356b
A
230 return (error);
231 }
232#endif /* QUOTA */
233 /*
234 * Check for extension in the existing location.
235 */
236 cg = dtog(fs, bprev);
237 if (bno = ffs_fragextend(ip, cg, (long)bprev, osize, nsize)) {
91447636 238 if ((ufs_daddr_t)buf_blkno(bp) != fsbtodb(fs, bno))
1c79356b
A
239 panic("bad blockno");
240 ip->i_blocks += btodb(nsize - osize, devBlockSize);
241 ip->i_flag |= IN_CHANGE | IN_UPDATE;
242 allocbuf(bp, nsize);
91447636
A
243 buf_setflags(bp, B_DONE);
244 bzero((char *)buf_dataptr(bp) + osize, (u_int)buf_size(bp) - osize);
1c79356b
A
245 *bpp = bp;
246 return (0);
247 }
248 /*
249 * Allocate a new disk location.
250 */
251 if (bpref >= fs->fs_size)
252 bpref = 0;
253 switch ((int)fs->fs_optim) {
254 case FS_OPTSPACE:
255 /*
256 * Allocate an exact sized fragment. Although this makes
257 * best use of space, we will waste time relocating it if
258 * the file continues to grow. If the fragmentation is
259 * less than half of the minimum free reserve, we choose
260 * to begin optimizing for time.
261 */
262 request = nsize;
263 if (fs->fs_minfree < 5 ||
264 fs->fs_cstotal.cs_nffree >
265 fs->fs_dsize * fs->fs_minfree / (2 * 100))
266 break;
267 log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n",
268 fs->fs_fsmnt);
269 fs->fs_optim = FS_OPTTIME;
270 break;
271 case FS_OPTTIME:
272 /*
273 * At this point we have discovered a file that is trying to
274 * grow a small fragment to a larger fragment. To save time,
275 * we allocate a full sized block, then free the unused portion.
276 * If the file continues to grow, the `ffs_fragextend' call
277 * above will be able to grow it in place without further
278 * copying. If aberrant programs cause disk fragmentation to
279 * grow within 2% of the free reserve, we choose to begin
280 * optimizing for space.
281 */
282 request = fs->fs_bsize;
283 if (fs->fs_cstotal.cs_nffree <
284 fs->fs_dsize * (fs->fs_minfree - 2) / 100)
285 break;
286 log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n",
287 fs->fs_fsmnt);
288 fs->fs_optim = FS_OPTSPACE;
289 break;
290 default:
291 printf("dev = 0x%x, optim = %d, fs = %s\n",
292 ip->i_dev, fs->fs_optim, fs->fs_fsmnt);
293 panic("ffs_realloccg: bad optim");
294 /* NOTREACHED */
295 }
296 bno = (ufs_daddr_t)ffs_hashalloc(ip, cg, (long)bpref, request,
297 (u_int32_t (*)())ffs_alloccg);
298 if (bno > 0) {
91447636 299 buf_setblkno(bp, (daddr64_t)((unsigned)fsbtodb(fs, bno)));
1c79356b
A
300 ffs_blkfree(ip, bprev, (long)osize);
301 if (nsize < request)
302 ffs_blkfree(ip, bno + numfrags(fs, nsize),
303 (long)(request - nsize));
304 ip->i_blocks += btodb(nsize - osize, devBlockSize);
305 ip->i_flag |= IN_CHANGE | IN_UPDATE;
306 allocbuf(bp, nsize);
91447636
A
307 buf_setflags(bp, B_DONE);
308 bzero((char *)buf_dataptr(bp) + osize, (u_int)buf_size(bp) - osize);
1c79356b
A
309 *bpp = bp;
310 return (0);
311 }
312#if QUOTA
313 /*
314 * Restore user's disk quota because allocation failed.
315 */
9bccf70c 316 (void) chkdq(ip, (int64_t)-(nsize - osize), cred, FORCE);
1c79356b 317#endif /* QUOTA */
91447636 318 buf_brelse(bp);
1c79356b
A
319nospace:
320 /*
321 * no space available
322 */
91447636 323 ffs_fserr(fs, kauth_cred_getuid(cred), "file system full");
1c79356b
A
324 uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
325 return (ENOSPC);
326}
327
328/*
329 * Reallocate a sequence of blocks into a contiguous sequence of blocks.
330 *
331 * The vnode and an array of buffer pointers for a range of sequential
332 * logical blocks to be made contiguous is given. The allocator attempts
333 * to find a range of sequential blocks starting as close as possible to
334 * an fs_rotdelay offset from the end of the allocation for the logical
335 * block immediately preceeding the current range. If successful, the
336 * physical block numbers in the buffer pointers and in the inode are
337 * changed to reflect the new allocation. If unsuccessful, the allocation
338 * is left unchanged. The success in doing the reallocation is returned.
339 * Note that the error return is not reflected back to the user. Rather
340 * the previous block allocation will be used.
341 */
342/* Note: This routine is unused in UBC cluster I/O */
343
344int doasyncfree = 1;
345int doreallocblks = 1;
346
1c79356b
A
347
348/*
349 * Allocate an inode in the file system.
350 *
351 * If allocating a directory, use ffs_dirpref to select the inode.
352 * If allocating in a directory, the following hierarchy is followed:
353 * 1) allocate the preferred inode.
354 * 2) allocate an inode in the same cylinder group.
355 * 3) quadradically rehash into other cylinder groups, until an
356 * available inode is located.
357 * If no inode preference is given the following heirarchy is used
358 * to allocate an inode:
359 * 1) allocate an inode in cylinder group 0.
360 * 2) quadradically rehash into other cylinder groups, until an
361 * available inode is located.
362 */
363int
91447636
A
364ffs_valloc(
365 struct vnode *pvp,
366 mode_t mode,
367 kauth_cred_t cred,
368 struct vnode **vpp)
369
1c79356b 370{
1c79356b
A
371 register struct inode *pip;
372 register struct fs *fs;
373 register struct inode *ip;
91447636 374 struct timeval tv;
1c79356b
A
375 ino_t ino, ipref;
376 int cg, error;
377
91447636 378 *vpp = NULL;
1c79356b
A
379 pip = VTOI(pvp);
380 fs = pip->i_fs;
381 if (fs->fs_cstotal.cs_nifree == 0)
382 goto noinodes;
383
384 if ((mode & IFMT) == IFDIR)
55e303ae 385 ipref = ffs_dirpref(pip);
1c79356b
A
386 else
387 ipref = pip->i_number;
388 if (ipref >= fs->fs_ncg * fs->fs_ipg)
389 ipref = 0;
390 cg = ino_to_cg(fs, ipref);
55e303ae
A
391 /*
392 * Track the number of dirs created one after another
393 * in a cg without intervening files.
394 */
395 if ((mode & IFMT) == IFDIR) {
396 if (fs->fs_contigdirs[cg] < 255)
397 fs->fs_contigdirs[cg]++;
398 } else {
399 if (fs->fs_contigdirs[cg] > 0)
400 fs->fs_contigdirs[cg]--;
401 }
1c79356b
A
402 ino = (ino_t)ffs_hashalloc(pip, cg, (long)ipref, mode, ffs_nodealloccg);
403 if (ino == 0)
404 goto noinodes;
91447636
A
405
406 error = ffs_vget_internal(pvp->v_mount, ino, vpp, NULL, NULL, mode, 0);
1c79356b 407 if (error) {
91447636 408 ffs_vfree(pvp, ino, mode);
1c79356b
A
409 return (error);
410 }
91447636
A
411 ip = VTOI(*vpp);
412
1c79356b
A
413 if (ip->i_mode) {
414 printf("mode = 0%o, inum = %d, fs = %s\n",
415 ip->i_mode, ip->i_number, fs->fs_fsmnt);
416 panic("ffs_valloc: dup alloc");
417 }
418 if (ip->i_blocks) { /* XXX */
419 printf("free inode %s/%d had %d blocks\n",
420 fs->fs_fsmnt, ino, ip->i_blocks);
421 ip->i_blocks = 0;
422 }
423 ip->i_flags = 0;
424 /*
425 * Set up a new generation number for this inode.
426 */
91447636
A
427 microtime(&tv);
428 if (++nextgennumber < (u_long)tv.tv_sec)
429 nextgennumber = tv.tv_sec;
1c79356b
A
430 ip->i_gen = nextgennumber;
431 return (0);
432noinodes:
91447636 433 ffs_fserr(fs, kauth_cred_getuid(cred), "out of inodes");
1c79356b
A
434 uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
435 return (ENOSPC);
436}
437
438/*
55e303ae 439 * Find a cylinder group to place a directory.
1c79356b 440 *
55e303ae
A
441 * The policy implemented by this algorithm is to allocate a
442 * directory inode in the same cylinder group as its parent
443 * directory, but also to reserve space for its files inodes
444 * and data. Restrict the number of directories which may be
445 * allocated one after another in the same cylinder group
446 * without intervening allocation of files.
1c79356b
A
447 */
448static ino_t
55e303ae
A
449ffs_dirpref(pip)
450 struct inode *pip;
1c79356b 451{
55e303ae
A
452 register struct fs *fs;
453 int cg, prefcg, dirsize, cgsize;
454 int avgifree, avgbfree, avgndir, curdirsize;
455 int minifree, minbfree, maxndir;
456 int mincg, minndir;
457 int maxcontigdirs;
1c79356b 458
55e303ae 459 fs = pip->i_fs;
1c79356b 460 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
55e303ae
A
461 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
462 avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
463
464 /*
465 * Force allocation in another cg if creating a first level dir.
466 */
467 if (ITOV(pip)->v_flag & VROOT) {
468#ifdef __APPLE__
469 prefcg = random() % fs->fs_ncg;
470#else
471 prefcg = arc4random() % fs->fs_ncg;
472#endif
473 mincg = prefcg;
474 minndir = fs->fs_ipg;
475 for (cg = prefcg; cg < fs->fs_ncg; cg++)
476 if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
477 fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
478 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
479 mincg = cg;
480 minndir = fs->fs_cs(fs, cg).cs_ndir;
481 }
482 for (cg = 0; cg < prefcg; cg++)
483 if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
484 fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
485 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
486 mincg = cg;
487 minndir = fs->fs_cs(fs, cg).cs_ndir;
488 }
489 return ((ino_t)(fs->fs_ipg * mincg));
490 }
491
492 /*
493 * Count various limits which used for
494 * optimal allocation of a directory inode.
495 */
496 maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
497 minifree = avgifree - fs->fs_ipg / 4;
498 if (minifree < 0)
499 minifree = 0;
500 minbfree = avgbfree - fs->fs_fpg / fs->fs_frag / 4;
501 if (minbfree < 0)
502 minbfree = 0;
503 cgsize = fs->fs_fsize * fs->fs_fpg;
504 dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir;
505 curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0;
506 if (dirsize < curdirsize)
507 dirsize = curdirsize;
508 maxcontigdirs = min(cgsize / dirsize, 255);
509 if (fs->fs_avgfpdir > 0)
510 maxcontigdirs = min(maxcontigdirs,
511 fs->fs_ipg / fs->fs_avgfpdir);
512 if (maxcontigdirs == 0)
513 maxcontigdirs = 1;
514
515 /*
516 * Limit number of dirs in one cg and reserve space for
517 * regular files, but only if we have no deficit in
518 * inodes or space.
519 */
520 prefcg = ino_to_cg(fs, pip->i_number);
521 for (cg = prefcg; cg < fs->fs_ncg; cg++)
522 if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
523 fs->fs_cs(fs, cg).cs_nifree >= minifree &&
524 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
525 if (fs->fs_contigdirs[cg] < maxcontigdirs)
526 return ((ino_t)(fs->fs_ipg * cg));
527 }
528 for (cg = 0; cg < prefcg; cg++)
529 if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
530 fs->fs_cs(fs, cg).cs_nifree >= minifree &&
531 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
532 if (fs->fs_contigdirs[cg] < maxcontigdirs)
533 return ((ino_t)(fs->fs_ipg * cg));
1c79356b 534 }
55e303ae
A
535 /*
536 * This is a backstop when we have deficit in space.
537 */
538 for (cg = prefcg; cg < fs->fs_ncg; cg++)
539 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
540 return ((ino_t)(fs->fs_ipg * cg));
541 for (cg = 0; cg < prefcg; cg++)
542 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
543 break;
544 return ((ino_t)(fs->fs_ipg * cg));
1c79356b
A
545}
546
547/*
548 * Select the desired position for the next block in a file. The file is
549 * logically divided into sections. The first section is composed of the
550 * direct blocks. Each additional section contains fs_maxbpg blocks.
551 *
552 * If no blocks have been allocated in the first section, the policy is to
553 * request a block in the same cylinder group as the inode that describes
554 * the file. If no blocks have been allocated in any other section, the
555 * policy is to place the section in a cylinder group with a greater than
556 * average number of free blocks. An appropriate cylinder group is found
557 * by using a rotor that sweeps the cylinder groups. When a new group of
558 * blocks is needed, the sweep begins in the cylinder group following the
559 * cylinder group from which the previous allocation was made. The sweep
560 * continues until a cylinder group with greater than the average number
561 * of free blocks is found. If the allocation is for the first block in an
562 * indirect block, the information on the previous allocation is unavailable;
563 * here a best guess is made based upon the logical block number being
564 * allocated.
565 *
566 * If a section is already partially allocated, the policy is to
567 * contiguously allocate fs_maxcontig blocks. The end of one of these
568 * contiguous blocks and the beginning of the next is physically separated
569 * so that the disk head will be in transit between them for at least
570 * fs_rotdelay milliseconds. This is to allow time for the processor to
571 * schedule another I/O transfer.
572 */
573ufs_daddr_t
574ffs_blkpref(ip, lbn, indx, bap)
575 struct inode *ip;
576 ufs_daddr_t lbn;
577 int indx;
578 ufs_daddr_t *bap;
579{
580 register struct fs *fs;
581 register int cg;
582 int avgbfree, startcg;
583 ufs_daddr_t nextblk;
584#if REV_ENDIAN_FS
585 daddr_t prev=0;
586 struct vnode *vp=ITOV(ip);
587 struct mount *mp=vp->v_mount;
588 int rev_endian=(mp->mnt_flag & MNT_REVEND);
589#endif /* REV_ENDIAN_FS */
590
591 fs = ip->i_fs;
592#if REV_ENDIAN_FS
593 if (indx && bap) {
594 if (rev_endian) {
595 if (bap != &ip->i_db[0])
596 prev = NXSwapLong(bap[indx - 1]);
597 else
598 prev = bap[indx - 1];
599 } else prev = bap[indx - 1];
600 }
601 if (indx % fs->fs_maxbpg == 0 || prev == 0)
602#else /* REV_ENDIAN_FS */
603 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0)
604#endif /* REV_ENDIAN_FS */
605 {
606 if (lbn < NDADDR) {
607 cg = ino_to_cg(fs, ip->i_number);
608 return (fs->fs_fpg * cg + fs->fs_frag);
609 }
610 /*
611 * Find a cylinder with greater than average number of
612 * unused data blocks.
613 */
614#if REV_ENDIAN_FS
615 if (indx == 0 || prev == 0)
616#else /* REV_ENDIAN_FS */
617 if (indx == 0 || bap[indx - 1] == 0)
618#endif /* REV_ENDIAN_FS */
619 startcg =
620 ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
621 else
622#if REV_ENDIAN_FS
623 startcg = dtog(fs, prev) + 1;
624#else /* REV_ENDIAN_FS */
625 startcg = dtog(fs, bap[indx - 1]) + 1;
626#endif /* REV_ENDIAN_FS */
627 startcg %= fs->fs_ncg;
628 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
629 for (cg = startcg; cg < fs->fs_ncg; cg++)
630 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
631 fs->fs_cgrotor = cg;
632 return (fs->fs_fpg * cg + fs->fs_frag);
633 }
634 for (cg = 0; cg <= startcg; cg++)
635 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
636 fs->fs_cgrotor = cg;
637 return (fs->fs_fpg * cg + fs->fs_frag);
638 }
639 return (NULL);
640 }
641 /*
642 * One or more previous blocks have been laid out. If less
643 * than fs_maxcontig previous blocks are contiguous, the
644 * next block is requested contiguously, otherwise it is
645 * requested rotationally delayed by fs_rotdelay milliseconds.
646 */
647#if REV_ENDIAN_FS
648 if (rev_endian) {
649 nextblk = prev + fs->fs_frag;
650 if (indx < fs->fs_maxcontig) {
651 return (nextblk);
652 }
653 if (bap != &ip->i_db[0])
654 prev = NXSwapLong(bap[indx - fs->fs_maxcontig]);
655 else
656 prev = bap[indx - fs->fs_maxcontig];
657 if (prev + blkstofrags(fs, fs->fs_maxcontig) != nextblk)
658 return (nextblk);
659 } else {
660#endif /* REV_ENDIAN_FS */
661 nextblk = bap[indx - 1] + fs->fs_frag;
662 if (indx < fs->fs_maxcontig || bap[indx - fs->fs_maxcontig] +
663 blkstofrags(fs, fs->fs_maxcontig) != nextblk)
664 return (nextblk);
665#if REV_ENDIAN_FS
666 }
667#endif /* REV_ENDIAN_FS */
668 if (fs->fs_rotdelay != 0)
669 /*
670 * Here we convert ms of delay to frags as:
671 * (frags) = (ms) * (rev/sec) * (sect/rev) /
672 * ((sect/frag) * (ms/sec))
673 * then round up to the next block.
674 */
675 nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
676 (NSPF(fs) * 1000), fs->fs_frag);
677 return (nextblk);
678}
679
680/*
681 * Implement the cylinder overflow algorithm.
682 *
683 * The policy implemented by this algorithm is:
684 * 1) allocate the block in its requested cylinder group.
685 * 2) quadradically rehash on the cylinder group number.
686 * 3) brute force search for a free block.
687 */
688/*VARARGS5*/
689static u_long
690ffs_hashalloc(ip, cg, pref, size, allocator)
691 struct inode *ip;
692 int cg;
693 long pref;
694 int size; /* size for data blocks, mode for inodes */
695 u_int32_t (*allocator)();
696{
697 register struct fs *fs;
698 long result;
699 int i, icg = cg;
700
701 fs = ip->i_fs;
702 /*
703 * 1: preferred cylinder group
704 */
705 result = (*allocator)(ip, cg, pref, size);
706 if (result)
707 return (result);
708 /*
709 * 2: quadratic rehash
710 */
711 for (i = 1; i < fs->fs_ncg; i *= 2) {
712 cg += i;
713 if (cg >= fs->fs_ncg)
714 cg -= fs->fs_ncg;
715 result = (*allocator)(ip, cg, 0, size);
716 if (result)
717 return (result);
718 }
719 /*
720 * 3: brute force search
721 * Note that we start at i == 2, since 0 was checked initially,
722 * and 1 is always checked in the quadratic rehash.
723 */
724 cg = (icg + 2) % fs->fs_ncg;
725 for (i = 2; i < fs->fs_ncg; i++) {
726 result = (*allocator)(ip, cg, 0, size);
727 if (result)
728 return (result);
729 cg++;
730 if (cg == fs->fs_ncg)
731 cg = 0;
732 }
733 return (NULL);
734}
735
736/*
737 * Determine whether a fragment can be extended.
738 *
739 * Check to see if the necessary fragments are available, and
740 * if they are, allocate them.
741 */
742static ufs_daddr_t
743ffs_fragextend(ip, cg, bprev, osize, nsize)
744 struct inode *ip;
745 int cg;
746 long bprev;
747 int osize, nsize;
748{
749 register struct fs *fs;
750 register struct cg *cgp;
751 struct buf *bp;
91447636 752 struct timeval tv;
1c79356b
A
753 long bno;
754 int frags, bbase;
755 int i, error;
756#if REV_ENDIAN_FS
757 struct vnode *vp=ITOV(ip);
758 struct mount *mp=vp->v_mount;
759 int rev_endian=(mp->mnt_flag & MNT_REVEND);
760#endif /* REV_ENDIAN_FS */
761
762 fs = ip->i_fs;
763 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
764 return (NULL);
765 frags = numfrags(fs, nsize); /* number of fragments needed */
766 bbase = fragnum(fs, bprev); /* offset in a frag (it is mod fragsize */
767 if (bbase > fragnum(fs, (bprev + frags - 1))) {
768 /* cannot extend across a block boundary */
769 return (NULL);
770 }
771 /* read corresponding cylinder group info */
91447636
A
772 error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))),
773 (int)fs->fs_cgsize, NOCRED, &bp);
1c79356b 774 if (error) {
91447636 775 buf_brelse(bp);
1c79356b
A
776 return (NULL);
777 }
91447636 778 cgp = (struct cg *)buf_dataptr(bp);
1c79356b
A
779#if REV_ENDIAN_FS
780 if (rev_endian) {
781 byte_swap_cgin(cgp, fs);
782 }
783#endif /* REV_ENDIAN_FS */
784
785 if (!cg_chkmagic(cgp)) {
786#if REV_ENDIAN_FS
787 if (rev_endian)
788 byte_swap_cgout(cgp,fs);
789#endif /* REV_ENDIAN_FS */
91447636 790 buf_brelse(bp);
1c79356b
A
791 return (NULL);
792 }
91447636
A
793 microtime(&tv);
794 cgp->cg_time = tv.tv_sec;
1c79356b
A
795 bno = dtogd(fs, bprev);
796 for (i = numfrags(fs, osize); i < frags; i++)
797 if (isclr(cg_blksfree(cgp), bno + i)) {
798#if REV_ENDIAN_FS
799 if (rev_endian)
800 byte_swap_cgout(cgp,fs);
801#endif /* REV_ENDIAN_FS */
91447636 802 buf_brelse(bp);
1c79356b
A
803 return (NULL);
804 }
805 /*
806 * the current fragment can be extended
807 * deduct the count on fragment being extended into
808 * increase the count on the remaining fragment (if any)
809 * allocate the extended piece
810 */
811 for (i = frags; i < fs->fs_frag - bbase; i++)
812 if (isclr(cg_blksfree(cgp), bno + i))
813 break;
814 cgp->cg_frsum[i - numfrags(fs, osize)]--;
815 if (i != frags)
816 cgp->cg_frsum[i - frags]++;
817 for (i = numfrags(fs, osize); i < frags; i++) {
818 clrbit(cg_blksfree(cgp), bno + i);
819 cgp->cg_cs.cs_nffree--;
820 fs->fs_cstotal.cs_nffree--;
821 fs->fs_cs(fs, cg).cs_nffree--;
822 }
823 fs->fs_fmod = 1;
824#if REV_ENDIAN_FS
825 if (rev_endian)
826 byte_swap_cgout(cgp,fs);
827#endif /* REV_ENDIAN_FS */
91447636 828 buf_bdwrite(bp);
1c79356b
A
829 return (bprev);
830}
831
832/*
833 * Determine whether a block can be allocated.
834 *
835 * Check to see if a block of the appropriate size is available,
836 * and if it is, allocate it.
837 */
838static ufs_daddr_t
839ffs_alloccg(ip, cg, bpref, size)
840 struct inode *ip;
841 int cg;
842 ufs_daddr_t bpref;
843 int size;
844{
845 register struct fs *fs;
846 register struct cg *cgp;
847 struct buf *bp;
91447636 848 struct timeval tv;
1c79356b
A
849 register int i;
850 int error, bno, frags, allocsiz;
851#if REV_ENDIAN_FS
852 struct vnode *vp=ITOV(ip);
853 struct mount *mp=vp->v_mount;
854 int rev_endian=(mp->mnt_flag & MNT_REVEND);
855#endif /* REV_ENDIAN_FS */
856
857 fs = ip->i_fs;
858 if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
859 return (NULL);
91447636
A
860 error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))),
861 (int)fs->fs_cgsize, NOCRED, &bp);
1c79356b 862 if (error) {
91447636 863 buf_brelse(bp);
1c79356b
A
864 return (NULL);
865 }
91447636 866 cgp = (struct cg *)buf_dataptr(bp);
1c79356b
A
867#if REV_ENDIAN_FS
868 if (rev_endian)
869 byte_swap_cgin(cgp,fs);
870#endif /* REV_ENDIAN_FS */
871 if (!cg_chkmagic(cgp) ||
872 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
873#if REV_ENDIAN_FS
874 if (rev_endian)
875 byte_swap_cgout(cgp,fs);
876#endif /* REV_ENDIAN_FS */
91447636 877 buf_brelse(bp);
1c79356b
A
878 return (NULL);
879 }
91447636
A
880 microtime(&tv);
881 cgp->cg_time = tv.tv_sec;
1c79356b
A
882 if (size == fs->fs_bsize) {
883 bno = ffs_alloccgblk(fs, cgp, bpref);
884#if REV_ENDIAN_FS
885 if (rev_endian)
886 byte_swap_cgout(cgp,fs);
887#endif /* REV_ENDIAN_FS */
91447636 888 buf_bdwrite(bp);
1c79356b
A
889 return (bno);
890 }
891 /*
892 * check to see if any fragments are already available
893 * allocsiz is the size which will be allocated, hacking
894 * it down to a smaller size if necessary
895 */
896 frags = numfrags(fs, size);
897 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
898 if (cgp->cg_frsum[allocsiz] != 0)
899 break;
900 if (allocsiz == fs->fs_frag) {
901 /*
902 * no fragments were available, so a block will be
903 * allocated, and hacked up
904 */
905 if (cgp->cg_cs.cs_nbfree == 0) {
906#if REV_ENDIAN_FS
907 if (rev_endian)
908 byte_swap_cgout(cgp,fs);
909#endif /* REV_ENDIAN_FS */
91447636 910 buf_brelse(bp);
1c79356b
A
911 return (NULL);
912 }
913 bno = ffs_alloccgblk(fs, cgp, bpref);
914 bpref = dtogd(fs, bno);
915 for (i = frags; i < fs->fs_frag; i++)
916 setbit(cg_blksfree(cgp), bpref + i);
917 i = fs->fs_frag - frags;
918 cgp->cg_cs.cs_nffree += i;
919 fs->fs_cstotal.cs_nffree += i;
920 fs->fs_cs(fs, cg).cs_nffree += i;
921 fs->fs_fmod = 1;
922 cgp->cg_frsum[i]++;
923#if REV_ENDIAN_FS
924 if (rev_endian)
925 byte_swap_cgout(cgp,fs);
926#endif /* REV_ENDIAN_FS */
91447636 927 buf_bdwrite(bp);
1c79356b
A
928 return (bno);
929 }
930 bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
931 if (bno < 0) {
932#if REV_ENDIAN_FS
933 if (rev_endian)
934 byte_swap_cgout(cgp,fs);
935#endif /* REV_ENDIAN_FS */
91447636 936 buf_brelse(bp);
1c79356b
A
937 return (NULL);
938 }
939 for (i = 0; i < frags; i++)
940 clrbit(cg_blksfree(cgp), bno + i);
941 cgp->cg_cs.cs_nffree -= frags;
942 fs->fs_cstotal.cs_nffree -= frags;
943 fs->fs_cs(fs, cg).cs_nffree -= frags;
944 fs->fs_fmod = 1;
945 cgp->cg_frsum[allocsiz]--;
946 if (frags != allocsiz)
947 cgp->cg_frsum[allocsiz - frags]++;
948#if REV_ENDIAN_FS
949 if (rev_endian)
950 byte_swap_cgout(cgp,fs);
951#endif /* REV_ENDIAN_FS */
91447636 952 buf_bdwrite(bp);
1c79356b
A
953 return (cg * fs->fs_fpg + bno);
954}
955
956/*
957 * Allocate a block in a cylinder group.
958 *
959 * This algorithm implements the following policy:
960 * 1) allocate the requested block.
961 * 2) allocate a rotationally optimal block in the same cylinder.
962 * 3) allocate the next available block on the block rotor for the
963 * specified cylinder group.
964 * Note that this routine only allocates fs_bsize blocks; these
965 * blocks may be fragmented by the routine that allocates them.
966 */
967static ufs_daddr_t
968ffs_alloccgblk(fs, cgp, bpref)
969 register struct fs *fs;
970 register struct cg *cgp;
971 ufs_daddr_t bpref;
972{
973 ufs_daddr_t bno, blkno;
974 int cylno, pos, delta;
975 short *cylbp;
976 register int i;
977
978 if (bpref == 0 || dtog(fs, bpref) != cgp->cg_cgx) {
979 bpref = cgp->cg_rotor;
980 goto norot;
981 }
982 bpref = blknum(fs, bpref);
983 bpref = dtogd(fs, bpref);
984 /*
985 * if the requested block is available, use it
986 */
987 if (ffs_isblock(fs, cg_blksfree(cgp), fragstoblks(fs, bpref))) {
988 bno = bpref;
989 goto gotit;
990 }
991 if (fs->fs_nrpos <= 1 || fs->fs_cpc == 0) {
992 /*
993 * Block layout information is not available.
994 * Leaving bpref unchanged means we take the
995 * next available free block following the one
996 * we just allocated. Hopefully this will at
997 * least hit a track cache on drives of unknown
998 * geometry (e.g. SCSI).
999 */
1000 goto norot;
1001 }
1002 /*
1003 * check for a block available on the same cylinder
1004 */
1005 cylno = cbtocylno(fs, bpref);
1006 if (cg_blktot(cgp)[cylno] == 0)
1007 goto norot;
1008 /*
1009 * check the summary information to see if a block is
1010 * available in the requested cylinder starting at the
1011 * requested rotational position and proceeding around.
1012 */
1013 cylbp = cg_blks(fs, cgp, cylno);
1014 pos = cbtorpos(fs, bpref);
1015 for (i = pos; i < fs->fs_nrpos; i++)
1016 if (cylbp[i] > 0)
1017 break;
1018 if (i == fs->fs_nrpos)
1019 for (i = 0; i < pos; i++)
1020 if (cylbp[i] > 0)
1021 break;
1022 if (cylbp[i] > 0) {
1023 /*
1024 * found a rotational position, now find the actual
1025 * block. A panic if none is actually there.
1026 */
1027 pos = cylno % fs->fs_cpc;
1028 bno = (cylno - pos) * fs->fs_spc / NSPB(fs);
1029 if (fs_postbl(fs, pos)[i] == -1) {
1030 printf("pos = %d, i = %d, fs = %s\n",
1031 pos, i, fs->fs_fsmnt);
1032 panic("ffs_alloccgblk: cyl groups corrupted");
1033 }
1034 for (i = fs_postbl(fs, pos)[i];; ) {
1035 if (ffs_isblock(fs, cg_blksfree(cgp), bno + i)) {
1036 bno = blkstofrags(fs, (bno + i));
1037 goto gotit;
1038 }
1039 delta = fs_rotbl(fs)[i];
1040 if (delta <= 0 ||
1041 delta + i > fragstoblks(fs, fs->fs_fpg))
1042 break;
1043 i += delta;
1044 }
1045 printf("pos = %d, i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
1046 panic("ffs_alloccgblk: can't find blk in cyl");
1047 }
1048norot:
1049 /*
1050 * no blocks in the requested cylinder, so take next
1051 * available one in this cylinder group.
1052 */
1053 bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
1054 if (bno < 0)
1055 return (NULL);
1056 cgp->cg_rotor = bno;
1057gotit:
1058 blkno = fragstoblks(fs, bno);
1059 ffs_clrblock(fs, cg_blksfree(cgp), (long)blkno);
1060 ffs_clusteracct(fs, cgp, blkno, -1);
1061 cgp->cg_cs.cs_nbfree--;
1062 fs->fs_cstotal.cs_nbfree--;
1063 fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
1064 cylno = cbtocylno(fs, bno);
1065 cg_blks(fs, cgp, cylno)[cbtorpos(fs, bno)]--;
1066 cg_blktot(cgp)[cylno]--;
1067 fs->fs_fmod = 1;
1068 return (cgp->cg_cgx * fs->fs_fpg + bno);
1069}
1070
1071/*
1072 * Determine whether a cluster can be allocated.
1073 *
1074 * We do not currently check for optimal rotational layout if there
1075 * are multiple choices in the same cylinder group. Instead we just
1076 * take the first one that we find following bpref.
1077 */
1078static ufs_daddr_t
1079ffs_clusteralloc(ip, cg, bpref, len)
1080 struct inode *ip;
1081 int cg;
1082 ufs_daddr_t bpref;
1083 int len;
1084{
1085 register struct fs *fs;
1086 register struct cg *cgp;
1087 struct buf *bp;
1088 int i, got, run, bno, bit, map;
1089 u_char *mapp;
1090 int32_t *lp;
1091#if REV_ENDIAN_FS
1092 struct vnode *vp=ITOV(ip);
1093 struct mount *mp=vp->v_mount;
1094 int rev_endian=(mp->mnt_flag & MNT_REVEND);
1095#endif /* REV_ENDIAN_FS */
1096
1097 fs = ip->i_fs;
1098 if (fs->fs_maxcluster[cg] < len)
1099 return (NULL);
91447636
A
1100 if (buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))), (int)fs->fs_cgsize,
1101 NOCRED, &bp))
1c79356b 1102 goto fail;
91447636 1103 cgp = (struct cg *)buf_dataptr(bp);
1c79356b
A
1104#if REV_ENDIAN_FS
1105 if (rev_endian)
1106 byte_swap_cgin(cgp,fs);
1107#endif /* REV_ENDIAN_FS */
1108 if (!cg_chkmagic(cgp)) {
1109#if REV_ENDIAN_FS
1110 if (rev_endian)
1111 byte_swap_cgout(cgp,fs);
1112#endif /* REV_ENDIAN_FS */
1113 goto fail;
1114 }
1115 /*
1116 * Check to see if a cluster of the needed size (or bigger) is
1117 * available in this cylinder group.
1118 */
1119 lp = &cg_clustersum(cgp)[len];
1120 for (i = len; i <= fs->fs_contigsumsize; i++)
1121 if (*lp++ > 0)
1122 break;
1123 if (i > fs->fs_contigsumsize) {
1124 /*
1125 * This is the first time looking for a cluster in this
1126 * cylinder group. Update the cluster summary information
1127 * to reflect the true maximum sized cluster so that
1128 * future cluster allocation requests can avoid reading
1129 * the cylinder group map only to find no clusters.
1130 */
1131 lp = &cg_clustersum(cgp)[len - 1];
1132 for (i = len - 1; i > 0; i--)
1133 if (*lp-- > 0)
1134 break;
1135 fs->fs_maxcluster[cg] = i;
1136#if REV_ENDIAN_FS
1137 if (rev_endian)
1138 byte_swap_cgout(cgp,fs);
1139#endif /* REV_ENDIAN_FS */
1140 goto fail;
1141 }
1142 /*
1143 * Search the cluster map to find a big enough cluster.
1144 * We take the first one that we find, even if it is larger
1145 * than we need as we prefer to get one close to the previous
1146 * block allocation. We do not search before the current
1147 * preference point as we do not want to allocate a block
1148 * that is allocated before the previous one (as we will
1149 * then have to wait for another pass of the elevator
1150 * algorithm before it will be read). We prefer to fail and
1151 * be recalled to try an allocation in the next cylinder group.
1152 */
1153 if (dtog(fs, bpref) != cg)
1154 bpref = 0;
1155 else
1156 bpref = fragstoblks(fs, dtogd(fs, blknum(fs, bpref)));
1157 mapp = &cg_clustersfree(cgp)[bpref / NBBY];
1158 map = *mapp++;
1159 bit = 1 << (bpref % NBBY);
1160 for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) {
1161 if ((map & bit) == 0) {
1162 run = 0;
1163 } else {
1164 run++;
1165 if (run == len)
1166 break;
1167 }
1168 if ((got & (NBBY - 1)) != (NBBY - 1)) {
1169 bit <<= 1;
1170 } else {
1171 map = *mapp++;
1172 bit = 1;
1173 }
1174 }
1175 if (got == cgp->cg_nclusterblks) {
1176#if REV_ENDIAN_FS
1177 if (rev_endian)
1178 byte_swap_cgout(cgp,fs);
1179#endif /* REV_ENDIAN_FS */
1180 goto fail;
1181 }
1182 /*
1183 * Allocate the cluster that we have found.
1184 */
1185 for (i = 1; i <= len; i++)
1186 if (!ffs_isblock(fs, cg_blksfree(cgp), got - run + i))
1187 panic("ffs_clusteralloc: map mismatch");
1188 bno = cg * fs->fs_fpg + blkstofrags(fs, got - run + 1);
1189 if (dtog(fs, bno) != cg)
1190 panic("ffs_clusteralloc: allocated out of group");
1191 len = blkstofrags(fs, len);
1192 for (i = 0; i < len; i += fs->fs_frag)
1193 if ((got = ffs_alloccgblk(fs, cgp, bno + i)) != bno + i)
1194 panic("ffs_clusteralloc: lost block");
1195#if REV_ENDIAN_FS
1196 if (rev_endian)
1197 byte_swap_cgout(cgp,fs);
1198#endif /* REV_ENDIAN_FS */
91447636 1199 buf_bdwrite(bp);
1c79356b
A
1200 return (bno);
1201
1202fail:
91447636 1203 buf_brelse(bp);
1c79356b
A
1204 return (0);
1205}
1206
1207/*
1208 * Determine whether an inode can be allocated.
1209 *
1210 * Check to see if an inode is available, and if it is,
1211 * allocate it using the following policy:
1212 * 1) allocate the requested inode.
1213 * 2) allocate the next available inode after the requested
1214 * inode in the specified cylinder group.
1215 */
1216static ino_t
1217ffs_nodealloccg(ip, cg, ipref, mode)
1218 struct inode *ip;
1219 int cg;
1220 ufs_daddr_t ipref;
1221 int mode;
1222{
1223 register struct fs *fs;
1224 register struct cg *cgp;
1225 struct buf *bp;
91447636 1226 struct timeval tv;
1c79356b
A
1227 int error, start, len, loc, map, i;
1228#if REV_ENDIAN_FS
1229 struct vnode *vp=ITOV(ip);
1230 struct mount *mp=vp->v_mount;
1231 int rev_endian=(mp->mnt_flag & MNT_REVEND);
1232#endif /* REV_ENDIAN_FS */
1233
1234 fs = ip->i_fs;
1235 if (fs->fs_cs(fs, cg).cs_nifree == 0)
1236 return (NULL);
91447636
A
1237 error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))),
1238 (int)fs->fs_cgsize, NOCRED, &bp);
1c79356b 1239 if (error) {
91447636 1240 buf_brelse(bp);
1c79356b
A
1241 return (NULL);
1242 }
91447636 1243 cgp = (struct cg *)buf_dataptr(bp);
1c79356b
A
1244#if REV_ENDIAN_FS
1245 if (rev_endian)
1246 byte_swap_cgin(cgp,fs);
1247#endif /* REV_ENDIAN_FS */
1248 if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) {
1249#if REV_ENDIAN_FS
1250 if (rev_endian)
1251 byte_swap_cgout(cgp,fs);
1252#endif /* REV_ENDIAN_FS */
91447636 1253 buf_brelse(bp);
1c79356b
A
1254 return (NULL);
1255 }
1256
91447636
A
1257 microtime(&tv);
1258 cgp->cg_time = tv.tv_sec;
1c79356b
A
1259 if (ipref) {
1260 ipref %= fs->fs_ipg;
1261 if (isclr(cg_inosused(cgp), ipref))
1262 goto gotit;
1263 }
1264 start = cgp->cg_irotor / NBBY;
1265 len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
1266 loc = skpc(0xff, len, &cg_inosused(cgp)[start]);
1267 if (loc == 0) {
1268 len = start + 1;
1269 start = 0;
1270 loc = skpc(0xff, len, &cg_inosused(cgp)[0]);
1271 if (loc == 0) {
1272 printf("cg = %d, irotor = %d, fs = %s\n",
1273 cg, cgp->cg_irotor, fs->fs_fsmnt);
1274 panic("ffs_nodealloccg: map corrupted");
1275 /* NOTREACHED */
1276 }
1277 }
1278 i = start + len - loc;
1279 map = cg_inosused(cgp)[i];
1280 ipref = i * NBBY;
1281 for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
1282 if ((map & i) == 0) {
1283 cgp->cg_irotor = ipref;
1284 goto gotit;
1285 }
1286 }
1287 printf("fs = %s\n", fs->fs_fsmnt);
1288 panic("ffs_nodealloccg: block not in map");
1289 /* NOTREACHED */
1290gotit:
1291 setbit(cg_inosused(cgp), ipref);
1292 cgp->cg_cs.cs_nifree--;
1293 fs->fs_cstotal.cs_nifree--;
1294 fs->fs_cs(fs, cg).cs_nifree--;
1295 fs->fs_fmod = 1;
1296 if ((mode & IFMT) == IFDIR) {
1297 cgp->cg_cs.cs_ndir++;
1298 fs->fs_cstotal.cs_ndir++;
1299 fs->fs_cs(fs, cg).cs_ndir++;
1300 }
1301#if REV_ENDIAN_FS
1302 if (rev_endian)
1303 byte_swap_cgout(cgp,fs);
1304#endif /* REV_ENDIAN_FS */
91447636 1305 buf_bdwrite(bp);
1c79356b
A
1306 return (cg * fs->fs_ipg + ipref);
1307}
1308
1309/*
1310 * Free a block or fragment.
1311 *
1312 * The specified block or fragment is placed back in the
1313 * free map. If a fragment is deallocated, a possible
1314 * block reassembly is checked.
1315 */
91447636 1316void
1c79356b
A
1317ffs_blkfree(ip, bno, size)
1318 register struct inode *ip;
1319 ufs_daddr_t bno;
1320 long size;
1321{
1322 register struct fs *fs;
1323 register struct cg *cgp;
1324 struct buf *bp;
91447636 1325 struct timeval tv;
1c79356b
A
1326 ufs_daddr_t blkno;
1327 int i, error, cg, blk, frags, bbase;
1328#if REV_ENDIAN_FS
1329 struct vnode *vp=ITOV(ip);
1330 struct mount *mp=vp->v_mount;
1331 int rev_endian=(mp->mnt_flag & MNT_REVEND);
1332#endif /* REV_ENDIAN_FS */
91447636 1333
1c79356b
A
1334 fs = ip->i_fs;
1335 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
1336 printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n",
1337 ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
1338 panic("blkfree: bad size");
1339 }
1340 cg = dtog(fs, bno);
1341 if ((u_int)bno >= fs->fs_size) {
1342 printf("bad block %d, ino %d\n", bno, ip->i_number);
1343 ffs_fserr(fs, ip->i_uid, "bad block");
1344 return;
1345 }
91447636
A
1346 error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))),
1347 (int)fs->fs_cgsize, NOCRED, &bp);
1c79356b 1348 if (error) {
91447636 1349 buf_brelse(bp);
1c79356b
A
1350 return;
1351 }
91447636 1352 cgp = (struct cg *)buf_dataptr(bp);
1c79356b
A
1353#if REV_ENDIAN_FS
1354 if (rev_endian)
1355 byte_swap_cgin(cgp,fs);
1356#endif /* REV_ENDIAN_FS */
1357 if (!cg_chkmagic(cgp)) {
1358#if REV_ENDIAN_FS
1359 if (rev_endian)
1360 byte_swap_cgout(cgp,fs);
1361#endif /* REV_ENDIAN_FS */
91447636 1362 buf_brelse(bp);
1c79356b
A
1363 return;
1364 }
91447636
A
1365 microtime(&tv);
1366 cgp->cg_time = tv.tv_sec;
1c79356b
A
1367 bno = dtogd(fs, bno);
1368 if (size == fs->fs_bsize) {
1369 blkno = fragstoblks(fs, bno);
1370 if (ffs_isblock(fs, cg_blksfree(cgp), blkno)) {
1371 printf("dev = 0x%x, block = %d, fs = %s\n",
1372 ip->i_dev, bno, fs->fs_fsmnt);
1373 panic("blkfree: freeing free block");
1374 }
1375 ffs_setblock(fs, cg_blksfree(cgp), blkno);
1376 ffs_clusteracct(fs, cgp, blkno, 1);
1377 cgp->cg_cs.cs_nbfree++;
1378 fs->fs_cstotal.cs_nbfree++;
1379 fs->fs_cs(fs, cg).cs_nbfree++;
1380 i = cbtocylno(fs, bno);
1381 cg_blks(fs, cgp, i)[cbtorpos(fs, bno)]++;
1382 cg_blktot(cgp)[i]++;
1383 } else {
1384 bbase = bno - fragnum(fs, bno);
1385 /*
1386 * decrement the counts associated with the old frags
1387 */
1388 blk = blkmap(fs, cg_blksfree(cgp), bbase);
1389 ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
1390 /*
1391 * deallocate the fragment
1392 */
1393 frags = numfrags(fs, size);
1394 for (i = 0; i < frags; i++) {
1395 if (isset(cg_blksfree(cgp), bno + i)) {
1396 printf("dev = 0x%x, block = %d, fs = %s\n",
1397 ip->i_dev, bno + i, fs->fs_fsmnt);
1398 panic("blkfree: freeing free frag");
1399 }
1400 setbit(cg_blksfree(cgp), bno + i);
1401 }
1402 cgp->cg_cs.cs_nffree += i;
1403 fs->fs_cstotal.cs_nffree += i;
1404 fs->fs_cs(fs, cg).cs_nffree += i;
1405 /*
1406 * add back in counts associated with the new frags
1407 */
1408 blk = blkmap(fs, cg_blksfree(cgp), bbase);
1409 ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
1410 /*
1411 * if a complete block has been reassembled, account for it
1412 */
1413 blkno = fragstoblks(fs, bbase);
1414 if (ffs_isblock(fs, cg_blksfree(cgp), blkno)) {
1415 cgp->cg_cs.cs_nffree -= fs->fs_frag;
1416 fs->fs_cstotal.cs_nffree -= fs->fs_frag;
1417 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
1418 ffs_clusteracct(fs, cgp, blkno, 1);
1419 cgp->cg_cs.cs_nbfree++;
1420 fs->fs_cstotal.cs_nbfree++;
1421 fs->fs_cs(fs, cg).cs_nbfree++;
1422 i = cbtocylno(fs, bbase);
1423 cg_blks(fs, cgp, i)[cbtorpos(fs, bbase)]++;
1424 cg_blktot(cgp)[i]++;
1425 }
1426 }
1427 fs->fs_fmod = 1;
1428#if REV_ENDIAN_FS
1429 if (rev_endian)
1430 byte_swap_cgout(cgp,fs);
1431#endif /* REV_ENDIAN_FS */
91447636 1432 buf_bdwrite(bp);
1c79356b
A
1433}
1434
1435#if DIAGNOSTIC
1436/*
1437 * Verify allocation of a block or fragment. Returns true if block or
1438 * fragment is allocated, false if it is free.
1439 */
1440ffs_checkblk(ip, bno, size)
1441 struct inode *ip;
1442 ufs_daddr_t bno;
1443 long size;
1444{
1445 struct fs *fs;
1446 struct cg *cgp;
1447 struct buf *bp;
1448 int i, error, frags, free;
1449#if REV_ENDIAN_FS
1450 struct vnode *vp=ITOV(ip);
1451 struct mount *mp=vp->v_mount;
1452 int rev_endian=(mp->mnt_flag & MNT_REVEND);
1453#endif /* REV_ENDIAN_FS */
1454
1455 fs = ip->i_fs;
1456 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
1457 printf("bsize = %d, size = %d, fs = %s\n",
1458 fs->fs_bsize, size, fs->fs_fsmnt);
1459 panic("checkblk: bad size");
1460 }
1461 if ((u_int)bno >= fs->fs_size)
1462 panic("checkblk: bad block %d", bno);
91447636
A
1463 error = (int)buf_bread(ip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, dtog(fs, bno)))),
1464 (int)fs->fs_cgsize, NOCRED, &bp);
1c79356b 1465 if (error) {
91447636 1466 buf_brelse(bp);
1c79356b
A
1467 return;
1468 }
91447636 1469 cgp = (struct cg *)buf_dataptr(bp);
1c79356b
A
1470#if REV_ENDIAN_FS
1471 if (rev_endian)
1472 byte_swap_cgin(cgp,fs);
1473#endif /* REV_ENDIAN_FS */
1474 if (!cg_chkmagic(cgp)) {
1475#if REV_ENDIAN_FS
1476 if (rev_endian)
1477 byte_swap_cgout(cgp,fs);
1478#endif /* REV_ENDIAN_FS */
91447636 1479 buf_brelse(bp);
1c79356b
A
1480 return;
1481 }
1482 bno = dtogd(fs, bno);
1483 if (size == fs->fs_bsize) {
1484 free = ffs_isblock(fs, cg_blksfree(cgp), fragstoblks(fs, bno));
1485 } else {
1486 frags = numfrags(fs, size);
1487 for (free = 0, i = 0; i < frags; i++)
1488 if (isset(cg_blksfree(cgp), bno + i))
1489 free++;
1490 if (free != 0 && free != frags)
1491 panic("checkblk: partially free fragment");
1492 }
1493#if REV_ENDIAN_FS
1494 if (rev_endian)
1495 byte_swap_cgout(cgp,fs);
1496#endif /* REV_ENDIAN_FS */
91447636 1497 buf_brelse(bp);
1c79356b
A
1498 return (!free);
1499}
1500#endif /* DIAGNOSTIC */
1501
1502/*
1503 * Free an inode.
1504 *
1505 * The specified inode is placed back in the free map.
1506 */
1507int
91447636 1508ffs_vfree(struct vnode *vp, ino_t ino, int mode)
1c79356b
A
1509{
1510 register struct fs *fs;
1511 register struct cg *cgp;
1512 register struct inode *pip;
1c79356b 1513 struct buf *bp;
91447636 1514 struct timeval tv;
1c79356b
A
1515 int error, cg;
1516#if REV_ENDIAN_FS
1c79356b
A
1517 struct mount *mp=vp->v_mount;
1518 int rev_endian=(mp->mnt_flag & MNT_REVEND);
1519#endif /* REV_ENDIAN_FS */
1520
91447636 1521 pip = VTOI(vp);
1c79356b
A
1522 fs = pip->i_fs;
1523 if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
1524 panic("ifree: range: dev = 0x%x, ino = %d, fs = %s\n",
1525 pip->i_dev, ino, fs->fs_fsmnt);
1526 cg = ino_to_cg(fs, ino);
91447636
A
1527 error = (int)buf_bread(pip->i_devvp, (daddr64_t)((unsigned)fsbtodb(fs, cgtod(fs, cg))),
1528 (int)fs->fs_cgsize, NOCRED, &bp);
1c79356b 1529 if (error) {
91447636 1530 buf_brelse(bp);
1c79356b
A
1531 return (0);
1532 }
91447636 1533 cgp = (struct cg *)buf_dataptr(bp);
1c79356b
A
1534#if REV_ENDIAN_FS
1535 if (rev_endian)
1536 byte_swap_cgin(cgp,fs);
1537#endif /* REV_ENDIAN_FS */
1538 if (!cg_chkmagic(cgp)) {
1539#if REV_ENDIAN_FS
1540 if (rev_endian)
1541 byte_swap_cgout(cgp,fs);
1542#endif /* REV_ENDIAN_FS */
91447636 1543 buf_brelse(bp);
1c79356b
A
1544 return (0);
1545 }
91447636
A
1546 microtime(&tv);
1547 cgp->cg_time = tv.tv_sec;
1c79356b
A
1548 ino %= fs->fs_ipg;
1549 if (isclr(cg_inosused(cgp), ino)) {
1550 printf("dev = 0x%x, ino = %d, fs = %s\n",
1551 pip->i_dev, ino, fs->fs_fsmnt);
1552 if (fs->fs_ronly == 0)
1553 panic("ifree: freeing free inode");
1554 }
1555 clrbit(cg_inosused(cgp), ino);
1556 if (ino < cgp->cg_irotor)
1557 cgp->cg_irotor = ino;
1558 cgp->cg_cs.cs_nifree++;
1559 fs->fs_cstotal.cs_nifree++;
1560 fs->fs_cs(fs, cg).cs_nifree++;
91447636 1561 if ((mode & IFMT) == IFDIR) {
1c79356b
A
1562 cgp->cg_cs.cs_ndir--;
1563 fs->fs_cstotal.cs_ndir--;
1564 fs->fs_cs(fs, cg).cs_ndir--;
1565 }
1566 fs->fs_fmod = 1;
1567#if REV_ENDIAN_FS
1568 if (rev_endian)
1569 byte_swap_cgout(cgp,fs);
1570#endif /* REV_ENDIAN_FS */
91447636 1571 buf_bdwrite(bp);
1c79356b
A
1572 return (0);
1573}
1574
1575/*
1576 * Find a block of the specified size in the specified cylinder group.
1577 *
1578 * It is a panic if a request is made to find a block if none are
1579 * available.
1580 */
1581static ufs_daddr_t
1582ffs_mapsearch(fs, cgp, bpref, allocsiz)
1583 register struct fs *fs;
1584 register struct cg *cgp;
1585 ufs_daddr_t bpref;
1586 int allocsiz;
1587{
1588 ufs_daddr_t bno;
1589 int start, len, loc, i;
1590 int blk, field, subfield, pos;
1591
1592 /*
1593 * find the fragment by searching through the free block
1594 * map for an appropriate bit pattern
1595 */
1596 if (bpref)
1597 start = dtogd(fs, bpref) / NBBY;
1598 else
1599 start = cgp->cg_frotor / NBBY;
1600 len = howmany(fs->fs_fpg, NBBY) - start;
1601 loc = scanc((u_int)len, (u_char *)&cg_blksfree(cgp)[start],
1602 (u_char *)fragtbl[fs->fs_frag],
1603 (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1604 if (loc == 0) {
1605 len = start + 1;
1606 start = 0;
1607 loc = scanc((u_int)len, (u_char *)&cg_blksfree(cgp)[0],
1608 (u_char *)fragtbl[fs->fs_frag],
1609 (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1610 if (loc == 0) {
1611 printf("start = %d, len = %d, fs = %s\n",
1612 start, len, fs->fs_fsmnt);
1613 panic("ffs_alloccg: map corrupted");
1614 /* NOTREACHED */
1615 }
1616 }
1617 bno = (start + len - loc) * NBBY;
1618 cgp->cg_frotor = bno;
1619 /*
1620 * found the byte in the map
1621 * sift through the bits to find the selected frag
1622 */
1623 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
1624 blk = blkmap(fs, cg_blksfree(cgp), bno);
1625 blk <<= 1;
1626 field = around[allocsiz];
1627 subfield = inside[allocsiz];
1628 for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
1629 if ((blk & field) == subfield)
1630 return (bno + pos);
1631 field <<= 1;
1632 subfield <<= 1;
1633 }
1634 }
1635 printf("bno = %d, fs = %s\n", bno, fs->fs_fsmnt);
1636 panic("ffs_alloccg: block not in map");
1637 return (-1);
1638}
1639
1640/*
1641 * Update the cluster map because of an allocation or free.
1642 *
1643 * Cnt == 1 means free; cnt == -1 means allocating.
1644 */
91447636
A
1645static void
1646ffs_clusteracct(struct fs *fs, struct cg *cgp, ufs_daddr_t blkno, int cnt)
1c79356b
A
1647{
1648 int32_t *sump;
1649 int32_t *lp;
1650 u_char *freemapp, *mapp;
1651 int i, start, end, forw, back, map, bit;
1652
1653 if (fs->fs_contigsumsize <= 0)
1654 return;
1655 freemapp = cg_clustersfree(cgp);
1656 sump = cg_clustersum(cgp);
1657 /*
1658 * Allocate or clear the actual block.
1659 */
1660 if (cnt > 0)
1661 setbit(freemapp, blkno);
1662 else
1663 clrbit(freemapp, blkno);
1664 /*
1665 * Find the size of the cluster going forward.
1666 */
1667 start = blkno + 1;
1668 end = start + fs->fs_contigsumsize;
1669 if (end >= cgp->cg_nclusterblks)
1670 end = cgp->cg_nclusterblks;
1671 mapp = &freemapp[start / NBBY];
1672 map = *mapp++;
1673 bit = 1 << (start % NBBY);
1674 for (i = start; i < end; i++) {
1675 if ((map & bit) == 0)
1676 break;
1677 if ((i & (NBBY - 1)) != (NBBY - 1)) {
1678 bit <<= 1;
1679 } else {
1680 map = *mapp++;
1681 bit = 1;
1682 }
1683 }
1684 forw = i - start;
1685 /*
1686 * Find the size of the cluster going backward.
1687 */
1688 start = blkno - 1;
1689 end = start - fs->fs_contigsumsize;
1690 if (end < 0)
1691 end = -1;
1692 mapp = &freemapp[start / NBBY];
1693 map = *mapp--;
1694 bit = 1 << (start % NBBY);
1695 for (i = start; i > end; i--) {
1696 if ((map & bit) == 0)
1697 break;
1698 if ((i & (NBBY - 1)) != 0) {
1699 bit >>= 1;
1700 } else {
1701 map = *mapp--;
1702 bit = 1 << (NBBY - 1);
1703 }
1704 }
1705 back = start - i;
1706 /*
1707 * Account for old cluster and the possibly new forward and
1708 * back clusters.
1709 */
1710 i = back + forw + 1;
1711 if (i > fs->fs_contigsumsize)
1712 i = fs->fs_contigsumsize;
1713 sump[i] += cnt;
1714 if (back > 0)
1715 sump[back] -= cnt;
1716 if (forw > 0)
1717 sump[forw] -= cnt;
1718 /*
1719 * Update cluster summary information.
1720 */
1721 lp = &sump[fs->fs_contigsumsize];
1722 for (i = fs->fs_contigsumsize; i > 0; i--)
1723 if (*lp-- > 0)
1724 break;
1725 fs->fs_maxcluster[cgp->cg_cgx] = i;
1726}
1727
1728/*
1729 * Fserr prints the name of a file system with an error diagnostic.
1730 *
1731 * The form of the error message is:
1732 * fs: error message
1733 */
1734static void
1735ffs_fserr(fs, uid, cp)
1736 struct fs *fs;
1737 u_int uid;
1738 char *cp;
1739{
1740
1741 log(LOG_ERR, "uid %d on %s: %s\n", uid, fs->fs_fsmnt, cp);
1742}