/[base]/head/sys/kern/vfs_subr.c
ViewVC logotype

Contents of /head/sys/kern/vfs_subr.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 368360 - (show annotations) (download)
Sat Dec 5 05:56:23 2020 UTC (3 years, 6 months ago) by mjg
File MIME type: text/plain
File size: 167741 byte(s)
vfs: keep bad ops on vnode reclaim

They were only modified to accomodate a redundant assertion.

This runs into problems as lockless lookup can still try to use the vnode
and crash instead of getting an error.

The bug was only present in kernels with INVARIANTS.

Reported by:	kevans

1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993
5 * The Regents of the University of California. All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
37 */
38
39 /*
40 * External virtual filesystem routines
41 */
42
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45
46 #include "opt_ddb.h"
47 #include "opt_watchdog.h"
48
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/bio.h>
52 #include <sys/buf.h>
53 #include <sys/capsicum.h>
54 #include <sys/condvar.h>
55 #include <sys/conf.h>
56 #include <sys/counter.h>
57 #include <sys/dirent.h>
58 #include <sys/event.h>
59 #include <sys/eventhandler.h>
60 #include <sys/extattr.h>
61 #include <sys/file.h>
62 #include <sys/fcntl.h>
63 #include <sys/jail.h>
64 #include <sys/kdb.h>
65 #include <sys/kernel.h>
66 #include <sys/kthread.h>
67 #include <sys/ktr.h>
68 #include <sys/lockf.h>
69 #include <sys/malloc.h>
70 #include <sys/mount.h>
71 #include <sys/namei.h>
72 #include <sys/pctrie.h>
73 #include <sys/priv.h>
74 #include <sys/reboot.h>
75 #include <sys/refcount.h>
76 #include <sys/rwlock.h>
77 #include <sys/sched.h>
78 #include <sys/sleepqueue.h>
79 #include <sys/smr.h>
80 #include <sys/smp.h>
81 #include <sys/stat.h>
82 #include <sys/sysctl.h>
83 #include <sys/syslog.h>
84 #include <sys/vmmeter.h>
85 #include <sys/vnode.h>
86 #include <sys/watchdog.h>
87
88 #include <machine/stdarg.h>
89
90 #include <security/mac/mac_framework.h>
91
92 #include <vm/vm.h>
93 #include <vm/vm_object.h>
94 #include <vm/vm_extern.h>
95 #include <vm/pmap.h>
96 #include <vm/vm_map.h>
97 #include <vm/vm_page.h>
98 #include <vm/vm_kern.h>
99 #include <vm/uma.h>
100
101 #ifdef DDB
102 #include <ddb/ddb.h>
103 #endif
104
105 static void delmntque(struct vnode *vp);
106 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
107 int slpflag, int slptimeo);
108 static void syncer_shutdown(void *arg, int howto);
109 static int vtryrecycle(struct vnode *vp);
110 static void v_init_counters(struct vnode *);
111 static void vgonel(struct vnode *);
112 static bool vhold_recycle_free(struct vnode *);
113 static void vfs_knllock(void *arg);
114 static void vfs_knlunlock(void *arg);
115 static void vfs_knl_assert_lock(void *arg, int what);
116 static void destroy_vpollinfo(struct vpollinfo *vi);
117 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
118 daddr_t startlbn, daddr_t endlbn);
119 static void vnlru_recalc(void);
120
121 /*
122 * These fences are intended for cases where some synchronization is
123 * needed between access of v_iflags and lockless vnode refcount (v_holdcnt
124 * and v_usecount) updates. Access to v_iflags is generally synchronized
125 * by the interlock, but we have some internal assertions that check vnode
126 * flags without acquiring the lock. Thus, these fences are INVARIANTS-only
127 * for now.
128 */
129 #ifdef INVARIANTS
130 #define VNODE_REFCOUNT_FENCE_ACQ() atomic_thread_fence_acq()
131 #define VNODE_REFCOUNT_FENCE_REL() atomic_thread_fence_rel()
132 #else
133 #define VNODE_REFCOUNT_FENCE_ACQ()
134 #define VNODE_REFCOUNT_FENCE_REL()
135 #endif
136
137 /*
138 * Number of vnodes in existence. Increased whenever getnewvnode()
139 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode.
140 */
141 static u_long __exclusive_cache_line numvnodes;
142
143 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
144 "Number of vnodes in existence");
145
146 static counter_u64_t vnodes_created;
147 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
148 "Number of vnodes created by getnewvnode");
149
150 /*
151 * Conversion tables for conversion from vnode types to inode formats
152 * and back.
153 */
154 enum vtype iftovt_tab[16] = {
155 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
156 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
157 };
158 int vttoif_tab[10] = {
159 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
160 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
161 };
162
163 /*
164 * List of allocates vnodes in the system.
165 */
166 static TAILQ_HEAD(freelst, vnode) vnode_list;
167 static struct vnode *vnode_list_free_marker;
168 static struct vnode *vnode_list_reclaim_marker;
169
170 /*
171 * "Free" vnode target. Free vnodes are rarely completely free, but are
172 * just ones that are cheap to recycle. Usually they are for files which
173 * have been stat'd but not read; these usually have inode and namecache
174 * data attached to them. This target is the preferred minimum size of a
175 * sub-cache consisting mostly of such files. The system balances the size
176 * of this sub-cache with its complement to try to prevent either from
177 * thrashing while the other is relatively inactive. The targets express
178 * a preference for the best balance.
179 *
180 * "Above" this target there are 2 further targets (watermarks) related
181 * to recyling of free vnodes. In the best-operating case, the cache is
182 * exactly full, the free list has size between vlowat and vhiwat above the
183 * free target, and recycling from it and normal use maintains this state.
184 * Sometimes the free list is below vlowat or even empty, but this state
185 * is even better for immediate use provided the cache is not full.
186 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
187 * ones) to reach one of these states. The watermarks are currently hard-
188 * coded as 4% and 9% of the available space higher. These and the default
189 * of 25% for wantfreevnodes are too large if the memory size is large.
190 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
191 * whenever vnlru_proc() becomes active.
192 */
193 static long wantfreevnodes;
194 static long __exclusive_cache_line freevnodes;
195 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
196 &freevnodes, 0, "Number of \"free\" vnodes");
197 static long freevnodes_old;
198
199 static counter_u64_t recycles_count;
200 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
201 "Number of vnodes recycled to meet vnode cache targets");
202
203 static counter_u64_t recycles_free_count;
204 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count,
205 "Number of free vnodes recycled to meet vnode cache targets");
206
207 static counter_u64_t deferred_inact;
208 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact,
209 "Number of times inactive processing was deferred");
210
211 /* To keep more than one thread at a time from running vfs_getnewfsid */
212 static struct mtx mntid_mtx;
213
214 /*
215 * Lock for any access to the following:
216 * vnode_list
217 * numvnodes
218 * freevnodes
219 */
220 static struct mtx __exclusive_cache_line vnode_list_mtx;
221
222 /* Publicly exported FS */
223 struct nfs_public nfs_pub;
224
225 static uma_zone_t buf_trie_zone;
226 static smr_t buf_trie_smr;
227
228 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
229 static uma_zone_t vnode_zone;
230 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll");
231
232 __read_frequently smr_t vfs_smr;
233
234 /*
235 * The workitem queue.
236 *
237 * It is useful to delay writes of file data and filesystem metadata
238 * for tens of seconds so that quickly created and deleted files need
239 * not waste disk bandwidth being created and removed. To realize this,
240 * we append vnodes to a "workitem" queue. When running with a soft
241 * updates implementation, most pending metadata dependencies should
242 * not wait for more than a few seconds. Thus, mounted on block devices
243 * are delayed only about a half the time that file data is delayed.
244 * Similarly, directory updates are more critical, so are only delayed
245 * about a third the time that file data is delayed. Thus, there are
246 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
247 * one each second (driven off the filesystem syncer process). The
248 * syncer_delayno variable indicates the next queue that is to be processed.
249 * Items that need to be processed soon are placed in this queue:
250 *
251 * syncer_workitem_pending[syncer_delayno]
252 *
253 * A delay of fifteen seconds is done by placing the request fifteen
254 * entries later in the queue:
255 *
256 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
257 *
258 */
259 static int syncer_delayno;
260 static long syncer_mask;
261 LIST_HEAD(synclist, bufobj);
262 static struct synclist *syncer_workitem_pending;
263 /*
264 * The sync_mtx protects:
265 * bo->bo_synclist
266 * sync_vnode_count
267 * syncer_delayno
268 * syncer_state
269 * syncer_workitem_pending
270 * syncer_worklist_len
271 * rushjob
272 */
273 static struct mtx sync_mtx;
274 static struct cv sync_wakeup;
275
276 #define SYNCER_MAXDELAY 32
277 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
278 static int syncdelay = 30; /* max time to delay syncing data */
279 static int filedelay = 30; /* time to delay syncing files */
280 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
281 "Time to delay syncing files (in seconds)");
282 static int dirdelay = 29; /* time to delay syncing directories */
283 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
284 "Time to delay syncing directories (in seconds)");
285 static int metadelay = 28; /* time to delay syncing metadata */
286 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
287 "Time to delay syncing metadata (in seconds)");
288 static int rushjob; /* number of slots to run ASAP */
289 static int stat_rush_requests; /* number of times I/O speeded up */
290 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
291 "Number of times I/O speeded up (rush requests)");
292
293 #define VDBATCH_SIZE 8
294 struct vdbatch {
295 u_int index;
296 long freevnodes;
297 struct mtx lock;
298 struct vnode *tab[VDBATCH_SIZE];
299 };
300 DPCPU_DEFINE_STATIC(struct vdbatch, vd);
301
302 static void vdbatch_dequeue(struct vnode *vp);
303
304 /*
305 * When shutting down the syncer, run it at four times normal speed.
306 */
307 #define SYNCER_SHUTDOWN_SPEEDUP 4
308 static int sync_vnode_count;
309 static int syncer_worklist_len;
310 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
311 syncer_state;
312
313 /* Target for maximum number of vnodes. */
314 u_long desiredvnodes;
315 static u_long gapvnodes; /* gap between wanted and desired */
316 static u_long vhiwat; /* enough extras after expansion */
317 static u_long vlowat; /* minimal extras before expansion */
318 static u_long vstir; /* nonzero to stir non-free vnodes */
319 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */
320
321 static u_long vnlru_read_freevnodes(void);
322
323 /*
324 * Note that no attempt is made to sanitize these parameters.
325 */
326 static int
327 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS)
328 {
329 u_long val;
330 int error;
331
332 val = desiredvnodes;
333 error = sysctl_handle_long(oidp, &val, 0, req);
334 if (error != 0 || req->newptr == NULL)
335 return (error);
336
337 if (val == desiredvnodes)
338 return (0);
339 mtx_lock(&vnode_list_mtx);
340 desiredvnodes = val;
341 wantfreevnodes = desiredvnodes / 4;
342 vnlru_recalc();
343 mtx_unlock(&vnode_list_mtx);
344 /*
345 * XXX There is no protection against multiple threads changing
346 * desiredvnodes at the same time. Locking above only helps vnlru and
347 * getnewvnode.
348 */
349 vfs_hash_changesize(desiredvnodes);
350 cache_changesize(desiredvnodes);
351 return (0);
352 }
353
354 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
355 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes,
356 "LU", "Target for maximum number of vnodes");
357
358 static int
359 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS)
360 {
361 u_long val;
362 int error;
363
364 val = wantfreevnodes;
365 error = sysctl_handle_long(oidp, &val, 0, req);
366 if (error != 0 || req->newptr == NULL)
367 return (error);
368
369 if (val == wantfreevnodes)
370 return (0);
371 mtx_lock(&vnode_list_mtx);
372 wantfreevnodes = val;
373 vnlru_recalc();
374 mtx_unlock(&vnode_list_mtx);
375 return (0);
376 }
377
378 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes,
379 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes,
380 "LU", "Target for minimum number of \"free\" vnodes");
381
382 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
383 &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
384 static int vnlru_nowhere;
385 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
386 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
387
388 static int
389 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS)
390 {
391 struct vnode *vp;
392 struct nameidata nd;
393 char *buf;
394 unsigned long ndflags;
395 int error;
396
397 if (req->newptr == NULL)
398 return (EINVAL);
399 if (req->newlen >= PATH_MAX)
400 return (E2BIG);
401
402 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK);
403 error = SYSCTL_IN(req, buf, req->newlen);
404 if (error != 0)
405 goto out;
406
407 buf[req->newlen] = '\0';
408
409 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | SAVENAME;
410 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread);
411 if ((error = namei(&nd)) != 0)
412 goto out;
413 vp = nd.ni_vp;
414
415 if (VN_IS_DOOMED(vp)) {
416 /*
417 * This vnode is being recycled. Return != 0 to let the caller
418 * know that the sysctl had no effect. Return EAGAIN because a
419 * subsequent call will likely succeed (since namei will create
420 * a new vnode if necessary)
421 */
422 error = EAGAIN;
423 goto putvnode;
424 }
425
426 counter_u64_add(recycles_count, 1);
427 vgone(vp);
428 putvnode:
429 NDFREE(&nd, 0);
430 out:
431 free(buf, M_TEMP);
432 return (error);
433 }
434
435 static int
436 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS)
437 {
438 struct thread *td = curthread;
439 struct vnode *vp;
440 struct file *fp;
441 int error;
442 int fd;
443
444 if (req->newptr == NULL)
445 return (EBADF);
446
447 error = sysctl_handle_int(oidp, &fd, 0, req);
448 if (error != 0)
449 return (error);
450 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp);
451 if (error != 0)
452 return (error);
453 vp = fp->f_vnode;
454
455 error = vn_lock(vp, LK_EXCLUSIVE);
456 if (error != 0)
457 goto drop;
458
459 counter_u64_add(recycles_count, 1);
460 vgone(vp);
461 VOP_UNLOCK(vp);
462 drop:
463 fdrop(fp, td);
464 return (error);
465 }
466
467 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode,
468 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
469 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname");
470 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode,
471 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
472 sysctl_ftry_reclaim_vnode, "I",
473 "Try to reclaim a vnode by its file descriptor");
474
475 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
476 static int vnsz2log;
477
478 /*
479 * Support for the bufobj clean & dirty pctrie.
480 */
481 static void *
482 buf_trie_alloc(struct pctrie *ptree)
483 {
484 return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT));
485 }
486
487 static void
488 buf_trie_free(struct pctrie *ptree, void *node)
489 {
490 uma_zfree_smr(buf_trie_zone, node);
491 }
492 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free,
493 buf_trie_smr);
494
495 /*
496 * Initialize the vnode management data structures.
497 *
498 * Reevaluate the following cap on the number of vnodes after the physical
499 * memory size exceeds 512GB. In the limit, as the physical memory size
500 * grows, the ratio of the memory size in KB to vnodes approaches 64:1.
501 */
502 #ifndef MAXVNODES_MAX
503 #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */
504 #endif
505
506 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
507
508 static struct vnode *
509 vn_alloc_marker(struct mount *mp)
510 {
511 struct vnode *vp;
512
513 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
514 vp->v_type = VMARKER;
515 vp->v_mount = mp;
516
517 return (vp);
518 }
519
520 static void
521 vn_free_marker(struct vnode *vp)
522 {
523
524 MPASS(vp->v_type == VMARKER);
525 free(vp, M_VNODE_MARKER);
526 }
527
528 /*
529 * Initialize a vnode as it first enters the zone.
530 */
531 static int
532 vnode_init(void *mem, int size, int flags)
533 {
534 struct vnode *vp;
535
536 vp = mem;
537 bzero(vp, size);
538 /*
539 * Setup locks.
540 */
541 vp->v_vnlock = &vp->v_lock;
542 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
543 /*
544 * By default, don't allow shared locks unless filesystems opt-in.
545 */
546 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
547 LK_NOSHARE | LK_IS_VNODE);
548 /*
549 * Initialize bufobj.
550 */
551 bufobj_init(&vp->v_bufobj, vp);
552 /*
553 * Initialize namecache.
554 */
555 cache_vnode_init(vp);
556 /*
557 * Initialize rangelocks.
558 */
559 rangelock_init(&vp->v_rl);
560
561 vp->v_dbatchcpu = NOCPU;
562
563 /*
564 * Check vhold_recycle_free for an explanation.
565 */
566 vp->v_holdcnt = VHOLD_NO_SMR;
567 vp->v_type = VNON;
568 mtx_lock(&vnode_list_mtx);
569 TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist);
570 mtx_unlock(&vnode_list_mtx);
571 return (0);
572 }
573
574 /*
575 * Free a vnode when it is cleared from the zone.
576 */
577 static void
578 vnode_fini(void *mem, int size)
579 {
580 struct vnode *vp;
581 struct bufobj *bo;
582
583 vp = mem;
584 vdbatch_dequeue(vp);
585 mtx_lock(&vnode_list_mtx);
586 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
587 mtx_unlock(&vnode_list_mtx);
588 rangelock_destroy(&vp->v_rl);
589 lockdestroy(vp->v_vnlock);
590 mtx_destroy(&vp->v_interlock);
591 bo = &vp->v_bufobj;
592 rw_destroy(BO_LOCKPTR(bo));
593 }
594
595 /*
596 * Provide the size of NFS nclnode and NFS fh for calculation of the
597 * vnode memory consumption. The size is specified directly to
598 * eliminate dependency on NFS-private header.
599 *
600 * Other filesystems may use bigger or smaller (like UFS and ZFS)
601 * private inode data, but the NFS-based estimation is ample enough.
602 * Still, we care about differences in the size between 64- and 32-bit
603 * platforms.
604 *
605 * Namecache structure size is heuristically
606 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
607 */
608 #ifdef _LP64
609 #define NFS_NCLNODE_SZ (528 + 64)
610 #define NC_SZ 148
611 #else
612 #define NFS_NCLNODE_SZ (360 + 32)
613 #define NC_SZ 92
614 #endif
615
616 static void
617 vntblinit(void *dummy __unused)
618 {
619 struct vdbatch *vd;
620 int cpu, physvnodes, virtvnodes;
621 u_int i;
622
623 /*
624 * Desiredvnodes is a function of the physical memory size and the
625 * kernel's heap size. Generally speaking, it scales with the
626 * physical memory size. The ratio of desiredvnodes to the physical
627 * memory size is 1:16 until desiredvnodes exceeds 98,304.
628 * Thereafter, the
629 * marginal ratio of desiredvnodes to the physical memory size is
630 * 1:64. However, desiredvnodes is limited by the kernel's heap
631 * size. The memory required by desiredvnodes vnodes and vm objects
632 * must not exceed 1/10th of the kernel's heap size.
633 */
634 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
635 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
636 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
637 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
638 desiredvnodes = min(physvnodes, virtvnodes);
639 if (desiredvnodes > MAXVNODES_MAX) {
640 if (bootverbose)
641 printf("Reducing kern.maxvnodes %lu -> %lu\n",
642 desiredvnodes, MAXVNODES_MAX);
643 desiredvnodes = MAXVNODES_MAX;
644 }
645 wantfreevnodes = desiredvnodes / 4;
646 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
647 TAILQ_INIT(&vnode_list);
648 mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF);
649 /*
650 * The lock is taken to appease WITNESS.
651 */
652 mtx_lock(&vnode_list_mtx);
653 vnlru_recalc();
654 mtx_unlock(&vnode_list_mtx);
655 vnode_list_free_marker = vn_alloc_marker(NULL);
656 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist);
657 vnode_list_reclaim_marker = vn_alloc_marker(NULL);
658 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
659 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
660 vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
661 uma_zone_set_smr(vnode_zone, vfs_smr);
662 /*
663 * Preallocate enough nodes to support one-per buf so that
664 * we can not fail an insert. reassignbuf() callers can not
665 * tolerate the insertion failure.
666 */
667 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
668 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
669 UMA_ZONE_NOFREE | UMA_ZONE_SMR);
670 buf_trie_smr = uma_zone_get_smr(buf_trie_zone);
671 uma_prealloc(buf_trie_zone, nbuf);
672
673 vnodes_created = counter_u64_alloc(M_WAITOK);
674 recycles_count = counter_u64_alloc(M_WAITOK);
675 recycles_free_count = counter_u64_alloc(M_WAITOK);
676 deferred_inact = counter_u64_alloc(M_WAITOK);
677
678 /*
679 * Initialize the filesystem syncer.
680 */
681 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
682 &syncer_mask);
683 syncer_maxdelay = syncer_mask + 1;
684 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
685 cv_init(&sync_wakeup, "syncer");
686 for (i = 1; i <= sizeof(struct vnode); i <<= 1)
687 vnsz2log++;
688 vnsz2log--;
689
690 CPU_FOREACH(cpu) {
691 vd = DPCPU_ID_PTR((cpu), vd);
692 bzero(vd, sizeof(*vd));
693 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF);
694 }
695 }
696 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
697
698 /*
699 * Mark a mount point as busy. Used to synchronize access and to delay
700 * unmounting. Eventually, mountlist_mtx is not released on failure.
701 *
702 * vfs_busy() is a custom lock, it can block the caller.
703 * vfs_busy() only sleeps if the unmount is active on the mount point.
704 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
705 * vnode belonging to mp.
706 *
707 * Lookup uses vfs_busy() to traverse mount points.
708 * root fs var fs
709 * / vnode lock A / vnode lock (/var) D
710 * /var vnode lock B /log vnode lock(/var/log) E
711 * vfs_busy lock C vfs_busy lock F
712 *
713 * Within each file system, the lock order is C->A->B and F->D->E.
714 *
715 * When traversing across mounts, the system follows that lock order:
716 *
717 * C->A->B
718 * |
719 * +->F->D->E
720 *
721 * The lookup() process for namei("/var") illustrates the process:
722 * VOP_LOOKUP() obtains B while A is held
723 * vfs_busy() obtains a shared lock on F while A and B are held
724 * vput() releases lock on B
725 * vput() releases lock on A
726 * VFS_ROOT() obtains lock on D while shared lock on F is held
727 * vfs_unbusy() releases shared lock on F
728 * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
729 * Attempt to lock A (instead of vp_crossmp) while D is held would
730 * violate the global order, causing deadlocks.
731 *
732 * dounmount() locks B while F is drained.
733 */
734 int
735 vfs_busy(struct mount *mp, int flags)
736 {
737 struct mount_pcpu *mpcpu;
738
739 MPASS((flags & ~MBF_MASK) == 0);
740 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
741
742 if (vfs_op_thread_enter(mp, mpcpu)) {
743 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
744 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0);
745 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0);
746 vfs_mp_count_add_pcpu(mpcpu, ref, 1);
747 vfs_mp_count_add_pcpu(mpcpu, lockref, 1);
748 vfs_op_thread_exit(mp, mpcpu);
749 if (flags & MBF_MNTLSTLOCK)
750 mtx_unlock(&mountlist_mtx);
751 return (0);
752 }
753
754 MNT_ILOCK(mp);
755 vfs_assert_mount_counters(mp);
756 MNT_REF(mp);
757 /*
758 * If mount point is currently being unmounted, sleep until the
759 * mount point fate is decided. If thread doing the unmounting fails,
760 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
761 * that this mount point has survived the unmount attempt and vfs_busy
762 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE
763 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
764 * about to be really destroyed. vfs_busy needs to release its
765 * reference on the mount point in this case and return with ENOENT,
766 * telling the caller that mount mount it tried to busy is no longer
767 * valid.
768 */
769 while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
770 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
771 MNT_REL(mp);
772 MNT_IUNLOCK(mp);
773 CTR1(KTR_VFS, "%s: failed busying before sleeping",
774 __func__);
775 return (ENOENT);
776 }
777 if (flags & MBF_MNTLSTLOCK)
778 mtx_unlock(&mountlist_mtx);
779 mp->mnt_kern_flag |= MNTK_MWAIT;
780 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
781 if (flags & MBF_MNTLSTLOCK)
782 mtx_lock(&mountlist_mtx);
783 MNT_ILOCK(mp);
784 }
785 if (flags & MBF_MNTLSTLOCK)
786 mtx_unlock(&mountlist_mtx);
787 mp->mnt_lockref++;
788 MNT_IUNLOCK(mp);
789 return (0);
790 }
791
792 /*
793 * Free a busy filesystem.
794 */
795 void
796 vfs_unbusy(struct mount *mp)
797 {
798 struct mount_pcpu *mpcpu;
799 int c;
800
801 CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
802
803 if (vfs_op_thread_enter(mp, mpcpu)) {
804 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
805 vfs_mp_count_sub_pcpu(mpcpu, lockref, 1);
806 vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
807 vfs_op_thread_exit(mp, mpcpu);
808 return;
809 }
810
811 MNT_ILOCK(mp);
812 vfs_assert_mount_counters(mp);
813 MNT_REL(mp);
814 c = --mp->mnt_lockref;
815 if (mp->mnt_vfs_ops == 0) {
816 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
817 MNT_IUNLOCK(mp);
818 return;
819 }
820 if (c < 0)
821 vfs_dump_mount_counters(mp);
822 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
823 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
824 CTR1(KTR_VFS, "%s: waking up waiters", __func__);
825 mp->mnt_kern_flag &= ~MNTK_DRAINING;
826 wakeup(&mp->mnt_lockref);
827 }
828 MNT_IUNLOCK(mp);
829 }
830
831 /*
832 * Lookup a mount point by filesystem identifier.
833 */
834 struct mount *
835 vfs_getvfs(fsid_t *fsid)
836 {
837 struct mount *mp;
838
839 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
840 mtx_lock(&mountlist_mtx);
841 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
842 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
843 vfs_ref(mp);
844 mtx_unlock(&mountlist_mtx);
845 return (mp);
846 }
847 }
848 mtx_unlock(&mountlist_mtx);
849 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
850 return ((struct mount *) 0);
851 }
852
853 /*
854 * Lookup a mount point by filesystem identifier, busying it before
855 * returning.
856 *
857 * To avoid congestion on mountlist_mtx, implement simple direct-mapped
858 * cache for popular filesystem identifiers. The cache is lockess, using
859 * the fact that struct mount's are never freed. In worst case we may
860 * get pointer to unmounted or even different filesystem, so we have to
861 * check what we got, and go slow way if so.
862 */
863 struct mount *
864 vfs_busyfs(fsid_t *fsid)
865 {
866 #define FSID_CACHE_SIZE 256
867 typedef struct mount * volatile vmp_t;
868 static vmp_t cache[FSID_CACHE_SIZE];
869 struct mount *mp;
870 int error;
871 uint32_t hash;
872
873 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
874 hash = fsid->val[0] ^ fsid->val[1];
875 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
876 mp = cache[hash];
877 if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0)
878 goto slow;
879 if (vfs_busy(mp, 0) != 0) {
880 cache[hash] = NULL;
881 goto slow;
882 }
883 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0)
884 return (mp);
885 else
886 vfs_unbusy(mp);
887
888 slow:
889 mtx_lock(&mountlist_mtx);
890 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
891 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
892 error = vfs_busy(mp, MBF_MNTLSTLOCK);
893 if (error) {
894 cache[hash] = NULL;
895 mtx_unlock(&mountlist_mtx);
896 return (NULL);
897 }
898 cache[hash] = mp;
899 return (mp);
900 }
901 }
902 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
903 mtx_unlock(&mountlist_mtx);
904 return ((struct mount *) 0);
905 }
906
907 /*
908 * Check if a user can access privileged mount options.
909 */
910 int
911 vfs_suser(struct mount *mp, struct thread *td)
912 {
913 int error;
914
915 if (jailed(td->td_ucred)) {
916 /*
917 * If the jail of the calling thread lacks permission for
918 * this type of file system, deny immediately.
919 */
920 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag))
921 return (EPERM);
922
923 /*
924 * If the file system was mounted outside the jail of the
925 * calling thread, deny immediately.
926 */
927 if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
928 return (EPERM);
929 }
930
931 /*
932 * If file system supports delegated administration, we don't check
933 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
934 * by the file system itself.
935 * If this is not the user that did original mount, we check for
936 * the PRIV_VFS_MOUNT_OWNER privilege.
937 */
938 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
939 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
940 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
941 return (error);
942 }
943 return (0);
944 }
945
946 /*
947 * Get a new unique fsid. Try to make its val[0] unique, since this value
948 * will be used to create fake device numbers for stat(). Also try (but
949 * not so hard) make its val[0] unique mod 2^16, since some emulators only
950 * support 16-bit device numbers. We end up with unique val[0]'s for the
951 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
952 *
953 * Keep in mind that several mounts may be running in parallel. Starting
954 * the search one past where the previous search terminated is both a
955 * micro-optimization and a defense against returning the same fsid to
956 * different mounts.
957 */
958 void
959 vfs_getnewfsid(struct mount *mp)
960 {
961 static uint16_t mntid_base;
962 struct mount *nmp;
963 fsid_t tfsid;
964 int mtype;
965
966 CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
967 mtx_lock(&mntid_mtx);
968 mtype = mp->mnt_vfc->vfc_typenum;
969 tfsid.val[1] = mtype;
970 mtype = (mtype & 0xFF) << 24;
971 for (;;) {
972 tfsid.val[0] = makedev(255,
973 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
974 mntid_base++;
975 if ((nmp = vfs_getvfs(&tfsid)) == NULL)
976 break;
977 vfs_rel(nmp);
978 }
979 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
980 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
981 mtx_unlock(&mntid_mtx);
982 }
983
984 /*
985 * Knob to control the precision of file timestamps:
986 *
987 * 0 = seconds only; nanoseconds zeroed.
988 * 1 = seconds and nanoseconds, accurate within 1/HZ.
989 * 2 = seconds and nanoseconds, truncated to microseconds.
990 * >=3 = seconds and nanoseconds, maximum precision.
991 */
992 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
993
994 static int timestamp_precision = TSP_USEC;
995 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
996 &timestamp_precision, 0, "File timestamp precision (0: seconds, "
997 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
998 "3+: sec + ns (max. precision))");
999
1000 /*
1001 * Get a current timestamp.
1002 */
1003 void
1004 vfs_timestamp(struct timespec *tsp)
1005 {
1006 struct timeval tv;
1007
1008 switch (timestamp_precision) {
1009 case TSP_SEC:
1010 tsp->tv_sec = time_second;
1011 tsp->tv_nsec = 0;
1012 break;
1013 case TSP_HZ:
1014 getnanotime(tsp);
1015 break;
1016 case TSP_USEC:
1017 microtime(&tv);
1018 TIMEVAL_TO_TIMESPEC(&tv, tsp);
1019 break;
1020 case TSP_NSEC:
1021 default:
1022 nanotime(tsp);
1023 break;
1024 }
1025 }
1026
1027 /*
1028 * Set vnode attributes to VNOVAL
1029 */
1030 void
1031 vattr_null(struct vattr *vap)
1032 {
1033
1034 vap->va_type = VNON;
1035 vap->va_size = VNOVAL;
1036 vap->va_bytes = VNOVAL;
1037 vap->va_mode = VNOVAL;
1038 vap->va_nlink = VNOVAL;
1039 vap->va_uid = VNOVAL;
1040 vap->va_gid = VNOVAL;
1041 vap->va_fsid = VNOVAL;
1042 vap->va_fileid = VNOVAL;
1043 vap->va_blocksize = VNOVAL;
1044 vap->va_rdev = VNOVAL;
1045 vap->va_atime.tv_sec = VNOVAL;
1046 vap->va_atime.tv_nsec = VNOVAL;
1047 vap->va_mtime.tv_sec = VNOVAL;
1048 vap->va_mtime.tv_nsec = VNOVAL;
1049 vap->va_ctime.tv_sec = VNOVAL;
1050 vap->va_ctime.tv_nsec = VNOVAL;
1051 vap->va_birthtime.tv_sec = VNOVAL;
1052 vap->va_birthtime.tv_nsec = VNOVAL;
1053 vap->va_flags = VNOVAL;
1054 vap->va_gen = VNOVAL;
1055 vap->va_vaflags = 0;
1056 }
1057
1058 /*
1059 * Try to reduce the total number of vnodes.
1060 *
1061 * This routine (and its user) are buggy in at least the following ways:
1062 * - all parameters were picked years ago when RAM sizes were significantly
1063 * smaller
1064 * - it can pick vnodes based on pages used by the vm object, but filesystems
1065 * like ZFS don't use it making the pick broken
1066 * - since ZFS has its own aging policy it gets partially combated by this one
1067 * - a dedicated method should be provided for filesystems to let them decide
1068 * whether the vnode should be recycled
1069 *
1070 * This routine is called when we have too many vnodes. It attempts
1071 * to free <count> vnodes and will potentially free vnodes that still
1072 * have VM backing store (VM backing store is typically the cause
1073 * of a vnode blowout so we want to do this). Therefore, this operation
1074 * is not considered cheap.
1075 *
1076 * A number of conditions may prevent a vnode from being reclaimed.
1077 * the buffer cache may have references on the vnode, a directory
1078 * vnode may still have references due to the namei cache representing
1079 * underlying files, or the vnode may be in active use. It is not
1080 * desirable to reuse such vnodes. These conditions may cause the
1081 * number of vnodes to reach some minimum value regardless of what
1082 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
1083 *
1084 * @param reclaim_nc_src Only reclaim directories with outgoing namecache
1085 * entries if this argument is strue
1086 * @param trigger Only reclaim vnodes with fewer than this many resident
1087 * pages.
1088 * @param target How many vnodes to reclaim.
1089 * @return The number of vnodes that were reclaimed.
1090 */
1091 static int
1092 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target)
1093 {
1094 struct vnode *vp, *mvp;
1095 struct mount *mp;
1096 struct vm_object *object;
1097 u_long done;
1098 bool retried;
1099
1100 mtx_assert(&vnode_list_mtx, MA_OWNED);
1101
1102 retried = false;
1103 done = 0;
1104
1105 mvp = vnode_list_reclaim_marker;
1106 restart:
1107 vp = mvp;
1108 while (done < target) {
1109 vp = TAILQ_NEXT(vp, v_vnodelist);
1110 if (__predict_false(vp == NULL))
1111 break;
1112
1113 if (__predict_false(vp->v_type == VMARKER))
1114 continue;
1115
1116 /*
1117 * If it's been deconstructed already, it's still
1118 * referenced, or it exceeds the trigger, skip it.
1119 * Also skip free vnodes. We are trying to make space
1120 * to expand the free list, not reduce it.
1121 */
1122 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 ||
1123 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)))
1124 goto next_iter;
1125
1126 if (vp->v_type == VBAD || vp->v_type == VNON)
1127 goto next_iter;
1128
1129 object = atomic_load_ptr(&vp->v_object);
1130 if (object == NULL || object->resident_page_count > trigger) {
1131 goto next_iter;
1132 }
1133
1134 /*
1135 * Handle races against vnode allocation. Filesystems lock the
1136 * vnode some time after it gets returned from getnewvnode,
1137 * despite type and hold count being manipulated earlier.
1138 * Resorting to checking v_mount restores guarantees present
1139 * before the global list was reworked to contain all vnodes.
1140 */
1141 if (!VI_TRYLOCK(vp))
1142 goto next_iter;
1143 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
1144 VI_UNLOCK(vp);
1145 goto next_iter;
1146 }
1147 if (vp->v_mount == NULL) {
1148 VI_UNLOCK(vp);
1149 goto next_iter;
1150 }
1151 vholdl(vp);
1152 VI_UNLOCK(vp);
1153 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1154 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
1155 mtx_unlock(&vnode_list_mtx);
1156
1157 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1158 vdrop(vp);
1159 goto next_iter_unlocked;
1160 }
1161 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) {
1162 vdrop(vp);
1163 vn_finished_write(mp);
1164 goto next_iter_unlocked;
1165 }
1166
1167 VI_LOCK(vp);
1168 if (vp->v_usecount > 0 ||
1169 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
1170 (vp->v_object != NULL &&
1171 vp->v_object->resident_page_count > trigger)) {
1172 VOP_UNLOCK(vp);
1173 vdropl(vp);
1174 vn_finished_write(mp);
1175 goto next_iter_unlocked;
1176 }
1177 counter_u64_add(recycles_count, 1);
1178 vgonel(vp);
1179 VOP_UNLOCK(vp);
1180 vdropl(vp);
1181 vn_finished_write(mp);
1182 done++;
1183 next_iter_unlocked:
1184 if (should_yield())
1185 kern_yield(PRI_USER);
1186 mtx_lock(&vnode_list_mtx);
1187 goto restart;
1188 next_iter:
1189 MPASS(vp->v_type != VMARKER);
1190 if (!should_yield())
1191 continue;
1192 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1193 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
1194 mtx_unlock(&vnode_list_mtx);
1195 kern_yield(PRI_USER);
1196 mtx_lock(&vnode_list_mtx);
1197 goto restart;
1198 }
1199 if (done == 0 && !retried) {
1200 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1201 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
1202 retried = true;
1203 goto restart;
1204 }
1205 return (done);
1206 }
1207
1208 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
1209 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
1210 0,
1211 "limit on vnode free requests per call to the vnlru_free routine");
1212
1213 /*
1214 * Attempt to reduce the free list by the requested amount.
1215 */
1216 static int
1217 vnlru_free_locked(int count, struct vfsops *mnt_op)
1218 {
1219 struct vnode *vp, *mvp;
1220 struct mount *mp;
1221 int ocount;
1222
1223 mtx_assert(&vnode_list_mtx, MA_OWNED);
1224 if (count > max_vnlru_free)
1225 count = max_vnlru_free;
1226 ocount = count;
1227 mvp = vnode_list_free_marker;
1228 vp = mvp;
1229 for (;;) {
1230 if (count == 0) {
1231 break;
1232 }
1233 vp = TAILQ_NEXT(vp, v_vnodelist);
1234 if (__predict_false(vp == NULL)) {
1235 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1236 TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist);
1237 break;
1238 }
1239 if (__predict_false(vp->v_type == VMARKER))
1240 continue;
1241 if (vp->v_holdcnt > 0)
1242 continue;
1243 /*
1244 * Don't recycle if our vnode is from different type
1245 * of mount point. Note that mp is type-safe, the
1246 * check does not reach unmapped address even if
1247 * vnode is reclaimed.
1248 */
1249 if (mnt_op != NULL && (mp = vp->v_mount) != NULL &&
1250 mp->mnt_op != mnt_op) {
1251 continue;
1252 }
1253 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
1254 continue;
1255 }
1256 if (!vhold_recycle_free(vp))
1257 continue;
1258 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
1259 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
1260 mtx_unlock(&vnode_list_mtx);
1261 if (vtryrecycle(vp) == 0)
1262 count--;
1263 mtx_lock(&vnode_list_mtx);
1264 vp = mvp;
1265 }
1266 return (ocount - count);
1267 }
1268
1269 void
1270 vnlru_free(int count, struct vfsops *mnt_op)
1271 {
1272
1273 mtx_lock(&vnode_list_mtx);
1274 vnlru_free_locked(count, mnt_op);
1275 mtx_unlock(&vnode_list_mtx);
1276 }
1277
1278 static void
1279 vnlru_recalc(void)
1280 {
1281
1282 mtx_assert(&vnode_list_mtx, MA_OWNED);
1283 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
1284 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
1285 vlowat = vhiwat / 2;
1286 }
1287
1288 /*
1289 * Attempt to recycle vnodes in a context that is always safe to block.
1290 * Calling vlrurecycle() from the bowels of filesystem code has some
1291 * interesting deadlock problems.
1292 */
1293 static struct proc *vnlruproc;
1294 static int vnlruproc_sig;
1295
1296 /*
1297 * The main freevnodes counter is only updated when threads requeue their vnode
1298 * batches. CPUs are conditionally walked to compute a more accurate total.
1299 *
1300 * Limit how much of a slop are we willing to tolerate. Note: the actual value
1301 * at any given moment can still exceed slop, but it should not be by significant
1302 * margin in practice.
1303 */
1304 #define VNLRU_FREEVNODES_SLOP 128
1305
1306 static __inline void
1307 vn_freevnodes_inc(void)
1308 {
1309 struct vdbatch *vd;
1310
1311 critical_enter();
1312 vd = DPCPU_PTR(vd);
1313 vd->freevnodes++;
1314 critical_exit();
1315 }
1316
1317 static __inline void
1318 vn_freevnodes_dec(void)
1319 {
1320 struct vdbatch *vd;
1321
1322 critical_enter();
1323 vd = DPCPU_PTR(vd);
1324 vd->freevnodes--;
1325 critical_exit();
1326 }
1327
1328 static u_long
1329 vnlru_read_freevnodes(void)
1330 {
1331 struct vdbatch *vd;
1332 long slop;
1333 int cpu;
1334
1335 mtx_assert(&vnode_list_mtx, MA_OWNED);
1336 if (freevnodes > freevnodes_old)
1337 slop = freevnodes - freevnodes_old;
1338 else
1339 slop = freevnodes_old - freevnodes;
1340 if (slop < VNLRU_FREEVNODES_SLOP)
1341 return (freevnodes >= 0 ? freevnodes : 0);
1342 freevnodes_old = freevnodes;
1343 CPU_FOREACH(cpu) {
1344 vd = DPCPU_ID_PTR((cpu), vd);
1345 freevnodes_old += vd->freevnodes;
1346 }
1347 return (freevnodes_old >= 0 ? freevnodes_old : 0);
1348 }
1349
1350 static bool
1351 vnlru_under(u_long rnumvnodes, u_long limit)
1352 {
1353 u_long rfreevnodes, space;
1354
1355 if (__predict_false(rnumvnodes > desiredvnodes))
1356 return (true);
1357
1358 space = desiredvnodes - rnumvnodes;
1359 if (space < limit) {
1360 rfreevnodes = vnlru_read_freevnodes();
1361 if (rfreevnodes > wantfreevnodes)
1362 space += rfreevnodes - wantfreevnodes;
1363 }
1364 return (space < limit);
1365 }
1366
1367 static bool
1368 vnlru_under_unlocked(u_long rnumvnodes, u_long limit)
1369 {
1370 long rfreevnodes, space;
1371
1372 if (__predict_false(rnumvnodes > desiredvnodes))
1373 return (true);
1374
1375 space = desiredvnodes - rnumvnodes;
1376 if (space < limit) {
1377 rfreevnodes = atomic_load_long(&freevnodes);
1378 if (rfreevnodes > wantfreevnodes)
1379 space += rfreevnodes - wantfreevnodes;
1380 }
1381 return (space < limit);
1382 }
1383
1384 static void
1385 vnlru_kick(void)
1386 {
1387
1388 mtx_assert(&vnode_list_mtx, MA_OWNED);
1389 if (vnlruproc_sig == 0) {
1390 vnlruproc_sig = 1;
1391 wakeup(vnlruproc);
1392 }
1393 }
1394
1395 static void
1396 vnlru_proc(void)
1397 {
1398 u_long rnumvnodes, rfreevnodes, target;
1399 unsigned long onumvnodes;
1400 int done, force, trigger, usevnodes;
1401 bool reclaim_nc_src, want_reread;
1402
1403 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
1404 SHUTDOWN_PRI_FIRST);
1405
1406 force = 0;
1407 want_reread = false;
1408 for (;;) {
1409 kproc_suspend_check(vnlruproc);
1410 mtx_lock(&vnode_list_mtx);
1411 rnumvnodes = atomic_load_long(&numvnodes);
1412
1413 if (want_reread) {
1414 force = vnlru_under(numvnodes, vhiwat) ? 1 : 0;
1415 want_reread = false;
1416 }
1417
1418 /*
1419 * If numvnodes is too large (due to desiredvnodes being
1420 * adjusted using its sysctl, or emergency growth), first
1421 * try to reduce it by discarding from the free list.
1422 */
1423 if (rnumvnodes > desiredvnodes) {
1424 vnlru_free_locked(rnumvnodes - desiredvnodes, NULL);
1425 rnumvnodes = atomic_load_long(&numvnodes);
1426 }
1427 /*
1428 * Sleep if the vnode cache is in a good state. This is
1429 * when it is not over-full and has space for about a 4%
1430 * or 9% expansion (by growing its size or inexcessively
1431 * reducing its free list). Otherwise, try to reclaim
1432 * space for a 10% expansion.
1433 */
1434 if (vstir && force == 0) {
1435 force = 1;
1436 vstir = 0;
1437 }
1438 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) {
1439 vnlruproc_sig = 0;
1440 wakeup(&vnlruproc_sig);
1441 msleep(vnlruproc, &vnode_list_mtx,
1442 PVFS|PDROP, "vlruwt", hz);
1443 continue;
1444 }
1445 rfreevnodes = vnlru_read_freevnodes();
1446
1447 onumvnodes = rnumvnodes;
1448 /*
1449 * Calculate parameters for recycling. These are the same
1450 * throughout the loop to give some semblance of fairness.
1451 * The trigger point is to avoid recycling vnodes with lots
1452 * of resident pages. We aren't trying to free memory; we
1453 * are trying to recycle or at least free vnodes.
1454 */
1455 if (rnumvnodes <= desiredvnodes)
1456 usevnodes = rnumvnodes - rfreevnodes;
1457 else
1458 usevnodes = rnumvnodes;
1459 if (usevnodes <= 0)
1460 usevnodes = 1;
1461 /*
1462 * The trigger value is is chosen to give a conservatively
1463 * large value to ensure that it alone doesn't prevent
1464 * making progress. The value can easily be so large that
1465 * it is effectively infinite in some congested and
1466 * misconfigured cases, and this is necessary. Normally
1467 * it is about 8 to 100 (pages), which is quite large.
1468 */
1469 trigger = vm_cnt.v_page_count * 2 / usevnodes;
1470 if (force < 2)
1471 trigger = vsmalltrigger;
1472 reclaim_nc_src = force >= 3;
1473 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1);
1474 target = target / 10 + 1;
1475 done = vlrureclaim(reclaim_nc_src, trigger, target);
1476 mtx_unlock(&vnode_list_mtx);
1477 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
1478 uma_reclaim(UMA_RECLAIM_DRAIN);
1479 if (done == 0) {
1480 if (force == 0 || force == 1) {
1481 force = 2;
1482 continue;
1483 }
1484 if (force == 2) {
1485 force = 3;
1486 continue;
1487 }
1488 want_reread = true;
1489 force = 0;
1490 vnlru_nowhere++;
1491 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
1492 } else {
1493 want_reread = true;
1494 kern_yield(PRI_USER);
1495 }
1496 }
1497 }
1498
1499 static struct kproc_desc vnlru_kp = {
1500 "vnlru",
1501 vnlru_proc,
1502 &vnlruproc
1503 };
1504 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
1505 &vnlru_kp);
1506
1507 /*
1508 * Routines having to do with the management of the vnode table.
1509 */
1510
1511 /*
1512 * Try to recycle a freed vnode. We abort if anyone picks up a reference
1513 * before we actually vgone(). This function must be called with the vnode
1514 * held to prevent the vnode from being returned to the free list midway
1515 * through vgone().
1516 */
1517 static int
1518 vtryrecycle(struct vnode *vp)
1519 {
1520 struct mount *vnmp;
1521
1522 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
1523 VNASSERT(vp->v_holdcnt, vp,
1524 ("vtryrecycle: Recycling vp %p without a reference.", vp));
1525 /*
1526 * This vnode may found and locked via some other list, if so we
1527 * can't recycle it yet.
1528 */
1529 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
1530 CTR2(KTR_VFS,
1531 "%s: impossible to recycle, vp %p lock is already held",
1532 __func__, vp);
1533 vdrop(vp);
1534 return (EWOULDBLOCK);
1535 }
1536 /*
1537 * Don't recycle if its filesystem is being suspended.
1538 */
1539 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
1540 VOP_UNLOCK(vp);
1541 CTR2(KTR_VFS,
1542 "%s: impossible to recycle, cannot start the write for %p",
1543 __func__, vp);
1544 vdrop(vp);
1545 return (EBUSY);
1546 }
1547 /*
1548 * If we got this far, we need to acquire the interlock and see if
1549 * anyone picked up this vnode from another list. If not, we will
1550 * mark it with DOOMED via vgonel() so that anyone who does find it
1551 * will skip over it.
1552 */
1553 VI_LOCK(vp);
1554 if (vp->v_usecount) {
1555 VOP_UNLOCK(vp);
1556 vdropl(vp);
1557 vn_finished_write(vnmp);
1558 CTR2(KTR_VFS,
1559 "%s: impossible to recycle, %p is already referenced",
1560 __func__, vp);
1561 return (EBUSY);
1562 }
1563 if (!VN_IS_DOOMED(vp)) {
1564 counter_u64_add(recycles_free_count, 1);
1565 vgonel(vp);
1566 }
1567 VOP_UNLOCK(vp);
1568 vdropl(vp);
1569 vn_finished_write(vnmp);
1570 return (0);
1571 }
1572
1573 /*
1574 * Allocate a new vnode.
1575 *
1576 * The operation never returns an error. Returning an error was disabled
1577 * in r145385 (dated 2005) with the following comment:
1578 *
1579 * XXX Not all VFS_VGET/ffs_vget callers check returns.
1580 *
1581 * Given the age of this commit (almost 15 years at the time of writing this
1582 * comment) restoring the ability to fail requires a significant audit of
1583 * all codepaths.
1584 *
1585 * The routine can try to free a vnode or stall for up to 1 second waiting for
1586 * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation.
1587 */
1588 static u_long vn_alloc_cyclecount;
1589
1590 static struct vnode * __noinline
1591 vn_alloc_hard(struct mount *mp)
1592 {
1593 u_long rnumvnodes, rfreevnodes;
1594
1595 mtx_lock(&vnode_list_mtx);
1596 rnumvnodes = atomic_load_long(&numvnodes);
1597 if (rnumvnodes + 1 < desiredvnodes) {
1598 vn_alloc_cyclecount = 0;
1599 goto alloc;
1600 }
1601 rfreevnodes = vnlru_read_freevnodes();
1602 if (vn_alloc_cyclecount++ >= rfreevnodes) {
1603 vn_alloc_cyclecount = 0;
1604 vstir = 1;
1605 }
1606 /*
1607 * Grow the vnode cache if it will not be above its target max
1608 * after growing. Otherwise, if the free list is nonempty, try
1609 * to reclaim 1 item from it before growing the cache (possibly
1610 * above its target max if the reclamation failed or is delayed).
1611 * Otherwise, wait for some space. In all cases, schedule
1612 * vnlru_proc() if we are getting short of space. The watermarks
1613 * should be chosen so that we never wait or even reclaim from
1614 * the free list to below its target minimum.
1615 */
1616 if (vnlru_free_locked(1, NULL) > 0)
1617 goto alloc;
1618 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
1619 /*
1620 * Wait for space for a new vnode.
1621 */
1622 vnlru_kick();
1623 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz);
1624 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes &&
1625 vnlru_read_freevnodes() > 1)
1626 vnlru_free_locked(1, NULL);
1627 }
1628 alloc:
1629 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
1630 if (vnlru_under(rnumvnodes, vlowat))
1631 vnlru_kick();
1632 mtx_unlock(&vnode_list_mtx);
1633 return (uma_zalloc_smr(vnode_zone, M_WAITOK));
1634 }
1635
1636 static struct vnode *
1637 vn_alloc(struct mount *mp)
1638 {
1639 u_long rnumvnodes;
1640
1641 if (__predict_false(vn_alloc_cyclecount != 0))
1642 return (vn_alloc_hard(mp));
1643 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
1644 if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) {
1645 atomic_subtract_long(&numvnodes, 1);
1646 return (vn_alloc_hard(mp));
1647 }
1648
1649 return (uma_zalloc_smr(vnode_zone, M_WAITOK));
1650 }
1651
1652 static void
1653 vn_free(struct vnode *vp)
1654 {
1655
1656 atomic_subtract_long(&numvnodes, 1);
1657 uma_zfree_smr(vnode_zone, vp);
1658 }
1659
1660 /*
1661 * Return the next vnode from the free list.
1662 */
1663 int
1664 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
1665 struct vnode **vpp)
1666 {
1667 struct vnode *vp;
1668 struct thread *td;
1669 struct lock_object *lo;
1670
1671 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
1672
1673 KASSERT(vops->registered,
1674 ("%s: not registered vector op %p\n", __func__, vops));
1675
1676 td = curthread;
1677 if (td->td_vp_reserved != NULL) {
1678 vp = td->td_vp_reserved;
1679 td->td_vp_reserved = NULL;
1680 } else {
1681 vp = vn_alloc(mp);
1682 }
1683 counter_u64_add(vnodes_created, 1);
1684 /*
1685 * Locks are given the generic name "vnode" when created.
1686 * Follow the historic practice of using the filesystem
1687 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
1688 *
1689 * Locks live in a witness group keyed on their name. Thus,
1690 * when a lock is renamed, it must also move from the witness
1691 * group of its old name to the witness group of its new name.
1692 *
1693 * The change only needs to be made when the vnode moves
1694 * from one filesystem type to another. We ensure that each
1695 * filesystem use a single static name pointer for its tag so
1696 * that we can compare pointers rather than doing a strcmp().
1697 */
1698 lo = &vp->v_vnlock->lock_object;
1699 #ifdef WITNESS
1700 if (lo->lo_name != tag) {
1701 #endif
1702 lo->lo_name = tag;
1703 #ifdef WITNESS
1704 WITNESS_DESTROY(lo);
1705 WITNESS_INIT(lo, tag);
1706 }
1707 #endif
1708 /*
1709 * By default, don't allow shared locks unless filesystems opt-in.
1710 */
1711 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
1712 /*
1713 * Finalize various vnode identity bits.
1714 */
1715 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
1716 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
1717 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
1718 vp->v_type = VNON;
1719 vp->v_op = vops;
1720 v_init_counters(vp);
1721 vp->v_bufobj.bo_ops = &buf_ops_bio;
1722 #ifdef DIAGNOSTIC
1723 if (mp == NULL && vops != &dead_vnodeops)
1724 printf("NULL mp in getnewvnode(9), tag %s\n", tag);
1725 #endif
1726 #ifdef MAC
1727 mac_vnode_init(vp);
1728 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1729 mac_vnode_associate_singlelabel(mp, vp);
1730 #endif
1731 if (mp != NULL) {
1732 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
1733 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1734 vp->v_vflag |= VV_NOKNOTE;
1735 }
1736
1737 /*
1738 * For the filesystems which do not use vfs_hash_insert(),
1739 * still initialize v_hash to have vfs_hash_index() useful.
1740 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
1741 * its own hashing.
1742 */
1743 vp->v_hash = (uintptr_t)vp >> vnsz2log;
1744
1745 *vpp = vp;
1746 return (0);
1747 }
1748
1749 void
1750 getnewvnode_reserve(void)
1751 {
1752 struct thread *td;
1753
1754 td = curthread;
1755 MPASS(td->td_vp_reserved == NULL);
1756 td->td_vp_reserved = vn_alloc(NULL);
1757 }
1758
1759 void
1760 getnewvnode_drop_reserve(void)
1761 {
1762 struct thread *td;
1763
1764 td = curthread;
1765 if (td->td_vp_reserved != NULL) {
1766 vn_free(td->td_vp_reserved);
1767 td->td_vp_reserved = NULL;
1768 }
1769 }
1770
1771 static void __noinline
1772 freevnode(struct vnode *vp)
1773 {
1774 struct bufobj *bo;
1775
1776 /*
1777 * The vnode has been marked for destruction, so free it.
1778 *
1779 * The vnode will be returned to the zone where it will
1780 * normally remain until it is needed for another vnode. We
1781 * need to cleanup (or verify that the cleanup has already
1782 * been done) any residual data left from its current use
1783 * so as not to contaminate the freshly allocated vnode.
1784 */
1785 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
1786 /*
1787 * Paired with vgone.
1788 */
1789 vn_seqc_write_end_locked(vp);
1790 VNPASS(vp->v_seqc_users == 0, vp);
1791
1792 bo = &vp->v_bufobj;
1793 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
1794 VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp);
1795 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
1796 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
1797 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
1798 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
1799 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
1800 ("clean blk trie not empty"));
1801 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
1802 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
1803 ("dirty blk trie not empty"));
1804 VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
1805 VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
1806 VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
1807 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
1808 ("Dangling rangelock waiters"));
1809 VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp,
1810 ("Leaked inactivation"));
1811 VI_UNLOCK(vp);
1812 #ifdef MAC
1813 mac_vnode_destroy(vp);
1814 #endif
1815 if (vp->v_pollinfo != NULL) {
1816 destroy_vpollinfo(vp->v_pollinfo);
1817 vp->v_pollinfo = NULL;
1818 }
1819 vp->v_mountedhere = NULL;
1820 vp->v_unpcb = NULL;
1821 vp->v_rdev = NULL;
1822 vp->v_fifoinfo = NULL;
1823 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
1824 vp->v_irflag = 0;
1825 vp->v_iflag = 0;
1826 vp->v_vflag = 0;
1827 bo->bo_flag = 0;
1828 vn_free(vp);
1829 }
1830
1831 /*
1832 * Delete from old mount point vnode list, if on one.
1833 */
1834 static void
1835 delmntque(struct vnode *vp)
1836 {
1837 struct mount *mp;
1838
1839 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
1840
1841 mp = vp->v_mount;
1842 if (mp == NULL)
1843 return;
1844 MNT_ILOCK(mp);
1845 VI_LOCK(vp);
1846 vp->v_mount = NULL;
1847 VI_UNLOCK(vp);
1848 VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1849 ("bad mount point vnode list size"));
1850 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1851 mp->mnt_nvnodelistsize--;
1852 MNT_REL(mp);
1853 MNT_IUNLOCK(mp);
1854 }
1855
1856 static void
1857 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1858 {
1859
1860 vp->v_data = NULL;
1861 vp->v_op = &dead_vnodeops;
1862 vgone(vp);
1863 vput(vp);
1864 }
1865
1866 /*
1867 * Insert into list of vnodes for the new mount point, if available.
1868 */
1869 int
1870 insmntque1(struct vnode *vp, struct mount *mp,
1871 void (*dtr)(struct vnode *, void *), void *dtr_arg)
1872 {
1873
1874 KASSERT(vp->v_mount == NULL,
1875 ("insmntque: vnode already on per mount vnode list"));
1876 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1877 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
1878
1879 /*
1880 * We acquire the vnode interlock early to ensure that the
1881 * vnode cannot be recycled by another process releasing a
1882 * holdcnt on it before we get it on both the vnode list
1883 * and the active vnode list. The mount mutex protects only
1884 * manipulation of the vnode list and the vnode freelist
1885 * mutex protects only manipulation of the active vnode list.
1886 * Hence the need to hold the vnode interlock throughout.
1887 */
1888 MNT_ILOCK(mp);
1889 VI_LOCK(vp);
1890 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 &&
1891 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1892 mp->mnt_nvnodelistsize == 0)) &&
1893 (vp->v_vflag & VV_FORCEINSMQ) == 0) {
1894 VI_UNLOCK(vp);
1895 MNT_IUNLOCK(mp);
1896 if (dtr != NULL)
1897 dtr(vp, dtr_arg);
1898 return (EBUSY);
1899 }
1900 vp->v_mount = mp;
1901 MNT_REF(mp);
1902 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1903 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1904 ("neg mount point vnode list size"));
1905 mp->mnt_nvnodelistsize++;
1906 VI_UNLOCK(vp);
1907 MNT_IUNLOCK(mp);
1908 return (0);
1909 }
1910
1911 int
1912 insmntque(struct vnode *vp, struct mount *mp)
1913 {
1914
1915 return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1916 }
1917
1918 /*
1919 * Flush out and invalidate all buffers associated with a bufobj
1920 * Called with the underlying object locked.
1921 */
1922 int
1923 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1924 {
1925 int error;
1926
1927 BO_LOCK(bo);
1928 if (flags & V_SAVE) {
1929 error = bufobj_wwait(bo, slpflag, slptimeo);
1930 if (error) {
1931 BO_UNLOCK(bo);
1932 return (error);
1933 }
1934 if (bo->bo_dirty.bv_cnt > 0) {
1935 BO_UNLOCK(bo);
1936 do {
1937 error = BO_SYNC(bo, MNT_WAIT);
1938 } while (error == ERELOOKUP);
1939 if (error != 0)
1940 return (error);
1941 /*
1942 * XXX We could save a lock/unlock if this was only
1943 * enabled under INVARIANTS
1944 */
1945 BO_LOCK(bo);
1946 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1947 panic("vinvalbuf: dirty bufs");
1948 }
1949 }
1950 /*
1951 * If you alter this loop please notice that interlock is dropped and
1952 * reacquired in flushbuflist. Special care is needed to ensure that
1953 * no race conditions occur from this.
1954 */
1955 do {
1956 error = flushbuflist(&bo->bo_clean,
1957 flags, bo, slpflag, slptimeo);
1958 if (error == 0 && !(flags & V_CLEANONLY))
1959 error = flushbuflist(&bo->bo_dirty,
1960 flags, bo, slpflag, slptimeo);
1961 if (error != 0 && error != EAGAIN) {
1962 BO_UNLOCK(bo);
1963 return (error);
1964 }
1965 } while (error != 0);
1966
1967 /*
1968 * Wait for I/O to complete. XXX needs cleaning up. The vnode can
1969 * have write I/O in-progress but if there is a VM object then the
1970 * VM object can also have read-I/O in-progress.
1971 */
1972 do {
1973 bufobj_wwait(bo, 0, 0);
1974 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) {
1975 BO_UNLOCK(bo);
1976 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx");
1977 BO_LOCK(bo);
1978 }
1979 } while (bo->bo_numoutput > 0);
1980 BO_UNLOCK(bo);
1981
1982 /*
1983 * Destroy the copy in the VM cache, too.
1984 */
1985 if (bo->bo_object != NULL &&
1986 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
1987 VM_OBJECT_WLOCK(bo->bo_object);
1988 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1989 OBJPR_CLEANONLY : 0);
1990 VM_OBJECT_WUNLOCK(bo->bo_object);
1991 }
1992
1993 #ifdef INVARIANTS
1994 BO_LOCK(bo);
1995 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO |
1996 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 ||
1997 bo->bo_clean.bv_cnt > 0))
1998 panic("vinvalbuf: flush failed");
1999 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
2000 bo->bo_dirty.bv_cnt > 0)
2001 panic("vinvalbuf: flush dirty failed");
2002 BO_UNLOCK(bo);
2003 #endif
2004 return (0);
2005 }
2006
2007 /*
2008 * Flush out and invalidate all buffers associated with a vnode.
2009 * Called with the underlying object locked.
2010 */
2011 int
2012 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
2013 {
2014
2015 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2016 ASSERT_VOP_LOCKED(vp, "vinvalbuf");
2017 if (vp->v_object != NULL && vp->v_object->handle != vp)
2018 return (0);
2019 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
2020 }
2021
2022 /*
2023 * Flush out buffers on the specified list.
2024 *
2025 */
2026 static int
2027 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
2028 int slptimeo)
2029 {
2030 struct buf *bp, *nbp;
2031 int retval, error;
2032 daddr_t lblkno;
2033 b_xflags_t xflags;
2034
2035 ASSERT_BO_WLOCKED(bo);
2036
2037 retval = 0;
2038 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
2039 /*
2040 * If we are flushing both V_NORMAL and V_ALT buffers then
2041 * do not skip any buffers. If we are flushing only V_NORMAL
2042 * buffers then skip buffers marked as BX_ALTDATA. If we are
2043 * flushing only V_ALT buffers then skip buffers not marked
2044 * as BX_ALTDATA.
2045 */
2046 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) &&
2047 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) ||
2048 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) {
2049 continue;
2050 }
2051 if (nbp != NULL) {
2052 lblkno = nbp->b_lblkno;
2053 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
2054 }
2055 retval = EAGAIN;
2056 error = BUF_TIMELOCK(bp,
2057 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
2058 "flushbuf", slpflag, slptimeo);
2059 if (error) {
2060 BO_LOCK(bo);
2061 return (error != ENOLCK ? error : EAGAIN);
2062 }
2063 KASSERT(bp->b_bufobj == bo,
2064 ("bp %p wrong b_bufobj %p should be %p",
2065 bp, bp->b_bufobj, bo));
2066 /*
2067 * XXX Since there are no node locks for NFS, I
2068 * believe there is a slight chance that a delayed
2069 * write will occur while sleeping just above, so
2070 * check for it.
2071 */
2072 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
2073 (flags & V_SAVE)) {
2074 bremfree(bp);
2075 bp->b_flags |= B_ASYNC;
2076 bwrite(bp);
2077 BO_LOCK(bo);
2078 return (EAGAIN); /* XXX: why not loop ? */
2079 }
2080 bremfree(bp);
2081 bp->b_flags |= (B_INVAL | B_RELBUF);
2082 bp->b_flags &= ~B_ASYNC;
2083 brelse(bp);
2084 BO_LOCK(bo);
2085 if (nbp == NULL)
2086 break;
2087 nbp = gbincore(bo, lblkno);
2088 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2089 != xflags)
2090 break; /* nbp invalid */
2091 }
2092 return (retval);
2093 }
2094
2095 int
2096 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
2097 {
2098 struct buf *bp;
2099 int error;
2100 daddr_t lblkno;
2101
2102 ASSERT_BO_LOCKED(bo);
2103
2104 for (lblkno = startn;;) {
2105 again:
2106 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
2107 if (bp == NULL || bp->b_lblkno >= endn ||
2108 bp->b_lblkno < startn)
2109 break;
2110 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
2111 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
2112 if (error != 0) {
2113 BO_RLOCK(bo);
2114 if (error == ENOLCK)
2115 goto again;
2116 return (error);
2117 }
2118 KASSERT(bp->b_bufobj == bo,
2119 ("bp %p wrong b_bufobj %p should be %p",
2120 bp, bp->b_bufobj, bo));
2121 lblkno = bp->b_lblkno + 1;
2122 if ((bp->b_flags & B_MANAGED) == 0)
2123 bremfree(bp);
2124 bp->b_flags |= B_RELBUF;
2125 /*
2126 * In the VMIO case, use the B_NOREUSE flag to hint that the
2127 * pages backing each buffer in the range are unlikely to be
2128 * reused. Dirty buffers will have the hint applied once
2129 * they've been written.
2130 */
2131 if ((bp->b_flags & B_VMIO) != 0)
2132 bp->b_flags |= B_NOREUSE;
2133 brelse(bp);
2134 BO_RLOCK(bo);
2135 }
2136 return (0);
2137 }
2138
2139 /*
2140 * Truncate a file's buffer and pages to a specified length. This
2141 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
2142 * sync activity.
2143 */
2144 int
2145 vtruncbuf(struct vnode *vp, off_t length, int blksize)
2146 {
2147 struct buf *bp, *nbp;
2148 struct bufobj *bo;
2149 daddr_t startlbn;
2150
2151 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
2152 vp, blksize, (uintmax_t)length);
2153
2154 /*
2155 * Round up to the *next* lbn.
2156 */
2157 startlbn = howmany(length, blksize);
2158
2159 ASSERT_VOP_LOCKED(vp, "vtruncbuf");
2160
2161 bo = &vp->v_bufobj;
2162 restart_unlocked:
2163 BO_LOCK(bo);
2164
2165 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN)
2166 ;
2167
2168 if (length > 0) {
2169 restartsync:
2170 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2171 if (bp->b_lblkno > 0)
2172 continue;
2173 /*
2174 * Since we hold the vnode lock this should only
2175 * fail if we're racing with the buf daemon.
2176 */
2177 if (BUF_LOCK(bp,
2178 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
2179 BO_LOCKPTR(bo)) == ENOLCK)
2180 goto restart_unlocked;
2181
2182 VNASSERT((bp->b_flags & B_DELWRI), vp,
2183 ("buf(%p) on dirty queue without DELWRI", bp));
2184
2185 bremfree(bp);
2186 bawrite(bp);
2187 BO_LOCK(bo);
2188 goto restartsync;
2189 }
2190 }
2191
2192 bufobj_wwait(bo, 0, 0);
2193 BO_UNLOCK(bo);
2194 vnode_pager_setsize(vp, length);
2195
2196 return (0);
2197 }
2198
2199 /*
2200 * Invalidate the cached pages of a file's buffer within the range of block
2201 * numbers [startlbn, endlbn).
2202 */
2203 void
2204 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
2205 int blksize)
2206 {
2207 struct bufobj *bo;
2208 off_t start, end;
2209
2210 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range");
2211
2212 start = blksize * startlbn;
2213 end = blksize * endlbn;
2214
2215 bo = &vp->v_bufobj;
2216 BO_LOCK(bo);
2217 MPASS(blksize == bo->bo_bsize);
2218
2219 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN)
2220 ;
2221
2222 BO_UNLOCK(bo);
2223 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1));
2224 }
2225
2226 static int
2227 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
2228 daddr_t startlbn, daddr_t endlbn)
2229 {
2230 struct buf *bp, *nbp;
2231 bool anyfreed;
2232
2233 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked");
2234 ASSERT_BO_LOCKED(bo);
2235
2236 do {
2237 anyfreed = false;
2238 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
2239 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
2240 continue;
2241 if (BUF_LOCK(bp,
2242 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
2243 BO_LOCKPTR(bo)) == ENOLCK) {
2244 BO_LOCK(bo);
2245 return (EAGAIN);
2246 }
2247
2248 bremfree(bp);
2249 bp->b_flags |= B_INVAL | B_RELBUF;
2250 bp->b_flags &= ~B_ASYNC;
2251 brelse(bp);
2252 anyfreed = true;
2253
2254 BO_LOCK(bo);
2255 if (nbp != NULL &&
2256 (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
2257 nbp->b_vp != vp ||
2258 (nbp->b_flags & B_DELWRI) != 0))
2259 return (EAGAIN);
2260 }
2261
2262 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
2263 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
2264 continue;
2265 if (BUF_LOCK(bp,
2266 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
2267 BO_LOCKPTR(bo)) == ENOLCK) {
2268 BO_LOCK(bo);
2269 return (EAGAIN);
2270 }
2271 bremfree(bp);
2272 bp->b_flags |= B_INVAL | B_RELBUF;
2273 bp->b_flags &= ~B_ASYNC;
2274 brelse(bp);
2275 anyfreed = true;
2276
2277 BO_LOCK(bo);
2278 if (nbp != NULL &&
2279 (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
2280 (nbp->b_vp != vp) ||
2281 (nbp->b_flags & B_DELWRI) == 0))
2282 return (EAGAIN);
2283 }
2284 } while (anyfreed);
2285 return (0);
2286 }
2287
2288 static void
2289 buf_vlist_remove(struct buf *bp)
2290 {
2291 struct bufv *bv;
2292 b_xflags_t flags;
2293
2294 flags = bp->b_xflags;
2295
2296 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
2297 ASSERT_BO_WLOCKED(bp->b_bufobj);
2298 KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 &&
2299 (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN),
2300 ("%s: buffer %p has invalid queue state", __func__, bp));
2301
2302 if ((flags & BX_VNDIRTY) != 0)
2303 bv = &bp->b_bufobj->bo_dirty;
2304 else
2305 bv = &bp->b_bufobj->bo_clean;
2306 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
2307 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
2308 bv->bv_cnt--;
2309 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
2310 }
2311
2312 /*
2313 * Add the buffer to the sorted clean or dirty block list.
2314 *
2315 * NOTE: xflags is passed as a constant, optimizing this inline function!
2316 */
2317 static void
2318 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
2319 {
2320 struct bufv *bv;
2321 struct buf *n;
2322 int error;
2323
2324 ASSERT_BO_WLOCKED(bo);
2325 KASSERT((bo->bo_flag & BO_NOBUFS) == 0,
2326 ("buf_vlist_add: bo %p does not allow bufs", bo));
2327 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
2328 ("dead bo %p", bo));
2329 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
2330 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
2331 bp->b_xflags |= xflags;
2332 if (xflags & BX_VNDIRTY)
2333 bv = &bo->bo_dirty;
2334 else
2335 bv = &bo->bo_clean;
2336
2337 /*
2338 * Keep the list ordered. Optimize empty list insertion. Assume
2339 * we tend to grow at the tail so lookup_le should usually be cheaper
2340 * than _ge.
2341 */
2342 if (bv->bv_cnt == 0 ||
2343 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
2344 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
2345 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
2346 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
2347 else
2348 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
2349 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
2350 if (error)
2351 panic("buf_vlist_add: Preallocated nodes insufficient.");
2352 bv->bv_cnt++;
2353 }
2354
2355 /*
2356 * Look up a buffer using the buffer tries.
2357 */
2358 struct buf *
2359 gbincore(struct bufobj *bo, daddr_t lblkno)
2360 {
2361 struct buf *bp;
2362
2363 ASSERT_BO_LOCKED(bo);
2364 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
2365 if (bp != NULL)
2366 return (bp);
2367 return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno));
2368 }
2369
2370 /*
2371 * Look up a buf using the buffer tries, without the bufobj lock. This relies
2372 * on SMR for safe lookup, and bufs being in a no-free zone to provide type
2373 * stability of the result. Like other lockless lookups, the found buf may
2374 * already be invalid by the time this function returns.
2375 */
2376 struct buf *
2377 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno)
2378 {
2379 struct buf *bp;
2380
2381 ASSERT_BO_UNLOCKED(bo);
2382 bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno);
2383 if (bp != NULL)
2384 return (bp);
2385 return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno));
2386 }
2387
2388 /*
2389 * Associate a buffer with a vnode.
2390 */
2391 void
2392 bgetvp(struct vnode *vp, struct buf *bp)
2393 {
2394 struct bufobj *bo;
2395
2396 bo = &vp->v_bufobj;
2397 ASSERT_BO_WLOCKED(bo);
2398 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
2399
2400 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
2401 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
2402 ("bgetvp: bp already attached! %p", bp));
2403
2404 vhold(vp);
2405 bp->b_vp = vp;
2406 bp->b_bufobj = bo;
2407 /*
2408 * Insert onto list for new vnode.
2409 */
2410 buf_vlist_add(bp, bo, BX_VNCLEAN);
2411 }
2412
2413 /*
2414 * Disassociate a buffer from a vnode.
2415 */
2416 void
2417 brelvp(struct buf *bp)
2418 {
2419 struct bufobj *bo;
2420 struct vnode *vp;
2421
2422 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
2423 KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
2424
2425 /*
2426 * Delete from old vnode list, if on one.
2427 */
2428 vp = bp->b_vp; /* XXX */
2429 bo = bp->b_bufobj;
2430 BO_LOCK(bo);
2431 buf_vlist_remove(bp);
2432 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2433 bo->bo_flag &= ~BO_ONWORKLST;
2434 mtx_lock(&sync_mtx);
2435 LIST_REMOVE(bo, bo_synclist);
2436 syncer_worklist_len--;
2437 mtx_unlock(&sync_mtx);
2438 }
2439 bp->b_vp = NULL;
2440 bp->b_bufobj = NULL;
2441 BO_UNLOCK(bo);
2442 vdrop(vp);
2443 }
2444
2445 /*
2446 * Add an item to the syncer work queue.
2447 */
2448 static void
2449 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
2450 {
2451 int slot;
2452
2453 ASSERT_BO_WLOCKED(bo);
2454
2455 mtx_lock(&sync_mtx);
2456 if (bo->bo_flag & BO_ONWORKLST)
2457 LIST_REMOVE(bo, bo_synclist);
2458 else {
2459 bo->bo_flag |= BO_ONWORKLST;
2460 syncer_worklist_len++;
2461 }
2462
2463 if (delay > syncer_maxdelay - 2)
2464 delay = syncer_maxdelay - 2;
2465 slot = (syncer_delayno + delay) & syncer_mask;
2466
2467 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
2468 mtx_unlock(&sync_mtx);
2469 }
2470
2471 static int
2472 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
2473 {
2474 int error, len;
2475
2476 mtx_lock(&sync_mtx);
2477 len = syncer_worklist_len - sync_vnode_count;
2478 mtx_unlock(&sync_mtx);
2479 error = SYSCTL_OUT(req, &len, sizeof(len));
2480 return (error);
2481 }
2482
2483 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len,
2484 CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0,
2485 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
2486
2487 static struct proc *updateproc;
2488 static void sched_sync(void);
2489 static struct kproc_desc up_kp = {
2490 "syncer",
2491 sched_sync,
2492 &updateproc
2493 };
2494 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
2495
2496 static int
2497 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
2498 {
2499 struct vnode *vp;
2500 struct mount *mp;
2501
2502 *bo = LIST_FIRST(slp);
2503 if (*bo == NULL)
2504 return (0);
2505 vp = bo2vnode(*bo);
2506 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
2507 return (1);
2508 /*
2509 * We use vhold in case the vnode does not
2510 * successfully sync. vhold prevents the vnode from
2511 * going away when we unlock the sync_mtx so that
2512 * we can acquire the vnode interlock.
2513 */
2514 vholdl(vp);
2515 mtx_unlock(&sync_mtx);
2516 VI_UNLOCK(vp);
2517 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
2518 vdrop(vp);
2519 mtx_lock(&sync_mtx);
2520 return (*bo == LIST_FIRST(slp));
2521 }
2522 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2523 (void) VOP_FSYNC(vp, MNT_LAZY, td);
2524 VOP_UNLOCK(vp);
2525 vn_finished_write(mp);
2526 BO_LOCK(*bo);
2527 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
2528 /*
2529 * Put us back on the worklist. The worklist
2530 * routine will remove us from our current
2531 * position and then add us back in at a later
2532 * position.
2533 */
2534 vn_syncer_add_to_worklist(*bo, syncdelay);
2535 }
2536 BO_UNLOCK(*bo);
2537 vdrop(vp);
2538 mtx_lock(&sync_mtx);
2539 return (0);
2540 }
2541
2542 static int first_printf = 1;
2543
2544 /*
2545 * System filesystem synchronizer daemon.
2546 */
2547 static void
2548 sched_sync(void)
2549 {
2550 struct synclist *next, *slp;
2551 struct bufobj *bo;
2552 long starttime;
2553 struct thread *td = curthread;
2554 int last_work_seen;
2555 int net_worklist_len;
2556 int syncer_final_iter;
2557 int error;
2558
2559 last_work_seen = 0;
2560 syncer_final_iter = 0;
2561 syncer_state = SYNCER_RUNNING;
2562 starttime = time_uptime;
2563 td->td_pflags |= TDP_NORUNNINGBUF;
2564
2565 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
2566 SHUTDOWN_PRI_LAST);
2567
2568 mtx_lock(&sync_mtx);
2569 for (;;) {
2570 if (syncer_state == SYNCER_FINAL_DELAY &&
2571 syncer_final_iter == 0) {
2572 mtx_unlock(&sync_mtx);
2573 kproc_suspend_check(td->td_proc);
2574 mtx_lock(&sync_mtx);
2575 }
2576 net_worklist_len = syncer_worklist_len - sync_vnode_count;
2577 if (syncer_state != SYNCER_RUNNING &&
2578 starttime != time_uptime) {
2579 if (first_printf) {
2580 printf("\nSyncing disks, vnodes remaining... ");
2581 first_printf = 0;
2582 }
2583 printf("%d ", net_worklist_len);
2584 }
2585 starttime = time_uptime;
2586
2587 /*
2588 * Push files whose dirty time has expired. Be careful
2589 * of interrupt race on slp queue.
2590 *
2591 * Skip over empty worklist slots when shutting down.
2592 */
2593 do {
2594 slp = &syncer_workitem_pending[syncer_delayno];
2595 syncer_delayno += 1;
2596 if (syncer_delayno == syncer_maxdelay)
2597 syncer_delayno = 0;
2598 next = &syncer_workitem_pending[syncer_delayno];
2599 /*
2600 * If the worklist has wrapped since the
2601 * it was emptied of all but syncer vnodes,
2602 * switch to the FINAL_DELAY state and run
2603 * for one more second.
2604 */
2605 if (syncer_state == SYNCER_SHUTTING_DOWN &&
2606 net_worklist_len == 0 &&
2607 last_work_seen == syncer_delayno) {
2608 syncer_state = SYNCER_FINAL_DELAY;
2609 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
2610 }
2611 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
2612 syncer_worklist_len > 0);
2613
2614 /*
2615 * Keep track of the last time there was anything
2616 * on the worklist other than syncer vnodes.
2617 * Return to the SHUTTING_DOWN state if any
2618 * new work appears.
2619 */
2620 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
2621 last_work_seen = syncer_delayno;
2622 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
2623 syncer_state = SYNCER_SHUTTING_DOWN;
2624 while (!LIST_EMPTY(slp)) {
2625 error = sync_vnode(slp, &bo, td);
2626 if (error == 1) {
2627 LIST_REMOVE(bo, bo_synclist);
2628 LIST_INSERT_HEAD(next, bo, bo_synclist);
2629 continue;
2630 }
2631
2632 if (first_printf == 0) {
2633 /*
2634 * Drop the sync mutex, because some watchdog
2635 * drivers need to sleep while patting
2636 */
2637 mtx_unlock(&sync_mtx);
2638 wdog_kern_pat(WD_LASTVAL);
2639 mtx_lock(&sync_mtx);
2640 }
2641 }
2642 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
2643 syncer_final_iter--;
2644 /*
2645 * The variable rushjob allows the kernel to speed up the
2646 * processing of the filesystem syncer process. A rushjob
2647 * value of N tells the filesystem syncer to process the next
2648 * N seconds worth of work on its queue ASAP. Currently rushjob
2649 * is used by the soft update code to speed up the filesystem
2650 * syncer process when the incore state is getting so far
2651 * ahead of the disk that the kernel memory pool is being
2652 * threatened with exhaustion.
2653 */
2654 if (rushjob > 0) {
2655 rushjob -= 1;
2656 continue;
2657 }
2658 /*
2659 * Just sleep for a short period of time between
2660 * iterations when shutting down to allow some I/O
2661 * to happen.
2662 *
2663 * If it has taken us less than a second to process the
2664 * current work, then wait. Otherwise start right over
2665 * again. We can still lose time if any single round
2666 * takes more than two seconds, but it does not really
2667 * matter as we are just trying to generally pace the
2668 * filesystem activity.
2669 */
2670 if (syncer_state != SYNCER_RUNNING ||
2671 time_uptime == starttime) {
2672 thread_lock(td);
2673 sched_prio(td, PPAUSE);
2674 thread_unlock(td);
2675 }
2676 if (syncer_state != SYNCER_RUNNING)
2677 cv_timedwait(&sync_wakeup, &sync_mtx,
2678 hz / SYNCER_SHUTDOWN_SPEEDUP);
2679 else if (time_uptime == starttime)
2680 cv_timedwait(&sync_wakeup, &sync_mtx, hz);
2681 }
2682 }
2683
2684 /*
2685 * Request the syncer daemon to speed up its work.
2686 * We never push it to speed up more than half of its
2687 * normal turn time, otherwise it could take over the cpu.
2688 */
2689 int
2690 speedup_syncer(void)
2691 {
2692 int ret = 0;
2693
2694 mtx_lock(&sync_mtx);
2695 if (rushjob < syncdelay / 2) {
2696 rushjob += 1;
2697 stat_rush_requests += 1;
2698 ret = 1;
2699 }
2700 mtx_unlock(&sync_mtx);
2701 cv_broadcast(&sync_wakeup);
2702 return (ret);
2703 }
2704
2705 /*
2706 * Tell the syncer to speed up its work and run though its work
2707 * list several times, then tell it to shut down.
2708 */
2709 static void
2710 syncer_shutdown(void *arg, int howto)
2711 {
2712
2713 if (howto & RB_NOSYNC)
2714 return;
2715 mtx_lock(&sync_mtx);
2716 syncer_state = SYNCER_SHUTTING_DOWN;
2717 rushjob = 0;
2718 mtx_unlock(&sync_mtx);
2719 cv_broadcast(&sync_wakeup);
2720 kproc_shutdown(arg, howto);
2721 }
2722
2723 void
2724 syncer_suspend(void)
2725 {
2726
2727 syncer_shutdown(updateproc, 0);
2728 }
2729
2730 void
2731 syncer_resume(void)
2732 {
2733
2734 mtx_lock(&sync_mtx);
2735 first_printf = 1;
2736 syncer_state = SYNCER_RUNNING;
2737 mtx_unlock(&sync_mtx);
2738 cv_broadcast(&sync_wakeup);
2739 kproc_resume(updateproc);
2740 }
2741
2742 /*
2743 * Move the buffer between the clean and dirty lists of its vnode.
2744 */
2745 void
2746 reassignbuf(struct buf *bp)
2747 {
2748 struct vnode *vp;
2749 struct bufobj *bo;
2750 int delay;
2751 #ifdef INVARIANTS
2752 struct bufv *bv;
2753 #endif
2754
2755 vp = bp->b_vp;
2756 bo = bp->b_bufobj;
2757
2758 KASSERT((bp->b_flags & B_PAGING) == 0,
2759 ("%s: cannot reassign paging buffer %p", __func__, bp));
2760
2761 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
2762 bp, bp->b_vp, bp->b_flags);
2763
2764 BO_LOCK(bo);
2765 buf_vlist_remove(bp);
2766
2767 /*
2768 * If dirty, put on list of dirty buffers; otherwise insert onto list
2769 * of clean buffers.
2770 */
2771 if (bp->b_flags & B_DELWRI) {
2772 if ((bo->bo_flag & BO_ONWORKLST) == 0) {
2773 switch (vp->v_type) {
2774 case VDIR:
2775 delay = dirdelay;
2776 break;
2777 case VCHR:
2778 delay = metadelay;
2779 break;
2780 default:
2781 delay = filedelay;
2782 }
2783 vn_syncer_add_to_worklist(bo, delay);
2784 }
2785 buf_vlist_add(bp, bo, BX_VNDIRTY);
2786 } else {
2787 buf_vlist_add(bp, bo, BX_VNCLEAN);
2788
2789 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2790 mtx_lock(&sync_mtx);
2791 LIST_REMOVE(bo, bo_synclist);
2792 syncer_worklist_len--;
2793 mtx_unlock(&sync_mtx);
2794 bo->bo_flag &= ~BO_ONWORKLST;
2795 }
2796 }
2797 #ifdef INVARIANTS
2798 bv = &bo->bo_clean;
2799 bp = TAILQ_FIRST(&bv->bv_hd);
2800 KASSERT(bp == NULL || bp->b_bufobj == bo,
2801 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2802 bp = TAILQ_LAST(&bv->bv_hd, buflists);
2803 KASSERT(bp == NULL || bp->b_bufobj == bo,
2804 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2805 bv = &bo->bo_dirty;
2806 bp = TAILQ_FIRST(&bv->bv_hd);
2807 KASSERT(bp == NULL || bp->b_bufobj == bo,
2808 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2809 bp = TAILQ_LAST(&bv->bv_hd, buflists);
2810 KASSERT(bp == NULL || bp->b_bufobj == bo,
2811 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2812 #endif
2813 BO_UNLOCK(bo);
2814 }
2815
2816 static void
2817 v_init_counters(struct vnode *vp)
2818 {
2819
2820 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
2821 vp, ("%s called for an initialized vnode", __FUNCTION__));
2822 ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
2823
2824 refcount_init(&vp->v_holdcnt, 1);
2825 refcount_init(&vp->v_usecount, 1);
2826 }
2827
2828 /*
2829 * Grab a particular vnode from the free list, increment its
2830 * reference count and lock it. VIRF_DOOMED is set if the vnode
2831 * is being destroyed. Only callers who specify LK_RETRY will
2832 * see doomed vnodes. If inactive processing was delayed in
2833 * vput try to do it here.
2834 *
2835 * usecount is manipulated using atomics without holding any locks.
2836 *
2837 * holdcnt can be manipulated using atomics without holding any locks,
2838 * except when transitioning 1<->0, in which case the interlock is held.
2839 *
2840 * Consumers which don't guarantee liveness of the vnode can use SMR to
2841 * try to get a reference. Note this operation can fail since the vnode
2842 * may be awaiting getting freed by the time they get to it.
2843 */
2844 enum vgetstate
2845 vget_prep_smr(struct vnode *vp)
2846 {
2847 enum vgetstate vs;
2848
2849 VFS_SMR_ASSERT_ENTERED();
2850
2851 if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
2852 vs = VGET_USECOUNT;
2853 } else {
2854 if (vhold_smr(vp))
2855 vs = VGET_HOLDCNT;
2856 else
2857 vs = VGET_NONE;
2858 }
2859 return (vs);
2860 }
2861
2862 enum vgetstate
2863 vget_prep(struct vnode *vp)
2864 {
2865 enum vgetstate vs;
2866
2867 if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
2868 vs = VGET_USECOUNT;
2869 } else {
2870 vhold(vp);
2871 vs = VGET_HOLDCNT;
2872 }
2873 return (vs);
2874 }
2875
2876 void
2877 vget_abort(struct vnode *vp, enum vgetstate vs)
2878 {
2879
2880 switch (vs) {
2881 case VGET_USECOUNT:
2882 vrele(vp);
2883 break;
2884 case VGET_HOLDCNT:
2885 vdrop(vp);
2886 break;
2887 default:
2888 __assert_unreachable();
2889 }
2890 }
2891
2892 int
2893 vget(struct vnode *vp, int flags)
2894 {
2895 enum vgetstate vs;
2896
2897 vs = vget_prep(vp);
2898 return (vget_finish(vp, flags, vs));
2899 }
2900
2901 int
2902 vget_finish(struct vnode *vp, int flags, enum vgetstate vs)
2903 {
2904 int error;
2905
2906 if ((flags & LK_INTERLOCK) != 0)
2907 ASSERT_VI_LOCKED(vp, __func__);
2908 else
2909 ASSERT_VI_UNLOCKED(vp, __func__);
2910 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
2911 VNPASS(vp->v_holdcnt > 0, vp);
2912 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
2913
2914 error = vn_lock(vp, flags);
2915 if (__predict_false(error != 0)) {
2916 vget_abort(vp, vs);
2917 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2918 vp);
2919 return (error);
2920 }
2921
2922 vget_finish_ref(vp, vs);
2923 return (0);
2924 }
2925
2926 void
2927 vget_finish_ref(struct vnode *vp, enum vgetstate vs)
2928 {
2929 int old;
2930
2931 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
2932 VNPASS(vp->v_holdcnt > 0, vp);
2933 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
2934
2935 if (vs == VGET_USECOUNT)
2936 return;
2937
2938 /*
2939 * We hold the vnode. If the usecount is 0 it will be utilized to keep
2940 * the vnode around. Otherwise someone else lended their hold count and
2941 * we have to drop ours.
2942 */
2943 old = atomic_fetchadd_int(&vp->v_usecount, 1);
2944 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old));
2945 if (old != 0) {
2946 #ifdef INVARIANTS
2947 old = atomic_fetchadd_int(&vp->v_holdcnt, -1);
2948 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old));
2949 #else
2950 refcount_release(&vp->v_holdcnt);
2951 #endif
2952 }
2953 }
2954
2955 void
2956 vref(struct vnode *vp)
2957 {
2958 enum vgetstate vs;
2959
2960 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2961 vs = vget_prep(vp);
2962 vget_finish_ref(vp, vs);
2963 }
2964
2965 void
2966 vrefact(struct vnode *vp)
2967 {
2968
2969 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2970 #ifdef INVARIANTS
2971 int old = atomic_fetchadd_int(&vp->v_usecount, 1);
2972 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old));
2973 #else
2974 refcount_acquire(&vp->v_usecount);
2975 #endif
2976 }
2977
2978 void
2979 vlazy(struct vnode *vp)
2980 {
2981 struct mount *mp;
2982
2983 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__));
2984
2985 if ((vp->v_mflag & VMP_LAZYLIST) != 0)
2986 return;
2987 /*
2988 * We may get here for inactive routines after the vnode got doomed.
2989 */
2990 if (VN_IS_DOOMED(vp))
2991 return;
2992 mp = vp->v_mount;
2993 mtx_lock(&mp->mnt_listmtx);
2994 if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
2995 vp->v_mflag |= VMP_LAZYLIST;
2996 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist);
2997 mp->mnt_lazyvnodelistsize++;
2998 }
2999 mtx_unlock(&mp->mnt_listmtx);
3000 }
3001
3002 /*
3003 * This routine is only meant to be called from vgonel prior to dooming
3004 * the vnode.
3005 */
3006 static void
3007 vunlazy_gone(struct vnode *vp)
3008 {
3009 struct mount *mp;
3010
3011 ASSERT_VOP_ELOCKED(vp, __func__);
3012 ASSERT_VI_LOCKED(vp, __func__);
3013 VNPASS(!VN_IS_DOOMED(vp), vp);
3014
3015 if (vp->v_mflag & VMP_LAZYLIST) {
3016 mp = vp->v_mount;
3017 mtx_lock(&mp->mnt_listmtx);
3018 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
3019 vp->v_mflag &= ~VMP_LAZYLIST;
3020 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
3021 mp->mnt_lazyvnodelistsize--;
3022 mtx_unlock(&mp->mnt_listmtx);
3023 }
3024 }
3025
3026 static void
3027 vdefer_inactive(struct vnode *vp)
3028 {
3029
3030 ASSERT_VI_LOCKED(vp, __func__);
3031 VNASSERT(vp->v_holdcnt > 0, vp,
3032 ("%s: vnode without hold count", __func__));
3033 if (VN_IS_DOOMED(vp)) {
3034 vdropl(vp);
3035 return;
3036 }
3037 if (vp->v_iflag & VI_DEFINACT) {
3038 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
3039 vdropl(vp);
3040 return;
3041 }
3042 if (vp->v_usecount > 0) {
3043 vp->v_iflag &= ~VI_OWEINACT;
3044 vdropl(vp);
3045 return;
3046 }
3047 vlazy(vp);
3048 vp->v_iflag |= VI_DEFINACT;
3049 VI_UNLOCK(vp);
3050 counter_u64_add(deferred_inact, 1);
3051 }
3052
3053 static void
3054 vdefer_inactive_unlocked(struct vnode *vp)
3055 {
3056
3057 VI_LOCK(vp);
3058 if ((vp->v_iflag & VI_OWEINACT) == 0) {
3059 vdropl(vp);
3060 return;
3061 }
3062 vdefer_inactive(vp);
3063 }
3064
3065 enum vput_op { VRELE, VPUT, VUNREF };
3066
3067 /*
3068 * Handle ->v_usecount transitioning to 0.
3069 *
3070 * By releasing the last usecount we take ownership of the hold count which
3071 * provides liveness of the vnode, meaning we have to vdrop.
3072 *
3073 * For all vnodes we may need to perform inactive processing. It requires an
3074 * exclusive lock on the vnode, while it is legal to call here with only a
3075 * shared lock (or no locks). If locking the vnode in an expected manner fails,
3076 * inactive processing gets deferred to the syncer.
3077 *
3078 * XXX Some filesystems pass in an exclusively locked vnode and strongly depend
3079 * on the lock being held all the way until VOP_INACTIVE. This in particular
3080 * happens with UFS which adds half-constructed vnodes to the hash, where they
3081 * can be found by other code.
3082 */
3083 static void
3084 vput_final(struct vnode *vp, enum vput_op func)
3085 {
3086 int error;
3087 bool want_unlock;
3088
3089 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3090 VNPASS(vp->v_holdcnt > 0, vp);
3091
3092 VI_LOCK(vp);
3093
3094 /*
3095 * By the time we got here someone else might have transitioned
3096 * the count back to > 0.
3097 */
3098 if (vp->v_usecount > 0)
3099 goto out;
3100
3101 /*
3102 * If the vnode is doomed vgone already performed inactive processing
3103 * (if needed).
3104 */
3105 if (VN_IS_DOOMED(vp))
3106 goto out;
3107
3108 if (__predict_true(VOP_NEED_INACTIVE(vp) == 0))
3109 goto out;
3110
3111 if (vp->v_iflag & VI_DOINGINACT)
3112 goto out;
3113
3114 /*
3115 * Locking operations here will drop the interlock and possibly the
3116 * vnode lock, opening a window where the vnode can get doomed all the
3117 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to
3118 * perform inactive.
3119 */
3120 vp->v_iflag |= VI_OWEINACT;
3121 want_unlock = false;
3122 error = 0;
3123 switch (func) {
3124 case VRELE:
3125 switch (VOP_ISLOCKED(vp)) {
3126 case LK_EXCLUSIVE:
3127 break;
3128 case LK_EXCLOTHER:
3129 case 0:
3130 want_unlock = true;
3131 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
3132 VI_LOCK(vp);
3133 break;
3134 default:
3135 /*
3136 * The lock has at least one sharer, but we have no way
3137 * to conclude whether this is us. Play it safe and
3138 * defer processing.
3139 */
3140 error = EAGAIN;
3141 break;
3142 }
3143 break;
3144 case VPUT:
3145 want_unlock = true;
3146 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
3147 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
3148 LK_NOWAIT);
3149 VI_LOCK(vp);
3150 }
3151 break;
3152 case VUNREF:
3153 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
3154 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
3155 VI_LOCK(vp);
3156 }
3157 break;
3158 }
3159 if (error == 0) {
3160 vinactive(vp);
3161 if (want_unlock)
3162 VOP_UNLOCK(vp);
3163 vdropl(vp);
3164 } else {
3165 vdefer_inactive(vp);
3166 }
3167 return;
3168 out:
3169 if (func == VPUT)
3170 VOP_UNLOCK(vp);
3171 vdropl(vp);
3172 }
3173
3174 /*
3175 * Decrement ->v_usecount for a vnode.
3176 *
3177 * Releasing the last use count requires additional processing, see vput_final
3178 * above for details.
3179 *
3180 * Comment above each variant denotes lock state on entry and exit.
3181 */
3182
3183 /*
3184 * in: any
3185 * out: same as passed in
3186 */
3187 void
3188 vrele(struct vnode *vp)
3189 {
3190
3191 ASSERT_VI_UNLOCKED(vp, __func__);
3192 if (!refcount_release(&vp->v_usecount))
3193 return;
3194 vput_final(vp, VRELE);
3195 }
3196
3197 /*
3198 * in: locked
3199 * out: unlocked
3200 */
3201 void
3202 vput(struct vnode *vp)
3203 {
3204
3205 ASSERT_VOP_LOCKED(vp, __func__);
3206 ASSERT_VI_UNLOCKED(vp, __func__);
3207 if (!refcount_release(&vp->v_usecount)) {
3208 VOP_UNLOCK(vp);
3209 return;
3210 }
3211 vput_final(vp, VPUT);
3212 }
3213
3214 /*
3215 * in: locked
3216 * out: locked
3217 */
3218 void
3219 vunref(struct vnode *vp)
3220 {
3221
3222 ASSERT_VOP_LOCKED(vp, __func__);
3223 ASSERT_VI_UNLOCKED(vp, __func__);
3224 if (!refcount_release(&vp->v_usecount))
3225 return;
3226 vput_final(vp, VUNREF);
3227 }
3228
3229 void
3230 vhold(struct vnode *vp)
3231 {
3232 int old;
3233
3234 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3235 old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
3236 VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
3237 ("%s: wrong hold count %d", __func__, old));
3238 if (old == 0)
3239 vn_freevnodes_dec();
3240 }
3241
3242 void
3243 vholdnz(struct vnode *vp)
3244 {
3245
3246 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3247 #ifdef INVARIANTS
3248 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
3249 VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
3250 ("%s: wrong hold count %d", __func__, old));
3251 #else
3252 atomic_add_int(&vp->v_holdcnt, 1);
3253 #endif
3254 }
3255
3256 /*
3257 * Grab a hold count unless the vnode is freed.
3258 *
3259 * Only use this routine if vfs smr is the only protection you have against
3260 * freeing the vnode.
3261 *
3262 * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag
3263 * is not set. After the flag is set the vnode becomes immutable to anyone but
3264 * the thread which managed to set the flag.
3265 *
3266 * It may be tempting to replace the loop with:
3267 * count = atomic_fetchadd_int(&vp->v_holdcnt, 1);
3268 * if (count & VHOLD_NO_SMR) {
3269 * backpedal and error out;
3270 * }
3271 *
3272 * However, while this is more performant, it hinders debugging by eliminating
3273 * the previously mentioned invariant.
3274 */
3275 bool
3276 vhold_smr(struct vnode *vp)
3277 {
3278 int count;
3279
3280 VFS_SMR_ASSERT_ENTERED();
3281
3282 count = atomic_load_int(&vp->v_holdcnt);
3283 for (;;) {
3284 if (count & VHOLD_NO_SMR) {
3285 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
3286 ("non-zero hold count with flags %d\n", count));
3287 return (false);
3288 }
3289 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
3290 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
3291 if (count == 0)
3292 vn_freevnodes_dec();
3293 return (true);
3294 }
3295 }
3296 }
3297
3298 /*
3299 * Hold a free vnode for recycling.
3300 *
3301 * Note: vnode_init references this comment.
3302 *
3303 * Attempts to recycle only need the global vnode list lock and have no use for
3304 * SMR.
3305 *
3306 * However, vnodes get inserted into the global list before they get fully
3307 * initialized and stay there until UMA decides to free the memory. This in
3308 * particular means the target can be found before it becomes usable and after
3309 * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to
3310 * VHOLD_NO_SMR.
3311 *
3312 * Note: the vnode may gain more references after we transition the count 0->1.
3313 */
3314 static bool
3315 vhold_recycle_free(struct vnode *vp)
3316 {
3317 int count;
3318
3319 mtx_assert(&vnode_list_mtx, MA_OWNED);
3320
3321 count = atomic_load_int(&vp->v_holdcnt);
3322 for (;;) {
3323 if (count & VHOLD_NO_SMR) {
3324 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
3325 ("non-zero hold count with flags %d\n", count));
3326 return (false);
3327 }
3328 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
3329 if (count > 0) {
3330 return (false);
3331 }
3332 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
3333 vn_freevnodes_dec();
3334 return (true);
3335 }
3336 }
3337 }
3338
3339 static void __noinline
3340 vdbatch_process(struct vdbatch *vd)
3341 {
3342 struct vnode *vp;
3343 int i;
3344
3345 mtx_assert(&vd->lock, MA_OWNED);
3346 MPASS(curthread->td_pinned > 0);
3347 MPASS(vd->index == VDBATCH_SIZE);
3348
3349 mtx_lock(&vnode_list_mtx);
3350 critical_enter();
3351 freevnodes += vd->freevnodes;
3352 for (i = 0; i < VDBATCH_SIZE; i++) {
3353 vp = vd->tab[i];
3354 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
3355 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
3356 MPASS(vp->v_dbatchcpu != NOCPU);
3357 vp->v_dbatchcpu = NOCPU;
3358 }
3359 mtx_unlock(&vnode_list_mtx);
3360 vd->freevnodes = 0;
3361 bzero(vd->tab, sizeof(vd->tab));
3362 vd->index = 0;
3363 critical_exit();
3364 }
3365
3366 static void
3367 vdbatch_enqueue(struct vnode *vp)
3368 {
3369 struct vdbatch *vd;
3370
3371 ASSERT_VI_LOCKED(vp, __func__);
3372 VNASSERT(!VN_IS_DOOMED(vp), vp,
3373 ("%s: deferring requeue of a doomed vnode", __func__));
3374
3375 if (vp->v_dbatchcpu != NOCPU) {
3376 VI_UNLOCK(vp);
3377 return;
3378 }
3379
3380 sched_pin();
3381 vd = DPCPU_PTR(vd);
3382 mtx_lock(&vd->lock);
3383 MPASS(vd->index < VDBATCH_SIZE);
3384 MPASS(vd->tab[vd->index] == NULL);
3385 /*
3386 * A hack: we depend on being pinned so that we know what to put in
3387 * ->v_dbatchcpu.
3388 */
3389 vp->v_dbatchcpu = curcpu;
3390 vd->tab[vd->index] = vp;
3391 vd->index++;
3392 VI_UNLOCK(vp);
3393 if (vd->index == VDBATCH_SIZE)
3394 vdbatch_process(vd);
3395 mtx_unlock(&vd->lock);
3396 sched_unpin();
3397 }
3398
3399 /*
3400 * This routine must only be called for vnodes which are about to be
3401 * deallocated. Supporting dequeue for arbitrary vndoes would require
3402 * validating that the locked batch matches.
3403 */
3404 static void
3405 vdbatch_dequeue(struct vnode *vp)
3406 {
3407 struct vdbatch *vd;
3408 int i;
3409 short cpu;
3410
3411 VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp,
3412 ("%s: called for a used vnode\n", __func__));
3413
3414 cpu = vp->v_dbatchcpu;
3415 if (cpu == NOCPU)
3416 return;
3417
3418 vd = DPCPU_ID_PTR(cpu, vd);
3419 mtx_lock(&vd->lock);
3420 for (i = 0; i < vd->index; i++) {
3421 if (vd->tab[i] != vp)
3422 continue;
3423 vp->v_dbatchcpu = NOCPU;
3424 vd->index--;
3425 vd->tab[i] = vd->tab[vd->index];
3426 vd->tab[vd->index] = NULL;
3427 break;
3428 }
3429 mtx_unlock(&vd->lock);
3430 /*
3431 * Either we dequeued the vnode above or the target CPU beat us to it.
3432 */
3433 MPASS(vp->v_dbatchcpu == NOCPU);
3434 }
3435
3436 /*
3437 * Drop the hold count of the vnode. If this is the last reference to
3438 * the vnode we place it on the free list unless it has been vgone'd
3439 * (marked VIRF_DOOMED) in which case we will free it.
3440 *
3441 * Because the vnode vm object keeps a hold reference on the vnode if
3442 * there is at least one resident non-cached page, the vnode cannot
3443 * leave the active list without the page cleanup done.
3444 */
3445 static void
3446 vdrop_deactivate(struct vnode *vp)
3447 {
3448 struct mount *mp;
3449
3450 ASSERT_VI_LOCKED(vp, __func__);
3451 /*
3452 * Mark a vnode as free: remove it from its active list
3453 * and put it up for recycling on the freelist.
3454 */
3455 VNASSERT(!VN_IS_DOOMED(vp), vp,
3456 ("vdrop: returning doomed vnode"));
3457 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
3458 ("vnode with VI_OWEINACT set"));
3459 VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp,
3460 ("vnode with VI_DEFINACT set"));
3461 if (vp->v_mflag & VMP_LAZYLIST) {
3462 mp = vp->v_mount;
3463 mtx_lock(&mp->mnt_listmtx);
3464 VNASSERT(vp->v_mflag & VMP_LAZYLIST, vp, ("lost VMP_LAZYLIST"));
3465 /*
3466 * Don't remove the vnode from the lazy list if another thread
3467 * has increased the hold count. It may have re-enqueued the
3468 * vnode to the lazy list and is now responsible for its
3469 * removal.
3470 */
3471 if (vp->v_holdcnt == 0) {
3472 vp->v_mflag &= ~VMP_LAZYLIST;
3473 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
3474 mp->mnt_lazyvnodelistsize--;
3475 }
3476 mtx_unlock(&mp->mnt_listmtx);
3477 }
3478 vdbatch_enqueue(vp);
3479 }
3480
3481 static void __noinline
3482 vdropl_final(struct vnode *vp)
3483 {
3484
3485 ASSERT_VI_LOCKED(vp, __func__);
3486 VNPASS(VN_IS_DOOMED(vp), vp);
3487 /*
3488 * Set the VHOLD_NO_SMR flag.
3489 *
3490 * We may be racing against vhold_smr. If they win we can just pretend
3491 * we never got this far, they will vdrop later.
3492 */
3493 if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) {
3494 vn_freevnodes_inc();
3495 VI_UNLOCK(vp);
3496 /*
3497 * We lost the aforementioned race. Any subsequent access is
3498 * invalid as they might have managed to vdropl on their own.
3499 */
3500 return;
3501 }
3502 /*
3503 * Don't bump freevnodes as this one is going away.
3504 */
3505 freevnode(vp);
3506 }
3507
3508 void
3509 vdrop(struct vnode *vp)
3510 {
3511
3512 ASSERT_VI_UNLOCKED(vp, __func__);
3513 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3514 if (refcount_release_if_not_last(&vp->v_holdcnt))
3515 return;
3516 VI_LOCK(vp);
3517 vdropl(vp);
3518 }
3519
3520 void
3521 vdropl(struct vnode *vp)
3522 {
3523
3524 ASSERT_VI_LOCKED(vp, __func__);
3525 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3526 if (!refcount_release(&vp->v_holdcnt)) {
3527 VI_UNLOCK(vp);
3528 return;
3529 }
3530 if (!VN_IS_DOOMED(vp)) {
3531 vn_freevnodes_inc();
3532 vdrop_deactivate(vp);
3533 /*
3534 * Also unlocks the interlock. We can't assert on it as we
3535 * released our hold and by now the vnode might have been
3536 * freed.
3537 */
3538 return;
3539 }
3540 vdropl_final(vp);
3541 }
3542
3543 /*
3544 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
3545 * flags. DOINGINACT prevents us from recursing in calls to vinactive.
3546 */
3547 static void
3548 vinactivef(struct vnode *vp)
3549 {
3550 struct vm_object *obj;
3551
3552 ASSERT_VOP_ELOCKED(vp, "vinactive");
3553 ASSERT_VI_LOCKED(vp, "vinactive");
3554 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
3555 ("vinactive: recursed on VI_DOINGINACT"));
3556 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3557 vp->v_iflag |= VI_DOINGINACT;
3558 vp->v_iflag &= ~VI_OWEINACT;
3559 VI_UNLOCK(vp);
3560 /*
3561 * Before moving off the active list, we must be sure that any
3562 * modified pages are converted into the vnode's dirty
3563 * buffers, since these will no longer be checked once the
3564 * vnode is on the inactive list.
3565 *
3566 * The write-out of the dirty pages is asynchronous. At the
3567 * point that VOP_INACTIVE() is called, there could still be
3568 * pending I/O and dirty pages in the object.
3569 */
3570 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
3571 vm_object_mightbedirty(obj)) {
3572 VM_OBJECT_WLOCK(obj);
3573 vm_object_page_clean(obj, 0, 0, 0);
3574 VM_OBJECT_WUNLOCK(obj);
3575 }
3576 VOP_INACTIVE(vp);
3577 VI_LOCK(vp);
3578 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
3579 ("vinactive: lost VI_DOINGINACT"));
3580 vp->v_iflag &= ~VI_DOINGINACT;
3581 }
3582
3583 void
3584 vinactive(struct vnode *vp)
3585 {
3586
3587 ASSERT_VOP_ELOCKED(vp, "vinactive");
3588 ASSERT_VI_LOCKED(vp, "vinactive");
3589 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3590
3591 if ((vp->v_iflag & VI_OWEINACT) == 0)
3592 return;
3593 if (vp->v_iflag & VI_DOINGINACT)
3594 return;
3595 if (vp->v_usecount > 0) {
3596 vp->v_iflag &= ~VI_OWEINACT;
3597 return;
3598 }
3599 vinactivef(vp);
3600 }
3601
3602 /*
3603 * Remove any vnodes in the vnode table belonging to mount point mp.
3604 *
3605 * If FORCECLOSE is not specified, there should not be any active ones,
3606 * return error if any are found (nb: this is a user error, not a
3607 * system error). If FORCECLOSE is specified, detach any active vnodes
3608 * that are found.
3609 *
3610 * If WRITECLOSE is set, only flush out regular file vnodes open for
3611 * writing.
3612 *
3613 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
3614 *
3615 * `rootrefs' specifies the base reference count for the root vnode
3616 * of this filesystem. The root vnode is considered busy if its
3617 * v_usecount exceeds this value. On a successful return, vflush(, td)
3618 * will call vrele() on the root vnode exactly rootrefs times.
3619 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
3620 * be zero.
3621 */
3622 #ifdef DIAGNOSTIC
3623 static int busyprt = 0; /* print out busy vnodes */
3624 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
3625 #endif
3626
3627 int
3628 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
3629 {
3630 struct vnode *vp, *mvp, *rootvp = NULL;
3631 struct vattr vattr;
3632 int busy = 0, error;
3633
3634 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
3635 rootrefs, flags);
3636 if (rootrefs > 0) {
3637 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
3638 ("vflush: bad args"));
3639 /*
3640 * Get the filesystem root vnode. We can vput() it
3641 * immediately, since with rootrefs > 0, it won't go away.
3642 */
3643 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
3644 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
3645 __func__, error);
3646 return (error);
3647 }
3648 vput(rootvp);
3649 }
3650 loop:
3651 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
3652 vholdl(vp);
3653 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
3654 if (error) {
3655 vdrop(vp);
3656 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
3657 goto loop;
3658 }
3659 /*
3660 * Skip over a vnodes marked VV_SYSTEM.
3661 */
3662 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
3663 VOP_UNLOCK(vp);
3664 vdrop(vp);
3665 continue;
3666 }
3667 /*
3668 * If WRITECLOSE is set, flush out unlinked but still open
3669 * files (even if open only for reading) and regular file
3670 * vnodes open for writing.
3671 */
3672 if (flags & WRITECLOSE) {
3673 if (vp->v_object != NULL) {
3674 VM_OBJECT_WLOCK(vp->v_object);
3675 vm_object_page_clean(vp->v_object, 0, 0, 0);
3676 VM_OBJECT_WUNLOCK(vp->v_object);
3677 }
3678 do {
3679 error = VOP_FSYNC(vp, MNT_WAIT, td);
3680 } while (error == ERELOOKUP);
3681 if (error != 0) {
3682 VOP_UNLOCK(vp);
3683 vdrop(vp);
3684 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
3685 return (error);
3686 }
3687 error = VOP_GETATTR(vp, &vattr, td->td_ucred);
3688 VI_LOCK(vp);
3689
3690 if ((vp->v_type == VNON ||
3691 (error == 0 && vattr.va_nlink > 0)) &&
3692 (vp->v_writecount <= 0 || vp->v_type != VREG)) {
3693 VOP_UNLOCK(vp);
3694 vdropl(vp);
3695 continue;
3696 }
3697 } else
3698 VI_LOCK(vp);
3699 /*
3700 * With v_usecount == 0, all we need to do is clear out the
3701 * vnode data structures and we are done.
3702 *
3703 * If FORCECLOSE is set, forcibly close the vnode.
3704 */
3705 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
3706 vgonel(vp);
3707 } else {
3708 busy++;
3709 #ifdef DIAGNOSTIC
3710 if (busyprt)
3711 vn_printf(vp, "vflush: busy vnode ");
3712 #endif
3713 }
3714 VOP_UNLOCK(vp);
3715 vdropl(vp);
3716 }
3717 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
3718 /*
3719 * If just the root vnode is busy, and if its refcount
3720 * is equal to `rootrefs', then go ahead and kill it.
3721 */
3722 VI_LOCK(rootvp);
3723 KASSERT(busy > 0, ("vflush: not busy"));
3724 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
3725 ("vflush: usecount %d < rootrefs %d",
3726 rootvp->v_usecount, rootrefs));
3727 if (busy == 1 && rootvp->v_usecount == rootrefs) {
3728 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
3729 vgone(rootvp);
3730 VOP_UNLOCK(rootvp);
3731 busy = 0;
3732 } else
3733 VI_UNLOCK(rootvp);
3734 }
3735 if (busy) {
3736 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
3737 busy);
3738 return (EBUSY);
3739 }
3740 for (; rootrefs > 0; rootrefs--)
3741 vrele(rootvp);
3742 return (0);
3743 }
3744
3745 /*
3746 * Recycle an unused vnode to the front of the free list.
3747 */
3748 int
3749 vrecycle(struct vnode *vp)
3750 {
3751 int recycled;
3752
3753 VI_LOCK(vp);
3754 recycled = vrecyclel(vp);
3755 VI_UNLOCK(vp);
3756 return (recycled);
3757 }
3758
3759 /*
3760 * vrecycle, with the vp interlock held.
3761 */
3762 int
3763 vrecyclel(struct vnode *vp)
3764 {
3765 int recycled;
3766
3767 ASSERT_VOP_ELOCKED(vp, __func__);
3768 ASSERT_VI_LOCKED(vp, __func__);
3769 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3770 recycled = 0;
3771 if (vp->v_usecount == 0) {
3772 recycled = 1;
3773 vgonel(vp);
3774 }
3775 return (recycled);
3776 }
3777
3778 /*
3779 * Eliminate all activity associated with a vnode
3780 * in preparation for reuse.
3781 */
3782 void
3783 vgone(struct vnode *vp)
3784 {
3785 VI_LOCK(vp);
3786 vgonel(vp);
3787 VI_UNLOCK(vp);
3788 }
3789
3790 static void
3791 notify_lowervp_vfs_dummy(struct mount *mp __unused,
3792 struct vnode *lowervp __unused)
3793 {
3794 }
3795
3796 /*
3797 * Notify upper mounts about reclaimed or unlinked vnode.
3798 */
3799 void
3800 vfs_notify_upper(struct vnode *vp, int event)
3801 {
3802 static struct vfsops vgonel_vfsops = {
3803 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
3804 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
3805 };
3806 struct mount *mp, *ump, *mmp;
3807
3808 mp = vp->v_mount;
3809 if (mp == NULL)
3810 return;
3811 if (TAILQ_EMPTY(&mp->mnt_uppers))
3812 return;
3813
3814 mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
3815 mmp->mnt_op = &vgonel_vfsops;
3816 mmp->mnt_kern_flag |= MNTK_MARKER;
3817 MNT_ILOCK(mp);
3818 mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
3819 for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
3820 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
3821 ump = TAILQ_NEXT(ump, mnt_upper_link);
3822 continue;
3823 }
3824 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
3825 MNT_IUNLOCK(mp);
3826 switch (event) {
3827 case VFS_NOTIFY_UPPER_RECLAIM:
3828 VFS_RECLAIM_LOWERVP(ump, vp);
3829 break;
3830 case VFS_NOTIFY_UPPER_UNLINK:
3831 VFS_UNLINK_LOWERVP(ump, vp);
3832 break;
3833 default:
3834 KASSERT(0, ("invalid event %d", event));
3835 break;
3836 }
3837 MNT_ILOCK(mp);
3838 ump = TAILQ_NEXT(mmp, mnt_upper_link);
3839 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
3840 }
3841 free(mmp, M_TEMP);
3842 mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
3843 if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
3844 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
3845 wakeup(&mp->mnt_uppers);
3846 }
3847 MNT_IUNLOCK(mp);
3848 }
3849
3850 /*
3851 * vgone, with the vp interlock held.
3852 */
3853 static void
3854 vgonel(struct vnode *vp)
3855 {
3856 struct thread *td;
3857 struct mount *mp;
3858 vm_object_t object;
3859 bool active, doinginact, oweinact;
3860
3861 ASSERT_VOP_ELOCKED(vp, "vgonel");
3862 ASSERT_VI_LOCKED(vp, "vgonel");
3863 VNASSERT(vp->v_holdcnt, vp,
3864 ("vgonel: vp %p has no reference.", vp));
3865 CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3866 td = curthread;
3867
3868 /*
3869 * Don't vgonel if we're already doomed.
3870 */
3871 if (vp->v_irflag & VIRF_DOOMED)
3872 return;
3873 /*
3874 * Paired with freevnode.
3875 */
3876 vn_seqc_write_begin_locked(vp);
3877 vunlazy_gone(vp);
3878 vp->v_irflag |= VIRF_DOOMED;
3879
3880 /*
3881 * Check to see if the vnode is in use. If so, we have to
3882 * call VOP_CLOSE() and VOP_INACTIVE().
3883 *
3884 * It could be that VOP_INACTIVE() requested reclamation, in
3885 * which case we should avoid recursion, so check
3886 * VI_DOINGINACT. This is not precise but good enough.
3887 */
3888 active = vp->v_usecount > 0;
3889 oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
3890 doinginact = (vp->v_iflag & VI_DOINGINACT) != 0;
3891
3892 /*
3893 * If we need to do inactive VI_OWEINACT will be set.
3894 */
3895 if (vp->v_iflag & VI_DEFINACT) {
3896 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
3897 vp->v_iflag &= ~VI_DEFINACT;
3898 vdropl(vp);
3899 } else {
3900 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count"));
3901 VI_UNLOCK(vp);
3902 }
3903 cache_purge_vgone(vp);
3904 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
3905
3906 /*
3907 * If purging an active vnode, it must be closed and
3908 * deactivated before being reclaimed.
3909 */
3910 if (active)
3911 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
3912 if ((oweinact || active) && !doinginact) {
3913 VI_LOCK(vp);
3914 vinactivef(vp);
3915 VI_UNLOCK(vp);
3916 }
3917 if (vp->v_type == VSOCK)
3918 vfs_unp_reclaim(vp);
3919
3920 /*
3921 * Clean out any buffers associated with the vnode.
3922 * If the flush fails, just toss the buffers.
3923 */
3924 mp = NULL;
3925 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
3926 (void) vn_start_secondary_write(vp, &mp, V_WAIT);
3927 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
3928 while (vinvalbuf(vp, 0, 0, 0) != 0)
3929 ;
3930 }
3931
3932 BO_LOCK(&vp->v_bufobj);
3933 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
3934 vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
3935 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
3936 vp->v_bufobj.bo_clean.bv_cnt == 0,
3937 ("vp %p bufobj not invalidated", vp));
3938
3939 /*
3940 * For VMIO bufobj, BO_DEAD is set later, or in
3941 * vm_object_terminate() after the object's page queue is
3942 * flushed.
3943 */
3944 object = vp->v_bufobj.bo_object;
3945 if (object == NULL)
3946 vp->v_bufobj.bo_flag |= BO_DEAD;
3947 BO_UNLOCK(&vp->v_bufobj);
3948
3949 /*
3950 * Handle the VM part. Tmpfs handles v_object on its own (the
3951 * OBJT_VNODE check). Nullfs or other bypassing filesystems
3952 * should not touch the object borrowed from the lower vnode
3953 * (the handle check).
3954 */
3955 if (object != NULL && object->type == OBJT_VNODE &&
3956 object->handle == vp)
3957 vnode_destroy_vobject(vp);
3958
3959 /*
3960 * Reclaim the vnode.
3961 */
3962 if (VOP_RECLAIM(vp))
3963 panic("vgone: cannot reclaim");
3964 if (mp != NULL)
3965 vn_finished_secondary_write(mp);
3966 VNASSERT(vp->v_object == NULL, vp,
3967 ("vop_reclaim left v_object vp=%p", vp));
3968 /*
3969 * Clear the advisory locks and wake up waiting threads.
3970 */
3971 (void)VOP_ADVLOCKPURGE(vp);
3972 vp->v_lockf = NULL;
3973 /*
3974 * Delete from old mount point vnode list.
3975 */
3976 delmntque(vp);
3977 /*
3978 * Done with purge, reset to the standard lock and invalidate
3979 * the vnode.
3980 */
3981 VI_LOCK(vp);
3982 vp->v_vnlock = &vp->v_lock;
3983 vp->v_op = &dead_vnodeops;
3984 vp->v_type = VBAD;
3985 }
3986
3987 /*
3988 * Print out a description of a vnode.
3989 */
3990 static const char * const typename[] =
3991 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
3992 "VMARKER"};
3993
3994 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0,
3995 "new hold count flag not added to vn_printf");
3996
3997 void
3998 vn_printf(struct vnode *vp, const char *fmt, ...)
3999 {
4000 va_list ap;
4001 char buf[256], buf2[16];
4002 u_long flags;
4003 u_int holdcnt;
4004
4005 va_start(ap, fmt);
4006 vprintf(fmt, ap);
4007 va_end(ap);
4008 printf("%p: ", (void *)vp);
4009 printf("type %s\n", typename[vp->v_type]);
4010 holdcnt = atomic_load_int(&vp->v_holdcnt);
4011 printf(" usecount %d, writecount %d, refcount %d seqc users %d",
4012 vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS,
4013 vp->v_seqc_users);
4014 switch (vp->v_type) {
4015 case VDIR:
4016 printf(" mountedhere %p\n", vp->v_mountedhere);
4017 break;
4018 case VCHR:
4019 printf(" rdev %p\n", vp->v_rdev);
4020 break;
4021 case VSOCK:
4022 printf(" socket %p\n", vp->v_unpcb);
4023 break;
4024 case VFIFO:
4025 printf(" fifoinfo %p\n", vp->v_fifoinfo);
4026 break;
4027 default:
4028 printf("\n");
4029 break;
4030 }
4031 buf[0] = '\0';
4032 buf[1] = '\0';
4033 if (holdcnt & VHOLD_NO_SMR)
4034 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf));
4035 printf(" hold count flags (%s)\n", buf + 1);
4036
4037 buf[0] = '\0';
4038 buf[1] = '\0';
4039 if (vp->v_irflag & VIRF_DOOMED)
4040 strlcat(buf, "|VIRF_DOOMED", sizeof(buf));
4041 if (vp->v_irflag & VIRF_PGREAD)
4042 strlcat(buf, "|VIRF_PGREAD", sizeof(buf));
4043 flags = vp->v_irflag & ~(VIRF_DOOMED | VIRF_PGREAD);
4044 if (flags != 0) {
4045 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags);
4046 strlcat(buf, buf2, sizeof(buf));
4047 }
4048 if (vp->v_vflag & VV_ROOT)
4049 strlcat(buf, "|VV_ROOT", sizeof(buf));
4050 if (vp->v_vflag & VV_ISTTY)
4051 strlcat(buf, "|VV_ISTTY", sizeof(buf));
4052 if (vp->v_vflag & VV_NOSYNC)
4053 strlcat(buf, "|VV_NOSYNC", sizeof(buf));
4054 if (vp->v_vflag & VV_ETERNALDEV)
4055 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
4056 if (vp->v_vflag & VV_CACHEDLABEL)
4057 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
4058 if (vp->v_vflag & VV_VMSIZEVNLOCK)
4059 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf));
4060 if (vp->v_vflag & VV_COPYONWRITE)
4061 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
4062 if (vp->v_vflag & VV_SYSTEM)
4063 strlcat(buf, "|VV_SYSTEM", sizeof(buf));
4064 if (vp->v_vflag & VV_PROCDEP)
4065 strlcat(buf, "|VV_PROCDEP", sizeof(buf));
4066 if (vp->v_vflag & VV_NOKNOTE)
4067 strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
4068 if (vp->v_vflag & VV_DELETED)
4069 strlcat(buf, "|VV_DELETED", sizeof(buf));
4070 if (vp->v_vflag & VV_MD)
4071 strlcat(buf, "|VV_MD", sizeof(buf));
4072 if (vp->v_vflag & VV_FORCEINSMQ)
4073 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
4074 if (vp->v_vflag & VV_READLINK)
4075 strlcat(buf, "|VV_READLINK", sizeof(buf));
4076 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
4077 VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM |
4078 VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ |
4079 VV_READLINK);
4080 if (flags != 0) {
4081 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
4082 strlcat(buf, buf2, sizeof(buf));
4083 }
4084 if (vp->v_iflag & VI_TEXT_REF)
4085 strlcat(buf, "|VI_TEXT_REF", sizeof(buf));
4086 if (vp->v_iflag & VI_MOUNT)
4087 strlcat(buf, "|VI_MOUNT", sizeof(buf));
4088 if (vp->v_iflag & VI_DOINGINACT)
4089 strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
4090 if (vp->v_iflag & VI_OWEINACT)
4091 strlcat(buf, "|VI_OWEINACT", sizeof(buf));
4092 if (vp->v_iflag & VI_DEFINACT)
4093 strlcat(buf, "|VI_DEFINACT", sizeof(buf));
4094 flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_DOINGINACT |
4095 VI_OWEINACT | VI_DEFINACT);
4096 if (flags != 0) {
4097 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
4098 strlcat(buf, buf2, sizeof(buf));
4099 }
4100 if (vp->v_mflag & VMP_LAZYLIST)
4101 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf));
4102 flags = vp->v_mflag & ~(VMP_LAZYLIST);
4103 if (flags != 0) {
4104 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags);
4105 strlcat(buf, buf2, sizeof(buf));
4106 }
4107 printf(" flags (%s)", buf + 1);
4108 if (mtx_owned(VI_MTX(vp)))
4109 printf(" VI_LOCKed");
4110 printf("\n");
4111 if (vp->v_object != NULL)
4112 printf(" v_object %p ref %d pages %d "
4113 "cleanbuf %d dirtybuf %d\n",
4114 vp->v_object, vp->v_object->ref_count,
4115 vp->v_object->resident_page_count,
4116 vp->v_bufobj.bo_clean.bv_cnt,
4117 vp->v_bufobj.bo_dirty.bv_cnt);
4118 printf(" ");
4119 lockmgr_printinfo(vp->v_vnlock);
4120 if (vp->v_data != NULL)
4121 VOP_PRINT(vp);
4122 }
4123
4124 #ifdef DDB
4125 /*
4126 * List all of the locked vnodes in the system.
4127 * Called when debugging the kernel.
4128 */
4129 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
4130 {
4131 struct mount *mp;
4132 struct vnode *vp;
4133
4134 /*
4135 * Note: because this is DDB, we can't obey the locking semantics
4136 * for these structures, which means we could catch an inconsistent
4137 * state and dereference a nasty pointer. Not much to be done
4138 * about that.
4139 */
4140 db_printf("Locked vnodes\n");
4141 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
4142 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
4143 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
4144 vn_printf(vp, "vnode ");
4145 }
4146 }
4147 }
4148
4149 /*
4150 * Show details about the given vnode.
4151 */
4152 DB_SHOW_COMMAND(vnode, db_show_vnode)
4153 {
4154 struct vnode *vp;
4155
4156 if (!have_addr)
4157 return;
4158 vp = (struct vnode *)addr;
4159 vn_printf(vp, "vnode ");
4160 }
4161
4162 /*
4163 * Show details about the given mount point.
4164 */
4165 DB_SHOW_COMMAND(mount, db_show_mount)
4166 {
4167 struct mount *mp;
4168 struct vfsopt *opt;
4169 struct statfs *sp;
4170 struct vnode *vp;
4171 char buf[512];
4172 uint64_t mflags;
4173 u_int flags;
4174
4175 if (!have_addr) {
4176 /* No address given, print short info about all mount points. */
4177 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
4178 db_printf("%p %s on %s (%s)\n", mp,
4179 mp->mnt_stat.f_mntfromname,
4180 mp->mnt_stat.f_mntonname,
4181 mp->mnt_stat.f_fstypename);
4182 if (db_pager_quit)
4183 break;
4184 }
4185 db_printf("\nMore info: show mount <addr>\n");
4186 return;
4187 }
4188
4189 mp = (struct mount *)addr;
4190 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
4191 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
4192
4193 buf[0] = '\0';
4194 mflags = mp->mnt_flag;
4195 #define MNT_FLAG(flag) do { \
4196 if (mflags & (flag)) { \
4197 if (buf[0] != '\0') \
4198 strlcat(buf, ", ", sizeof(buf)); \
4199 strlcat(buf, (#flag) + 4, sizeof(buf)); \
4200 mflags &= ~(flag); \
4201 } \
4202 } while (0)
4203 MNT_FLAG(MNT_RDONLY);
4204 MNT_FLAG(MNT_SYNCHRONOUS);
4205 MNT_FLAG(MNT_NOEXEC);
4206 MNT_FLAG(MNT_NOSUID);
4207 MNT_FLAG(MNT_NFS4ACLS);
4208 MNT_FLAG(MNT_UNION);
4209 MNT_FLAG(MNT_ASYNC);
4210 MNT_FLAG(MNT_SUIDDIR);
4211 MNT_FLAG(MNT_SOFTDEP);
4212 MNT_FLAG(MNT_NOSYMFOLLOW);
4213 MNT_FLAG(MNT_GJOURNAL);
4214 MNT_FLAG(MNT_MULTILABEL);
4215 MNT_FLAG(MNT_ACLS);
4216 MNT_FLAG(MNT_NOATIME);
4217 MNT_FLAG(MNT_NOCLUSTERR);
4218 MNT_FLAG(MNT_NOCLUSTERW);
4219 MNT_FLAG(MNT_SUJ);
4220 MNT_FLAG(MNT_EXRDONLY);
4221 MNT_FLAG(MNT_EXPORTED);
4222 MNT_FLAG(MNT_DEFEXPORTED);
4223 MNT_FLAG(MNT_EXPORTANON);
4224 MNT_FLAG(MNT_EXKERB);
4225 MNT_FLAG(MNT_EXPUBLIC);
4226 MNT_FLAG(MNT_LOCAL);
4227 MNT_FLAG(MNT_QUOTA);
4228 MNT_FLAG(MNT_ROOTFS);
4229 MNT_FLAG(MNT_USER);
4230 MNT_FLAG(MNT_IGNORE);
4231 MNT_FLAG(MNT_UPDATE);
4232 MNT_FLAG(MNT_DELEXPORT);
4233 MNT_FLAG(MNT_RELOAD);
4234 MNT_FLAG(MNT_FORCE);
4235 MNT_FLAG(MNT_SNAPSHOT);
4236 MNT_FLAG(MNT_BYFSID);
4237 #undef MNT_FLAG
4238 if (mflags != 0) {
4239 if (buf[0] != '\0')
4240 strlcat(buf, ", ", sizeof(buf));
4241 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
4242 "0x%016jx", mflags);
4243 }
4244 db_printf(" mnt_flag = %s\n", buf);
4245
4246 buf[0] = '\0';
4247 flags = mp->mnt_kern_flag;
4248 #define MNT_KERN_FLAG(flag) do { \
4249 if (flags & (flag)) { \
4250 if (buf[0] != '\0') \
4251 strlcat(buf, ", ", sizeof(buf)); \
4252 strlcat(buf, (#flag) + 5, sizeof(buf)); \
4253 flags &= ~(flag); \
4254 } \
4255 } while (0)
4256 MNT_KERN_FLAG(MNTK_UNMOUNTF);
4257 MNT_KERN_FLAG(MNTK_ASYNC);
4258 MNT_KERN_FLAG(MNTK_SOFTDEP);
4259 MNT_KERN_FLAG(MNTK_DRAINING);
4260 MNT_KERN_FLAG(MNTK_REFEXPIRE);
4261 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
4262 MNT_KERN_FLAG(MNTK_SHARED_WRITES);
4263 MNT_KERN_FLAG(MNTK_NO_IOPF);
4264 MNT_KERN_FLAG(MNTK_VGONE_UPPER);
4265 MNT_KERN_FLAG(MNTK_VGONE_WAITER);
4266 MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
4267 MNT_KERN_FLAG(MNTK_MARKER);
4268 MNT_KERN_FLAG(MNTK_USES_BCACHE);
4269 MNT_KERN_FLAG(MNTK_FPLOOKUP);
4270 MNT_KERN_FLAG(MNTK_NOASYNC);
4271 MNT_KERN_FLAG(MNTK_UNMOUNT);
4272 MNT_KERN_FLAG(MNTK_MWAIT);
4273 MNT_KERN_FLAG(MNTK_SUSPEND);
4274 MNT_KERN_FLAG(MNTK_SUSPEND2);
4275 MNT_KERN_FLAG(MNTK_SUSPENDED);
4276 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
4277 MNT_KERN_FLAG(MNTK_NOKNOTE);
4278 #undef MNT_KERN_FLAG
4279 if (flags != 0) {
4280 if (buf[0] != '\0')
4281 strlcat(buf, ", ", sizeof(buf));
4282 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
4283 "0x%08x", flags);
4284 }
4285 db_printf(" mnt_kern_flag = %s\n", buf);
4286
4287 db_printf(" mnt_opt = ");
4288 opt = TAILQ_FIRST(mp->mnt_opt);
4289 if (opt != NULL) {
4290 db_printf("%s", opt->name);
4291 opt = TAILQ_NEXT(opt, link);
4292 while (opt != NULL) {
4293 db_printf(", %s", opt->name);
4294 opt = TAILQ_NEXT(opt, link);
4295 }
4296 }
4297 db_printf("\n");
4298
4299 sp = &mp->mnt_stat;
4300 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx "
4301 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
4302 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
4303 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
4304 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
4305 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
4306 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
4307 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
4308 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
4309 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
4310 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
4311 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
4312
4313 db_printf(" mnt_cred = { uid=%u ruid=%u",
4314 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
4315 if (jailed(mp->mnt_cred))
4316 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
4317 db_printf(" }\n");
4318 db_printf(" mnt_ref = %d (with %d in the struct)\n",
4319 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref);
4320 db_printf(" mnt_gen = %d\n", mp->mnt_gen);
4321 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
4322 db_printf(" mnt_lazyvnodelistsize = %d\n",
4323 mp->mnt_lazyvnodelistsize);
4324 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n",
4325 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount);
4326 db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
4327 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max);
4328 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed);
4329 db_printf(" mnt_lockref = %d (with %d in the struct)\n",
4330 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref);
4331 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
4332 db_printf(" mnt_secondary_accwrites = %d\n",
4333 mp->mnt_secondary_accwrites);
4334 db_printf(" mnt_gjprovider = %s\n",
4335 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
4336 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops);
4337
4338 db_printf("\n\nList of active vnodes\n");
4339 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
4340 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) {
4341 vn_printf(vp, "vnode ");
4342 if (db_pager_quit)
4343 break;
4344 }
4345 }
4346 db_printf("\n\nList of inactive vnodes\n");
4347 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
4348 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) {
4349 vn_printf(vp, "vnode ");
4350 if (db_pager_quit)
4351 break;
4352 }
4353 }
4354 }
4355 #endif /* DDB */
4356
4357 /*
4358 * Fill in a struct xvfsconf based on a struct vfsconf.
4359 */
4360 static int
4361 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
4362 {
4363 struct xvfsconf xvfsp;
4364
4365 bzero(&xvfsp, sizeof(xvfsp));
4366 strcpy(xvfsp.vfc_name, vfsp->vfc_name);
4367 xvfsp.vfc_typenum = vfsp->vfc_typenum;
4368 xvfsp.vfc_refcount = vfsp->vfc_refcount;
4369 xvfsp.vfc_flags = vfsp->vfc_flags;
4370 /*
4371 * These are unused in userland, we keep them
4372 * to not break binary compatibility.
4373 */
4374 xvfsp.vfc_vfsops = NULL;
4375 xvfsp.vfc_next = NULL;
4376 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
4377 }
4378
4379 #ifdef COMPAT_FREEBSD32
4380 struct xvfsconf32 {
4381 uint32_t vfc_vfsops;
4382 char vfc_name[MFSNAMELEN];
4383 int32_t vfc_typenum;
4384 int32_t vfc_refcount;
4385 int32_t vfc_flags;
4386 uint32_t vfc_next;
4387 };
4388
4389 static int
4390 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
4391 {
4392 struct xvfsconf32 xvfsp;
4393
4394 bzero(&xvfsp, sizeof(xvfsp));
4395 strcpy(xvfsp.vfc_name, vfsp->vfc_name);
4396 xvfsp.vfc_typenum = vfsp->vfc_typenum;
4397 xvfsp.vfc_refcount = vfsp->vfc_refcount;
4398 xvfsp.vfc_flags = vfsp->vfc_flags;
4399 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
4400 }
4401 #endif
4402
4403 /*
4404 * Top level filesystem related information gathering.
4405 */
4406 static int
4407 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
4408 {
4409 struct vfsconf *vfsp;
4410 int error;
4411
4412 error = 0;
4413 vfsconf_slock();
4414 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
4415 #ifdef COMPAT_FREEBSD32
4416 if (req->flags & SCTL_MASK32)
4417 error = vfsconf2x32(req, vfsp);
4418 else
4419 #endif
4420 error = vfsconf2x(req, vfsp);
4421 if (error)
4422 break;
4423 }
4424 vfsconf_sunlock();
4425 return (error);
4426 }
4427
4428 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
4429 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
4430 "S,xvfsconf", "List of all configured filesystems");
4431
4432 #ifndef BURN_BRIDGES
4433 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
4434
4435 static int
4436 vfs_sysctl(SYSCTL_HANDLER_ARGS)
4437 {
4438 int *name = (int *)arg1 - 1; /* XXX */
4439 u_int namelen = arg2 + 1; /* XXX */
4440 struct vfsconf *vfsp;
4441
4442 log(LOG_WARNING, "userland calling deprecated sysctl, "
4443 "please rebuild world\n");
4444
4445 #if 1 || defined(COMPAT_PRELITE2)
4446 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
4447 if (namelen == 1)
4448 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
4449 #endif
4450
4451 switch (name[1]) {
4452 case VFS_MAXTYPENUM:
4453 if (namelen != 2)
4454 return (ENOTDIR);
4455 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
4456 case VFS_CONF:
4457 if (namelen != 3)
4458 return (ENOTDIR); /* overloaded */
4459 vfsconf_slock();
4460 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
4461 if (vfsp->vfc_typenum == name[2])
4462 break;
4463 }
4464 vfsconf_sunlock();
4465 if (vfsp == NULL)
4466 return (EOPNOTSUPP);
4467 #ifdef COMPAT_FREEBSD32
4468 if (req->flags & SCTL_MASK32)
4469 return (vfsconf2x32(req, vfsp));
4470 else
4471 #endif
4472 return (vfsconf2x(req, vfsp));
4473 }
4474 return (EOPNOTSUPP);
4475 }
4476
4477 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
4478 CTLFLAG_MPSAFE, vfs_sysctl,
4479 "Generic filesystem");
4480
4481 #if 1 || defined(COMPAT_PRELITE2)
4482
4483 static int
4484 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
4485 {
4486 int error;
4487 struct vfsconf *vfsp;
4488 struct ovfsconf ovfs;
4489
4490 vfsconf_slock();
4491 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
4492 bzero(&ovfs, sizeof(ovfs));
4493 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
4494 strcpy(ovfs.vfc_name, vfsp->vfc_name);
4495 ovfs.vfc_index = vfsp->vfc_typenum;
4496 ovfs.vfc_refcount = vfsp->vfc_refcount;
4497 ovfs.vfc_flags = vfsp->vfc_flags;
4498 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
4499 if (error != 0) {
4500 vfsconf_sunlock();
4501 return (error);
4502 }
4503 }
4504 vfsconf_sunlock();
4505 return (0);
4506 }
4507
4508 #endif /* 1 || COMPAT_PRELITE2 */
4509 #endif /* !BURN_BRIDGES */
4510
4511 #define KINFO_VNODESLOP 10
4512 #ifdef notyet
4513 /*
4514 * Dump vnode list (via sysctl).
4515 */
4516 /* ARGSUSED */
4517 static int
4518 sysctl_vnode(SYSCTL_HANDLER_ARGS)
4519 {
4520 struct xvnode *xvn;
4521 struct mount *mp;
4522 struct vnode *vp;
4523 int error, len, n;
4524
4525 /*
4526 * Stale numvnodes access is not fatal here.
4527 */
4528 req->lock = 0;
4529 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
4530 if (!req->oldptr)
4531 /* Make an estimate */
4532 return (SYSCTL_OUT(req, 0, len));
4533
4534 error = sysctl_wire_old_buffer(req, 0);
4535 if (error != 0)
4536 return (error);
4537 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
4538 n = 0;
4539 mtx_lock(&mountlist_mtx);
4540 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
4541 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
4542 continue;
4543 MNT_ILOCK(mp);
4544 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
4545 if (n == len)
4546 break;
4547 vref(vp);
4548 xvn[n].xv_size = sizeof *xvn;
4549 xvn[n].xv_vnode = vp;
4550 xvn[n].xv_id = 0; /* XXX compat */
4551 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
4552 XV_COPY(usecount);
4553 XV_COPY(writecount);
4554 XV_COPY(holdcnt);
4555 XV_COPY(mount);
4556 XV_COPY(numoutput);
4557 XV_COPY(type);
4558 #undef XV_COPY
4559 xvn[n].xv_flag = vp->v_vflag;
4560
4561 switch (vp->v_type) {
4562 case VREG:
4563 case VDIR:
4564 case VLNK:
4565 break;
4566 case VBLK:
4567 case VCHR:
4568 if (vp->v_rdev == NULL) {
4569 vrele(vp);
4570 continue;
4571 }
4572 xvn[n].xv_dev = dev2udev(vp->v_rdev);
4573 break;
4574 case VSOCK:
4575 xvn[n].xv_socket = vp->v_socket;
4576 break;
4577 case VFIFO:
4578 xvn[n].xv_fifo = vp->v_fifoinfo;
4579 break;
4580 case VNON:
4581 case VBAD:
4582 default:
4583 /* shouldn't happen? */
4584 vrele(vp);
4585 continue;
4586 }
4587 vrele(vp);
4588 ++n;
4589 }
4590 MNT_IUNLOCK(mp);
4591 mtx_lock(&mountlist_mtx);
4592 vfs_unbusy(mp);
4593 if (n == len)
4594 break;
4595 }
4596 mtx_unlock(&mountlist_mtx);
4597
4598 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
4599 free(xvn, M_TEMP);
4600 return (error);
4601 }
4602
4603 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
4604 CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
4605 "");
4606 #endif
4607
4608 static void
4609 unmount_or_warn(struct mount *mp)
4610 {
4611 int error;
4612
4613 error = dounmount(mp, MNT_FORCE, curthread);
4614 if (error != 0) {
4615 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
4616 if (error == EBUSY)
4617 printf("BUSY)\n");
4618 else
4619 printf("%d)\n", error);
4620 }
4621 }
4622
4623 /*
4624 * Unmount all filesystems. The list is traversed in reverse order
4625 * of mounting to avoid dependencies.
4626 */
4627 void
4628 vfs_unmountall(void)
4629 {
4630 struct mount *mp, *tmp;
4631
4632 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
4633
4634 /*
4635 * Since this only runs when rebooting, it is not interlocked.
4636 */
4637 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
4638 vfs_ref(mp);
4639
4640 /*
4641 * Forcibly unmounting "/dev" before "/" would prevent clean
4642 * unmount of the latter.
4643 */
4644 if (mp == rootdevmp)
4645 continue;
4646
4647 unmount_or_warn(mp);
4648 }
4649
4650 if (rootdevmp != NULL)
4651 unmount_or_warn(rootdevmp);
4652 }
4653
4654 static void
4655 vfs_deferred_inactive(struct vnode *vp, int lkflags)
4656 {
4657
4658 ASSERT_VI_LOCKED(vp, __func__);
4659 VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set"));
4660 if ((vp->v_iflag & VI_OWEINACT) == 0) {
4661 vdropl(vp);
4662 return;
4663 }
4664 if (vn_lock(vp, lkflags) == 0) {
4665 VI_LOCK(vp);
4666 vinactive(vp);
4667 VOP_UNLOCK(vp);
4668 vdropl(vp);
4669 return;
4670 }
4671 vdefer_inactive_unlocked(vp);
4672 }
4673
4674 static int
4675 vfs_periodic_inactive_filter(struct vnode *vp, void *arg)
4676 {
4677
4678 return (vp->v_iflag & VI_DEFINACT);
4679 }
4680
4681 static void __noinline
4682 vfs_periodic_inactive(struct mount *mp, int flags)
4683 {
4684 struct vnode *vp, *mvp;
4685 int lkflags;
4686
4687 lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
4688 if (flags != MNT_WAIT)
4689 lkflags |= LK_NOWAIT;
4690
4691 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) {
4692 if ((vp->v_iflag & VI_DEFINACT) == 0) {
4693 VI_UNLOCK(vp);
4694 continue;
4695 }
4696 vp->v_iflag &= ~VI_DEFINACT;
4697 vfs_deferred_inactive(vp, lkflags);
4698 }
4699 }
4700
4701 static inline bool
4702 vfs_want_msync(struct vnode *vp)
4703 {
4704 struct vm_object *obj;
4705
4706 /*
4707 * This test may be performed without any locks held.
4708 * We rely on vm_object's type stability.
4709 */
4710 if (vp->v_vflag & VV_NOSYNC)
4711 return (false);
4712 obj = vp->v_object;
4713 return (obj != NULL && vm_object_mightbedirty(obj));
4714 }
4715
4716 static int
4717 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused)
4718 {
4719
4720 if (vp->v_vflag & VV_NOSYNC)
4721 return (false);
4722 if (vp->v_iflag & VI_DEFINACT)
4723 return (true);
4724 return (vfs_want_msync(vp));
4725 }
4726
4727 static void __noinline
4728 vfs_periodic_msync_inactive(struct mount *mp, int flags)
4729 {
4730 struct vnode *vp, *mvp;
4731 struct vm_object *obj;
4732 int lkflags, objflags;
4733 bool seen_defer;
4734
4735 lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
4736 if (flags != MNT_WAIT) {
4737 lkflags |= LK_NOWAIT;
4738 objflags = OBJPC_NOSYNC;
4739 } else {
4740 objflags = OBJPC_SYNC;
4741 }
4742
4743 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) {
4744 seen_defer = false;
4745 if (vp->v_iflag & VI_DEFINACT) {
4746 vp->v_iflag &= ~VI_DEFINACT;
4747 seen_defer = true;
4748 }
4749 if (!vfs_want_msync(vp)) {
4750 if (seen_defer)
4751 vfs_deferred_inactive(vp, lkflags);
4752 else
4753 VI_UNLOCK(vp);
4754 continue;
4755 }
4756 if (vget(vp, lkflags) == 0) {
4757 obj = vp->v_object;
4758 if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) {
4759 VM_OBJECT_WLOCK(obj);
4760 vm_object_page_clean(obj, 0, 0, objflags);
4761 VM_OBJECT_WUNLOCK(obj);
4762 }
4763 vput(vp);
4764 if (seen_defer)
4765 vdrop(vp);
4766 } else {
4767 if (seen_defer)
4768 vdefer_inactive_unlocked(vp);
4769 }
4770 }
4771 }
4772
4773 void
4774 vfs_periodic(struct mount *mp, int flags)
4775 {
4776
4777 CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
4778
4779 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0)
4780 vfs_periodic_inactive(mp, flags);
4781 else
4782 vfs_periodic_msync_inactive(mp, flags);
4783 }
4784
4785 static void
4786 destroy_vpollinfo_free(struct vpollinfo *vi)
4787 {
4788
4789 knlist_destroy(&vi->vpi_selinfo.si_note);
4790 mtx_destroy(&vi->vpi_lock);
4791 free(vi, M_VNODEPOLL);
4792 }
4793
4794 static void
4795 destroy_vpollinfo(struct vpollinfo *vi)
4796 {
4797
4798 knlist_clear(&vi->vpi_selinfo.si_note, 1);
4799 seldrain(&vi->vpi_selinfo);
4800 destroy_vpollinfo_free(vi);
4801 }
4802
4803 /*
4804 * Initialize per-vnode helper structure to hold poll-related state.
4805 */
4806 void
4807 v_addpollinfo(struct vnode *vp)
4808 {
4809 struct vpollinfo *vi;
4810
4811 if (vp->v_pollinfo != NULL)
4812 return;
4813 vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO);
4814 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
4815 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
4816 vfs_knlunlock, vfs_knl_assert_lock);
4817 VI_LOCK(vp);
4818 if (vp->v_pollinfo != NULL) {
4819 VI_UNLOCK(vp);
4820 destroy_vpollinfo_free(vi);
4821 return;
4822 }
4823 vp->v_pollinfo = vi;
4824 VI_UNLOCK(vp);
4825 }
4826
4827 /*
4828 * Record a process's interest in events which might happen to
4829 * a vnode. Because poll uses the historic select-style interface
4830 * internally, this routine serves as both the ``check for any
4831 * pending events'' and the ``record my interest in future events''
4832 * functions. (These are done together, while the lock is held,
4833 * to avoid race conditions.)
4834 */
4835 int
4836 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
4837 {
4838
4839 v_addpollinfo(vp);
4840 mtx_lock(&vp->v_pollinfo->vpi_lock);
4841 if (vp->v_pollinfo->vpi_revents & events) {
4842 /*
4843 * This leaves events we are not interested
4844 * in available for the other process which
4845 * which presumably had requested them
4846 * (otherwise they would never have been
4847 * recorded).
4848 */
4849 events &= vp->v_pollinfo->vpi_revents;
4850 vp->v_pollinfo->vpi_revents &= ~events;
4851
4852 mtx_unlock(&vp->v_pollinfo->vpi_lock);
4853 return (events);
4854 }
4855 vp->v_pollinfo->vpi_events |= events;
4856 selrecord(td, &vp->v_pollinfo->vpi_selinfo);
4857 mtx_unlock(&vp->v_pollinfo->vpi_lock);
4858 return (0);
4859 }
4860
4861 /*
4862 * Routine to create and manage a filesystem syncer vnode.
4863 */
4864 #define sync_close ((int (*)(struct vop_close_args *))nullop)
4865 static int sync_fsync(struct vop_fsync_args *);
4866 static int sync_inactive(struct vop_inactive_args *);
4867 static int sync_reclaim(struct vop_reclaim_args *);
4868
4869 static struct vop_vector sync_vnodeops = {
4870 .vop_bypass = VOP_EOPNOTSUPP,
4871 .vop_close = sync_close, /* close */
4872 .vop_fsync = sync_fsync, /* fsync */
4873 .vop_inactive = sync_inactive, /* inactive */
4874 .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */
4875 .vop_reclaim = sync_reclaim, /* reclaim */
4876 .vop_lock1 = vop_stdlock, /* lock */
4877 .vop_unlock = vop_stdunlock, /* unlock */
4878 .vop_islocked = vop_stdislocked, /* islocked */
4879 };
4880 VFS_VOP_VECTOR_REGISTER(sync_vnodeops);
4881
4882 /*
4883 * Create a new filesystem syncer vnode for the specified mount point.
4884 */
4885 void
4886 vfs_allocate_syncvnode(struct mount *mp)
4887 {
4888 struct vnode *vp;
4889 struct bufobj *bo;
4890 static long start, incr, next;
4891 int error;
4892
4893 /* Allocate a new vnode */
4894 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
4895 if (error != 0)
4896 panic("vfs_allocate_syncvnode: getnewvnode() failed");
4897 vp->v_type = VNON;
4898 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4899 vp->v_vflag |= VV_FORCEINSMQ;
4900 error = insmntque(vp, mp);
4901 if (error != 0)
4902 panic("vfs_allocate_syncvnode: insmntque() failed");
4903 vp->v_vflag &= ~VV_FORCEINSMQ;
4904 VOP_UNLOCK(vp);
4905 /*
4906 * Place the vnode onto the syncer worklist. We attempt to
4907 * scatter them about on the list so that they will go off
4908 * at evenly distributed times even if all the filesystems
4909 * are mounted at once.
4910 */
4911 next += incr;
4912 if (next == 0 || next > syncer_maxdelay) {
4913 start /= 2;
4914 incr /= 2;
4915 if (start == 0) {
4916 start = syncer_maxdelay / 2;
4917 incr = syncer_maxdelay;
4918 }
4919 next = start;
4920 }
4921 bo = &vp->v_bufobj;
4922 BO_LOCK(bo);
4923 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
4924 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
4925 mtx_lock(&sync_mtx);
4926 sync_vnode_count++;
4927 if (mp->mnt_syncer == NULL) {
4928 mp->mnt_syncer = vp;
4929 vp = NULL;
4930 }
4931 mtx_unlock(&sync_mtx);
4932 BO_UNLOCK(bo);
4933 if (vp != NULL) {
4934 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4935 vgone(vp);
4936 vput(vp);
4937 }
4938 }
4939
4940 void
4941 vfs_deallocate_syncvnode(struct mount *mp)
4942 {
4943 struct vnode *vp;
4944
4945 mtx_lock(&sync_mtx);
4946 vp = mp->mnt_syncer;
4947 if (vp != NULL)
4948 mp->mnt_syncer = NULL;
4949 mtx_unlock(&sync_mtx);
4950 if (vp != NULL)
4951 vrele(vp);
4952 }
4953
4954 /*
4955 * Do a lazy sync of the filesystem.
4956 */
4957 static int
4958 sync_fsync(struct vop_fsync_args *ap)
4959 {
4960 struct vnode *syncvp = ap->a_vp;
4961 struct mount *mp = syncvp->v_mount;
4962 int error, save;
4963 struct bufobj *bo;
4964
4965 /*
4966 * We only need to do something if this is a lazy evaluation.
4967 */
4968 if (ap->a_waitfor != MNT_LAZY)
4969 return (0);
4970
4971 /*
4972 * Move ourselves to the back of the sync list.
4973 */
4974 bo = &syncvp->v_bufobj;
4975 BO_LOCK(bo);
4976 vn_syncer_add_to_worklist(bo, syncdelay);
4977 BO_UNLOCK(bo);
4978
4979 /*
4980 * Walk the list of vnodes pushing all that are dirty and
4981 * not already on the sync list.
4982 */
4983 if (vfs_busy(mp, MBF_NOWAIT) != 0)
4984 return (0);
4985 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
4986 vfs_unbusy(mp);
4987 return (0);
4988 }
4989 save = curthread_pflags_set(TDP_SYNCIO);
4990 /*
4991 * The filesystem at hand may be idle with free vnodes stored in the
4992 * batch. Return them instead of letting them stay there indefinitely.
4993 */
4994 vfs_periodic(mp, MNT_NOWAIT);
4995 error = VFS_SYNC(mp, MNT_LAZY);
4996 curthread_pflags_restore(save);
4997 vn_finished_write(mp);
4998 vfs_unbusy(mp);
4999 return (error);
5000 }
5001
5002 /*
5003 * The syncer vnode is no referenced.
5004 */
5005 static int
5006 sync_inactive(struct vop_inactive_args *ap)
5007 {
5008
5009 vgone(ap->a_vp);
5010 return (0);
5011 }
5012
5013 /*
5014 * The syncer vnode is no longer needed and is being decommissioned.
5015 *
5016 * Modifications to the worklist must be protected by sync_mtx.
5017 */
5018 static int
5019 sync_reclaim(struct vop_reclaim_args *ap)
5020 {
5021 struct vnode *vp = ap->a_vp;
5022 struct bufobj *bo;
5023
5024 bo = &vp->v_bufobj;
5025 BO_LOCK(bo);
5026 mtx_lock(&sync_mtx);
5027 if (vp->v_mount->mnt_syncer == vp)
5028 vp->v_mount->mnt_syncer = NULL;
5029 if (bo->bo_flag & BO_ONWORKLST) {
5030 LIST_REMOVE(bo, bo_synclist);
5031 syncer_worklist_len--;
5032 sync_vnode_count--;
5033 bo->bo_flag &= ~BO_ONWORKLST;
5034 }
5035 mtx_unlock(&sync_mtx);
5036 BO_UNLOCK(bo);
5037
5038 return (0);
5039 }
5040
5041 int
5042 vn_need_pageq_flush(struct vnode *vp)
5043 {
5044 struct vm_object *obj;
5045 int need;
5046
5047 MPASS(mtx_owned(VI_MTX(vp)));
5048 need = 0;
5049 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
5050 vm_object_mightbedirty(obj))
5051 need = 1;
5052 return (need);
5053 }
5054
5055 /*
5056 * Check if vnode represents a disk device
5057 */
5058 bool
5059 vn_isdisk_error(struct vnode *vp, int *errp)
5060 {
5061 int error;
5062
5063 if (vp->v_type != VCHR) {
5064 error = ENOTBLK;
5065 goto out;
5066 }
5067 error = 0;
5068 dev_lock();
5069 if (vp->v_rdev == NULL)
5070 error = ENXIO;
5071 else if (vp->v_rdev->si_devsw == NULL)
5072 error = ENXIO;
5073 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
5074 error = ENOTBLK;
5075 dev_unlock();
5076 out:
5077 *errp = error;
5078 return (error == 0);
5079 }
5080
5081 bool
5082 vn_isdisk(struct vnode *vp)
5083 {
5084 int error;
5085
5086 return (vn_isdisk_error(vp, &error));
5087 }
5088
5089 /*
5090 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
5091 * the comment above cache_fplookup for details.
5092 */
5093 int
5094 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred)
5095 {
5096 int error;
5097
5098 VFS_SMR_ASSERT_ENTERED();
5099
5100 /* Check the owner. */
5101 if (cred->cr_uid == file_uid) {
5102 if (file_mode & S_IXUSR)
5103 return (0);
5104 goto out_error;
5105 }
5106
5107 /* Otherwise, check the groups (first match) */
5108 if (groupmember(file_gid, cred)) {
5109 if (file_mode & S_IXGRP)
5110 return (0);
5111 goto out_error;
5112 }
5113
5114 /* Otherwise, check everyone else. */
5115 if (file_mode & S_IXOTH)
5116 return (0);
5117 out_error:
5118 /*
5119 * Permission check failed, but it is possible denial will get overwritten
5120 * (e.g., when root is traversing through a 700 directory owned by someone
5121 * else).
5122 *
5123 * vaccess() calls priv_check_cred which in turn can descent into MAC
5124 * modules overriding this result. It's quite unclear what semantics
5125 * are allowed for them to operate, thus for safety we don't call them
5126 * from within the SMR section. This also means if any such modules
5127 * are present, we have to let the regular lookup decide.
5128 */
5129 error = priv_check_cred_vfs_lookup_nomac(cred);
5130 switch (error) {
5131 case 0:
5132 return (0);
5133 case EAGAIN:
5134 /*
5135 * MAC modules present.
5136 */
5137 return (EAGAIN);
5138 case EPERM:
5139 return (EACCES);
5140 default:
5141 return (error);
5142 }
5143 }
5144
5145 /*
5146 * Common filesystem object access control check routine. Accepts a
5147 * vnode's type, "mode", uid and gid, requested access mode, and credentials.
5148 * Returns 0 on success, or an errno on failure.
5149 */
5150 int
5151 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
5152 accmode_t accmode, struct ucred *cred)
5153 {
5154 accmode_t dac_granted;
5155 accmode_t priv_granted;
5156
5157 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
5158 ("invalid bit in accmode"));
5159 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
5160 ("VAPPEND without VWRITE"));
5161
5162 /*
5163 * Look for a normal, non-privileged way to access the file/directory
5164 * as requested. If it exists, go with that.
5165 */
5166
5167 dac_granted = 0;
5168
5169 /* Check the owner. */
5170 if (cred->cr_uid == file_uid) {
5171 dac_granted |= VADMIN;
5172 if (file_mode & S_IXUSR)
5173 dac_granted |= VEXEC;
5174 if (file_mode & S_IRUSR)
5175 dac_granted |= VREAD;
5176 if (file_mode & S_IWUSR)
5177 dac_granted |= (VWRITE | VAPPEND);
5178
5179 if ((accmode & dac_granted) == accmode)
5180 return (0);
5181
5182 goto privcheck;
5183 }
5184
5185 /* Otherwise, check the groups (first match) */
5186 if (groupmember(file_gid, cred)) {
5187 if (file_mode & S_IXGRP)
5188 dac_granted |= VEXEC;
5189 if (file_mode & S_IRGRP)
5190 dac_granted |= VREAD;
5191 if (file_mode & S_IWGRP)
5192 dac_granted |= (VWRITE | VAPPEND);
5193
5194 if ((accmode & dac_granted) == accmode)
5195 return (0);
5196
5197 goto privcheck;
5198 }
5199
5200 /* Otherwise, check everyone else. */
5201 if (file_mode & S_IXOTH)
5202 dac_granted |= VEXEC;
5203 if (file_mode & S_IROTH)
5204 dac_granted |= VREAD;
5205 if (file_mode & S_IWOTH)
5206 dac_granted |= (VWRITE | VAPPEND);
5207 if ((accmode & dac_granted) == accmode)
5208 return (0);
5209
5210 privcheck:
5211 /*
5212 * Build a privilege mask to determine if the set of privileges
5213 * satisfies the requirements when combined with the granted mask
5214 * from above. For each privilege, if the privilege is required,
5215 * bitwise or the request type onto the priv_granted mask.
5216 */
5217 priv_granted = 0;
5218
5219 if (type == VDIR) {
5220 /*
5221 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
5222 * requests, instead of PRIV_VFS_EXEC.
5223 */
5224 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
5225 !priv_check_cred(cred, PRIV_VFS_LOOKUP))
5226 priv_granted |= VEXEC;
5227 } else {
5228 /*
5229 * Ensure that at least one execute bit is on. Otherwise,
5230 * a privileged user will always succeed, and we don't want
5231 * this to happen unless the file really is executable.
5232 */
5233 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
5234 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
5235 !priv_check_cred(cred, PRIV_VFS_EXEC))
5236 priv_granted |= VEXEC;
5237 }
5238
5239 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
5240 !priv_check_cred(cred, PRIV_VFS_READ))
5241 priv_granted |= VREAD;
5242
5243 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
5244 !priv_check_cred(cred, PRIV_VFS_WRITE))
5245 priv_granted |= (VWRITE | VAPPEND);
5246
5247 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
5248 !priv_check_cred(cred, PRIV_VFS_ADMIN))
5249 priv_granted |= VADMIN;
5250
5251 if ((accmode & (priv_granted | dac_granted)) == accmode) {
5252 return (0);
5253 }
5254
5255 return ((accmode & VADMIN) ? EPERM : EACCES);
5256 }
5257
5258 /*
5259 * Credential check based on process requesting service, and per-attribute
5260 * permissions.
5261 */
5262 int
5263 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
5264 struct thread *td, accmode_t accmode)
5265 {
5266
5267 /*
5268 * Kernel-invoked always succeeds.
5269 */
5270 if (cred == NOCRED)
5271 return (0);
5272
5273 /*
5274 * Do not allow privileged processes in jail to directly manipulate
5275 * system attributes.
5276 */
5277 switch (attrnamespace) {
5278 case EXTATTR_NAMESPACE_SYSTEM:
5279 /* Potentially should be: return (EPERM); */
5280 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM));
5281 case EXTATTR_NAMESPACE_USER:
5282 return (VOP_ACCESS(vp, accmode, cred, td));
5283 default:
5284 return (EPERM);
5285 }
5286 }
5287
5288 #ifdef DEBUG_VFS_LOCKS
5289 /*
5290 * This only exists to suppress warnings from unlocked specfs accesses. It is
5291 * no longer ok to have an unlocked VFS.
5292 */
5293 #define IGNORE_LOCK(vp) (KERNEL_PANICKED() || (vp) == NULL || \
5294 (vp)->v_type == VCHR || (vp)->v_type == VBAD)
5295
5296 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */
5297 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
5298 "Drop into debugger on lock violation");
5299
5300 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */
5301 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
5302 0, "Check for interlock across VOPs");
5303
5304 int vfs_badlock_print = 1; /* Print lock violations. */
5305 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
5306 0, "Print lock violations");
5307
5308 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */
5309 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
5310 0, "Print vnode details on lock violations");
5311
5312 #ifdef KDB
5313 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */
5314 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
5315 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
5316 #endif
5317
5318 static void
5319 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
5320 {
5321
5322 #ifdef KDB
5323 if (vfs_badlock_backtrace)
5324 kdb_backtrace();
5325 #endif
5326 if (vfs_badlock_vnode)
5327 vn_printf(vp, "vnode ");
5328 if (vfs_badlock_print)
5329 printf("%s: %p %s\n", str, (void *)vp, msg);
5330 if (vfs_badlock_ddb)
5331 kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
5332 }
5333
5334 void
5335 assert_vi_locked(struct vnode *vp, const char *str)
5336 {
5337
5338 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
5339 vfs_badlock("interlock is not locked but should be", str, vp);
5340 }
5341
5342 void
5343 assert_vi_unlocked(struct vnode *vp, const char *str)
5344 {
5345
5346 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
5347 vfs_badlock("interlock is locked but should not be", str, vp);
5348 }
5349
5350 void
5351 assert_vop_locked(struct vnode *vp, const char *str)
5352 {
5353 int locked;
5354
5355 if (!IGNORE_LOCK(vp)) {
5356 locked = VOP_ISLOCKED(vp);
5357 if (locked == 0 || locked == LK_EXCLOTHER)
5358 vfs_badlock("is not locked but should be", str, vp);
5359 }
5360 }
5361
5362 void
5363 assert_vop_unlocked(struct vnode *vp, const char *str)
5364 {
5365
5366 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
5367 vfs_badlock("is locked but should not be", str, vp);
5368 }
5369
5370 void
5371 assert_vop_elocked(struct vnode *vp, const char *str)
5372 {
5373
5374 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
5375 vfs_badlock("is not exclusive locked but should be", str, vp);
5376 }
5377 #endif /* DEBUG_VFS_LOCKS */
5378
5379 void
5380 vop_rename_fail(struct vop_rename_args *ap)
5381 {
5382
5383 if (ap->a_tvp != NULL)
5384 vput(ap->a_tvp);
5385 if (ap->a_tdvp == ap->a_tvp)
5386 vrele(ap->a_tdvp);
5387 else
5388 vput(ap->a_tdvp);
5389 vrele(ap->a_fdvp);
5390 vrele(ap->a_fvp);
5391 }
5392
5393 void
5394 vop_rename_pre(void *ap)
5395 {
5396 struct vop_rename_args *a = ap;
5397
5398 #ifdef DEBUG_VFS_LOCKS
5399 if (a->a_tvp)
5400 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
5401 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
5402 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
5403 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
5404
5405 /* Check the source (from). */
5406 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
5407 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
5408 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
5409 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
5410 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
5411
5412 /* Check the target. */
5413 if (a->a_tvp)
5414 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
5415 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
5416 #endif
5417 /*
5418 * It may be tempting to add vn_seqc_write_begin/end calls here and
5419 * in vop_rename_post but that's not going to work out since some
5420 * filesystems relookup vnodes mid-rename. This is probably a bug.
5421 *
5422 * For now filesystems are expected to do the relevant calls after they
5423 * decide what vnodes to operate on.
5424 */
5425 if (a->a_tdvp != a->a_fdvp)
5426 vhold(a->a_fdvp);
5427 if (a->a_tvp != a->a_fvp)
5428 vhold(a->a_fvp);
5429 vhold(a->a_tdvp);
5430 if (a->a_tvp)
5431 vhold(a->a_tvp);
5432 }
5433
5434 #ifdef DEBUG_VFS_LOCKS
5435 void
5436 vop_fplookup_vexec_debugpre(void *ap __unused)
5437 {
5438
5439 VFS_SMR_ASSERT_ENTERED();
5440 }
5441
5442 void
5443 vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused)
5444 {
5445
5446 VFS_SMR_ASSERT_ENTERED();
5447 }
5448
5449 void
5450 vop_strategy_debugpre(void *ap)
5451 {
5452 struct vop_strategy_args *a;
5453 struct buf *bp;
5454
5455 a = ap;
5456 bp = a->a_bp;
5457
5458 /*
5459 * Cluster ops lock their component buffers but not the IO container.
5460 */
5461 if ((bp->b_flags & B_CLUSTER) != 0)
5462 return;
5463
5464 if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) {
5465 if (vfs_badlock_print)
5466 printf(
5467 "VOP_STRATEGY: bp is not locked but should be\n");
5468 if (vfs_badlock_ddb)
5469 kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
5470 }
5471 }
5472
5473 void
5474 vop_lock_debugpre(void *ap)
5475 {
5476 struct vop_lock1_args *a = ap;
5477
5478 if ((a->a_flags & LK_INTERLOCK) == 0)
5479 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
5480 else
5481 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
5482 }
5483
5484 void
5485 vop_lock_debugpost(void *ap, int rc)
5486 {
5487 struct vop_lock1_args *a = ap;
5488
5489 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
5490 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
5491 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
5492 }
5493
5494 void
5495 vop_unlock_debugpre(void *ap)
5496 {
5497 struct vop_unlock_args *a = ap;
5498
5499 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
5500 }
5501
5502 void
5503 vop_need_inactive_debugpre(void *ap)
5504 {
5505 struct vop_need_inactive_args *a = ap;
5506
5507 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
5508 }
5509
5510 void
5511 vop_need_inactive_debugpost(void *ap, int rc)
5512 {
5513 struct vop_need_inactive_args *a = ap;
5514
5515 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
5516 }
5517 #endif
5518
5519 void
5520 vop_create_pre(void *ap)
5521 {
5522 struct vop_create_args *a;
5523 struct vnode *dvp;
5524
5525 a = ap;
5526 dvp = a->a_dvp;
5527 vn_seqc_write_begin(dvp);
5528 }
5529
5530 void
5531 vop_create_post(void *ap, int rc)
5532 {
5533 struct vop_create_args *a;
5534 struct vnode *dvp;
5535
5536 a = ap;
5537 dvp = a->a_dvp;
5538 vn_seqc_write_end(dvp);
5539 if (!rc)
5540 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
5541 }
5542
5543 void
5544 vop_whiteout_pre(void *ap)
5545 {
5546 struct vop_whiteout_args *a;
5547 struct vnode *dvp;
5548
5549 a = ap;
5550 dvp = a->a_dvp;
5551 vn_seqc_write_begin(dvp);
5552 }
5553
5554 void
5555 vop_whiteout_post(void *ap, int rc)
5556 {
5557 struct vop_whiteout_args *a;
5558 struct vnode *dvp;
5559
5560 a = ap;
5561 dvp = a->a_dvp;
5562 vn_seqc_write_end(dvp);
5563 }
5564
5565 void
5566 vop_deleteextattr_pre(void *ap)
5567 {
5568 struct vop_deleteextattr_args *a;
5569 struct vnode *vp;
5570
5571 a = ap;
5572 vp = a->a_vp;
5573 vn_seqc_write_begin(vp);
5574 }
5575
5576 void
5577 vop_deleteextattr_post(void *ap, int rc)
5578 {
5579 struct vop_deleteextattr_args *a;
5580 struct vnode *vp;
5581
5582 a = ap;
5583 vp = a->a_vp;
5584 vn_seqc_write_end(vp);
5585 if (!rc)
5586 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
5587 }
5588
5589 void
5590 vop_link_pre(void *ap)
5591 {
5592 struct vop_link_args *a;
5593 struct vnode *vp, *tdvp;
5594
5595 a = ap;
5596 vp = a->a_vp;
5597 tdvp = a->a_tdvp;
5598 vn_seqc_write_begin(vp);
5599 vn_seqc_write_begin(tdvp);
5600 }
5601
5602 void
5603 vop_link_post(void *ap, int rc)
5604 {
5605 struct vop_link_args *a;
5606 struct vnode *vp, *tdvp;
5607
5608 a = ap;
5609 vp = a->a_vp;
5610 tdvp = a->a_tdvp;
5611 vn_seqc_write_end(vp);
5612 vn_seqc_write_end(tdvp);
5613 if (!rc) {
5614 VFS_KNOTE_LOCKED(vp, NOTE_LINK);
5615 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE);
5616 }
5617 }
5618
5619 void
5620 vop_mkdir_pre(void *ap)
5621 {
5622 struct vop_mkdir_args *a;
5623 struct vnode *dvp;
5624
5625 a = ap;
5626 dvp = a->a_dvp;
5627 vn_seqc_write_begin(dvp);
5628 }
5629
5630 void
5631 vop_mkdir_post(void *ap, int rc)
5632 {
5633 struct vop_mkdir_args *a;
5634 struct vnode *dvp;
5635
5636 a = ap;
5637 dvp = a->a_dvp;
5638 vn_seqc_write_end(dvp);
5639 if (!rc)
5640 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
5641 }
5642
5643 #ifdef DEBUG_VFS_LOCKS
5644 void
5645 vop_mkdir_debugpost(void *ap, int rc)
5646 {
5647 struct vop_mkdir_args *a;
5648
5649 a = ap;
5650 if (!rc)
5651 cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp);
5652 }
5653 #endif
5654
5655 void
5656 vop_mknod_pre(void *ap)
5657 {
5658 struct vop_mknod_args *a;
5659 struct vnode *dvp;
5660
5661 a = ap;
5662 dvp = a->a_dvp;
5663 vn_seqc_write_begin(dvp);
5664 }
5665
5666 void
5667 vop_mknod_post(void *ap, int rc)
5668 {
5669 struct vop_mknod_args *a;
5670 struct vnode *dvp;
5671
5672 a = ap;
5673 dvp = a->a_dvp;
5674 vn_seqc_write_end(dvp);
5675 if (!rc)
5676 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
5677 }
5678
5679 void
5680 vop_reclaim_post(void *ap, int rc)
5681 {
5682 struct vop_reclaim_args *a;
5683 struct vnode *vp;
5684
5685 a = ap;
5686 vp = a->a_vp;
5687 ASSERT_VOP_IN_SEQC(vp);
5688 if (!rc)
5689 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE);
5690 }
5691
5692 void
5693 vop_remove_pre(void *ap)
5694 {
5695 struct vop_remove_args *a;
5696 struct vnode *dvp, *vp;
5697
5698 a = ap;
5699 dvp = a->a_dvp;
5700 vp = a->a_vp;
5701 vn_seqc_write_begin(dvp);
5702 vn_seqc_write_begin(vp);
5703 }
5704
5705 void
5706 vop_remove_post(void *ap, int rc)
5707 {
5708 struct vop_remove_args *a;
5709 struct vnode *dvp, *vp;
5710
5711 a = ap;
5712 dvp = a->a_dvp;
5713 vp = a->a_vp;
5714 vn_seqc_write_end(dvp);
5715 vn_seqc_write_end(vp);
5716 if (!rc) {
5717 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
5718 VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
5719 }
5720 }
5721
5722 void
5723 vop_rename_post(void *ap, int rc)
5724 {
5725 struct vop_rename_args *a = ap;
5726 long hint;
5727
5728 if (!rc) {
5729 hint = NOTE_WRITE;
5730 if (a->a_fdvp == a->a_tdvp) {
5731 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
5732 hint |= NOTE_LINK;
5733 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
5734 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
5735 } else {
5736 hint |= NOTE_EXTEND;
5737 if (a->a_fvp->v_type == VDIR)
5738 hint |= NOTE_LINK;
5739 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
5740
5741 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
5742 a->a_tvp->v_type == VDIR)
5743 hint &= ~NOTE_LINK;
5744 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
5745 }
5746
5747 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
5748 if (a->a_tvp)
5749 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
5750 }
5751 if (a->a_tdvp != a->a_fdvp)
5752 vdrop(a->a_fdvp);
5753 if (a->a_tvp != a->a_fvp)
5754 vdrop(a->a_fvp);
5755 vdrop(a->a_tdvp);
5756 if (a->a_tvp)
5757 vdrop(a->a_tvp);
5758 }
5759
5760 void
5761 vop_rmdir_pre(void *ap)
5762 {
5763 struct vop_rmdir_args *a;
5764 struct vnode *dvp, *vp;
5765
5766 a = ap;
5767 dvp = a->a_dvp;
5768 vp = a->a_vp;
5769 vn_seqc_write_begin(dvp);
5770 vn_seqc_write_begin(vp);
5771 }
5772
5773 void
5774 vop_rmdir_post(void *ap, int rc)
5775 {
5776 struct vop_rmdir_args *a;
5777 struct vnode *dvp, *vp;
5778
5779 a = ap;
5780 dvp = a->a_dvp;
5781 vp = a->a_vp;
5782 vn_seqc_write_end(dvp);
5783 vn_seqc_write_end(vp);
5784 if (!rc) {
5785 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
5786 VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
5787 }
5788 }
5789
5790 void
5791 vop_setattr_pre(void *ap)
5792 {
5793 struct vop_setattr_args *a;
5794 struct vnode *vp;
5795
5796 a = ap;
5797 vp = a->a_vp;
5798 vn_seqc_write_begin(vp);
5799 }
5800
5801 void
5802 vop_setattr_post(void *ap, int rc)
5803 {
5804 struct vop_setattr_args *a;
5805 struct vnode *vp;
5806
5807 a = ap;
5808 vp = a->a_vp;
5809 vn_seqc_write_end(vp);
5810 if (!rc)
5811 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
5812 }
5813
5814 void
5815 vop_setacl_pre(void *ap)
5816 {
5817 struct vop_setacl_args *a;
5818 struct vnode *vp;
5819
5820 a = ap;
5821 vp = a->a_vp;
5822 vn_seqc_write_begin(vp);
5823 }
5824
5825 void
5826 vop_setacl_post(void *ap, int rc __unused)
5827 {
5828 struct vop_setacl_args *a;
5829 struct vnode *vp;
5830
5831 a = ap;
5832 vp = a->a_vp;
5833 vn_seqc_write_end(vp);
5834 }
5835
5836 void
5837 vop_setextattr_pre(void *ap)
5838 {
5839 struct vop_setextattr_args *a;
5840 struct vnode *vp;
5841
5842 a = ap;
5843 vp = a->a_vp;
5844 vn_seqc_write_begin(vp);
5845 }
5846
5847 void
5848 vop_setextattr_post(void *ap, int rc)
5849 {
5850 struct vop_setextattr_args *a;
5851 struct vnode *vp;
5852
5853 a = ap;
5854 vp = a->a_vp;
5855 vn_seqc_write_end(vp);
5856 if (!rc)
5857 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
5858 }
5859
5860 void
5861 vop_symlink_pre(void *ap)
5862 {
5863 struct vop_symlink_args *a;
5864 struct vnode *dvp;
5865
5866 a = ap;
5867 dvp = a->a_dvp;
5868 vn_seqc_write_begin(dvp);
5869 }
5870
5871 void
5872 vop_symlink_post(void *ap, int rc)
5873 {
5874 struct vop_symlink_args *a;
5875 struct vnode *dvp;
5876
5877 a = ap;
5878 dvp = a->a_dvp;
5879 vn_seqc_write_end(dvp);
5880 if (!rc)
5881 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
5882 }
5883
5884 void
5885 vop_open_post(void *ap, int rc)
5886 {
5887 struct vop_open_args *a = ap;
5888
5889 if (!rc)
5890 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
5891 }
5892
5893 void
5894 vop_close_post(void *ap, int rc)
5895 {
5896 struct vop_close_args *a = ap;
5897
5898 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
5899 !VN_IS_DOOMED(a->a_vp))) {
5900 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
5901 NOTE_CLOSE_WRITE : NOTE_CLOSE);
5902 }
5903 }
5904
5905 void
5906 vop_read_post(void *ap, int rc)
5907 {
5908 struct vop_read_args *a = ap;
5909
5910 if (!rc)
5911 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
5912 }
5913
5914 void
5915 vop_read_pgcache_post(void *ap, int rc)
5916 {
5917 struct vop_read_pgcache_args *a = ap;
5918
5919 if (!rc)
5920 VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ);
5921 }
5922
5923 void
5924 vop_readdir_post(void *ap, int rc)
5925 {
5926 struct vop_readdir_args *a = ap;
5927
5928 if (!rc)
5929 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
5930 }
5931
5932 static struct knlist fs_knlist;
5933
5934 static void
5935 vfs_event_init(void *arg)
5936 {
5937 knlist_init_mtx(&fs_knlist, NULL);
5938 }
5939 /* XXX - correct order? */
5940 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
5941
5942 void
5943 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
5944 {
5945
5946 KNOTE_UNLOCKED(&fs_knlist, event);
5947 }
5948
5949 static int filt_fsattach(struct knote *kn);
5950 static void filt_fsdetach(struct knote *kn);
5951 static int filt_fsevent(struct knote *kn, long hint);
5952
5953 struct filterops fs_filtops = {
5954 .f_isfd = 0,
5955 .f_attach = filt_fsattach,
5956 .f_detach = filt_fsdetach,
5957 .f_event = filt_fsevent
5958 };
5959
5960 static int
5961 filt_fsattach(struct knote *kn)
5962 {
5963
5964 kn->kn_flags |= EV_CLEAR;
5965 knlist_add(&fs_knlist, kn, 0);
5966 return (0);
5967 }
5968
5969 static void
5970 filt_fsdetach(struct knote *kn)
5971 {
5972
5973 knlist_remove(&fs_knlist, kn, 0);
5974 }
5975
5976 static int
5977 filt_fsevent(struct knote *kn, long hint)
5978 {
5979
5980 kn->kn_fflags |= hint;
5981 return (kn->kn_fflags != 0);
5982 }
5983
5984 static int
5985 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
5986 {
5987 struct vfsidctl vc;
5988 int error;
5989 struct mount *mp;
5990
5991 error = SYSCTL_IN(req, &vc, sizeof(vc));
5992 if (error)
5993 return (error);
5994 if (vc.vc_vers != VFS_CTL_VERS1)
5995 return (EINVAL);
5996 mp = vfs_getvfs(&vc.vc_fsid);
5997 if (mp == NULL)
5998 return (ENOENT);
5999 /* ensure that a specific sysctl goes to the right filesystem. */
6000 if (strcmp(vc.vc_fstypename, "*") != 0 &&
6001 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
6002 vfs_rel(mp);
6003 return (EINVAL);
6004 }
6005 VCTLTOREQ(&vc, req);
6006 error = VFS_SYSCTL(mp, vc.vc_op, req);
6007 vfs_rel(mp);
6008 return (error);
6009 }
6010
6011 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR,
6012 NULL, 0, sysctl_vfs_ctl, "",
6013 "Sysctl by fsid");
6014
6015 /*
6016 * Function to initialize a va_filerev field sensibly.
6017 * XXX: Wouldn't a random number make a lot more sense ??
6018 */
6019 u_quad_t
6020 init_va_filerev(void)
6021 {
6022 struct bintime bt;
6023
6024 getbinuptime(&bt);
6025 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
6026 }
6027
6028 static int filt_vfsread(struct knote *kn, long hint);
6029 static int filt_vfswrite(struct knote *kn, long hint);
6030 static int filt_vfsvnode(struct knote *kn, long hint);
6031 static void filt_vfsdetach(struct knote *kn);
6032 static struct filterops vfsread_filtops = {
6033 .f_isfd = 1,
6034 .f_detach = filt_vfsdetach,
6035 .f_event = filt_vfsread
6036 };
6037 static struct filterops vfswrite_filtops = {
6038 .f_isfd = 1,
6039 .f_detach = filt_vfsdetach,
6040 .f_event = filt_vfswrite
6041 };
6042 static struct filterops vfsvnode_filtops = {
6043 .f_isfd = 1,
6044 .f_detach = filt_vfsdetach,
6045 .f_event = filt_vfsvnode
6046 };
6047
6048 static void
6049 vfs_knllock(void *arg)
6050 {
6051 struct vnode *vp = arg;
6052
6053 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
6054 }
6055
6056 static void
6057 vfs_knlunlock(void *arg)
6058 {
6059 struct vnode *vp = arg;
6060
6061 VOP_UNLOCK(vp);
6062 }
6063
6064 static void
6065 vfs_knl_assert_lock(void *arg, int what)
6066 {
6067 #ifdef DEBUG_VFS_LOCKS
6068 struct vnode *vp = arg;
6069
6070 if (what == LA_LOCKED)
6071 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
6072 else
6073 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
6074 #endif
6075 }
6076
6077 int
6078 vfs_kqfilter(struct vop_kqfilter_args *ap)
6079 {
6080 struct vnode *vp = ap->a_vp;
6081 struct knote *kn = ap->a_kn;
6082 struct knlist *knl;
6083
6084 switch (kn->kn_filter) {
6085 case EVFILT_READ:
6086 kn->kn_fop = &vfsread_filtops;
6087 break;
6088 case EVFILT_WRITE:
6089 kn->kn_fop = &vfswrite_filtops;
6090 break;
6091 case EVFILT_VNODE:
6092 kn->kn_fop = &vfsvnode_filtops;
6093 break;
6094 default:
6095 return (EINVAL);
6096 }
6097
6098 kn->kn_hook = (caddr_t)vp;
6099
6100 v_addpollinfo(vp);
6101 if (vp->v_pollinfo == NULL)
6102 return (ENOMEM);
6103 knl = &vp->v_pollinfo->vpi_selinfo.si_note;
6104 vhold(vp);
6105 knlist_add(knl, kn, 0);
6106
6107 return (0);
6108 }
6109
6110 /*
6111 * Detach knote from vnode
6112 */
6113 static void
6114 filt_vfsdetach(struct knote *kn)
6115 {
6116 struct vnode *vp = (struct vnode *)kn->kn_hook;
6117
6118 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
6119 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
6120 vdrop(vp);
6121 }
6122
6123 /*ARGSUSED*/
6124 static int
6125 filt_vfsread(struct knote *kn, long hint)
6126 {
6127 struct vnode *vp = (struct vnode *)kn->kn_hook;
6128 struct vattr va;
6129 int res;
6130
6131 /*
6132 * filesystem is gone, so set the EOF flag and schedule
6133 * the knote for deletion.
6134 */
6135 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
6136 VI_LOCK(vp);
6137 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
6138 VI_UNLOCK(vp);
6139 return (1);
6140 }
6141
6142 if (VOP_GETATTR(vp, &va, curthread->td_ucred))
6143 return (0);
6144
6145 VI_LOCK(vp);
6146 kn->kn_data = va.va_size - kn->kn_fp->f_offset;
6147 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
6148 VI_UNLOCK(vp);
6149 return (res);
6150 }
6151
6152 /*ARGSUSED*/
6153 static int
6154 filt_vfswrite(struct knote *kn, long hint)
6155 {
6156 struct vnode *vp = (struct vnode *)kn->kn_hook;
6157
6158 VI_LOCK(vp);
6159
6160 /*
6161 * filesystem is gone, so set the EOF flag and schedule
6162 * the knote for deletion.
6163 */
6164 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
6165 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
6166
6167 kn->kn_data = 0;
6168 VI_UNLOCK(vp);
6169 return (1);
6170 }
6171
6172 static int
6173 filt_vfsvnode(struct knote *kn, long hint)
6174 {
6175 struct vnode *vp = (struct vnode *)kn->kn_hook;
6176 int res;
6177
6178 VI_LOCK(vp);
6179 if (kn->kn_sfflags & hint)
6180 kn->kn_fflags |= hint;
6181 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
6182 kn->kn_flags |= EV_EOF;
6183 VI_UNLOCK(vp);
6184 return (1);
6185 }
6186 res = (kn->kn_fflags != 0);
6187 VI_UNLOCK(vp);
6188 return (res);
6189 }
6190
6191 /*
6192 * Returns whether the directory is empty or not.
6193 * If it is empty, the return value is 0; otherwise
6194 * the return value is an error value (which may
6195 * be ENOTEMPTY).
6196 */
6197 int
6198 vfs_emptydir(struct vnode *vp)
6199 {
6200 struct uio uio;
6201 struct iovec iov;
6202 struct dirent *dirent, *dp, *endp;
6203 int error, eof;
6204
6205 error = 0;
6206 eof = 0;
6207
6208 ASSERT_VOP_LOCKED(vp, "vfs_emptydir");
6209
6210 dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK);
6211 iov.iov_base = dirent;
6212 iov.iov_len = sizeof(struct dirent);
6213
6214 uio.uio_iov = &iov;
6215 uio.uio_iovcnt = 1;
6216 uio.uio_offset = 0;
6217 uio.uio_resid = sizeof(struct dirent);
6218 uio.uio_segflg = UIO_SYSSPACE;
6219 uio.uio_rw = UIO_READ;
6220 uio.uio_td = curthread;
6221
6222 while (eof == 0 && error == 0) {
6223 error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof,
6224 NULL, NULL);
6225 if (error != 0)
6226 break;
6227 endp = (void *)((uint8_t *)dirent +
6228 sizeof(struct dirent) - uio.uio_resid);
6229 for (dp = dirent; dp < endp;
6230 dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) {
6231 if (dp->d_type == DT_WHT)
6232 continue;
6233 if (dp->d_namlen == 0)
6234 continue;
6235 if (dp->d_type != DT_DIR &&
6236 dp->d_type != DT_UNKNOWN) {
6237 error = ENOTEMPTY;
6238 break;
6239 }
6240 if (dp->d_namlen > 2) {
6241 error = ENOTEMPTY;
6242 break;
6243 }
6244 if (dp->d_namlen == 1 &&
6245 dp->d_name[0] != '.') {
6246 error = ENOTEMPTY;
6247 break;
6248 }
6249 if (dp->d_namlen == 2 &&
6250 dp->d_name[1] != '.') {
6251 error = ENOTEMPTY;
6252 break;
6253 }
6254 uio.uio_resid = sizeof(struct dirent);
6255 }
6256 }
6257 free(dirent, M_TEMP);
6258 return (error);
6259 }
6260
6261 int
6262 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
6263 {
6264 int error;
6265
6266 if (dp->d_reclen > ap->a_uio->uio_resid)
6267 return (ENAMETOOLONG);
6268 error = uiomove(dp, dp->d_reclen, ap->a_uio);
6269 if (error) {
6270 if (ap->a_ncookies != NULL) {
6271 if (ap->a_cookies != NULL)
6272 free(ap->a_cookies, M_TEMP);
6273 ap->a_cookies = NULL;
6274 *ap->a_ncookies = 0;
6275 }
6276 return (error);
6277 }
6278 if (ap->a_ncookies == NULL)
6279 return (0);
6280
6281 KASSERT(ap->a_cookies,
6282 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
6283
6284 *ap->a_cookies = realloc(*ap->a_cookies,
6285 (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
6286 (*ap->a_cookies)[*ap->a_ncookies] = off;
6287 *ap->a_ncookies += 1;
6288 return (0);
6289 }
6290
6291 /*
6292 * The purpose of this routine is to remove granularity from accmode_t,
6293 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
6294 * VADMIN and VAPPEND.
6295 *
6296 * If it returns 0, the caller is supposed to continue with the usual
6297 * access checks using 'accmode' as modified by this routine. If it
6298 * returns nonzero value, the caller is supposed to return that value
6299 * as errno.
6300 *
6301 * Note that after this routine runs, accmode may be zero.
6302 */
6303 int
6304 vfs_unixify_accmode(accmode_t *accmode)
6305 {
6306 /*
6307 * There is no way to specify explicit "deny" rule using
6308 * file mode or POSIX.1e ACLs.
6309 */
6310 if (*accmode & VEXPLICIT_DENY) {
6311 *accmode = 0;
6312 return (0);
6313 }
6314
6315 /*
6316 * None of these can be translated into usual access bits.
6317 * Also, the common case for NFSv4 ACLs is to not contain
6318 * either of these bits. Caller should check for VWRITE
6319 * on the containing directory instead.
6320 */
6321 if (*accmode & (VDELETE_CHILD | VDELETE))
6322 return (EPERM);
6323
6324 if (*accmode & VADMIN_PERMS) {
6325 *accmode &= ~VADMIN_PERMS;
6326 *accmode |= VADMIN;
6327 }
6328
6329 /*
6330 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
6331 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
6332 */
6333 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
6334
6335 return (0);
6336 }
6337
6338 /*
6339 * Clear out a doomed vnode (if any) and replace it with a new one as long
6340 * as the fs is not being unmounted. Return the root vnode to the caller.
6341 */
6342 static int __noinline
6343 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp)
6344 {
6345 struct vnode *vp;
6346 int error;
6347
6348 restart:
6349 if (mp->mnt_rootvnode != NULL) {
6350 MNT_ILOCK(mp);
6351 vp = mp->mnt_rootvnode;
6352 if (vp != NULL) {
6353 if (!VN_IS_DOOMED(vp)) {
6354 vrefact(vp);
6355 MNT_IUNLOCK(mp);
6356 error = vn_lock(vp, flags);
6357 if (error == 0) {
6358 *vpp = vp;
6359 return (0);
6360 }
6361 vrele(vp);
6362 goto restart;
6363 }
6364 /*
6365 * Clear the old one.
6366 */
6367 mp->mnt_rootvnode = NULL;
6368 }
6369 MNT_IUNLOCK(mp);
6370 if (vp != NULL) {
6371 vfs_op_barrier_wait(mp);
6372 vrele(vp);
6373 }
6374 }
6375 error = VFS_CACHEDROOT(mp, flags, vpp);
6376 if (error != 0)
6377 return (error);
6378 if (mp->mnt_vfs_ops == 0) {
6379 MNT_ILOCK(mp);
6380 if (mp->mnt_vfs_ops != 0) {
6381 MNT_IUNLOCK(mp);
6382 return (0);
6383 }
6384 if (mp->mnt_rootvnode == NULL) {
6385 vrefact(*vpp);
6386 mp->mnt_rootvnode = *vpp;
6387 } else {
6388 if (mp->mnt_rootvnode != *vpp) {
6389 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) {
6390 panic("%s: mismatch between vnode returned "
6391 " by VFS_CACHEDROOT and the one cached "
6392 " (%p != %p)",
6393 __func__, *vpp, mp->mnt_rootvnode);
6394 }
6395 }
6396 }
6397 MNT_IUNLOCK(mp);
6398 }
6399 return (0);
6400 }
6401
6402 int
6403 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp)
6404 {
6405 struct mount_pcpu *mpcpu;
6406 struct vnode *vp;
6407 int error;
6408
6409 if (!vfs_op_thread_enter(mp, mpcpu))
6410 return (vfs_cache_root_fallback(mp, flags, vpp));
6411 vp = atomic_load_ptr(&mp->mnt_rootvnode);
6412 if (vp == NULL || VN_IS_DOOMED(vp)) {
6413 vfs_op_thread_exit(mp, mpcpu);
6414 return (vfs_cache_root_fallback(mp, flags, vpp));
6415 }
6416 vrefact(vp);
6417 vfs_op_thread_exit(mp, mpcpu);
6418 error = vn_lock(vp, flags);
6419 if (error != 0) {
6420 vrele(vp);
6421 return (vfs_cache_root_fallback(mp, flags, vpp));
6422 }
6423 *vpp = vp;
6424 return (0);
6425 }
6426
6427 struct vnode *
6428 vfs_cache_root_clear(struct mount *mp)
6429 {
6430 struct vnode *vp;
6431
6432 /*
6433 * ops > 0 guarantees there is nobody who can see this vnode
6434 */
6435 MPASS(mp->mnt_vfs_ops > 0);
6436 vp = mp->mnt_rootvnode;
6437 if (vp != NULL)
6438 vn_seqc_write_begin(vp);
6439 mp->mnt_rootvnode = NULL;
6440 return (vp);
6441 }
6442
6443 void
6444 vfs_cache_root_set(struct mount *mp, struct vnode *vp)
6445 {
6446
6447 MPASS(mp->mnt_vfs_ops > 0);
6448 vrefact(vp);
6449 mp->mnt_rootvnode = vp;
6450 }
6451
6452 /*
6453 * These are helper functions for filesystems to traverse all
6454 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
6455 *
6456 * This interface replaces MNT_VNODE_FOREACH.
6457 */
6458
6459 struct vnode *
6460 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
6461 {
6462 struct vnode *vp;
6463
6464 if (should_yield())
6465 kern_yield(PRI_USER);
6466 MNT_ILOCK(mp);
6467 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
6468 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
6469 vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
6470 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
6471 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
6472 continue;
6473 VI_LOCK(vp);
6474 if (VN_IS_DOOMED(vp)) {
6475 VI_UNLOCK(vp);
6476 continue;
6477 }
6478 break;
6479 }
6480 if (vp == NULL) {
6481 __mnt_vnode_markerfree_all(mvp, mp);
6482 /* MNT_IUNLOCK(mp); -- done in above function */
6483 mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
6484 return (NULL);
6485 }
6486 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
6487 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
6488 MNT_IUNLOCK(mp);
6489 return (vp);
6490 }
6491
6492 struct vnode *
6493 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
6494 {
6495 struct vnode *vp;
6496
6497 *mvp = vn_alloc_marker(mp);
6498 MNT_ILOCK(mp);
6499 MNT_REF(mp);
6500
6501 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
6502 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
6503 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
6504 continue;
6505 VI_LOCK(vp);
6506 if (VN_IS_DOOMED(vp)) {
6507 VI_UNLOCK(vp);
6508 continue;
6509 }
6510 break;
6511 }
6512 if (vp == NULL) {
6513 MNT_REL(mp);
6514 MNT_IUNLOCK(mp);
6515 vn_free_marker(*mvp);
6516 *mvp = NULL;
6517 return (NULL);
6518 }
6519 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
6520 MNT_IUNLOCK(mp);
6521 return (vp);
6522 }
6523
6524 void
6525 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
6526 {
6527
6528 if (*mvp == NULL) {
6529 MNT_IUNLOCK(mp);
6530 return;
6531 }
6532
6533 mtx_assert(MNT_MTX(mp), MA_OWNED);
6534
6535 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
6536 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
6537 MNT_REL(mp);
6538 MNT_IUNLOCK(mp);
6539 vn_free_marker(*mvp);
6540 *mvp = NULL;
6541 }
6542
6543 /*
6544 * These are helper functions for filesystems to traverse their
6545 * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h
6546 */
6547 static void
6548 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
6549 {
6550
6551 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
6552
6553 MNT_ILOCK(mp);
6554 MNT_REL(mp);
6555 MNT_IUNLOCK(mp);
6556 vn_free_marker(*mvp);
6557 *mvp = NULL;
6558 }
6559
6560 /*
6561 * Relock the mp mount vnode list lock with the vp vnode interlock in the
6562 * conventional lock order during mnt_vnode_next_lazy iteration.
6563 *
6564 * On entry, the mount vnode list lock is held and the vnode interlock is not.
6565 * The list lock is dropped and reacquired. On success, both locks are held.
6566 * On failure, the mount vnode list lock is held but the vnode interlock is
6567 * not, and the procedure may have yielded.
6568 */
6569 static bool
6570 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp,
6571 struct vnode *vp)
6572 {
6573
6574 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
6575 TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp,
6576 ("%s: bad marker", __func__));
6577 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
6578 ("%s: inappropriate vnode", __func__));
6579 ASSERT_VI_UNLOCKED(vp, __func__);
6580 mtx_assert(&mp->mnt_listmtx, MA_OWNED);
6581
6582 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist);
6583 TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist);
6584
6585 /*
6586 * Note we may be racing against vdrop which transitioned the hold
6587 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine,
6588 * if we are the only user after we get the interlock we will just
6589 * vdrop.
6590 */
6591 vhold(vp);
6592 mtx_unlock(&mp->mnt_listmtx);
6593 VI_LOCK(vp);
6594 if (VN_IS_DOOMED(vp)) {
6595 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
6596 goto out_lost;
6597 }
6598 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
6599 /*
6600 * There is nothing to do if we are the last user.
6601 */
6602 if (!refcount_release_if_not_last(&vp->v_holdcnt))
6603 goto out_lost;
6604 mtx_lock(&mp->mnt_listmtx);
6605 return (true);
6606 out_lost:
6607 vdropl(vp);
6608 maybe_yield();
6609 mtx_lock(&mp->mnt_listmtx);
6610 return (false);
6611 }
6612
6613 static struct vnode *
6614 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
6615 void *cbarg)
6616 {
6617 struct vnode *vp;
6618
6619 mtx_assert(&mp->mnt_listmtx, MA_OWNED);
6620 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
6621 restart:
6622 vp = TAILQ_NEXT(*mvp, v_lazylist);
6623 while (vp != NULL) {
6624 if (vp->v_type == VMARKER) {
6625 vp = TAILQ_NEXT(vp, v_lazylist);
6626 continue;
6627 }
6628 /*
6629 * See if we want to process the vnode. Note we may encounter a
6630 * long string of vnodes we don't care about and hog the list
6631 * as a result. Check for it and requeue the marker.
6632 */
6633 VNPASS(!VN_IS_DOOMED(vp), vp);
6634 if (!cb(vp, cbarg)) {
6635 if (!should_yield()) {
6636 vp = TAILQ_NEXT(vp, v_lazylist);
6637 continue;
6638 }
6639 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp,
6640 v_lazylist);
6641 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp,
6642 v_lazylist);
6643 mtx_unlock(&mp->mnt_listmtx);
6644 kern_yield(PRI_USER);
6645 mtx_lock(&mp->mnt_listmtx);
6646 goto restart;
6647 }
6648 /*
6649 * Try-lock because this is the wrong lock order.
6650 */
6651 if (!VI_TRYLOCK(vp) &&
6652 !mnt_vnode_next_lazy_relock(*mvp, mp, vp))
6653 goto restart;
6654 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
6655 KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
6656 ("alien vnode on the lazy list %p %p", vp, mp));
6657 VNPASS(vp->v_mount == mp, vp);
6658 VNPASS(!VN_IS_DOOMED(vp), vp);
6659 break;
6660 }
6661 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
6662
6663 /* Check if we are done */
6664 if (vp == NULL) {
6665 mtx_unlock(&mp->mnt_listmtx);
6666 mnt_vnode_markerfree_lazy(mvp, mp);
6667 return (NULL);
6668 }
6669 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist);
6670 mtx_unlock(&mp->mnt_listmtx);
6671 ASSERT_VI_LOCKED(vp, "lazy iter");
6672 return (vp);
6673 }
6674
6675 struct vnode *
6676 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
6677 void *cbarg)
6678 {
6679
6680 if (should_yield())
6681 kern_yield(PRI_USER);
6682 mtx_lock(&mp->mnt_listmtx);
6683 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
6684 }
6685
6686 struct vnode *
6687 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
6688 void *cbarg)
6689 {
6690 struct vnode *vp;
6691
6692 if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist))
6693 return (NULL);
6694
6695 *mvp = vn_alloc_marker(mp);
6696 MNT_ILOCK(mp);
6697 MNT_REF(mp);
6698 MNT_IUNLOCK(mp);
6699
6700 mtx_lock(&mp->mnt_listmtx);
6701 vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist);
6702 if (vp == NULL) {
6703 mtx_unlock(&mp->mnt_listmtx);
6704 mnt_vnode_markerfree_lazy(mvp, mp);
6705 return (NULL);
6706 }
6707 TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist);
6708 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
6709 }
6710
6711 void
6712 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
6713 {
6714
6715 if (*mvp == NULL)
6716 return;
6717
6718 mtx_lock(&mp->mnt_listmtx);
6719 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
6720 mtx_unlock(&mp->mnt_listmtx);
6721 mnt_vnode_markerfree_lazy(mvp, mp);
6722 }
6723
6724 int
6725 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp)
6726 {
6727
6728 if ((cnp->cn_flags & NOEXECCHECK) != 0) {
6729 cnp->cn_flags &= ~NOEXECCHECK;
6730 return (0);
6731 }
6732
6733 return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread));
6734 }
6735
6736 /*
6737 * Do not use this variant unless you have means other than the hold count
6738 * to prevent the vnode from getting freed.
6739 */
6740 void
6741 vn_seqc_write_begin_unheld_locked(struct vnode *vp)
6742 {
6743
6744 ASSERT_VI_LOCKED(vp, __func__);
6745 VNPASS(vp->v_seqc_users >= 0, vp);
6746 vp->v_seqc_users++;
6747 if (vp->v_seqc_users == 1)
6748 seqc_sleepable_write_begin(&vp->v_seqc);
6749 }
6750
6751 void
6752 vn_seqc_write_begin_locked(struct vnode *vp)
6753 {
6754
6755 ASSERT_VI_LOCKED(vp, __func__);
6756 VNPASS(vp->v_holdcnt > 0, vp);
6757 vn_seqc_write_begin_unheld_locked(vp);
6758 }
6759
6760 void
6761 vn_seqc_write_begin(struct vnode *vp)
6762 {
6763
6764 VI_LOCK(vp);
6765 vn_seqc_write_begin_locked(vp);
6766 VI_UNLOCK(vp);
6767 }
6768
6769 void
6770 vn_seqc_write_begin_unheld(struct vnode *vp)
6771 {
6772
6773 VI_LOCK(vp);
6774 vn_seqc_write_begin_unheld_locked(vp);
6775 VI_UNLOCK(vp);
6776 }
6777
6778 void
6779 vn_seqc_write_end_locked(struct vnode *vp)
6780 {
6781
6782 ASSERT_VI_LOCKED(vp, __func__);
6783 VNPASS(vp->v_seqc_users > 0, vp);
6784 vp->v_seqc_users--;
6785 if (vp->v_seqc_users == 0)
6786 seqc_sleepable_write_end(&vp->v_seqc);
6787 }
6788
6789 void
6790 vn_seqc_write_end(struct vnode *vp)
6791 {
6792
6793 VI_LOCK(vp);
6794 vn_seqc_write_end_locked(vp);
6795 VI_UNLOCK(vp);
6796 }

Properties

Name Value
svn:keywords FreeBSD=%H

  ViewVC Help
Powered by ViewVC 1.1.27