diff options
Diffstat (limited to 'fs')
436 files changed, 15591 insertions, 11857 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 18f74ec4dce9..9d03d1ebca6f 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1001,44 +1001,6 @@ done: } /** - * v9fs_vfs_readlink - read a symlink's location - * @dentry: dentry for symlink - * @buffer: buffer to load symlink location into - * @buflen: length of buffer - * - */ - -static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer, - int buflen) -{ - int retval; - int ret; - char *link = __getname(); - - if (unlikely(!link)) - return -ENOMEM; - - if (buflen > PATH_MAX) - buflen = PATH_MAX; - - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, - dentry); - - retval = v9fs_readlink(dentry, link, buflen); - - if (retval > 0) { - if ((ret = copy_to_user(buffer, link, retval)) != 0) { - P9_DPRINTK(P9_DEBUG_ERROR, - "problem copying to user: %d\n", ret); - retval = ret; - } - } - - __putname(link); - return retval; -} - -/** * v9fs_vfs_follow_link - follow a symlink path * @dentry: dentry for symlink * @nd: nameidata @@ -1230,7 +1192,6 @@ static const struct inode_operations v9fs_dir_inode_operations_ext = { .rmdir = v9fs_vfs_rmdir, .mknod = v9fs_vfs_mknod, .rename = v9fs_vfs_rename, - .readlink = v9fs_vfs_readlink, .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, }; @@ -1253,7 +1214,7 @@ static const struct inode_operations v9fs_file_inode_operations = { }; static const struct inode_operations v9fs_symlink_inode_operations = { - .readlink = v9fs_vfs_readlink, + .readlink = generic_readlink, .follow_link = v9fs_vfs_follow_link, .put_link = v9fs_vfs_put_link, .getattr = v9fs_vfs_getattr, diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 14a86448572c..69357c0d9899 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -188,7 +188,8 @@ static void v9fs_kill_super(struct super_block *s) P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); - v9fs_dentry_release(s->s_root); /* clunk root */ + if (s->s_root) + v9fs_dentry_release(s->s_root); /* clunk root */ kill_anon_super(s); diff --git a/fs/affs/affs.h b/fs/affs/affs.h index e511dc621a2e..0e40caaba456 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -106,8 +106,8 @@ struct affs_sb_info { u32 s_last_bmap; struct buffer_head *s_bmap_bh; char *s_prefix; /* Prefix for volumes and assigns. */ - int s_prefix_len; /* Length of prefix. */ char s_volume[32]; /* Volume prefix for absolute symlinks. */ + spinlock_t symlink_lock; /* protects the previous two */ }; #define SF_INTL 0x0001 /* International filesystem. */ diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 960d336ec694..d70bbbac6b7b 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -341,10 +341,13 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) p = (char *)AFFS_HEAD(bh)->table; lc = '/'; if (*symname == '/') { + struct affs_sb_info *sbi = AFFS_SB(sb); while (*symname == '/') symname++; - while (AFFS_SB(sb)->s_volume[i]) /* Cannot overflow */ - *p++ = AFFS_SB(sb)->s_volume[i++]; + spin_lock(&sbi->symlink_lock); + while (sbi->s_volume[i]) /* Cannot overflow */ + *p++ = sbi->s_volume[i++]; + spin_unlock(&sbi->symlink_lock); } while (i < maxlen && (c = *symname++)) { if (c == '.' && lc == '/' && *symname == '.' && symname[1] == '/') { diff --git a/fs/affs/super.c b/fs/affs/super.c index 104fdcb3a7fc..d41e9673cd97 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -203,7 +203,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s switch (token) { case Opt_bs: if (match_int(&args[0], &n)) - return -EINVAL; + return 0; if (n != 512 && n != 1024 && n != 2048 && n != 4096) { printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); @@ -213,7 +213,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s break; case Opt_mode: if (match_octal(&args[0], &option)) - return 1; + return 0; *mode = option & 0777; *mount_opts |= SF_SETMODE; break; @@ -221,8 +221,6 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s *mount_opts |= SF_MUFS; break; case Opt_prefix: - /* Free any previous prefix */ - kfree(*prefix); *prefix = match_strdup(&args[0]); if (!*prefix) return 0; @@ -233,21 +231,21 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s break; case Opt_reserved: if (match_int(&args[0], reserved)) - return 1; + return 0; break; case Opt_root: if (match_int(&args[0], root)) - return 1; + return 0; break; case Opt_setgid: if (match_int(&args[0], &option)) - return 1; + return 0; *gid = option; *mount_opts |= SF_SETGID; break; case Opt_setuid: if (match_int(&args[0], &option)) - return -EINVAL; + return 0; *uid = option; *mount_opts |= SF_SETUID; break; @@ -311,11 +309,14 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; sb->s_fs_info = sbi; mutex_init(&sbi->s_bmlock); + spin_lock_init(&sbi->symlink_lock); if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, &blocksize,&sbi->s_prefix, sbi->s_volume, &mount_flags)) { printk(KERN_ERR "AFFS: Error parsing options\n"); + kfree(sbi->s_prefix); + kfree(sbi); return -EINVAL; } /* N.B. after this point s_prefix must be released */ @@ -516,14 +517,18 @@ affs_remount(struct super_block *sb, int *flags, char *data) unsigned long mount_flags; int res = 0; char *new_opts = kstrdup(data, GFP_KERNEL); + char volume[32]; + char *prefix = NULL; pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data); *flags |= MS_NODIRATIME; + memcpy(volume, sbi->s_volume, 32); if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block, - &blocksize, &sbi->s_prefix, sbi->s_volume, + &blocksize, &prefix, volume, &mount_flags)) { + kfree(prefix); kfree(new_opts); return -EINVAL; } @@ -534,6 +539,14 @@ affs_remount(struct super_block *sb, int *flags, char *data) sbi->s_mode = mode; sbi->s_uid = uid; sbi->s_gid = gid; + /* protect against readers */ + spin_lock(&sbi->symlink_lock); + if (prefix) { + kfree(sbi->s_prefix); + sbi->s_prefix = prefix; + } + memcpy(sbi->s_volume, volume, 32); + spin_unlock(&sbi->symlink_lock); if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { unlock_kernel(); diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index 41782539c907..ee00f08c4f53 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -20,7 +20,6 @@ static int affs_symlink_readpage(struct file *file, struct page *page) int i, j; char c; char lc; - char *pf; pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino); @@ -32,11 +31,15 @@ static int affs_symlink_readpage(struct file *file, struct page *page) j = 0; lf = (struct slink_front *)bh->b_data; lc = 0; - pf = AFFS_SB(inode->i_sb)->s_prefix ? AFFS_SB(inode->i_sb)->s_prefix : "/"; if (strchr(lf->symname,':')) { /* Handle assign or volume name */ + struct affs_sb_info *sbi = AFFS_SB(inode->i_sb); + char *pf; + spin_lock(&sbi->symlink_lock); + pf = sbi->s_prefix ? sbi->s_prefix : "/"; while (i < 1023 && (c = pf[i])) link[i++] = c; + spin_unlock(&sbi->symlink_lock); while (i < 1023 && lf->symname[j] != ':') link[i++] = lf->symname[j++]; if (i < 1023) diff --git a/fs/afs/write.c b/fs/afs/write.c index c63a3c8beb73..5e15a21dbf9f 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -671,7 +671,6 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); ssize_t result; size_t count = iov_length(iov, nr_segs); - int ret; _enter("{%x.%u},{%zu},%lu,", vnode->fid.vid, vnode->fid.vnode, count, nr_segs); @@ -691,13 +690,6 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, return result; } - /* return error values for O_SYNC and IS_SYNC() */ - if (IS_SYNC(&vnode->vfs_inode) || iocb->ki_filp->f_flags & O_SYNC) { - ret = afs_fsync(iocb->ki_filp, dentry, 1); - if (ret < 0) - result = ret; - } - _leave(" = %zd", result); return result; } @@ -711,10 +711,8 @@ static ssize_t aio_run_iocb(struct kiocb *iocb) */ ret = retry(iocb); - if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { - BUG_ON(!list_empty(&iocb->ki_wait.task_list)); + if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) aio_complete(iocb, ret, 0); - } out: spin_lock_irq(&ctx->ctx_lock); @@ -866,13 +864,6 @@ static void try_queue_kicked_iocb(struct kiocb *iocb) unsigned long flags; int run = 0; - /* We're supposed to be the only path putting the iocb back on the run - * list. If we find that the iocb is *back* on a wait queue already - * than retry has happened before we could queue the iocb. This also - * means that the retry could have completed and freed our iocb, no - * good. */ - BUG_ON((!list_empty(&iocb->ki_wait.task_list))); - spin_lock_irqsave(&ctx->ctx_lock, flags); /* set this inside the lock so that we can't race with aio_run_iocb() * testing it and putting the iocb on the run list under the lock */ @@ -886,7 +877,7 @@ static void try_queue_kicked_iocb(struct kiocb *iocb) /* * kick_iocb: * Called typically from a wait queue callback context - * (aio_wake_function) to trigger a retry of the iocb. + * to trigger a retry of the iocb. * The retry is usually executed by aio workqueue * threads (See aio_kick_handler). */ @@ -1520,31 +1511,6 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb) return 0; } -/* - * aio_wake_function: - * wait queue callback function for aio notification, - * Simply triggers a retry of the operation via kick_iocb. - * - * This callback is specified in the wait queue entry in - * a kiocb. - * - * Note: - * This routine is executed with the wait queue lock held. - * Since kick_iocb acquires iocb->ctx->ctx_lock, it nests - * the ioctx lock inside the wait queue lock. This is safe - * because this callback isn't used for wait queues which - * are nested inside ioctx lock (i.e. ctx->wait) - */ -static int aio_wake_function(wait_queue_t *wait, unsigned mode, - int sync, void *key) -{ - struct kiocb *iocb = container_of(wait, struct kiocb, ki_wait); - - list_del_init(&wait->task_list); - kick_iocb(iocb); - return 1; -} - static void aio_batch_add(struct address_space *mapping, struct hlist_head *batch_hash) { @@ -1642,8 +1608,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; req->ki_left = req->ki_nbytes = iocb->aio_nbytes; req->ki_opcode = iocb->aio_lio_opcode; - init_waitqueue_func_entry(&req->ki_wait, aio_wake_function); - INIT_LIST_HEAD(&req->ki_wait.task_list); ret = aio_setup_iocb(req); diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 2ca7a7cafdbf..9f0bf13291e5 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -35,14 +35,13 @@ static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags, mnt); } -static int anon_inodefs_delete_dentry(struct dentry *dentry) +/* + * anon_inodefs_dname() is called from d_path(). + */ +static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen) { - /* - * We faked vfs to believe the dentry was hashed when we created it. - * Now we restore the flag so that dput() will work correctly. - */ - dentry->d_flags |= DCACHE_UNHASHED; - return 1; + return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s", + dentry->d_name.name); } static struct file_system_type anon_inode_fs_type = { @@ -51,7 +50,7 @@ static struct file_system_type anon_inode_fs_type = { .kill_sb = kill_anon_super, }; static const struct dentry_operations anon_inodefs_dentry_operations = { - .d_delete = anon_inodefs_delete_dentry, + .d_dname = anon_inodefs_dname, }; /* @@ -88,7 +87,7 @@ struct file *anon_inode_getfile(const char *name, void *priv, int flags) { struct qstr this; - struct dentry *dentry; + struct path path; struct file *file; int error; @@ -106,10 +105,11 @@ struct file *anon_inode_getfile(const char *name, this.name = name; this.len = strlen(name); this.hash = 0; - dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); - if (!dentry) + path.dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); + if (!path.dentry) goto err_module; + path.mnt = mntget(anon_inode_mnt); /* * We know the anon_inode inode count is always greater than zero, * so we can avoid doing an igrab() and we can use an open-coded @@ -117,27 +117,24 @@ struct file *anon_inode_getfile(const char *name, */ atomic_inc(&anon_inode_inode->i_count); - dentry->d_op = &anon_inodefs_dentry_operations; - /* Do not publish this dentry inside the global dentry hash table */ - dentry->d_flags &= ~DCACHE_UNHASHED; - d_instantiate(dentry, anon_inode_inode); + path.dentry->d_op = &anon_inodefs_dentry_operations; + d_instantiate(path.dentry, anon_inode_inode); error = -ENFILE; - file = alloc_file(anon_inode_mnt, dentry, - FMODE_READ | FMODE_WRITE, fops); + file = alloc_file(&path, OPEN_FMODE(flags), fops); if (!file) goto err_dput; file->f_mapping = anon_inode_inode->i_mapping; file->f_pos = 0; - file->f_flags = O_RDWR | (flags & O_NONBLOCK); + file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); file->f_version = 0; file->private_data = priv; return file; err_dput: - dput(dentry); + path_put(&path); err_module: module_put(fops->owner); return ERR_PTR(error); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 8f7cdde41733..0118d67221b2 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -60,6 +60,11 @@ do { \ current->pid, __func__, ##args); \ } while (0) +struct rehash_entry { + struct task_struct *task; + struct list_head list; +}; + /* Unified info structure. This is pointed to by both the dentry and inode structures. Each file in the filesystem has an instance of this structure. It holds a reference to the dentry, so dentries are never @@ -75,6 +80,9 @@ struct autofs_info { struct completion expire_complete; struct list_head active; + int active_count; + struct list_head rehash_list; + struct list_head expiring; struct autofs_sb_info *sbi; @@ -95,6 +103,8 @@ struct autofs_info { #define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ #define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */ +#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ +#define AUTOFS_INF_REHASH (1<<3) /* dentry in transit to ->lookup() */ struct autofs_wait_queue { wait_queue_head_t queue; @@ -161,7 +171,7 @@ static inline int autofs4_ispending(struct dentry *dentry) { struct autofs_info *inf = autofs4_dentry_ino(dentry); - if (dentry->d_flags & DCACHE_AUTOFS_PENDING) + if (inf->flags & AUTOFS_INF_PENDING) return 1; if (inf->flags & AUTOFS_INF_EXPIRING) @@ -264,5 +274,31 @@ out: return ret; } +static inline void autofs4_add_expiring(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { + spin_lock(&sbi->lookup_lock); + if (list_empty(&ino->expiring)) + list_add(&ino->expiring, &sbi->expiring_list); + spin_unlock(&sbi->lookup_lock); + } + return; +} + +static inline void autofs4_del_expiring(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->expiring)) + list_del_init(&ino->expiring); + spin_unlock(&sbi->lookup_lock); + } + return; +} + void autofs4_dentry_release(struct dentry *); extern void autofs4_kill_sb(struct super_block *); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 3da18d453488..74bc9aa6df31 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -27,7 +27,7 @@ static inline int autofs4_can_expire(struct dentry *dentry, return 0; /* No point expiring a pending mount */ - if (dentry->d_flags & DCACHE_AUTOFS_PENDING) + if (ino->flags & AUTOFS_INF_PENDING) return 0; if (!do_now) { @@ -279,6 +279,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, root->d_mounted--; } ino->flags |= AUTOFS_INF_EXPIRING; + autofs4_add_expiring(root); init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); return root; @@ -406,6 +407,7 @@ found: expired, (int)expired->d_name.len, expired->d_name.name); ino = autofs4_dentry_ino(expired); ino->flags |= AUTOFS_INF_EXPIRING; + autofs4_add_expiring(expired); init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); spin_lock(&dcache_lock); @@ -433,7 +435,7 @@ int autofs4_expire_wait(struct dentry *dentry) DPRINTK("expire done status=%d", status); - if (d_unhashed(dentry)) + if (d_unhashed(dentry) && IS_DEADDIR(dentry->d_inode)) return -EAGAIN; return status; @@ -473,6 +475,7 @@ int autofs4_expire_run(struct super_block *sb, spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); ino->flags &= ~AUTOFS_INF_EXPIRING; + autofs4_del_expiring(dentry); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); @@ -503,6 +506,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, ino->flags &= ~AUTOFS_INF_MOUNTPOINT; } ino->flags &= ~AUTOFS_INF_EXPIRING; + autofs4_del_expiring(dentry); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); dput(dentry); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 69c8142da838..d0a3de247458 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -49,6 +49,8 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino, ino->dentry = NULL; ino->size = 0; INIT_LIST_HEAD(&ino->active); + INIT_LIST_HEAD(&ino->rehash_list); + ino->active_count = 0; INIT_LIST_HEAD(&ino->expiring); atomic_set(&ino->count, 0); } diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index b96a3c57359d..30cc9ddf4b70 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -72,6 +72,139 @@ const struct inode_operations autofs4_dir_inode_operations = { .rmdir = autofs4_dir_rmdir, }; +static void autofs4_add_active(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { + spin_lock(&sbi->lookup_lock); + if (!ino->active_count) { + if (list_empty(&ino->active)) + list_add(&ino->active, &sbi->active_list); + } + ino->active_count++; + spin_unlock(&sbi->lookup_lock); + } + return; +} + +static void autofs4_del_active(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { + spin_lock(&sbi->lookup_lock); + ino->active_count--; + if (!ino->active_count) { + if (!list_empty(&ino->active)) + list_del_init(&ino->active); + } + spin_unlock(&sbi->lookup_lock); + } + return; +} + +static void autofs4_add_rehash_entry(struct autofs_info *ino, + struct rehash_entry *entry) +{ + entry->task = current; + INIT_LIST_HEAD(&entry->list); + list_add(&entry->list, &ino->rehash_list); + return; +} + +static void autofs4_remove_rehash_entry(struct autofs_info *ino) +{ + struct list_head *head = &ino->rehash_list; + struct rehash_entry *entry; + list_for_each_entry(entry, head, list) { + if (entry->task == current) { + list_del(&entry->list); + kfree(entry); + break; + } + } + return; +} + +static void autofs4_remove_rehash_entrys(struct autofs_info *ino) +{ + struct autofs_sb_info *sbi = ino->sbi; + struct rehash_entry *entry, *next; + struct list_head *head; + + spin_lock(&sbi->fs_lock); + spin_lock(&sbi->lookup_lock); + if (!(ino->flags & AUTOFS_INF_REHASH)) { + spin_unlock(&sbi->lookup_lock); + spin_unlock(&sbi->fs_lock); + return; + } + ino->flags &= ~AUTOFS_INF_REHASH; + head = &ino->rehash_list; + list_for_each_entry_safe(entry, next, head, list) { + list_del(&entry->list); + kfree(entry); + } + spin_unlock(&sbi->lookup_lock); + spin_unlock(&sbi->fs_lock); + dput(ino->dentry); + + return; +} + +static void autofs4_revalidate_drop(struct dentry *dentry, + struct rehash_entry *entry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + /* + * Add to the active list so we can pick this up in + * ->lookup(). Also add an entry to a rehash list so + * we know when there are no dentrys in flight so we + * know when we can rehash the dentry. + */ + spin_lock(&sbi->lookup_lock); + if (list_empty(&ino->active)) + list_add(&ino->active, &sbi->active_list); + autofs4_add_rehash_entry(ino, entry); + spin_unlock(&sbi->lookup_lock); + if (!(ino->flags & AUTOFS_INF_REHASH)) { + ino->flags |= AUTOFS_INF_REHASH; + dget(dentry); + spin_lock(&dentry->d_lock); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + } + return; +} + +static void autofs4_revalidate_rehash(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino->flags & AUTOFS_INF_REHASH) { + spin_lock(&sbi->lookup_lock); + autofs4_remove_rehash_entry(ino); + if (list_empty(&ino->rehash_list)) { + spin_unlock(&sbi->lookup_lock); + ino->flags &= ~AUTOFS_INF_REHASH; + d_rehash(dentry); + dput(ino->dentry); + } else + spin_unlock(&sbi->lookup_lock); + } + return; +} + +static unsigned int autofs4_need_mount(unsigned int flags) +{ + unsigned int res = 0; + if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS)) + res = 1; + return res; +} + static int autofs4_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_path.dentry; @@ -93,7 +226,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) * it. */ spin_lock(&dcache_lock); - if (!d_mountpoint(dentry) && __simple_empty(dentry)) { + if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { spin_unlock(&dcache_lock); return -ENOENT; } @@ -103,7 +236,7 @@ out: return dcache_dir_open(inode, file); } -static int try_to_fill_dentry(struct dentry *dentry, int flags) +static int try_to_fill_dentry(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); @@ -116,55 +249,17 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) * Wait for a pending mount, triggering one if there * isn't one already */ - if (dentry->d_inode == NULL) { - DPRINTK("waiting for mount name=%.*s", - dentry->d_name.len, dentry->d_name.name); - - status = autofs4_wait(sbi, dentry, NFY_MOUNT); - - DPRINTK("mount done status=%d", status); - - /* Turn this into a real negative dentry? */ - if (status == -ENOENT) { - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; - spin_unlock(&dentry->d_lock); - return status; - } else if (status) { - /* Return a negative dentry, but leave it "pending" */ - return status; - } - /* Trigger mount for path component or follow link */ - } else if (dentry->d_flags & DCACHE_AUTOFS_PENDING || - flags & (TRIGGER_FLAGS | TRIGGER_INTENTS) || - current->link_count) { - DPRINTK("waiting for mount name=%.*s", - dentry->d_name.len, dentry->d_name.name); - - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_AUTOFS_PENDING; - spin_unlock(&dentry->d_lock); - status = autofs4_wait(sbi, dentry, NFY_MOUNT); + DPRINTK("waiting for mount name=%.*s", + dentry->d_name.len, dentry->d_name.name); - DPRINTK("mount done status=%d", status); + status = autofs4_wait(sbi, dentry, NFY_MOUNT); - if (status) { - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; - spin_unlock(&dentry->d_lock); - return status; - } - } - - /* Initialize expiry counter after successful mount */ - if (ino) - ino->last_used = jiffies; + DPRINTK("mount done status=%d", status); - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; - spin_unlock(&dentry->d_lock); + /* Update expiry counter */ + ino->last_used = jiffies; - return 0; + return status; } /* For autofs direct mounts the follow link triggers the mount */ @@ -202,27 +297,39 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) autofs4_expire_wait(dentry); /* We trigger a mount for almost all flags */ - lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS); - if (!(lookup_type || dentry->d_flags & DCACHE_AUTOFS_PENDING)) + lookup_type = autofs4_need_mount(nd->flags); + spin_lock(&sbi->fs_lock); + spin_lock(&dcache_lock); + if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) { + spin_unlock(&dcache_lock); + spin_unlock(&sbi->fs_lock); goto follow; + } /* * If the dentry contains directories then it is an autofs * multi-mount with no root mount offset. So don't try to * mount it again. */ - spin_lock(&dcache_lock); - if (dentry->d_flags & DCACHE_AUTOFS_PENDING || - (!d_mountpoint(dentry) && __simple_empty(dentry))) { + if (ino->flags & AUTOFS_INF_PENDING || + (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) { + ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&dcache_lock); + spin_unlock(&sbi->fs_lock); + + status = try_to_fill_dentry(dentry); + + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_PENDING; + spin_unlock(&sbi->fs_lock); - status = try_to_fill_dentry(dentry, 0); if (status) goto out_error; goto follow; } spin_unlock(&dcache_lock); + spin_unlock(&sbi->fs_lock); follow: /* * If there is no root mount it must be an autofs @@ -254,18 +361,47 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *dir = dentry->d_parent->d_inode; struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - int oz_mode = autofs4_oz_mode(sbi); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct rehash_entry *entry; int flags = nd ? nd->flags : 0; - int status = 1; + unsigned int mutex_aquired; + + DPRINTK("name = %.*s oz_mode = %d", + dentry->d_name.len, dentry->d_name.name, oz_mode); + + /* Daemon never causes a mount to trigger */ + if (autofs4_oz_mode(sbi)) + return 1; + + entry = kmalloc(sizeof(struct rehash_entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + mutex_aquired = mutex_trylock(&dir->i_mutex); - /* Pending dentry */ spin_lock(&sbi->fs_lock); + spin_lock(&dcache_lock); + /* Pending dentry */ if (autofs4_ispending(dentry)) { - /* The daemon never causes a mount to trigger */ - spin_unlock(&sbi->fs_lock); + int status; - if (oz_mode) - return 1; + /* + * We can only unhash and send this to ->lookup() if + * the directory mutex is held over d_revalidate() and + * ->lookup(). This prevents the VFS from incorrectly + * seeing the dentry as non-existent. + */ + ino->flags |= AUTOFS_INF_PENDING; + if (!mutex_aquired) { + autofs4_revalidate_drop(dentry, entry); + spin_unlock(&dcache_lock); + spin_unlock(&sbi->fs_lock); + return 0; + } + spin_unlock(&dcache_lock); + spin_unlock(&sbi->fs_lock); + mutex_unlock(&dir->i_mutex); + kfree(entry); /* * If the directory has gone away due to an expire @@ -279,46 +415,82 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) * A zero status is success otherwise we have a * negative error code. */ - status = try_to_fill_dentry(dentry, flags); + status = try_to_fill_dentry(dentry); + + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_PENDING; + spin_unlock(&sbi->fs_lock); + if (status == 0) return 1; return status; } - spin_unlock(&sbi->fs_lock); - - /* Negative dentry.. invalidate if "old" */ - if (dentry->d_inode == NULL) - return 0; /* Check for a non-mountpoint directory with no contents */ - spin_lock(&dcache_lock); if (S_ISDIR(dentry->d_inode->i_mode) && - !d_mountpoint(dentry) && - __simple_empty(dentry)) { + !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { DPRINTK("dentry=%p %.*s, emptydir", dentry, dentry->d_name.len, dentry->d_name.name); - spin_unlock(&dcache_lock); - /* The daemon never causes a mount to trigger */ - if (oz_mode) - return 1; + if (autofs4_need_mount(flags) || current->link_count) { + int status; - /* - * A zero status is success otherwise we have a - * negative error code. - */ - status = try_to_fill_dentry(dentry, flags); - if (status == 0) - return 1; + /* + * We can only unhash and send this to ->lookup() if + * the directory mutex is held over d_revalidate() and + * ->lookup(). This prevents the VFS from incorrectly + * seeing the dentry as non-existent. + */ + ino->flags |= AUTOFS_INF_PENDING; + if (!mutex_aquired) { + autofs4_revalidate_drop(dentry, entry); + spin_unlock(&dcache_lock); + spin_unlock(&sbi->fs_lock); + return 0; + } + spin_unlock(&dcache_lock); + spin_unlock(&sbi->fs_lock); + mutex_unlock(&dir->i_mutex); + kfree(entry); - return status; + /* + * A zero status is success otherwise we have a + * negative error code. + */ + status = try_to_fill_dentry(dentry); + + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_PENDING; + spin_unlock(&sbi->fs_lock); + + if (status == 0) + return 1; + + return status; + } } spin_unlock(&dcache_lock); + spin_unlock(&sbi->fs_lock); + + if (mutex_aquired) + mutex_unlock(&dir->i_mutex); + + kfree(entry); return 1; } +static void autofs4_free_rehash_entrys(struct autofs_info *inf) +{ + struct list_head *head = &inf->rehash_list; + struct rehash_entry *entry, *next; + list_for_each_entry_safe(entry, next, head, list) { + list_del(&entry->list); + kfree(entry); + } +} + void autofs4_dentry_release(struct dentry *de) { struct autofs_info *inf; @@ -337,6 +509,8 @@ void autofs4_dentry_release(struct dentry *de) list_del(&inf->active); if (!list_empty(&inf->expiring)) list_del(&inf->expiring); + if (!list_empty(&inf->rehash_list)) + autofs4_free_rehash_entrys(inf); spin_unlock(&sbi->lookup_lock); } @@ -359,35 +533,52 @@ static const struct dentry_operations autofs4_dentry_operations = { .d_release = autofs4_dentry_release, }; -static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name) +static struct dentry *autofs4_lookup_active(struct dentry *dentry) { + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct dentry *parent = dentry->d_parent; + struct qstr *name = &dentry->d_name; unsigned int len = name->len; unsigned int hash = name->hash; const unsigned char *str = name->name; struct list_head *p, *head; +restart: spin_lock(&dcache_lock); spin_lock(&sbi->lookup_lock); head = &sbi->active_list; list_for_each(p, head) { struct autofs_info *ino; - struct dentry *dentry; + struct dentry *active; struct qstr *qstr; ino = list_entry(p, struct autofs_info, active); - dentry = ino->dentry; + active = ino->dentry; - spin_lock(&dentry->d_lock); + spin_lock(&active->d_lock); /* Already gone? */ - if (atomic_read(&dentry->d_count) == 0) + if (atomic_read(&active->d_count) == 0) goto next; - qstr = &dentry->d_name; + if (active->d_inode && IS_DEADDIR(active->d_inode)) { + if (!list_empty(&ino->rehash_list)) { + dget(active); + spin_unlock(&active->d_lock); + spin_unlock(&sbi->lookup_lock); + spin_unlock(&dcache_lock); + autofs4_remove_rehash_entrys(ino); + dput(active); + goto restart; + } + goto next; + } + + qstr = &active->d_name; - if (dentry->d_name.hash != hash) + if (active->d_name.hash != hash) goto next; - if (dentry->d_parent != parent) + if (active->d_parent != parent) goto next; if (qstr->len != len) @@ -395,15 +586,13 @@ static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct d if (memcmp(qstr->name, str, len)) goto next; - if (d_unhashed(dentry)) { - dget(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); - return dentry; - } + dget(active); + spin_unlock(&active->d_lock); + spin_unlock(&sbi->lookup_lock); + spin_unlock(&dcache_lock); + return active; next: - spin_unlock(&dentry->d_lock); + spin_unlock(&active->d_lock); } spin_unlock(&sbi->lookup_lock); spin_unlock(&dcache_lock); @@ -411,8 +600,11 @@ next: return NULL; } -static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name) +static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) { + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct dentry *parent = dentry->d_parent; + struct qstr *name = &dentry->d_name; unsigned int len = name->len; unsigned int hash = name->hash; const unsigned char *str = name->name; @@ -423,23 +615,23 @@ static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct head = &sbi->expiring_list; list_for_each(p, head) { struct autofs_info *ino; - struct dentry *dentry; + struct dentry *expiring; struct qstr *qstr; ino = list_entry(p, struct autofs_info, expiring); - dentry = ino->dentry; + expiring = ino->dentry; - spin_lock(&dentry->d_lock); + spin_lock(&expiring->d_lock); /* Bad luck, we've already been dentry_iput */ - if (!dentry->d_inode) + if (!expiring->d_inode) goto next; - qstr = &dentry->d_name; + qstr = &expiring->d_name; - if (dentry->d_name.hash != hash) + if (expiring->d_name.hash != hash) goto next; - if (dentry->d_parent != parent) + if (expiring->d_parent != parent) goto next; if (qstr->len != len) @@ -447,15 +639,13 @@ static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct if (memcmp(qstr->name, str, len)) goto next; - if (d_unhashed(dentry)) { - dget(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); - return dentry; - } + dget(expiring); + spin_unlock(&expiring->d_lock); + spin_unlock(&sbi->lookup_lock); + spin_unlock(&dcache_lock); + return expiring; next: - spin_unlock(&dentry->d_lock); + spin_unlock(&expiring->d_lock); } spin_unlock(&sbi->lookup_lock); spin_unlock(&dcache_lock); @@ -463,13 +653,56 @@ next: return NULL; } +static struct autofs_info *init_new_dentry(struct autofs_sb_info *sbi, + struct dentry *dentry, int oz_mode) +{ + struct autofs_info *ino; + + /* + * Mark the dentry incomplete but don't hash it. We do this + * to serialize our inode creation operations (symlink and + * mkdir) which prevents deadlock during the callback to + * the daemon. Subsequent user space lookups for the same + * dentry are placed on the wait queue while the daemon + * itself is allowed passage unresticted so the create + * operation itself can then hash the dentry. Finally, + * we check for the hashed dentry and return the newly + * hashed dentry. + */ + dentry->d_op = &autofs4_root_dentry_operations; + + /* + * And we need to ensure that the same dentry is used for + * all following lookup calls until it is hashed so that + * the dentry flags are persistent throughout the request. + */ + ino = autofs4_init_ino(NULL, sbi, 0555); + if (!ino) + return ERR_PTR(-ENOMEM); + + dentry->d_fsdata = ino; + ino->dentry = dentry; + + /* + * Only set the mount pending flag for new dentrys not created + * by the daemon. + */ + if (!oz_mode) + ino->flags |= AUTOFS_INF_PENDING; + + d_instantiate(dentry, NULL); + + return ino; +} + /* Lookups in the root directory */ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct autofs_sb_info *sbi; struct autofs_info *ino; - struct dentry *expiring, *unhashed; + struct dentry *expiring, *active; int oz_mode; + int status = 0; DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name); @@ -484,123 +717,100 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); - unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name); - if (unhashed) - dentry = unhashed; - else { - /* - * Mark the dentry incomplete but don't hash it. We do this - * to serialize our inode creation operations (symlink and - * mkdir) which prevents deadlock during the callback to - * the daemon. Subsequent user space lookups for the same - * dentry are placed on the wait queue while the daemon - * itself is allowed passage unresticted so the create - * operation itself can then hash the dentry. Finally, - * we check for the hashed dentry and return the newly - * hashed dentry. - */ - dentry->d_op = &autofs4_root_dentry_operations; - - /* - * And we need to ensure that the same dentry is used for - * all following lookup calls until it is hashed so that - * the dentry flags are persistent throughout the request. - */ - ino = autofs4_init_ino(NULL, sbi, 0555); - if (!ino) - return ERR_PTR(-ENOMEM); - - dentry->d_fsdata = ino; - ino->dentry = dentry; - - spin_lock(&sbi->lookup_lock); - list_add(&ino->active, &sbi->active_list); - spin_unlock(&sbi->lookup_lock); - - d_instantiate(dentry, NULL); + spin_lock(&sbi->fs_lock); + active = autofs4_lookup_active(dentry); + if (active) { + dentry = active; + ino = autofs4_dentry_ino(dentry); + /* If this came from revalidate, rehash it */ + autofs4_revalidate_rehash(dentry); + spin_unlock(&sbi->fs_lock); + } else { + spin_unlock(&sbi->fs_lock); + ino = init_new_dentry(sbi, dentry, oz_mode); + if (IS_ERR(ino)) + return (struct dentry *) ino; } + autofs4_add_active(dentry); + if (!oz_mode) { + expiring = autofs4_lookup_expiring(dentry); mutex_unlock(&dir->i_mutex); - expiring = autofs4_lookup_expiring(sbi, - dentry->d_parent, - &dentry->d_name); if (expiring) { /* * If we are racing with expire the request might not * be quite complete but the directory has been removed * so it must have been successful, so just wait for it. */ - ino = autofs4_dentry_ino(expiring); autofs4_expire_wait(expiring); - spin_lock(&sbi->lookup_lock); - if (!list_empty(&ino->expiring)) - list_del_init(&ino->expiring); - spin_unlock(&sbi->lookup_lock); dput(expiring); } - - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_AUTOFS_PENDING; - spin_unlock(&dentry->d_lock); - if (dentry->d_op && dentry->d_op->d_revalidate) - (dentry->d_op->d_revalidate)(dentry, nd); + status = try_to_fill_dentry(dentry); mutex_lock(&dir->i_mutex); + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_PENDING; + spin_unlock(&sbi->fs_lock); } + autofs4_del_active(dentry); + /* - * If we are still pending, check if we had to handle + * If we had a mount fail, check if we had to handle * a signal. If so we can force a restart.. */ - if (dentry->d_flags & DCACHE_AUTOFS_PENDING) { + if (status) { /* See if we were interrupted */ if (signal_pending(current)) { sigset_t *sigset = ¤t->pending.signal; if (sigismember (sigset, SIGKILL) || sigismember (sigset, SIGQUIT) || sigismember (sigset, SIGINT)) { - if (unhashed) - dput(unhashed); + if (active) + dput(active); return ERR_PTR(-ERESTARTNOINTR); } } - if (!oz_mode) { - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; - spin_unlock(&dentry->d_lock); + } + + /* + * User space can (and has done in the past) remove and re-create + * this directory during the callback. This can leave us with an + * unhashed dentry, but a successful mount! So we need to + * perform another cached lookup in case the dentry now exists. + */ + if (!oz_mode && !have_submounts(dentry)) { + struct dentry *new; + new = d_lookup(dentry->d_parent, &dentry->d_name); + if (new) { + if (active) + dput(active); + return new; + } else { + if (!status) + status = -ENOENT; } } /* - * If this dentry is unhashed, then we shouldn't honour this - * lookup. Returning ENOENT here doesn't do the right thing - * for all system calls, but it should be OK for the operations - * we permit from an autofs. + * If we had a mount failure, return status to user space. + * If the mount succeeded and we used a dentry from the active queue + * return it. */ - if (!oz_mode && d_unhashed(dentry)) { + if (status) { + dentry = ERR_PTR(status); + if (active) + dput(active); + return dentry; + } else { /* - * A user space application can (and has done in the past) - * remove and re-create this directory during the callback. - * This can leave us with an unhashed dentry, but a - * successful mount! So we need to perform another - * cached lookup in case the dentry now exists. + * Valid successful mount, return active dentry or NULL + * for a new dentry. */ - struct dentry *parent = dentry->d_parent; - struct dentry *new = d_lookup(parent, &dentry->d_name); - if (new != NULL) - dentry = new; - else - dentry = ERR_PTR(-ENOENT); - - if (unhashed) - dput(unhashed); - - return dentry; + if (active) + return active; } - if (unhashed) - return unhashed; - return NULL; } @@ -624,11 +834,6 @@ static int autofs4_dir_symlink(struct inode *dir, if (!ino) return -ENOMEM; - spin_lock(&sbi->lookup_lock); - if (!list_empty(&ino->active)) - list_del_init(&ino->active); - spin_unlock(&sbi->lookup_lock); - ino->size = strlen(symname); cp = kmalloc(ino->size + 1, GFP_KERNEL); if (!cp) { @@ -705,10 +910,6 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) dir->i_mtime = CURRENT_TIME; spin_lock(&dcache_lock); - spin_lock(&sbi->lookup_lock); - if (list_empty(&ino->expiring)) - list_add(&ino->expiring, &sbi->expiring_list); - spin_unlock(&sbi->lookup_lock); spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -734,10 +935,6 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) spin_unlock(&dcache_lock); return -ENOTEMPTY; } - spin_lock(&sbi->lookup_lock); - if (list_empty(&ino->expiring)) - list_add(&ino->expiring, &sbi->expiring_list); - spin_unlock(&sbi->lookup_lock); spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -775,11 +972,6 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (!ino) return -ENOMEM; - spin_lock(&sbi->lookup_lock); - if (!list_empty(&ino->active)) - list_del_init(&ino->active); - spin_unlock(&sbi->lookup_lock); - inode = autofs4_get_inode(dir->i_sb, ino); if (!inode) { if (!dentry->d_fsdata) diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 6f60336c6628..8f3d9fd89604 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -353,35 +353,35 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) struct inode *inode; unsigned i, imap_len; struct bfs_sb_info *info; - long ret = -EINVAL; + int ret = -EINVAL; unsigned long i_sblock, i_eblock, i_eoff, s_size; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; + mutex_init(&info->bfs_lock); s->s_fs_info = info; sb_set_blocksize(s, BFS_BSIZE); - bh = sb_bread(s, 0); - if(!bh) + info->si_sbh = sb_bread(s, 0); + if (!info->si_sbh) goto out; - bfs_sb = (struct bfs_super_block *)bh->b_data; + bfs_sb = (struct bfs_super_block *)info->si_sbh->b_data; if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) { if (!silent) printf("No BFS filesystem on %s (magic=%08x)\n", s->s_id, le32_to_cpu(bfs_sb->s_magic)); - goto out; + goto out1; } if (BFS_UNCLEAN(bfs_sb, s) && !silent) printf("%s is unclean, continuing\n", s->s_id); s->s_magic = BFS_MAGIC; - info->si_sbh = bh; if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { printf("Superblock is corrupted\n"); - goto out; + goto out1; } info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / @@ -390,7 +390,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) imap_len = (info->si_lasti / 8) + 1; info->si_imap = kzalloc(imap_len, GFP_KERNEL); if (!info->si_imap) - goto out; + goto out1; for (i = 0; i < BFS_ROOT_INO; i++) set_bit(i, info->si_imap); @@ -398,15 +398,13 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) inode = bfs_iget(s, BFS_ROOT_INO); if (IS_ERR(inode)) { ret = PTR_ERR(inode); - kfree(info->si_imap); - goto out; + goto out2; } s->s_root = d_alloc_root(inode); if (!s->s_root) { iput(inode); ret = -ENOMEM; - kfree(info->si_imap); - goto out; + goto out2; } info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS; @@ -419,10 +417,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) bh = sb_bread(s, info->si_blocks - 1); if (!bh) { printf("Last block not available: %lu\n", info->si_blocks - 1); - iput(inode); ret = -EIO; - kfree(info->si_imap); - goto out; + goto out3; } brelse(bh); @@ -459,11 +455,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) printf("Inode 0x%08x corrupted\n", i); brelse(bh); - s->s_root = NULL; - kfree(info->si_imap); - kfree(info); - s->s_fs_info = NULL; - return -EIO; + ret = -EIO; + goto out3; } if (!di->i_ino) { @@ -483,11 +476,17 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) s->s_dirt = 1; } dump_imap("read_super", s); - mutex_init(&info->bfs_lock); return 0; +out3: + dput(s->s_root); + s->s_root = NULL; +out2: + kfree(info->si_imap); +out1: + brelse(info->si_sbh); out: - brelse(bh); + mutex_destroy(&info->bfs_lock); kfree(info); s->s_fs_info = NULL; return ret; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index b639dcf7c778..fdd397099172 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -32,7 +32,7 @@ static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); static int load_aout_library(struct file*); -static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); +static int aout_core_dump(struct coredump_params *cprm); static struct linux_binfmt aout_format = { .module = THIS_MODULE, @@ -89,8 +89,9 @@ if (file->f_op->llseek) { \ * dumping of the process results in another error.. */ -static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) +static int aout_core_dump(struct coredump_params *cprm) { + struct file *file = cprm->file; mm_segment_t fs; int has_dumped = 0; unsigned long dump_start, dump_size; @@ -108,16 +109,16 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u current->flags |= PF_DUMPCORE; strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); dump.u_ar0 = offsetof(struct user, regs); - dump.signal = signr; - aout_dump_thread(regs, &dump); + dump.signal = cprm->signr; + aout_dump_thread(cprm->regs, &dump); /* If the size of the dump file exceeds the rlimit, then see what would happen if we wrote the stack, but not the data area. */ - if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit) + if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit) dump.u_dsize = 0; /* Make sure we have enough room to write the stack and data areas. */ - if ((dump.u_ssize + 1) * PAGE_SIZE > limit) + if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit) dump.u_ssize = 0; /* make sure we actually have a data and stack area to dump */ @@ -263,6 +264,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) #else set_personality(PER_LINUX); #endif + setup_new_exec(bprm); current->mm->end_code = ex.a_text + (current->mm->start_code = N_TXTADDR(ex)); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d15ea1790bfb..fd5b2ea5d299 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -44,8 +44,8 @@ static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, * If we don't support core dumping, then supply a NULL so we * don't even try. */ -#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) -static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); +#ifdef CONFIG_ELF_CORE +static int elf_core_dump(struct coredump_params *cprm); #else #define elf_core_dump NULL #endif @@ -662,27 +662,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') goto out_free_interp; - /* - * The early SET_PERSONALITY here is so that the lookup - * for the interpreter happens in the namespace of the - * to-be-execed image. SET_PERSONALITY can select an - * alternate root. - * - * However, SET_PERSONALITY is NOT allowed to switch - * this task into the new images's memory mapping - * policy - that is, TASK_SIZE must still evaluate to - * that which is appropriate to the execing application. - * This is because exit_mmap() needs to have TASK_SIZE - * evaluate to the size of the old image. - * - * So if (say) a 64-bit application is execing a 32-bit - * application it is the architecture's responsibility - * to defer changing the value of TASK_SIZE until the - * switch really is going to happen - do this in - * flush_thread(). - akpm - */ - SET_PERSONALITY(loc->elf_ex); - interpreter = open_exec(elf_interpreter); retval = PTR_ERR(interpreter); if (IS_ERR(interpreter)) @@ -730,9 +709,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* Verify the interpreter has a valid arch */ if (!elf_check_arch(&loc->interp_elf_ex)) goto out_free_dentry; - } else { - /* Executables without an interpreter also need a personality */ - SET_PERSONALITY(loc->elf_ex); } /* Flush all traces of the currently running executable */ @@ -752,7 +728,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) current->flags |= PF_RANDOMIZE; - arch_pick_mmap_layout(current->mm); + + setup_new_exec(bprm); /* Do this so that we can load the interpreter, if need be. We will change some of these later */ @@ -1101,12 +1078,7 @@ out: return error; } -/* - * Note that some platforms still use traditional core dumps and not - * the ELF core dump. Each platform can select it as appropriate. - */ -#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) - +#ifdef CONFIG_ELF_CORE /* * ELF core dumper * @@ -1277,8 +1249,9 @@ static int writenote(struct memelfnote *men, struct file *file, } #undef DUMP_WRITE -#define DUMP_WRITE(addr, nr) \ - if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ +#define DUMP_WRITE(addr, nr) \ + if ((size += (nr)) > cprm->limit || \ + !dump_write(cprm->file, (addr), (nr))) \ goto end_coredump; static void fill_elf_header(struct elfhdr *elf, int segs, @@ -1906,7 +1879,7 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, * and then they are actually written out. If we run out of core limit * we just truncate. */ -static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) +static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; mm_segment_t fs; @@ -1952,7 +1925,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un * notes. This also sets up the file header. */ if (!fill_note_info(elf, segs + 1, /* including notes section */ - &info, signr, regs)) + &info, cprm->signr, cprm->regs)) goto cleanup; has_dumped = 1; @@ -2014,14 +1987,14 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un #endif /* write out the notes section */ - if (!write_note_info(&info, file, &foffset)) + if (!write_note_info(&info, cprm->file, &foffset)) goto end_coredump; - if (elf_coredump_extra_notes_write(file, &foffset)) + if (elf_coredump_extra_notes_write(cprm->file, &foffset)) goto end_coredump; /* Align to page */ - if (!dump_seek(file, dataoff - foffset)) + if (!dump_seek(cprm->file, dataoff - foffset)) goto end_coredump; for (vma = first_vma(current, gate_vma); vma != NULL; @@ -2038,12 +2011,13 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un page = get_dump_page(addr); if (page) { void *kaddr = kmap(page); - stop = ((size += PAGE_SIZE) > limit) || - !dump_write(file, kaddr, PAGE_SIZE); + stop = ((size += PAGE_SIZE) > cprm->limit) || + !dump_write(cprm->file, kaddr, + PAGE_SIZE); kunmap(page); page_cache_release(page); } else - stop = !dump_seek(file, PAGE_SIZE); + stop = !dump_seek(cprm->file, PAGE_SIZE); if (stop) goto end_coredump; } @@ -2063,7 +2037,7 @@ out: return has_dumped; } -#endif /* USE_ELF_CORE_DUMP */ +#endif /* CONFIG_ELF_CORE */ static int __init init_elf_binfmt(void) { diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 38502c67987c..18d77297ccc8 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -75,14 +75,14 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *, static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *, struct file *, struct mm_struct *); -#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) -static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *, unsigned long limit); +#ifdef CONFIG_ELF_CORE +static int elf_fdpic_core_dump(struct coredump_params *cprm); #endif static struct linux_binfmt elf_fdpic_format = { .module = THIS_MODULE, .load_binary = load_elf_fdpic_binary, -#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) +#ifdef CONFIG_ELF_CORE .core_dump = elf_fdpic_core_dump, #endif .min_coredump = ELF_EXEC_PAGESIZE, @@ -171,6 +171,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, #ifdef ELF_FDPIC_PLAT_INIT unsigned long dynaddr; #endif +#ifndef CONFIG_MMU + unsigned long stack_prot; +#endif struct file *interpreter = NULL; /* to shut gcc up */ char *interpreter_name = NULL; int executable_stack; @@ -316,6 +319,11 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, * defunct, deceased, etc. after this point we have to exit via * error_kill */ set_personality(PER_LINUX_FDPIC); + if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) + current->personality |= READ_IMPLIES_EXEC; + + setup_new_exec(bprm); + set_binfmt(&elf_fdpic_format); current->mm->start_code = 0; @@ -377,10 +385,15 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, if (stack_size < PAGE_SIZE * 2) stack_size = PAGE_SIZE * 2; + stack_prot = PROT_READ | PROT_WRITE; + if (executable_stack == EXSTACK_ENABLE_X || + (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC)) + stack_prot |= PROT_EXEC; + down_write(¤t->mm->mmap_sem); - current->mm->start_brk = do_mmap(NULL, 0, stack_size, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN, + current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot, + MAP_PRIVATE | MAP_ANONYMOUS | + MAP_UNINITIALIZED | MAP_GROWSDOWN, 0); if (IS_ERR_VALUE(current->mm->start_brk)) { @@ -1200,7 +1213,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, * * Modelled on fs/binfmt_elf.c core dumper */ -#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) +#ifdef CONFIG_ELF_CORE /* * These are the only things you should do on a core-file: use only these @@ -1325,8 +1338,9 @@ static int writenote(struct memelfnote *men, struct file *file) #undef DUMP_WRITE #undef DUMP_SEEK -#define DUMP_WRITE(addr, nr) \ - if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ +#define DUMP_WRITE(addr, nr) \ + if ((size += (nr)) > cprm->limit || \ + !dump_write(cprm->file, (addr), (nr))) \ goto end_coredump; static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) @@ -1581,8 +1595,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size, * and then they are actually written out. If we run out of core limit * we just truncate. */ -static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, - struct file *file, unsigned long limit) +static int elf_fdpic_core_dump(struct coredump_params *cprm) { #define NUM_NOTES 6 int has_dumped = 0; @@ -1641,7 +1654,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, goto cleanup; #endif - if (signr) { + if (cprm->signr) { struct core_thread *ct; struct elf_thread_status *tmp; @@ -1660,14 +1673,14 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, int sz; tmp = list_entry(t, struct elf_thread_status, list); - sz = elf_dump_thread_status(signr, tmp); + sz = elf_dump_thread_status(cprm->signr, tmp); thread_status_size += sz; } } /* now collect the dump for the current */ - fill_prstatus(prstatus, current, signr); - elf_core_copy_regs(&prstatus->pr_reg, regs); + fill_prstatus(prstatus, current, cprm->signr); + elf_core_copy_regs(&prstatus->pr_reg, cprm->regs); segs = current->mm->map_count; #ifdef ELF_CORE_EXTRA_PHDRS @@ -1702,7 +1715,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, /* Try to dump the FPU. */ if ((prstatus->pr_fpvalid = - elf_core_copy_task_fpregs(current, regs, fpu))) + elf_core_copy_task_fpregs(current, cprm->regs, fpu))) fill_note(notes + numnote++, "CORE", NT_PRFPREG, sizeof(*fpu), fpu); #ifdef ELF_CORE_COPY_XFPREGS @@ -1773,7 +1786,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, /* write out the notes section */ for (i = 0; i < numnote; i++) - if (!writenote(notes + i, file)) + if (!writenote(notes + i, cprm->file)) goto end_coredump; /* write out the thread status notes section */ @@ -1782,25 +1795,26 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, list_entry(t, struct elf_thread_status, list); for (i = 0; i < tmp->num_notes; i++) - if (!writenote(&tmp->notes[i], file)) + if (!writenote(&tmp->notes[i], cprm->file)) goto end_coredump; } - if (!dump_seek(file, dataoff)) + if (!dump_seek(cprm->file, dataoff)) goto end_coredump; - if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0) + if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit, + mm_flags) < 0) goto end_coredump; #ifdef ELF_CORE_WRITE_EXTRA_DATA ELF_CORE_WRITE_EXTRA_DATA; #endif - if (file->f_pos != offset) { + if (cprm->file->f_pos != offset) { /* Sanity check */ printk(KERN_WARNING "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n", - file->f_pos, offset); + cprm->file->f_pos, offset); } end_coredump: @@ -1825,4 +1839,4 @@ cleanup: #undef NUM_NOTES } -#endif /* USE_ELF_CORE_DUMP */ +#endif /* CONFIG_ELF_CORE */ diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index a2796651e756..42c6b4a54445 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -87,7 +87,7 @@ static int load_flat_shared_library(int id, struct lib_info *p); #endif static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); -static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); +static int flat_core_dump(struct coredump_params *cprm); static struct linux_binfmt flat_format = { .module = THIS_MODULE, @@ -102,10 +102,10 @@ static struct linux_binfmt flat_format = { * Currently only a stub-function. */ -static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) +static int flat_core_dump(struct coredump_params *cprm) { printk("Process %s:%d received signr %d and should have core dumped\n", - current->comm, current->pid, (int) signr); + current->comm, current->pid, (int) cprm->signr); return(1); } @@ -519,6 +519,7 @@ static int load_flat_file(struct linux_binprm * bprm, /* OK, This is the point of no return */ set_personality(PER_LINUX_32BIT); + setup_new_exec(bprm); } /* diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index eff74b9c9e77..cc8560f6c9b0 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -43,7 +43,7 @@ static int load_som_library(struct file *); * don't even try. */ #if 0 -static int som_core_dump(long signr, struct pt_regs *regs, unsigned long limit); +static int som_core_dump(struct coredump_params *cprm); #else #define som_core_dump NULL #endif @@ -227,6 +227,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) /* OK, This is the point of no return */ current->flags &= ~PF_FORKNOEXEC; current->personality = PER_HPUX; + setup_new_exec(bprm); /* Set the task size for HP-UX processes such that * the gateway page is outside the address space. diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 49a34e7f7306..a16f29e888cd 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -61,7 +61,7 @@ static inline unsigned int vecs_to_idx(unsigned int nr) static inline int use_bip_pool(unsigned int idx) { - if (idx == BIOVEC_NR_POOLS) + if (idx == BIOVEC_MAX_IDX) return 1; return 0; @@ -95,6 +95,7 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, /* Use mempool if lower order alloc failed or max vecs were requested */ if (bip == NULL) { + idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */ bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); if (unlikely(bip == NULL)) { @@ -78,7 +78,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) i = 0; while (i < bio_slab_nr) { - struct bio_slab *bslab = &bio_slabs[i]; + bslab = &bio_slabs[i]; if (!bslab->slab && entry == -1) entry = i; @@ -542,13 +542,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (page == prev->bv_page && offset == prev->bv_offset + prev->bv_len) { + unsigned int prev_bv_len = prev->bv_len; prev->bv_len += len; if (q->merge_bvec_fn) { struct bvec_merge_data bvm = { + /* prev_bvec is already charged in + bi_size, discharge it in order to + simulate merging updated prev_bvec + as new bvec. */ .bi_bdev = bio->bi_bdev, .bi_sector = bio->bi_sector, - .bi_size = bio->bi_size, + .bi_size = bio->bi_size - prev_bv_len, .bi_rw = bio->bi_rw, }; diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 361604244271..6df6d6ed74fd 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -73,13 +73,13 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) return acl; } -static int btrfs_xattr_get_acl(struct inode *inode, int type, - void *value, size_t size) +static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, + void *value, size_t size, int type) { struct posix_acl *acl; int ret = 0; - acl = btrfs_get_acl(inode, type); + acl = btrfs_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); @@ -94,7 +94,8 @@ static int btrfs_xattr_get_acl(struct inode *inode, int type, /* * Needs to be called with fs_mutex held */ -static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +static int btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, int type) { int ret, size = 0; const char *name; @@ -111,12 +112,14 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) switch (type) { case ACL_TYPE_ACCESS: mode = inode->i_mode; - ret = posix_acl_equiv_mode(acl, &mode); - if (ret < 0) - return ret; - ret = 0; - inode->i_mode = mode; name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + ret = posix_acl_equiv_mode(acl, &mode); + if (ret < 0) + return ret; + inode->i_mode = mode; + } + ret = 0; break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) @@ -140,8 +143,7 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) goto out; } - ret = __btrfs_setxattr(inode, name, value, size, 0); - + ret = __btrfs_setxattr(trans, inode, name, value, size, 0); out: kfree(value); @@ -151,10 +153,10 @@ out: return ret; } -static int btrfs_xattr_set_acl(struct inode *inode, int type, - const void *value, size_t size) +static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { - int ret = 0; + int ret; struct posix_acl *acl = NULL; if (value) { @@ -167,38 +169,13 @@ static int btrfs_xattr_set_acl(struct inode *inode, int type, } } - ret = btrfs_set_acl(inode, acl, type); + ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); posix_acl_release(acl); return ret; } - -static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name, - void *value, size_t size) -{ - return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name, - void *value, size_t size) -{ - return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - -static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - int btrfs_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; @@ -221,7 +198,8 @@ int btrfs_check_acl(struct inode *inode, int mask) * stuff has been fixed to work with that. If the locking stuff changes, we * need to re-evaluate the acl locking stuff. */ -int btrfs_init_acl(struct inode *inode, struct inode *dir) +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; int ret = 0; @@ -246,7 +224,8 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) mode_t mode; if (S_ISDIR(inode->i_mode)) { - ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT); + ret = btrfs_set_acl(trans, inode, acl, + ACL_TYPE_DEFAULT); if (ret) goto failed; } @@ -261,10 +240,11 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) inode->i_mode = mode; if (ret > 0) { /* we need an acl */ - ret = btrfs_set_acl(inode, clone, + ret = btrfs_set_acl(trans, inode, clone, ACL_TYPE_ACCESS); } } + posix_acl_release(clone); } failed: posix_acl_release(acl); @@ -294,7 +274,7 @@ int btrfs_acl_chmod(struct inode *inode) ret = posix_acl_chmod_masq(clone, inode->i_mode); if (!ret) - ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS); + ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS); posix_acl_release(clone); @@ -303,14 +283,16 @@ int btrfs_acl_chmod(struct inode *inode) struct xattr_handler btrfs_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, - .get = btrfs_xattr_acl_default_get, - .set = btrfs_xattr_acl_default_set, + .flags = ACL_TYPE_DEFAULT, + .get = btrfs_xattr_acl_get, + .set = btrfs_xattr_acl_set, }; struct xattr_handler btrfs_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, - .get = btrfs_xattr_acl_access_get, - .set = btrfs_xattr_acl_access_set, + .flags = ACL_TYPE_ACCESS, + .get = btrfs_xattr_acl_get, + .set = btrfs_xattr_acl_set, }; #else /* CONFIG_BTRFS_FS_POSIX_ACL */ @@ -320,7 +302,8 @@ int btrfs_acl_chmod(struct inode *inode) return 0; } -int btrfs_init_acl(struct inode *inode, struct inode *dir) +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) { return 0; } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index f6783a42f010..3f1f50d9d916 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -44,9 +44,6 @@ struct btrfs_inode { */ struct extent_io_tree io_failure_tree; - /* held while inesrting or deleting extents from files */ - struct mutex extent_mutex; - /* held while logging the inode in tree-log.c */ struct mutex log_mutex; @@ -166,7 +163,7 @@ static inline struct btrfs_inode *BTRFS_I(struct inode *inode) static inline void btrfs_i_size_write(struct inode *inode, u64 size) { - inode->i_size = size; + i_size_write(inode, size); BTRFS_I(inode)->disk_i_size = size; } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ec96f3a6d536..c4bc570a396e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -37,6 +37,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *src_buf); static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); +static int setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr); + struct btrfs_path *btrfs_alloc_path(void) { @@ -451,9 +456,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, extent_buffer_get(cow); spin_unlock(&root->node_lock); - btrfs_free_extent(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, - level, 0); + btrfs_free_tree_block(trans, root, buf->start, buf->len, + parent_start, root->root_key.objectid, level); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -468,9 +472,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - btrfs_free_extent(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, - level, 0); + btrfs_free_tree_block(trans, root, buf->start, buf->len, + parent_start, root->root_key.objectid, level); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -1030,8 +1033,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); - ret = btrfs_free_extent(trans, root, mid->start, mid->len, - 0, root->root_key.objectid, level, 1); + ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, + 0, root->root_key.objectid, level); /* once for the root ptr */ free_extent_buffer(mid); return ret; @@ -1095,10 +1098,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, 1); if (wret) ret = wret; - wret = btrfs_free_extent(trans, root, bytenr, - blocksize, 0, - root->root_key.objectid, - level, 0); + wret = btrfs_free_tree_block(trans, root, + bytenr, blocksize, 0, + root->root_key.objectid, + level); if (wret) ret = wret; } else { @@ -1143,9 +1146,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, wret = del_ptr(trans, root, path, level + 1, pslot); if (wret) ret = wret; - wret = btrfs_free_extent(trans, root, bytenr, blocksize, - 0, root->root_key.objectid, - level, 0); + wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, + 0, root->root_key.objectid, level); if (wret) ret = wret; } else { @@ -2997,75 +2999,85 @@ again: return ret; } -/* - * This function splits a single item into two items, - * giving 'new_key' to the new item and splitting the - * old one at split_offset (from the start of the item). - * - * The path may be released by this operation. After - * the split, the path is pointing to the old item. The - * new item is going to be in the same node as the old one. - * - * Note, the item being split must be smaller enough to live alone on - * a tree block with room for one extra struct btrfs_item - * - * This allows us to split the item in place, keeping a lock on the - * leaf the entire time. - */ -int btrfs_split_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *new_key, - unsigned long split_offset) +static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int ins_len) { - u32 item_size; + struct btrfs_key key; struct extent_buffer *leaf; - struct btrfs_key orig_key; - struct btrfs_item *item; - struct btrfs_item *new_item; - int ret = 0; - int slot; - u32 nritems; - u32 orig_offset; - struct btrfs_disk_key disk_key; - char *buf; + struct btrfs_file_extent_item *fi; + u64 extent_len = 0; + u32 item_size; + int ret; leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]); - if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item)) - goto split; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_EXTENT_CSUM_KEY); + + if (btrfs_leaf_free_space(root, leaf) >= ins_len) + return 0; item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (key.type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_len = btrfs_file_extent_num_bytes(leaf, fi); + } btrfs_release_path(root, path); - path->search_for_split = 1; path->keep_locks = 1; - - ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1); + path->search_for_split = 1; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); path->search_for_split = 0; + if (ret < 0) + goto err; + ret = -EAGAIN; + leaf = path->nodes[0]; /* if our item isn't there or got smaller, return now */ - if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0], - path->slots[0])) { - path->keep_locks = 0; - return -EAGAIN; + if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) + goto err; + + if (key.type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) + goto err; } btrfs_set_path_blocking(path); - ret = split_leaf(trans, root, &orig_key, path, - sizeof(struct btrfs_item), 1); - path->keep_locks = 0; + ret = split_leaf(trans, root, &key, path, ins_len, 1); BUG_ON(ret); + path->keep_locks = 0; btrfs_unlock_up_safe(path, 1); + return 0; +err: + path->keep_locks = 0; + return ret; +} + +static noinline int split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + struct btrfs_item *new_item; + int slot; + char *buf; + u32 nritems; + u32 item_size; + u32 orig_offset; + struct btrfs_disk_key disk_key; + leaf = path->nodes[0]; BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); -split: - /* - * make sure any changes to the path from split_leaf leave it - * in a blocking state - */ btrfs_set_path_blocking(path); item = btrfs_item_nr(leaf, path->slots[0]); @@ -3073,19 +3085,19 @@ split: item_size = btrfs_item_size(leaf, item); buf = kmalloc(item_size, GFP_NOFS); + if (!buf) + return -ENOMEM; + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, path->slots[0]), item_size); - slot = path->slots[0] + 1; - leaf = path->nodes[0]; + slot = path->slots[0] + 1; nritems = btrfs_header_nritems(leaf); - if (slot != nritems) { /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); - + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); } btrfs_cpu_key_to_disk(&disk_key, new_key); @@ -3113,16 +3125,81 @@ split: item_size - split_offset); btrfs_mark_buffer_dirty(leaf); - ret = 0; - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); - BUG(); - } + BUG_ON(btrfs_leaf_free_space(root, leaf) < 0); kfree(buf); + return 0; +} + +/* + * This function splits a single item into two items, + * giving 'new_key' to the new item and splitting the + * old one at split_offset (from the start of the item). + * + * The path may be released by this operation. After + * the split, the path is pointing to the old item. The + * new item is going to be in the same node as the old one. + * + * Note, the item being split must be smaller enough to live alone on + * a tree block with room for one extra struct btrfs_item + * + * This allows us to split the item in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset) +{ + int ret; + ret = setup_leaf_for_split(trans, root, path, + sizeof(struct btrfs_item)); + if (ret) + return ret; + + ret = split_item(trans, root, path, new_key, split_offset); return ret; } /* + * This function duplicate a item, giving 'new_key' to the new item. + * It guarantees both items live in the same tree leaf and the new item + * is contiguous with the original item. + * + * This allows us to split file extent in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key) +{ + struct extent_buffer *leaf; + int ret; + u32 item_size; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ret = setup_leaf_for_split(trans, root, path, + item_size + sizeof(struct btrfs_item)); + if (ret) + return ret; + + path->slots[0]++; + ret = setup_items_for_insert(trans, root, path, new_key, &item_size, + item_size, item_size + + sizeof(struct btrfs_item), 1); + BUG_ON(ret); + + leaf = path->nodes[0]; + memcpy_extent_buffer(leaf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + btrfs_item_ptr_offset(leaf, path->slots[0] - 1), + item_size); + return 0; +} + +/* * make the item pointed to by the path smaller. new_size indicates * how small to make it, and from_end tells us if we just chop bytes * off the end of the item or if we shift the item to chop bytes off @@ -3714,8 +3791,8 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - ret = btrfs_free_extent(trans, root, leaf->start, leaf->len, - 0, root->root_key.objectid, 0, 0); + ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, + 0, root->root_key.objectid, 0); return ret; } /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 444b3e9b92a4..2aa8ec6a0981 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -310,6 +310,9 @@ struct btrfs_header { #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) - \ sizeof(struct btrfs_file_extent_item)) +#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) -\ + sizeof(struct btrfs_dir_item)) /* @@ -859,8 +862,9 @@ struct btrfs_fs_info { struct mutex ordered_operations_mutex; struct rw_semaphore extent_commit_sem; - struct rw_semaphore subvol_sem; + struct rw_semaphore cleanup_work_sem; + struct rw_semaphore subvol_sem; struct srcu_struct subvol_srcu; struct list_head trans_list; @@ -868,6 +872,9 @@ struct btrfs_fs_info { struct list_head dead_roots; struct list_head caching_block_groups; + spinlock_t delayed_iput_lock; + struct list_head delayed_iputs; + atomic_t nr_async_submits; atomic_t async_submit_draining; atomic_t nr_async_bios; @@ -1034,12 +1041,12 @@ struct btrfs_root { int ref_cows; int track_dirty; int in_radix; + int clean_orphans; u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; int defrag_running; - int defrag_level; char *name; int in_sysfs; @@ -1154,6 +1161,7 @@ struct btrfs_root { #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) #define BTRFS_MOUNT_NOSSD (1 << 9) #define BTRFS_MOUNT_DISCARD (1 << 10) +#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1975,6 +1983,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size); +int btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize, + u64 parent, u64 root_objectid, int level); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -2089,6 +2101,10 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_key *new_key, unsigned long split_offset); +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key); int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); @@ -2196,9 +2212,10 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_dir_item *di); int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *name, - u16 name_len, const void *data, u16 data_len, - u64 dir); + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len); struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, @@ -2292,7 +2309,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct inode *inode, u64 new_size, u32 min_type); -int btrfs_start_delalloc_inodes(struct btrfs_root *root); +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); @@ -2332,6 +2349,8 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t size); int btrfs_invalidate_inodes(struct btrfs_root *root); +void btrfs_add_delayed_iput(struct inode *inode); +void btrfs_run_delayed_iputs(struct btrfs_root *root); extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ @@ -2345,12 +2364,9 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); int btrfs_check_file(struct btrfs_root *root, struct inode *inode); extern const struct file_operations btrfs_file_operations; -int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 locked_end, - u64 inline_limit, u64 *hint_block, int drop_cache); +int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, + u64 start, u64 end, u64 *hint_byte, int drop_cache); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); @@ -2380,7 +2396,8 @@ int btrfs_check_acl(struct inode *inode, int mask); #else #define btrfs_check_acl NULL #endif -int btrfs_init_acl(struct inode *inode, struct inode *dir); +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir); int btrfs_acl_chmod(struct inode *inode); /* relocation.c */ diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index f3a6075519cc..e9103b3baa49 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -68,12 +68,12 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle * into the tree */ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *name, - u16 name_len, const void *data, u16 data_len, - u64 dir) + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len) { int ret = 0; - struct btrfs_path *path; struct btrfs_dir_item *dir_item; unsigned long name_ptr, data_ptr; struct btrfs_key key, location; @@ -81,15 +81,11 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; u32 data_size; - key.objectid = dir; + BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); + + key.objectid = objectid; btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); key.offset = btrfs_name_hash(name, name_len); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - if (name_len + data_len + sizeof(struct btrfs_dir_item) > - BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item)) - return -ENOSPC; data_size = sizeof(*dir_item) + name_len + data_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, @@ -117,7 +113,6 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, data, data_ptr, data_len); btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 02b6afbd7450..87b25543d7d1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -892,6 +892,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->stripesize = stripesize; root->ref_cows = 0; root->track_dirty = 0; + root->in_radix = 0; + root->clean_orphans = 0; root->fs_info = fs_info; root->objectid = objectid; @@ -928,7 +930,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->defrag_trans_start = fs_info->generation; init_completion(&root->kobj_unregister); root->defrag_running = 0; - root->defrag_level = 0; root->root_key.objectid = objectid; root->anon_super.s_root = NULL; root->anon_super.s_dev = 0; @@ -980,12 +981,12 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, while (1) { ret = find_first_extent_bit(&log_root_tree->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY); + 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); if (ret) break; - clear_extent_dirty(&log_root_tree->dirty_log_pages, - start, end, GFP_NOFS); + clear_extent_bits(&log_root_tree->dirty_log_pages, start, end, + EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); } eb = fs_info->log_root_tree->node; @@ -1210,8 +1211,10 @@ again: ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, root); - if (ret == 0) + if (ret == 0) { root->in_radix = 1; + root->clean_orphans = 1; + } spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); if (ret) { @@ -1225,10 +1228,6 @@ again: ret = btrfs_find_dead_roots(fs_info->tree_root, root->root_key.objectid); WARN_ON(ret); - - if (!(fs_info->sb->s_flags & MS_RDONLY)) - btrfs_orphan_cleanup(root); - return root; fail: free_fs_root(root); @@ -1477,6 +1476,7 @@ static int cleaner_kthread(void *arg) if (!(root->fs_info->sb->s_flags & MS_RDONLY) && mutex_trylock(&root->fs_info->cleaner_mutex)) { + btrfs_run_delayed_iputs(root); btrfs_clean_old_snapshots(root); mutex_unlock(&root->fs_info->cleaner_mutex); } @@ -1606,6 +1606,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); INIT_LIST_HEAD(&fs_info->ordered_operations); @@ -1614,6 +1615,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->new_trans_lock); spin_lock_init(&fs_info->ref_cache_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->delayed_iput_lock); init_completion(&fs_info->kobj_unregister); fs_info->tree_root = tree_root; @@ -1689,6 +1691,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); init_rwsem(&fs_info->extent_commit_sem); + init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); @@ -1990,6 +1993,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!fs_info->fs_root) goto fail_trans_kthread; + if (!(sb->s_flags & MS_RDONLY)) { + down_read(&fs_info->cleanup_work_sem); + btrfs_orphan_cleanup(fs_info->fs_root); + up_read(&fs_info->cleanup_work_sem); + } + return tree_root; fail_trans_kthread: @@ -2386,8 +2395,14 @@ int btrfs_commit_super(struct btrfs_root *root) int ret; mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(root); btrfs_clean_old_snapshots(root); mutex_unlock(&root->fs_info->cleaner_mutex); + + /* wait until ongoing cleanup work done */ + down_write(&root->fs_info->cleanup_work_sem); + up_write(&root->fs_info->cleanup_work_sem); + trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 94627c4cc193..432a2da4641e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -83,6 +83,17 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) return (cache->flags & bits) == bits; } +void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +{ + atomic_inc(&cache->count); +} + +void btrfs_put_block_group(struct btrfs_block_group_cache *cache) +{ + if (atomic_dec_and_test(&cache->count)) + kfree(cache); +} + /* * this adds the block group to the fs_info rb tree for the block group * cache @@ -156,7 +167,7 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, } } if (ret) - atomic_inc(&ret->count); + btrfs_get_block_group(ret); spin_unlock(&info->block_group_cache_lock); return ret; @@ -195,6 +206,14 @@ static int exclude_super_stripes(struct btrfs_root *root, int stripe_len; int i, nr, ret; + if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { + stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; + cache->bytes_super += stripe_len; + ret = add_excluded_extent(root, cache->key.objectid, + stripe_len); + BUG_ON(ret); + } + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); ret = btrfs_rmap_block(&root->fs_info->mapping_tree, @@ -255,7 +274,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, if (ret) break; - if (extent_start == start) { + if (extent_start <= start) { start = extent_end + 1; } else if (extent_start > start && extent_start < end) { size = extent_start - start; @@ -399,6 +418,8 @@ err: put_caching_control(caching_ctl); atomic_dec(&block_group->space_info->caching_threads); + btrfs_put_block_group(block_group); + return 0; } @@ -439,6 +460,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache) up_write(&fs_info->extent_commit_sem); atomic_inc(&cache->space_info->caching_threads); + btrfs_get_block_group(cache); tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", cache->key.objectid); @@ -478,12 +500,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( return cache; } -void btrfs_put_block_group(struct btrfs_block_group_cache *cache) -{ - if (atomic_dec_and_test(&cache->count)) - kfree(cache); -} - static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, u64 flags) { @@ -2574,7 +2590,7 @@ next_block_group(struct btrfs_root *root, if (node) { cache = rb_entry(node, struct btrfs_block_group_cache, cache_node); - atomic_inc(&cache->count); + btrfs_get_block_group(cache); } else cache = NULL; spin_unlock(&root->fs_info->block_group_cache_lock); @@ -2880,9 +2896,9 @@ static noinline void flush_delalloc_async(struct btrfs_work *work) root = async->root; info = async->info; - btrfs_start_delalloc_inodes(root); + btrfs_start_delalloc_inodes(root, 0); wake_up(&info->flush_wait); - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_ordered_extents(root, 0, 0); spin_lock(&info->lock); info->flushing = 0; @@ -2956,8 +2972,8 @@ static void flush_delalloc(struct btrfs_root *root, return; flush: - btrfs_start_delalloc_inodes(root); - btrfs_wait_ordered_extents(root, 0); + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0, 0); spin_lock(&info->lock); info->flushing = 0; @@ -3454,14 +3470,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, else old_val -= num_bytes; btrfs_set_super_bytes_used(&info->super_copy, old_val); - - /* block accounting for root item */ - old_val = btrfs_root_used(&root->root_item); - if (alloc) - old_val += num_bytes; - else - old_val -= num_bytes; - btrfs_set_root_used(&root->root_item, old_val); spin_unlock(&info->delalloc_lock); while (total) { @@ -4049,6 +4057,21 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, return ret; } +int btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize, + u64 parent, u64 root_objectid, int level) +{ + u64 used; + spin_lock(&root->node_lock); + used = btrfs_root_used(&root->root_item) - blocksize; + btrfs_set_root_used(&root->root_item, used); + spin_unlock(&root->node_lock); + + return btrfs_free_extent(trans, root, bytenr, blocksize, + parent, root_objectid, level, 0); +} + static u64 stripe_align(struct btrfs_root *root, u64 val) { u64 mask = ((u64)root->stripesize - 1); @@ -4212,7 +4235,7 @@ search: u64 offset; int cached; - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); search_start = block_group->key.objectid; have_block_group: @@ -4300,7 +4323,7 @@ have_block_group: btrfs_put_block_group(block_group); block_group = last_ptr->block_group; - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); spin_unlock(&last_ptr->lock); spin_unlock(&last_ptr->refill_lock); @@ -4578,7 +4601,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, { int ret; u64 search_start = 0; - struct btrfs_fs_info *info = root->fs_info; data = btrfs_get_alloc_profile(root, data); again: @@ -4586,17 +4608,9 @@ again: * the only place that sets empty_size is btrfs_realloc_node, which * is not called recursively on allocations */ - if (empty_size || root->ref_cows) { - if (!(data & BTRFS_BLOCK_GROUP_METADATA)) { - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, - BTRFS_BLOCK_GROUP_METADATA | - (info->metadata_alloc_profile & - info->avail_metadata_alloc_bits), 0); - } + if (empty_size || root->ref_cows) ret = do_chunk_alloc(trans, root->fs_info->extent_root, num_bytes + 2 * 1024 * 1024, data, 0); - } WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, @@ -4897,6 +4911,14 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, extent_op); BUG_ON(ret); } + + if (root_objectid == root->root_key.objectid) { + u64 used; + spin_lock(&root->node_lock); + used = btrfs_root_used(&root->root_item) + num_bytes; + btrfs_set_root_used(&root->root_item, used); + spin_unlock(&root->node_lock); + } return ret; } @@ -4919,8 +4941,16 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, btrfs_set_buffer_uptodate(buf); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { - set_extent_dirty(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + /* + * we allow two log transactions at a time, use different + * EXENT bit to differentiate dirty pages. + */ + if (root->log_transid % 2 == 0) + set_extent_dirty(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + else + set_extent_new(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); } else { set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); @@ -7373,9 +7403,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) wait_block_group_cache_done(block_group); btrfs_remove_free_space_cache(block_group); - - WARN_ON(atomic_read(&block_group->count) != 1); - kfree(block_group); + btrfs_put_block_group(block_group); spin_lock(&info->block_group_cache_lock); } diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 46bea0f4dc7b..428fcac45f90 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -155,20 +155,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return NULL; } -/* - * look for an offset in the tree, and if it can't be found, return - * the first offset we can find smaller than 'offset'. - */ -static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) -{ - struct rb_node *prev; - struct rb_node *ret; - ret = __tree_search(root, offset, &prev, NULL); - if (!ret) - return prev; - return ret; -} - /* check to see if two extent_map structs are adjacent and safe to merge */ static int mergable_maps(struct extent_map *prev, struct extent_map *next) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 06550affbd27..c02033596f02 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -179,18 +179,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } flags = em->flags; if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { - if (em->start <= start && - (!testend || em->start + em->len >= start + len)) { + if (testend && em->start + em->len >= start + len) { free_extent_map(em); write_unlock(&em_tree->lock); break; } - if (start < em->start) { - len = em->start - start; - } else { + start = em->start + em->len; + if (testend) len = start + len - (em->start + em->len); - start = em->start + em->len; - } free_extent_map(em); write_unlock(&em_tree->lock); continue; @@ -265,324 +261,253 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, * If an extent intersects the range but is not entirely inside the range * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. - * - * inline_limit is used to tell this code which offsets in the file to keep - * if they contain inline extents. */ -noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 locked_end, - u64 inline_limit, u64 *hint_byte, int drop_cache) +int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, + u64 start, u64 end, u64 *hint_byte, int drop_cache) { - u64 extent_end = 0; - u64 search_start = start; - u64 ram_bytes = 0; - u64 disk_bytenr = 0; - u64 orig_locked_end = locked_end; - u8 compression; - u8 encryption; - u16 other_encoding = 0; + struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_buffer *leaf; - struct btrfs_file_extent_item *extent; + struct btrfs_file_extent_item *fi; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_file_extent_item old; - int keep; - int slot; - int bookend; - int found_type = 0; - int found_extent; - int found_inline; + struct btrfs_key new_key; + u64 search_start = start; + u64 disk_bytenr = 0; + u64 num_bytes = 0; + u64 extent_offset = 0; + u64 extent_end = 0; + int del_nr = 0; + int del_slot = 0; + int extent_type; int recow; int ret; - inline_limit = 0; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); path = btrfs_alloc_path(); if (!path) return -ENOMEM; + while (1) { recow = 0; - btrfs_release_path(root, path); ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, search_start, -1); if (ret < 0) - goto out; - if (ret > 0) { - if (path->slots[0] == 0) { - ret = 0; - goto out; - } - path->slots[0]--; + break; + if (ret > 0 && path->slots[0] > 0 && search_start == start) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + if (key.objectid == inode->i_ino && + key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; } + ret = 0; next_slot: - keep = 0; - bookend = 0; - found_extent = 0; - found_inline = 0; - compression = 0; - encryption = 0; - extent = NULL; leaf = path->nodes[0]; - slot = path->slots[0]; - ret = 0; - btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY && - key.offset >= end) { - goto out; - } - if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || - key.objectid != inode->i_ino) { - goto out; - } - if (recow) { - search_start = max(key.offset, start); - continue; - } - if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, extent); - compression = btrfs_file_extent_compression(leaf, - extent); - encryption = btrfs_file_extent_encryption(leaf, - extent); - other_encoding = btrfs_file_extent_other_encoding(leaf, - extent); - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { - extent_end = - btrfs_file_extent_disk_bytenr(leaf, - extent); - if (extent_end) - *hint_byte = extent_end; - - extent_end = key.offset + - btrfs_file_extent_num_bytes(leaf, extent); - ram_bytes = btrfs_file_extent_ram_bytes(leaf, - extent); - found_extent = 1; - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - found_inline = 1; - extent_end = key.offset + - btrfs_file_extent_inline_len(leaf, extent); + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + BUG_ON(del_nr > 0); + ret = btrfs_next_leaf(root, path); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; } + leaf = path->nodes[0]; + recow = 1; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid > inode->i_ino || + key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); + extent_end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = key.offset + + btrfs_file_extent_inline_len(leaf, fi); } else { + WARN_ON(1); extent_end = search_start; } - /* we found nothing we can drop */ - if ((!found_extent && !found_inline) || - search_start >= extent_end) { - int nextret; - u32 nritems; - nritems = btrfs_header_nritems(leaf); - if (slot >= nritems - 1) { - nextret = btrfs_next_leaf(root, path); - if (nextret) - goto out; - recow = 1; - } else { - path->slots[0]++; - } + if (extent_end <= search_start) { + path->slots[0]++; goto next_slot; } - if (end <= extent_end && start >= key.offset && found_inline) - *hint_byte = EXTENT_MAP_INLINE; - - if (found_extent) { - read_extent_buffer(leaf, &old, (unsigned long)extent, - sizeof(old)); - } - - if (end < extent_end && end >= key.offset) { - bookend = 1; - if (found_inline && start <= key.offset) - keep = 1; + search_start = max(key.offset, start); + if (recow) { + btrfs_release_path(root, path); + continue; } - if (bookend && found_extent) { - if (locked_end < extent_end) { - ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - locked_end, extent_end - 1, - GFP_NOFS); - if (!ret) { - btrfs_release_path(root, path); - lock_extent(&BTRFS_I(inode)->io_tree, - locked_end, extent_end - 1, - GFP_NOFS); - locked_end = extent_end; - continue; - } - locked_end = extent_end; + /* + * | - range to drop - | + * | -------- extent -------- | + */ + if (start > key.offset && end < extent_end) { + BUG_ON(del_nr > 0); + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = start; + ret = btrfs_duplicate_item(trans, root, path, + &new_key); + if (ret == -EAGAIN) { + btrfs_release_path(root, path); + continue; } - disk_bytenr = le64_to_cpu(old.disk_bytenr); - if (disk_bytenr != 0) { + if (ret < 0) + break; + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + extent_offset += start - key.offset; + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - start); + btrfs_mark_buffer_dirty(leaf); + + if (disk_bytenr > 0) { ret = btrfs_inc_extent_ref(trans, root, - disk_bytenr, - le64_to_cpu(old.disk_num_bytes), 0, - root->root_key.objectid, - key.objectid, key.offset - - le64_to_cpu(old.offset)); + disk_bytenr, num_bytes, 0, + root->root_key.objectid, + new_key.objectid, + start - extent_offset); BUG_ON(ret); + *hint_byte = disk_bytenr; } + key.offset = start; } + /* + * | ---- range to drop ----- | + * | -------- extent -------- | + */ + if (start <= key.offset && end < extent_end) { + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); - if (found_inline) { - u64 mask = root->sectorsize - 1; - search_start = (extent_end + mask) & ~mask; - } else - search_start = extent_end; - - /* truncate existing extent */ - if (start > key.offset) { - u64 new_num; - u64 old_num; - keep = 1; - WARN_ON(start & (root->sectorsize - 1)); - if (found_extent) { - new_num = start - key.offset; - old_num = btrfs_file_extent_num_bytes(leaf, - extent); - *hint_byte = - btrfs_file_extent_disk_bytenr(leaf, - extent); - if (btrfs_file_extent_disk_bytenr(leaf, - extent)) { - inode_sub_bytes(inode, old_num - - new_num); - } - btrfs_set_file_extent_num_bytes(leaf, - extent, new_num); - btrfs_mark_buffer_dirty(leaf); - } else if (key.offset < inline_limit && - (end > extent_end) && - (inline_limit < extent_end)) { - u32 new_size; - new_size = btrfs_file_extent_calc_inline_size( - inline_limit - key.offset); - inode_sub_bytes(inode, extent_end - - inline_limit); - btrfs_set_file_extent_ram_bytes(leaf, extent, - new_size); - if (!compression && !encryption) { - btrfs_truncate_item(trans, root, path, - new_size, 1); - } + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = end; + btrfs_set_item_key_safe(trans, root, path, &new_key); + + extent_offset += end - key.offset; + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - end); + btrfs_mark_buffer_dirty(leaf); + if (disk_bytenr > 0) { + inode_sub_bytes(inode, end - key.offset); + *hint_byte = disk_bytenr; } + break; } - /* delete the entire extent */ - if (!keep) { - if (found_inline) - inode_sub_bytes(inode, extent_end - - key.offset); - ret = btrfs_del_item(trans, root, path); - /* TODO update progress marker and return */ - BUG_ON(ret); - extent = NULL; - btrfs_release_path(root, path); - /* the extent will be freed later */ - } - if (bookend && found_inline && start <= key.offset) { - u32 new_size; - new_size = btrfs_file_extent_calc_inline_size( - extent_end - end); - inode_sub_bytes(inode, end - key.offset); - btrfs_set_file_extent_ram_bytes(leaf, extent, - new_size); - if (!compression && !encryption) - ret = btrfs_truncate_item(trans, root, path, - new_size, 0); - BUG_ON(ret); - } - /* create bookend, splitting the extent in two */ - if (bookend && found_extent) { - struct btrfs_key ins; - ins.objectid = inode->i_ino; - ins.offset = end; - btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); - btrfs_release_path(root, path); - path->leave_spinning = 1; - ret = btrfs_insert_empty_item(trans, root, path, &ins, - sizeof(*extent)); - BUG_ON(ret); + search_start = extent_end; + /* + * | ---- range to drop ----- | + * | -------- extent -------- | + */ + if (start > key.offset && end >= extent_end) { + BUG_ON(del_nr > 0); + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); - leaf = path->nodes[0]; - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - write_extent_buffer(leaf, &old, - (unsigned long)extent, sizeof(old)); - - btrfs_set_file_extent_compression(leaf, extent, - compression); - btrfs_set_file_extent_encryption(leaf, extent, - encryption); - btrfs_set_file_extent_other_encoding(leaf, extent, - other_encoding); - btrfs_set_file_extent_offset(leaf, extent, - le64_to_cpu(old.offset) + end - key.offset); - WARN_ON(le64_to_cpu(old.num_bytes) < - (extent_end - end)); - btrfs_set_file_extent_num_bytes(leaf, extent, - extent_end - end); + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + btrfs_mark_buffer_dirty(leaf); + if (disk_bytenr > 0) { + inode_sub_bytes(inode, extent_end - start); + *hint_byte = disk_bytenr; + } + if (end == extent_end) + break; - /* - * set the ram bytes to the size of the full extent - * before splitting. This is a worst case flag, - * but its the best we can do because we don't know - * how splitting affects compression - */ - btrfs_set_file_extent_ram_bytes(leaf, extent, - ram_bytes); - btrfs_set_file_extent_type(leaf, extent, found_type); - - btrfs_unlock_up_safe(path, 1); - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_set_lock_blocking(path->nodes[0]); - - path->leave_spinning = 0; - btrfs_release_path(root, path); - if (disk_bytenr != 0) - inode_add_bytes(inode, extent_end - end); + path->slots[0]++; + goto next_slot; } - if (found_extent && !keep) { - u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr); + /* + * | ---- range to drop ----- | + * | ------ extent ------ | + */ + if (start <= key.offset && end >= extent_end) { + if (del_nr == 0) { + del_slot = path->slots[0]; + del_nr = 1; + } else { + BUG_ON(del_slot + del_nr != path->slots[0]); + del_nr++; + } - if (old_disk_bytenr != 0) { + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { inode_sub_bytes(inode, - le64_to_cpu(old.num_bytes)); + extent_end - key.offset); + extent_end = ALIGN(extent_end, + root->sectorsize); + } else if (disk_bytenr > 0) { ret = btrfs_free_extent(trans, root, - old_disk_bytenr, - le64_to_cpu(old.disk_num_bytes), - 0, root->root_key.objectid, + disk_bytenr, num_bytes, 0, + root->root_key.objectid, key.objectid, key.offset - - le64_to_cpu(old.offset)); + extent_offset); BUG_ON(ret); - *hint_byte = old_disk_bytenr; + inode_sub_bytes(inode, + extent_end - key.offset); + *hint_byte = disk_bytenr; } - } - if (search_start >= end) { - ret = 0; - goto out; + if (end == extent_end) + break; + + if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { + path->slots[0]++; + goto next_slot; + } + + ret = btrfs_del_items(trans, root, path, del_slot, + del_nr); + BUG_ON(ret); + + del_nr = 0; + del_slot = 0; + + btrfs_release_path(root, path); + continue; } + + BUG_ON(1); } -out: - btrfs_free_path(path); - if (locked_end > orig_locked_end) { - unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end, - locked_end - 1, GFP_NOFS); + + if (del_nr > 0) { + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + BUG_ON(ret); } + + btrfs_free_path(path); return ret; } static int extent_mergeable(struct extent_buffer *leaf, int slot, - u64 objectid, u64 bytenr, u64 *start, u64 *end) + u64 objectid, u64 bytenr, u64 orig_offset, + u64 *start, u64 *end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; @@ -598,6 +523,7 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || + btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) @@ -620,23 +546,24 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, * two or three. */ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, u64 start, u64 end) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_buffer *leaf; struct btrfs_path *path; struct btrfs_file_extent_item *fi; struct btrfs_key key; + struct btrfs_key new_key; u64 bytenr; u64 num_bytes; u64 extent_end; u64 orig_offset; u64 other_start; u64 other_end; - u64 split = start; - u64 locked_end = end; - int extent_type; - int split_end = 1; + u64 split; + int del_nr = 0; + int del_slot = 0; + int recow; int ret; btrfs_drop_extent_cache(inode, start, end - 1, 0); @@ -644,12 +571,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); again: + recow = 0; + split = start; key.objectid = inode->i_ino; key.type = BTRFS_EXTENT_DATA_KEY; - if (split == start) - key.offset = split; - else - key.offset = split - 1; + key.offset = split; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0 && path->slots[0] > 0) @@ -661,159 +587,156 @@ again: key.type != BTRFS_EXTENT_DATA_KEY); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(leaf, fi); - BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC); + BUG_ON(btrfs_file_extent_type(leaf, fi) != + BTRFS_FILE_EXTENT_PREALLOC); extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); BUG_ON(key.offset > start || extent_end < end); bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); + memcpy(&new_key, &key, sizeof(new_key)); - if (key.offset == start) - split = end; - - if (key.offset == start && extent_end == end) { - int del_nr = 0; - int del_slot = 0; - other_start = end; - other_end = 0; - if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, - bytenr, &other_start, &other_end)) { - extent_end = other_end; - del_slot = path->slots[0] + 1; - del_nr++; - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - inode->i_ino, orig_offset); - BUG_ON(ret); - } + if (start == key.offset && end < extent_end) { other_start = 0; other_end = start; - if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino, - bytenr, &other_start, &other_end)) { - key.offset = other_start; - del_slot = path->slots[0]; - del_nr++; - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - inode->i_ino, orig_offset); - BUG_ON(ret); - } - split_end = 0; - if (del_nr == 0) { - btrfs_set_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_REG); - goto done; - } - - fi = btrfs_item_ptr(leaf, del_slot - 1, - struct btrfs_file_extent_item); - btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_end - key.offset); - btrfs_mark_buffer_dirty(leaf); - - ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - BUG_ON(ret); - goto release; - } else if (split == start) { - if (locked_end < extent_end) { - ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - locked_end, extent_end - 1, GFP_NOFS); - if (!ret) { - btrfs_release_path(root, path); - lock_extent(&BTRFS_I(inode)->io_tree, - locked_end, extent_end - 1, GFP_NOFS); - locked_end = extent_end; - goto again; - } - locked_end = extent_end; + if (extent_mergeable(leaf, path->slots[0] - 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { + new_key.offset = end; + btrfs_set_item_key_safe(trans, root, path, &new_key); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - end); + btrfs_set_file_extent_offset(leaf, fi, + end - orig_offset); + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + end - other_start); + btrfs_mark_buffer_dirty(leaf); + goto out; } - btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); - } else { - BUG_ON(key.offset != start); - key.offset = split; - btrfs_set_file_extent_offset(leaf, fi, key.offset - - orig_offset); - btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); - btrfs_set_item_key_safe(trans, root, path, &key); - extent_end = split; } - if (extent_end == end) { - split_end = 0; - extent_type = BTRFS_FILE_EXTENT_REG; - } - if (extent_end == end && split == start) { + if (start > key.offset && end == extent_end) { other_start = end; other_end = 0; - if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, - bytenr, &other_start, &other_end)) { - path->slots[0]++; + if (extent_mergeable(leaf, path->slots[0] + 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - key.offset = split; - btrfs_set_item_key_safe(trans, root, path, &key); - btrfs_set_file_extent_offset(leaf, fi, key.offset - - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, - other_end - split); - goto done; - } - } - if (extent_end == end && split == end) { - other_start = 0; - other_end = start; - if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino, - bytenr, &other_start, &other_end)) { - path->slots[0]--; + start - key.offset); + path->slots[0]++; + new_key.offset = start; + btrfs_set_item_key_safe(trans, root, path, &new_key); + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - - other_start); - goto done; + btrfs_set_file_extent_num_bytes(leaf, fi, + other_end - start); + btrfs_set_file_extent_offset(leaf, fi, + start - orig_offset); + btrfs_mark_buffer_dirty(leaf); + goto out; } } - btrfs_mark_buffer_dirty(leaf); + while (start > key.offset || end < extent_end) { + if (key.offset == start) + split = end; - ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, - root->root_key.objectid, - inode->i_ino, orig_offset); - BUG_ON(ret); - btrfs_release_path(root, path); + new_key.offset = split; + ret = btrfs_duplicate_item(trans, root, path, &new_key); + if (ret == -EAGAIN) { + btrfs_release_path(root, path); + goto again; + } + BUG_ON(ret < 0); - key.offset = start; - ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi)); - BUG_ON(ret); + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + split - key.offset); - leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_set_file_extent_type(leaf, fi, extent_type); - btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes); - btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset); - btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); - btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); - btrfs_set_file_extent_compression(leaf, fi, 0); - btrfs_set_file_extent_encryption(leaf, fi, 0); - btrfs_set_file_extent_other_encoding(leaf, fi, 0); -done: - btrfs_mark_buffer_dirty(leaf); - -release: - btrfs_release_path(root, path); - if (split_end && split == start) { - split = end; - goto again; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - split); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, + root->root_key.objectid, + inode->i_ino, orig_offset); + BUG_ON(ret); + + if (split == start) { + key.offset = start; + } else { + BUG_ON(start != key.offset); + path->slots[0]--; + extent_end = end; + } + recow = 1; } - if (locked_end > end) { - unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, - GFP_NOFS); + + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { + if (recow) { + btrfs_release_path(root, path); + goto again; + } + extent_end = other_end; + del_slot = path->slots[0] + 1; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + 0, root->root_key.objectid, + inode->i_ino, orig_offset); + BUG_ON(ret); + } + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { + if (recow) { + btrfs_release_path(root, path); + goto again; + } + key.offset = other_start; + del_slot = path->slots[0]; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + 0, root->root_key.objectid, + inode->i_ino, orig_offset); + BUG_ON(ret); } + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if (del_nr == 0) { + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_mark_buffer_dirty(leaf); + } else { + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - key.offset); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + BUG_ON(ret); + } +out: btrfs_free_path(path); return 0; } @@ -909,7 +832,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, unsigned long last_index; int will_write; - will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) || + will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || (file->f_flags & O_DIRECT)); nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, @@ -1076,7 +999,7 @@ out_nolock: if (err) num_written = err; - if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { + if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { trans = btrfs_start_transaction(root, 1); ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b3ad168a0bfc..8cd109972fa6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -88,13 +88,14 @@ static noinline int cow_file_range(struct inode *inode, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); -static int btrfs_init_inode_security(struct inode *inode, struct inode *dir) +static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) { int err; - err = btrfs_init_acl(inode, dir); + err = btrfs_init_acl(trans, inode, dir); if (!err) - err = btrfs_xattr_security_init(inode, dir); + err = btrfs_xattr_security_init(trans, inode, dir); return err; } @@ -188,8 +189,18 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); + /* + * we're an inline extent, so nobody can + * extend the file past i_size without locking + * a page we already have locked. + * + * We must do any isize and inode updates + * before we unlock the pages. Otherwise we + * could end up racing with unlink. + */ BTRFS_I(inode)->disk_i_size = inode->i_size; btrfs_update_inode(trans, root, inode); + return 0; fail: btrfs_free_path(path); @@ -230,8 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, return 1; } - ret = btrfs_drop_extents(trans, root, inode, start, - aligned_end, aligned_end, start, + ret = btrfs_drop_extents(trans, inode, start, aligned_end, &hint_byte, 1); BUG_ON(ret); @@ -416,7 +426,6 @@ again: start, end, total_compressed, pages); } - btrfs_end_transaction(trans, root); if (ret == 0) { /* * inline extent creation worked, we don't need @@ -430,9 +439,11 @@ again: EXTENT_CLEAR_DELALLOC | EXTENT_CLEAR_ACCOUNTING | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); - ret = 0; + + btrfs_end_transaction(trans, root); goto free_pages_out; } + btrfs_end_transaction(trans, root); } if (will_compress) { @@ -472,7 +483,8 @@ again: nr_pages_ret = 0; /* flag the file so we don't compress in the future */ - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + if (!btrfs_test_opt(root, FORCE_COMPRESS)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } if (will_compress) { *num_added += 1; @@ -543,7 +555,6 @@ static noinline int submit_compressed_extents(struct inode *inode, if (list_empty(&async_cow->extents)) return 0; - trans = btrfs_join_transaction(root, 1); while (!list_empty(&async_cow->extents)) { async_extent = list_entry(async_cow->extents.next, @@ -590,19 +601,15 @@ retry: lock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1, GFP_NOFS); - /* - * here we're doing allocation and writeback of the - * compressed pages - */ - btrfs_drop_extent_cache(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, 0); + trans = btrfs_join_transaction(root, 1); ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, 0, alloc_hint, (u64)-1, &ins, 1); + btrfs_end_transaction(trans, root); + if (ret) { int i; for (i = 0; i < async_extent->nr_pages; i++) { @@ -618,6 +625,14 @@ retry: goto retry; } + /* + * here we're doing allocation and writeback of the + * compressed pages + */ + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + em = alloc_extent_map(GFP_NOFS); em->start = async_extent->start; em->len = async_extent->ram_size; @@ -649,8 +664,6 @@ retry: BTRFS_ORDERED_COMPRESSED); BUG_ON(ret); - btrfs_end_transaction(trans, root); - /* * clear dirty, set writeback and unlock the pages. */ @@ -672,13 +685,11 @@ retry: async_extent->nr_pages); BUG_ON(ret); - trans = btrfs_join_transaction(root, 1); alloc_hint = ins.objectid + ins.offset; kfree(async_extent); cond_resched(); } - btrfs_end_transaction(trans, root); return 0; } @@ -742,6 +753,7 @@ static noinline int cow_file_range(struct inode *inode, EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); + *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; @@ -1596,7 +1608,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_pos, u64 disk_bytenr, u64 disk_num_bytes, u64 num_bytes, u64 ram_bytes, - u64 locked_end, u8 compression, u8 encryption, u16 other_encoding, int extent_type) { @@ -1622,9 +1633,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * the caller is expected to unpin it and allow it to be merged * with the others. */ - ret = btrfs_drop_extents(trans, root, inode, file_pos, - file_pos + num_bytes, locked_end, - file_pos, &hint, 0); + ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, + &hint, 0); BUG_ON(ret); ins.objectid = inode->i_ino; @@ -1730,23 +1740,32 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) } } - trans = btrfs_join_transaction(root, 1); - if (!ordered_extent) ordered_extent = btrfs_lookup_ordered_extent(inode, start); BUG_ON(!ordered_extent); - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) - goto nocow; + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + BUG_ON(!list_empty(&ordered_extent->list)); + ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); + if (!ret) { + trans = btrfs_join_transaction(root, 1); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + btrfs_end_transaction(trans, root); + } + goto out; + } lock_extent(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, GFP_NOFS); + trans = btrfs_join_transaction(root, 1); + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compressed = 1; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compressed); - ret = btrfs_mark_extent_written(trans, root, inode, + ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len); @@ -1758,8 +1777,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->disk_len, ordered_extent->len, ordered_extent->len, - ordered_extent->file_offset + - ordered_extent->len, compressed, 0, 0, BTRFS_FILE_EXTENT_REG); unpin_extent_cache(&BTRFS_I(inode)->extent_tree, @@ -1770,22 +1787,20 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) unlock_extent(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, GFP_NOFS); -nocow: add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); - mutex_lock(&BTRFS_I(inode)->extent_mutex); - btrfs_ordered_update_i_size(inode, ordered_extent); - btrfs_update_inode(trans, root, inode); - btrfs_remove_ordered_extent(inode, ordered_extent); - mutex_unlock(&BTRFS_I(inode)->extent_mutex); - + /* this also removes the ordered extent from the tree */ + btrfs_ordered_update_i_size(inode, 0, ordered_extent); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + btrfs_end_transaction(trans, root); +out: /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); - btrfs_end_transaction(trans, root); return 0; } @@ -2008,6 +2023,54 @@ zeroit: return -EIO; } +struct delayed_iput { + struct list_head list; + struct inode *inode; +}; + +void btrfs_add_delayed_iput(struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct delayed_iput *delayed; + + if (atomic_add_unless(&inode->i_count, -1, 1)) + return; + + delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); + delayed->inode = inode; + + spin_lock(&fs_info->delayed_iput_lock); + list_add_tail(&delayed->list, &fs_info->delayed_iputs); + spin_unlock(&fs_info->delayed_iput_lock); +} + +void btrfs_run_delayed_iputs(struct btrfs_root *root) +{ + LIST_HEAD(list); + struct btrfs_fs_info *fs_info = root->fs_info; + struct delayed_iput *delayed; + int empty; + + spin_lock(&fs_info->delayed_iput_lock); + empty = list_empty(&fs_info->delayed_iputs); + spin_unlock(&fs_info->delayed_iput_lock); + if (empty) + return; + + down_read(&root->fs_info->cleanup_work_sem); + spin_lock(&fs_info->delayed_iput_lock); + list_splice_init(&fs_info->delayed_iputs, &list); + spin_unlock(&fs_info->delayed_iput_lock); + + while (!list_empty(&list)) { + delayed = list_entry(list.next, struct delayed_iput, list); + list_del(&delayed->list); + iput(delayed->inode); + kfree(delayed); + } + up_read(&root->fs_info->cleanup_work_sem); +} + /* * This creates an orphan entry for the given inode in case something goes * wrong in the middle of an unlink/truncate. @@ -2080,16 +2143,17 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) struct inode *inode; int ret = 0, nr_unlink = 0, nr_truncate = 0; - path = btrfs_alloc_path(); - if (!path) + if (!xchg(&root->clean_orphans, 0)) return; + + path = btrfs_alloc_path(); + BUG_ON(!path); path->reada = -1; key.objectid = BTRFS_ORPHAN_OBJECTID; btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); key.offset = (u64)-1; - while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { @@ -2834,37 +2898,40 @@ out: * min_type is the minimum key type to truncate down to. If set to 0, this * will kill all the items on this inode, including the INODE_ITEM_KEY. */ -noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode, - u64 new_size, u32 min_type) +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + u64 new_size, u32 min_type) { - int ret; struct btrfs_path *path; - struct btrfs_key key; - struct btrfs_key found_key; - u32 found_type = (u8)-1; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key found_key; u64 extent_start = 0; u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; + u64 mask = root->sectorsize - 1; + u32 found_type = (u8)-1; int found_extent; int del_item; int pending_del_nr = 0; int pending_del_slot = 0; int extent_type = -1; int encoding; - u64 mask = root->sectorsize - 1; + int ret; + int err = 0; + + BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); if (root->ref_cows) btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); + path = btrfs_alloc_path(); BUG_ON(!path); path->reada = -1; - /* FIXME, add redo link to tree so we don't leak on crash */ key.objectid = inode->i_ino; key.offset = (u64)-1; key.type = (u8)-1; @@ -2872,17 +2939,17 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, search_again: path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto error; + if (ret < 0) { + err = ret; + goto out; + } if (ret > 0) { /* there are no items in the tree for us to truncate, we're * done */ - if (path->slots[0] == 0) { - ret = 0; - goto error; - } + if (path->slots[0] == 0) + goto out; path->slots[0]--; } @@ -2917,28 +2984,17 @@ search_again: } item_end--; } - if (item_end < new_size) { - if (found_type == BTRFS_DIR_ITEM_KEY) - found_type = BTRFS_INODE_ITEM_KEY; - else if (found_type == BTRFS_EXTENT_ITEM_KEY) - found_type = BTRFS_EXTENT_DATA_KEY; - else if (found_type == BTRFS_EXTENT_DATA_KEY) - found_type = BTRFS_XATTR_ITEM_KEY; - else if (found_type == BTRFS_XATTR_ITEM_KEY) - found_type = BTRFS_INODE_REF_KEY; - else if (found_type) - found_type--; - else + if (found_type > min_type) { + del_item = 1; + } else { + if (item_end < new_size) break; - btrfs_set_key_type(&key, found_type); - goto next; + if (found_key.offset >= new_size) + del_item = 1; + else + del_item = 0; } - if (found_key.offset >= new_size) - del_item = 1; - else - del_item = 0; found_extent = 0; - /* FIXME, shrink the extent if the ref count is only 1 */ if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; @@ -3025,42 +3081,36 @@ delete: inode->i_ino, extent_offset); BUG_ON(ret); } -next: - if (path->slots[0] == 0) { - if (pending_del_nr) - goto del_pending; - btrfs_release_path(root, path); - if (found_type == BTRFS_INODE_ITEM_KEY) - break; - goto search_again; - } - path->slots[0]--; - if (pending_del_nr && - path->slots[0] + 1 != pending_del_slot) { - struct btrfs_key debug; -del_pending: - btrfs_item_key_to_cpu(path->nodes[0], &debug, - pending_del_slot); - ret = btrfs_del_items(trans, root, path, - pending_del_slot, - pending_del_nr); - BUG_ON(ret); - pending_del_nr = 0; + if (found_type == BTRFS_INODE_ITEM_KEY) + break; + + if (path->slots[0] == 0 || + path->slots[0] != pending_del_slot) { + if (root->ref_cows) { + err = -EAGAIN; + goto out; + } + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + BUG_ON(ret); + pending_del_nr = 0; + } btrfs_release_path(root, path); - if (found_type == BTRFS_INODE_ITEM_KEY) - break; goto search_again; + } else { + path->slots[0]--; } } - ret = 0; -error: +out: if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); } btrfs_free_path(path); - return ret; + return err; } /* @@ -3180,10 +3230,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) if (size <= hole_start) return 0; - err = btrfs_truncate_page(inode->i_mapping, inode->i_size); - if (err) - return err; - while (1) { struct btrfs_ordered_extent *ordered; btrfs_wait_ordered_range(inode, hole_start, @@ -3196,9 +3242,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) btrfs_put_ordered_extent(ordered); } - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - cur_offset = hole_start; while (1) { em = btrfs_get_extent(inode, NULL, 0, cur_offset, @@ -3206,40 +3249,120 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) BUG_ON(IS_ERR(em) || !em); last_byte = min(extent_map_end(em), block_end); last_byte = (last_byte + mask) & ~mask; - if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { u64 hint_byte = 0; hole_size = last_byte - cur_offset; - err = btrfs_drop_extents(trans, root, inode, - cur_offset, - cur_offset + hole_size, - block_end, - cur_offset, &hint_byte, 1); - if (err) - break; - err = btrfs_reserve_metadata_space(root, 1); + err = btrfs_reserve_metadata_space(root, 2); if (err) break; + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + err = btrfs_drop_extents(trans, inode, cur_offset, + cur_offset + hole_size, + &hint_byte, 1); + BUG_ON(err); + err = btrfs_insert_file_extent(trans, root, inode->i_ino, cur_offset, 0, 0, hole_size, 0, hole_size, 0, 0, 0); + BUG_ON(err); + btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0); - btrfs_unreserve_metadata_space(root, 1); + + btrfs_end_transaction(trans, root); + btrfs_unreserve_metadata_space(root, 2); } free_extent_map(em); cur_offset = last_byte; - if (err || cur_offset >= block_end) + if (cur_offset >= block_end) break; } - btrfs_end_transaction(trans, root); unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); return err; } +static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + unsigned long nr; + int ret; + + if (attr->ia_size == inode->i_size) + return 0; + + if (attr->ia_size > inode->i_size) { + unsigned long limit; + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (attr->ia_size > inode->i_sb->s_maxbytes) + return -EFBIG; + if (limit != RLIM_INFINITY && attr->ia_size > limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + } + + ret = btrfs_reserve_metadata_space(root, 1); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_orphan_add(trans, inode); + BUG_ON(ret); + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_unreserve_metadata_space(root, 1); + btrfs_btree_balance_dirty(root, nr); + + if (attr->ia_size > inode->i_size) { + ret = btrfs_cont_expand(inode, attr->ia_size); + if (ret) { + btrfs_truncate(inode); + return ret; + } + + i_size_write(inode, attr->ia_size); + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + if (inode->i_nlink > 0) { + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + } + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + return 0; + } + + /* + * We're truncating a file that used to have good data down to + * zero. Make sure it gets into the ordered flush list so that + * any new writes get down to disk quickly. + */ + if (attr->ia_size == 0) + BTRFS_I(inode)->ordered_data_close = 1; + + /* we don't support swapfiles, so vmtruncate shouldn't fail */ + ret = vmtruncate(inode, attr->ia_size); + BUG_ON(ret); + + return 0; +} + static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; @@ -3250,23 +3373,14 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - if (attr->ia_size > inode->i_size) { - err = btrfs_cont_expand(inode, attr->ia_size); - if (err) - return err; - } else if (inode->i_size > 0 && - attr->ia_size == 0) { - - /* we're truncating a file that used to have good - * data down to zero. Make sure it gets into - * the ordered flush list so that any new writes - * get down to disk quickly. - */ - BTRFS_I(inode)->ordered_data_close = 1; - } + err = btrfs_setattr_size(inode, attr); + if (err) + return err; } + attr->ia_valid &= ~ATTR_SIZE; - err = inode_setattr(inode, attr); + if (attr->ia_valid) + err = inode_setattr(inode, attr); if (!err && ((attr->ia_valid & ATTR_MODE))) err = btrfs_acl_chmod(inode); @@ -3287,36 +3401,43 @@ void btrfs_delete_inode(struct inode *inode) } btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (root->fs_info->log_root_recovering) { + BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); + goto no_delete; + } + if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0); goto no_delete; } btrfs_i_size_write(inode, 0); - trans = btrfs_join_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0); - if (ret) { - btrfs_orphan_del(NULL, inode); - goto no_delete_lock; - } + while (1) { + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); - btrfs_orphan_del(trans, inode); + if (ret != -EAGAIN) + break; - nr = trans->blocks_used; - clear_inode(inode); + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + trans = NULL; + btrfs_btree_balance_dirty(root, nr); + } - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - return; + if (ret == 0) { + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + } -no_delete_lock: nr = trans->blocks_used; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); no_delete: clear_inode(inode); + return; } /* @@ -3569,7 +3690,6 @@ static noinline void init_btrfs_i(struct inode *inode) INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); - mutex_init(&BTRFS_I(inode)->extent_mutex); mutex_init(&BTRFS_I(inode)->log_mutex); } @@ -3695,6 +3815,13 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) } srcu_read_unlock(&root->fs_info->subvol_srcu, index); + if (root != sub_root) { + down_read(&root->fs_info->cleanup_work_sem); + if (!(inode->i_sb->s_flags & MS_RDONLY)) + btrfs_orphan_cleanup(sub_root); + up_read(&root->fs_info->cleanup_work_sem); + } + return inode; } @@ -3869,7 +3996,11 @@ skip: /* Reached end of directory/root. Bump pos past the last item. */ if (key_type == BTRFS_DIR_INDEX_KEY) - filp->f_pos = INT_LIMIT(off_t); + /* + * 32-bit glibc will use getdents64, but then strtol - + * so the last number we can serve is this. + */ + filp->f_pos = 0x7fffffff; else filp->f_pos++; nopos: @@ -4219,7 +4350,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(inode, dir); + err = btrfs_init_inode_security(trans, inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -4290,7 +4421,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(inode, dir); + err = btrfs_init_inode_security(trans, inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -4336,6 +4467,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink == 0) return -ENOENT; + /* do not allow sys_link's with other subvols of the same device */ + if (root->objectid != BTRFS_I(inode)->root->objectid) + return -EPERM; + /* * 1 item for inode ref * 2 items for dir items @@ -4423,7 +4558,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) drop_on_err = 1; - err = btrfs_init_inode_security(inode, dir); + err = btrfs_init_inode_security(trans, inode, dir); if (err) goto out_fail; @@ -5074,17 +5209,20 @@ static void btrfs_truncate(struct inode *inode) unsigned long nr; u64 mask = root->sectorsize - 1; - if (!S_ISREG(inode->i_mode)) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + if (!S_ISREG(inode->i_mode)) { + WARN_ON(1); return; + } ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); if (ret) return; + btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); /* * setattr is responsible for setting the ordered_data_close flag, @@ -5106,21 +5244,32 @@ static void btrfs_truncate(struct inode *inode) if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) btrfs_add_ordered_operation(trans, root, inode); - btrfs_set_trans_block_group(trans, inode); - btrfs_i_size_write(inode, inode->i_size); + while (1) { + ret = btrfs_truncate_inode_items(trans, root, inode, + inode->i_size, + BTRFS_EXTENT_DATA_KEY); + if (ret != -EAGAIN) + break; - ret = btrfs_orphan_add(trans, inode); - if (ret) - goto out; - /* FIXME, add redo link to tree so we don't leak on crash */ - ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, - BTRFS_EXTENT_DATA_KEY); - btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + } - ret = btrfs_orphan_del(trans, inode); + if (ret == 0 && inode->i_nlink > 0) { + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + } + + ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); -out: nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); BUG_ON(ret); @@ -5217,9 +5366,9 @@ void btrfs_destroy_inode(struct inode *inode) spin_lock(&root->list_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { - printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" - " list\n", inode->i_ino); - dump_stack(); + printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", + inode->i_ino); + list_del_init(&BTRFS_I(inode)->i_orphan); } spin_unlock(&root->list_lock); @@ -5476,7 +5625,7 @@ out_fail: * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -int btrfs_start_delalloc_inodes(struct btrfs_root *root) +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { struct list_head *head = &root->fs_info->delalloc_inodes; struct btrfs_inode *binode; @@ -5495,7 +5644,10 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root) spin_unlock(&root->fs_info->delalloc_lock); if (inode) { filemap_flush(inode->i_mapping); - iput(inode); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); } cond_resched(); spin_lock(&root->fs_info->delalloc_lock); @@ -5569,7 +5721,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(inode, dir); + err = btrfs_init_inode_security(trans, inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -5641,57 +5793,75 @@ out_fail: return err; } -static int prealloc_file_range(struct btrfs_trans_handle *trans, - struct inode *inode, u64 start, u64 end, - u64 locked_end, u64 alloc_hint, int mode) +static int prealloc_file_range(struct inode *inode, u64 start, u64 end, + u64 alloc_hint, int mode, loff_t actual_len) { + struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 alloc_size; u64 cur_offset = start; u64 num_bytes = end - start; int ret = 0; + u64 i_size; while (num_bytes > 0) { alloc_size = min(num_bytes, root->fs_info->max_extent); - ret = btrfs_reserve_metadata_space(root, 1); - if (ret) - goto out; + trans = btrfs_start_transaction(root, 1); ret = btrfs_reserve_extent(trans, root, alloc_size, root->sectorsize, 0, alloc_hint, (u64)-1, &ins, 1); if (ret) { WARN_ON(1); - goto out; + goto stop_trans; + } + + ret = btrfs_reserve_metadata_space(root, 3); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, + ins.offset); + goto stop_trans; } + ret = insert_reserved_file_extent(trans, inode, cur_offset, ins.objectid, ins.offset, ins.offset, - ins.offset, locked_end, - 0, 0, 0, + ins.offset, 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); BUG_ON(ret); btrfs_drop_extent_cache(inode, cur_offset, cur_offset + ins.offset -1, 0); + num_bytes -= ins.offset; cur_offset += ins.offset; alloc_hint = ins.objectid + ins.offset; - btrfs_unreserve_metadata_space(root, 1); - } -out: - if (cur_offset > start) { + inode->i_ctime = CURRENT_TIME; BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && - cur_offset > i_size_read(inode)) - btrfs_i_size_write(inode, cur_offset); + cur_offset > inode->i_size) { + if (cur_offset > actual_len) + i_size = actual_len; + else + i_size = cur_offset; + i_size_write(inode, i_size); + btrfs_ordered_update_i_size(inode, i_size, NULL); + } + ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); + + btrfs_end_transaction(trans, root); + btrfs_unreserve_metadata_space(root, 3); } + return ret; +stop_trans: + btrfs_end_transaction(trans, root); return ret; + } static long btrfs_fallocate(struct inode *inode, int mode, @@ -5705,8 +5875,6 @@ static long btrfs_fallocate(struct inode *inode, int mode, u64 locked_end; u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; - struct btrfs_trans_handle *trans; - struct btrfs_root *root; int ret; alloc_start = offset & ~mask; @@ -5725,9 +5893,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } - root = BTRFS_I(inode)->root; - - ret = btrfs_check_data_free_space(root, inode, + ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, alloc_end - alloc_start); if (ret) goto out; @@ -5736,12 +5902,6 @@ static long btrfs_fallocate(struct inode *inode, int mode, while (1) { struct btrfs_ordered_extent *ordered; - trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); - if (!trans) { - ret = -EIO; - goto out_free; - } - /* the extent lock is ordered inside the running * transaction */ @@ -5755,8 +5915,6 @@ static long btrfs_fallocate(struct inode *inode, int mode, btrfs_put_ordered_extent(ordered); unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, GFP_NOFS); - btrfs_end_transaction(trans, BTRFS_I(inode)->root); - /* * we can't wait on the range with the transaction * running or with the extent lock held @@ -5777,10 +5935,12 @@ static long btrfs_fallocate(struct inode *inode, int mode, BUG_ON(IS_ERR(em) || !em); last_byte = min(extent_map_end(em), alloc_end); last_byte = (last_byte + mask) & ~mask; - if (em->block_start == EXTENT_MAP_HOLE) { - ret = prealloc_file_range(trans, inode, cur_offset, - last_byte, locked_end + 1, - alloc_hint, mode); + if (em->block_start == EXTENT_MAP_HOLE || + (cur_offset >= inode->i_size && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + ret = prealloc_file_range(inode, + cur_offset, last_byte, + alloc_hint, mode, offset+len); if (ret < 0) { free_extent_map(em); break; @@ -5799,9 +5959,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, GFP_NOFS); - btrfs_end_transaction(trans, BTRFS_I(inode)->root); -out_free: - btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start); + btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, + alloc_end - alloc_start); out: mutex_unlock(&inode->i_mutex); return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cdbb054102b9..645a17927a8f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -237,7 +237,6 @@ static noinline int create_subvol(struct btrfs_root *root, u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; - unsigned long nr = 1; /* * 1 - inode item @@ -290,7 +289,7 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_set_root_generation(&root_item, trans->transid); btrfs_set_root_level(&root_item, 0); btrfs_set_root_refs(&root_item, 1); - btrfs_set_root_used(&root_item, 0); + btrfs_set_root_used(&root_item, leaf->len); btrfs_set_root_last_snapshot(&root_item, 0); memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); @@ -342,24 +341,21 @@ static noinline int create_subvol(struct btrfs_root *root, d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: - nr = trans->blocks_used; err = btrfs_commit_transaction(trans, root); if (err && !ret) ret = err; btrfs_unreserve_metadata_space(root, 6); - btrfs_btree_balance_dirty(root, nr); return ret; } static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen) { + struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; struct btrfs_trans_handle *trans; - int ret = 0; - int err; - unsigned long nr = 0; + int ret; if (!root->ref_cows) return -EINVAL; @@ -372,20 +368,20 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, */ ret = btrfs_reserve_metadata_space(root, 6); if (ret) - goto fail_unlock; + goto fail; pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) { ret = -ENOMEM; btrfs_unreserve_metadata_space(root, 6); - goto fail_unlock; + goto fail; } pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); if (!pending_snapshot->name) { ret = -ENOMEM; kfree(pending_snapshot); btrfs_unreserve_metadata_space(root, 6); - goto fail_unlock; + goto fail; } memcpy(pending_snapshot->name, name, namelen); pending_snapshot->name[namelen] = '\0'; @@ -395,10 +391,19 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, pending_snapshot->root = root; list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); - err = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + btrfs_unreserve_metadata_space(root, 6); -fail_unlock: - btrfs_btree_balance_dirty(root, nr); + inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto fail; + } + BUG_ON(!inode); + d_instantiate(dentry, inode); + ret = 0; +fail: return ret; } @@ -1027,8 +1032,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, BUG_ON(!trans); /* punch hole in destination first */ - btrfs_drop_extents(trans, root, inode, off, off + len, - off + len, 0, &hint_byte, 1); + btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1); /* clone data */ key.objectid = src->i_ino; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 5799bc46a309..5c2a9e78a949 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -291,16 +291,16 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) /* * remove an ordered extent from the tree. No references are dropped - * but, anyone waiting on this extent is woken up. + * and you must wake_up entry->wait. You must hold the tree mutex + * while you call this function. */ -int btrfs_remove_ordered_extent(struct inode *inode, +static int __btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; tree = &BTRFS_I(inode)->ordered_tree; - mutex_lock(&tree->mutex); node = &entry->rb_node; rb_erase(node, &tree->tree); tree->last = NULL; @@ -326,16 +326,34 @@ int btrfs_remove_ordered_extent(struct inode *inode, } spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + return 0; +} + +/* + * remove an ordered extent from the tree. No references are dropped + * but any waiters are woken. + */ +int btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) +{ + struct btrfs_ordered_inode_tree *tree; + int ret; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + ret = __btrfs_remove_ordered_extent(inode, entry); mutex_unlock(&tree->mutex); wake_up(&entry->wait); - return 0; + + return ret; } /* * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) +int btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput) { struct list_head splice; struct list_head *cur; @@ -372,7 +390,10 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) if (inode) { btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); - iput(inode); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); } else { btrfs_put_ordered_extent(ordered); } @@ -430,7 +451,7 @@ again: btrfs_wait_ordered_range(inode, 0, (u64)-1); else filemap_flush(inode->i_mapping); - iput(inode); + btrfs_add_delayed_iput(inode); } cond_resched(); @@ -589,7 +610,7 @@ out: * After an extent is done, call this to conditionally update the on disk * i_size. i_size is updated to cover any fully written part of the file. */ -int btrfs_ordered_update_i_size(struct inode *inode, +int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered) { struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; @@ -597,18 +618,32 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 disk_i_size; u64 new_i_size; u64 i_size_test; + u64 i_size = i_size_read(inode); struct rb_node *node; + struct rb_node *prev = NULL; struct btrfs_ordered_extent *test; + int ret = 1; + + if (ordered) + offset = entry_end(ordered); + else + offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); mutex_lock(&tree->mutex); disk_i_size = BTRFS_I(inode)->disk_i_size; + /* truncate file */ + if (disk_i_size > i_size) { + BTRFS_I(inode)->disk_i_size = i_size; + ret = 0; + goto out; + } + /* * if the disk i_size is already at the inode->i_size, or * this ordered extent is inside the disk i_size, we're done */ - if (disk_i_size >= inode->i_size || - ordered->file_offset + ordered->len <= disk_i_size) { + if (disk_i_size == i_size || offset <= disk_i_size) { goto out; } @@ -616,8 +651,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, * we can't update the disk_isize if there are delalloc bytes * between disk_i_size and this ordered extent */ - if (test_range_bit(io_tree, disk_i_size, - ordered->file_offset + ordered->len - 1, + if (test_range_bit(io_tree, disk_i_size, offset - 1, EXTENT_DELALLOC, 0, NULL)) { goto out; } @@ -626,20 +660,32 @@ int btrfs_ordered_update_i_size(struct inode *inode, * if we find an ordered extent then we can't update disk i_size * yet */ - node = &ordered->rb_node; - while (1) { - node = rb_prev(node); - if (!node) - break; + if (ordered) { + node = rb_prev(&ordered->rb_node); + } else { + prev = tree_search(tree, offset); + /* + * we insert file extents without involving ordered struct, + * so there should be no ordered struct cover this offset + */ + if (prev) { + test = rb_entry(prev, struct btrfs_ordered_extent, + rb_node); + BUG_ON(offset_in_entry(test, offset)); + } + node = prev; + } + while (node) { test = rb_entry(node, struct btrfs_ordered_extent, rb_node); if (test->file_offset + test->len <= disk_i_size) break; - if (test->file_offset >= inode->i_size) + if (test->file_offset >= i_size) break; if (test->file_offset >= disk_i_size) goto out; + node = rb_prev(node); } - new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode)); + new_i_size = min_t(u64, offset, i_size); /* * at this point, we know we can safely update i_size to at least @@ -647,7 +693,14 @@ int btrfs_ordered_update_i_size(struct inode *inode, * walk forward and see if ios from higher up in the file have * finished. */ - node = rb_next(&ordered->rb_node); + if (ordered) { + node = rb_next(&ordered->rb_node); + } else { + if (prev) + node = rb_next(prev); + else + node = rb_first(&tree->tree); + } i_size_test = 0; if (node) { /* @@ -655,10 +708,10 @@ int btrfs_ordered_update_i_size(struct inode *inode, * between our ordered extent and the next one. */ test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (test->file_offset > entry_end(ordered)) + if (test->file_offset > offset) i_size_test = test->file_offset; } else { - i_size_test = i_size_read(inode); + i_size_test = i_size; } /* @@ -667,15 +720,25 @@ int btrfs_ordered_update_i_size(struct inode *inode, * are no delalloc bytes in this area, it is safe to update * disk_i_size to the end of the region. */ - if (i_size_test > entry_end(ordered) && - !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, - EXTENT_DELALLOC, 0, NULL)) { - new_i_size = min_t(u64, i_size_test, i_size_read(inode)); + if (i_size_test > offset && + !test_range_bit(io_tree, offset, i_size_test - 1, + EXTENT_DELALLOC, 0, NULL)) { + new_i_size = min_t(u64, i_size_test, i_size); } BTRFS_I(inode)->disk_i_size = new_i_size; + ret = 0; out: + /* + * we need to remove the ordered extent with the tree lock held + * so that other people calling this function don't find our fully + * processed ordered entry and skip updating the i_size + */ + if (ordered) + __btrfs_remove_ordered_extent(inode, ordered); mutex_unlock(&tree->mutex); - return 0; + if (ordered) + wake_up(&ordered->wait); + return ret; } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f82e87488ca8..1fe1282ef47c 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -150,12 +150,13 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); -int btrfs_ordered_update_i_size(struct inode *inode, +int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); +int btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput); #endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index cfcc93c93a7b..ed3e4a2ec2c8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1561,6 +1561,20 @@ static int invalidate_extent_cache(struct btrfs_root *root, return 0; } +static void put_inodes(struct list_head *list) +{ + struct inodevec *ivec; + while (!list_empty(list)) { + ivec = list_entry(list->next, struct inodevec, list); + list_del(&ivec->list); + while (ivec->nr > 0) { + ivec->nr--; + iput(ivec->inode[ivec->nr]); + } + kfree(ivec); + } +} + static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key) @@ -1723,6 +1737,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_btree_balance_dirty(root, nr); + /* + * put inodes outside transaction, otherwise we may deadlock. + */ + put_inodes(&inode_list); + if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); } @@ -1752,19 +1771,7 @@ out: btrfs_btree_balance_dirty(root, nr); - /* - * put inodes while we aren't holding the tree locks - */ - while (!list_empty(&inode_list)) { - struct inodevec *ivec; - ivec = list_entry(inode_list.next, struct inodevec, list); - list_del(&ivec->list); - while (ivec->nr > 0) { - ivec->nr--; - iput(ivec->inode[ivec->nr]); - } - kfree(ivec); - } + put_inodes(&inode_list); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); @@ -3274,8 +3281,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) return -ENOMEM; path = btrfs_alloc_path(); - if (!path) + if (!path) { + kfree(cluster); return -ENOMEM; + } rc->extents_found = 0; rc->extents_skipped = 0; @@ -3534,8 +3543,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) (unsigned long long)rc->block_group->key.objectid, (unsigned long long)rc->block_group->flags); - btrfs_start_delalloc_inodes(fs_info->tree_root); - btrfs_wait_ordered_extents(fs_info->tree_root, 0); + btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); while (1) { rc->extents_found = 0; @@ -3755,6 +3764,7 @@ out: BTRFS_DATA_RELOC_TREE_OBJECTID); if (IS_ERR(fs_root)) err = PTR_ERR(fs_root); + btrfs_orphan_cleanup(fs_root); } return err; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 752a5463bf53..8a1ea6e64575 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,7 +66,8 @@ enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, - Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, + Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, + Opt_flushoncommit, Opt_discard, Opt_err, }; @@ -82,6 +83,7 @@ static match_table_t tokens = { {Opt_alloc_start, "alloc_start=%s"}, {Opt_thread_pool, "thread_pool=%d"}, {Opt_compress, "compress"}, + {Opt_compress_force, "compress-force"}, {Opt_ssd, "ssd"}, {Opt_ssd_spread, "ssd_spread"}, {Opt_nossd, "nossd"}, @@ -128,6 +130,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) substring_t args[MAX_OPT_ARGS]; char *p, *num; int intarg; + int ret = 0; if (!options) return 0; @@ -172,6 +175,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: use compression\n"); btrfs_set_opt(info->mount_opt, COMPRESS); break; + case Opt_compress_force: + printk(KERN_INFO "btrfs: forcing compression\n"); + btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); + btrfs_set_opt(info->mount_opt, COMPRESS); + break; case Opt_ssd: printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); btrfs_set_opt(info->mount_opt, SSD); @@ -262,12 +270,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_discard: btrfs_set_opt(info->mount_opt, DISCARD); break; + case Opt_err: + printk(KERN_INFO "btrfs: unrecognized mount option " + "'%s'\n", p); + ret = -EINVAL; + goto out; default: break; } } +out: kfree(options); - return 0; + return ret; } /* @@ -405,8 +419,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_start_delalloc_inodes(root); - btrfs_wait_ordered_extents(root, 0); + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0, 0); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); @@ -450,6 +464,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",notreelog"); if (btrfs_test_opt(root, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); + if (btrfs_test_opt(root, DISCARD)) + seq_puts(seq, ",discard"); if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) seq_puts(seq, ",noacl"); return 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c207e8c32c9b..b2acc79f1b34 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -333,6 +333,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); + if (throttle) + btrfs_run_delayed_iputs(root); + return 0; } @@ -354,7 +357,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, * those extents are sent to disk but does not wait on them */ int btrfs_write_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages) + struct extent_io_tree *dirty_pages, int mark) { int ret; int err = 0; @@ -367,7 +370,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, while (1) { ret = find_first_extent_bit(dirty_pages, start, &start, &end, - EXTENT_DIRTY); + mark); if (ret) break; while (start <= end) { @@ -413,7 +416,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, * on all the pages and clear them from the dirty pages state tree */ int btrfs_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages) + struct extent_io_tree *dirty_pages, int mark) { int ret; int err = 0; @@ -425,12 +428,12 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, unsigned long index; while (1) { - ret = find_first_extent_bit(dirty_pages, 0, &start, &end, - EXTENT_DIRTY); + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + mark); if (ret) break; - clear_extent_dirty(dirty_pages, start, end, GFP_NOFS); + clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); while (start <= end) { index = start >> PAGE_CACHE_SHIFT; start = (u64)(index + 1) << PAGE_CACHE_SHIFT; @@ -460,13 +463,13 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, * those extents are on disk for transaction or log commit */ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages) + struct extent_io_tree *dirty_pages, int mark) { int ret; int ret2; - ret = btrfs_write_marked_extents(root, dirty_pages); - ret2 = btrfs_wait_marked_extents(root, dirty_pages); + ret = btrfs_write_marked_extents(root, dirty_pages, mark); + ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); return ret || ret2; } @@ -479,7 +482,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, return filemap_write_and_wait(btree_inode->i_mapping); } return btrfs_write_and_wait_marked_extents(root, - &trans->transaction->dirty_pages); + &trans->transaction->dirty_pages, + EXTENT_DIRTY); } /* @@ -497,13 +501,16 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, { int ret; u64 old_root_bytenr; + u64 old_root_used; struct btrfs_root *tree_root = root->fs_info->tree_root; + old_root_used = btrfs_root_used(&root->root_item); btrfs_write_dirty_block_groups(trans, root); while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); - if (old_root_bytenr == root->node->start) + if (old_root_bytenr == root->node->start && + old_root_used == btrfs_root_used(&root->root_item)) break; btrfs_set_root_node(&root->root_item, root->node); @@ -512,6 +519,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, &root->root_item); BUG_ON(ret); + old_root_used = btrfs_root_used(&root->root_item); ret = btrfs_write_dirty_block_groups(trans, root); BUG_ON(ret); } @@ -795,7 +803,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, memcpy(&pending->root_key, &key, sizeof(key)); fail: kfree(new_root_item); - btrfs_unreserve_metadata_space(root, 6); return ret; } @@ -807,7 +814,6 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, u64 index = 0; struct btrfs_trans_handle *trans; struct inode *parent_inode; - struct inode *inode; struct btrfs_root *parent_root; parent_inode = pending->dentry->d_parent->d_inode; @@ -839,8 +845,6 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, BUG_ON(ret); - inode = btrfs_lookup_dentry(parent_inode, pending->dentry); - d_instantiate(pending->dentry, inode); fail: btrfs_end_transaction(trans, fs_info->fs_root); return ret; @@ -994,11 +998,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); if (flush_on_commit) { - btrfs_start_delalloc_inodes(root); - ret = btrfs_wait_ordered_extents(root, 0); + btrfs_start_delalloc_inodes(root, 1); + ret = btrfs_wait_ordered_extents(root, 0, 1); BUG_ON(ret); } else if (snap_pending) { - ret = btrfs_wait_ordered_extents(root, 1); + ret = btrfs_wait_ordered_extents(root, 0, 1); BUG_ON(ret); } @@ -1116,6 +1120,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, current->journal_info = NULL; kmem_cache_free(btrfs_trans_handle_cachep, trans); + + if (current != root->fs_info->transaction_kthread) + btrfs_run_delayed_iputs(root); + return ret; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d4e3e7a6938c..93c7ccb33118 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -107,10 +107,10 @@ void btrfs_throttle(struct btrfs_root *root); int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages); + struct extent_io_tree *dirty_pages, int mark); int btrfs_write_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages); + struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages); + struct extent_io_tree *dirty_pages, int mark); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 741666a7676a..4a9434b622ec 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -542,8 +542,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ - ret = btrfs_drop_extents(trans, root, inode, - start, extent_end, extent_end, start, &alloc_hint, 1); + ret = btrfs_drop_extents(trans, inode, start, extent_end, + &alloc_hint, 1); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || @@ -930,6 +930,17 @@ out_nowrite: return 0; } +static int insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + int ret; + ret = btrfs_find_orphan_item(root, offset); + if (ret > 0) + ret = btrfs_insert_orphan_item(trans, root, offset); + return ret; +} + + /* * There are a few corners where the link count of the file can't * be properly maintained during replay. So, instead of adding @@ -997,9 +1008,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, } BTRFS_I(inode)->index_cnt = (u64)-1; - if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) { - ret = replay_dir_deletes(trans, root, NULL, path, - inode->i_ino, 1); + if (inode->i_nlink == 0) { + if (S_ISDIR(inode->i_mode)) { + ret = replay_dir_deletes(trans, root, NULL, path, + inode->i_ino, 1); + BUG_ON(ret); + } + ret = insert_orphan_item(trans, root, inode->i_ino); BUG_ON(ret); } btrfs_free_path(path); @@ -1587,7 +1602,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, /* inode keys are done during the first stage */ if (key.type == BTRFS_INODE_ITEM_KEY && wc->stage == LOG_WALK_REPLAY_INODES) { - struct inode *inode; struct btrfs_inode_item *inode_item; u32 mode; @@ -1603,31 +1617,16 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, eb, i, &key); BUG_ON(ret); - /* for regular files, truncate away - * extents past the new EOF + /* for regular files, make sure corresponding + * orhpan item exist. extents past the new EOF + * will be truncated later by orphan cleanup. */ if (S_ISREG(mode)) { - inode = read_one_inode(root, - key.objectid); - BUG_ON(!inode); - - ret = btrfs_truncate_inode_items(wc->trans, - root, inode, inode->i_size, - BTRFS_EXTENT_DATA_KEY); + ret = insert_orphan_item(wc->trans, root, + key.objectid); BUG_ON(ret); - - /* if the nlink count is zero here, the iput - * will free the inode. We bump it to make - * sure it doesn't get freed until the link - * count fixup is done - */ - if (inode->i_nlink == 0) { - btrfs_inc_nlink(inode); - btrfs_update_inode(wc->trans, - root, inode); - } - iput(inode); } + ret = link_to_fixup_dir(wc->trans, root, path, key.objectid); BUG_ON(ret); @@ -1977,10 +1976,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, { int index1; int index2; + int mark; int ret; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; - u64 log_transid = 0; + unsigned long log_transid = 0; mutex_lock(&root->log_mutex); index1 = root->log_transid % 2; @@ -2014,24 +2014,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } + log_transid = root->log_transid; + if (log_transid % 2 == 0) + mark = EXTENT_DIRTY; + else + mark = EXTENT_NEW; + /* we start IO on all the marked extents here, but we don't actually * wait for them until later. */ - ret = btrfs_write_marked_extents(log, &log->dirty_log_pages); + ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); BUG_ON(ret); btrfs_set_root_node(&log->root_item, log->node); root->log_batch = 0; - log_transid = root->log_transid; root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; smp_mb(); /* - * log tree has been flushed to disk, new modifications of - * the log will be written to new positions. so it's safe to - * allow log writers to go in. + * IO has been started, blocks of the log tree have WRITTEN flag set + * in their headers. new modifications of the log will be written to + * new positions. so it's safe to allow log writers to go in. */ mutex_unlock(&root->log_mutex); @@ -2052,7 +2057,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { - btrfs_wait_marked_extents(log, &log->dirty_log_pages); + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2072,16 +2077,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * check the full commit flag again */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { - btrfs_wait_marked_extents(log, &log->dirty_log_pages); + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; } ret = btrfs_write_and_wait_marked_extents(log_root_tree, - &log_root_tree->dirty_log_pages); + &log_root_tree->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); BUG_ON(ret); - btrfs_wait_marked_extents(log, &log->dirty_log_pages); + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_set_super_log_root(&root->fs_info->super_for_commit, log_root_tree->node->start); @@ -2147,12 +2153,12 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY); + 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); if (ret) break; - clear_extent_dirty(&log->dirty_log_pages, - start, end, GFP_NOFS); + clear_extent_bits(&log->dirty_log_pages, start, end, + EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); } if (log->log_transid > 0) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7eda483d7b5a..41ecbb2347f2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1135,7 +1135,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->avail_metadata_alloc_bits; if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && - root->fs_info->fs_devices->rw_devices <= 4) { + root->fs_info->fs_devices->num_devices <= 4) { printk(KERN_ERR "btrfs: unable to go below four devices " "on raid10\n"); ret = -EINVAL; @@ -1143,7 +1143,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && - root->fs_info->fs_devices->rw_devices <= 2) { + root->fs_info->fs_devices->num_devices <= 2) { printk(KERN_ERR "btrfs: unable to go below two " "devices on raid1\n"); ret = -EINVAL; @@ -1434,8 +1434,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) return -EINVAL; bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); - if (!bdev) - return -EIO; + if (IS_ERR(bdev)) + return PTR_ERR(bdev); if (root->fs_info->fs_devices->seeding) { seeding_dev = 1; @@ -2209,7 +2209,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size = 10 * calc_size; min_stripe_size = 64 * 1024 * 1024; } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - max_chunk_size = 4 * calc_size; + max_chunk_size = 256 * 1024 * 1024; min_stripe_size = 32 * 1024 * 1024; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { calc_size = 8 * 1024 * 1024; @@ -2538,6 +2538,11 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) if (!em) return 1; + if (btrfs_test_opt(root, DEGRADED)) { + free_extent_map(em); + return 0; + } + map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { if (!map->stripes[i].dev->writeable) { @@ -2649,8 +2654,10 @@ again: em = lookup_extent_mapping(em_tree, logical, *length); read_unlock(&em_tree->lock); - if (!em && unplug_page) + if (!em && unplug_page) { + kfree(multi); return 0; + } if (!em) { printk(KERN_CRIT "unable to find logical %llu len %llu\n", diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index b6dd5967c48a..193b58f7d3f3 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -85,22 +85,23 @@ out: return ret; } -int __btrfs_setxattr(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int do_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) { struct btrfs_dir_item *di; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; struct btrfs_path *path; - int ret = 0, mod = 0; + size_t name_len = strlen(name); + int ret = 0; + + if (name_len + size > BTRFS_MAX_XATTR_SIZE(root)) + return -ENOSPC; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - trans = btrfs_join_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - /* first lets see if we already have this xattr */ di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name, strlen(name), -1); @@ -118,15 +119,12 @@ int __btrfs_setxattr(struct inode *inode, const char *name, } ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) - goto out; + BUG_ON(ret); btrfs_release_path(root, path); /* if we don't have a value then we are removing the xattr */ - if (!value) { - mod = 1; + if (!value) goto out; - } } else { btrfs_release_path(root, path); @@ -138,20 +136,45 @@ int __btrfs_setxattr(struct inode *inode, const char *name, } /* ok we have to create a completely new xattr */ - ret = btrfs_insert_xattr_item(trans, root, name, strlen(name), - value, size, inode->i_ino); + ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino, + name, name_len, value, size); + BUG_ON(ret); +out: + btrfs_free_path(path); + return ret; +} + +int __btrfs_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (trans) + return do_setxattr(trans, inode, name, value, size, flags); + + ret = btrfs_reserve_metadata_space(root, 2); if (ret) - goto out; - mod = 1; + return ret; -out: - if (mod) { - inode->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, root, inode); + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; } + btrfs_set_trans_block_group(trans, inode); - btrfs_end_transaction(trans, root); - btrfs_free_path(path); + ret = do_setxattr(trans, inode, name, value, size, flags); + if (ret) + goto out; + + inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); +out: + btrfs_end_transaction_throttle(trans, root); + btrfs_unreserve_metadata_space(root, 2); return ret; } @@ -314,7 +337,9 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (size == 0) value = ""; /* empty EA, do not remove */ - return __btrfs_setxattr(dentry->d_inode, name, value, size, flags); + + return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size, + flags); } int btrfs_removexattr(struct dentry *dentry, const char *name) @@ -329,10 +354,13 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) if (!btrfs_is_valid_xattr(name)) return -EOPNOTSUPP; - return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); + + return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, + XATTR_REPLACE); } -int btrfs_xattr_security_init(struct inode *inode, struct inode *dir) +int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) { int err; size_t len; @@ -354,7 +382,7 @@ int btrfs_xattr_security_init(struct inode *inode, struct inode *dir) } else { strcpy(name, XATTR_SECURITY_PREFIX); strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); - err = __btrfs_setxattr(inode, name, value, len, 0); + err = __btrfs_setxattr(trans, inode, name, value, len, 0); kfree(name); } diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index c71e9c3cf3f7..721efa0346e0 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -27,15 +27,16 @@ extern struct xattr_handler *btrfs_xattr_handlers[]; extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size); -extern int __btrfs_setxattr(struct inode *inode, const char *name, - const void *value, size_t size, int flags); - +extern int __btrfs_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags); extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size); extern int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); extern int btrfs_removexattr(struct dentry *dentry, const char *name); -extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir); +extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir); #endif /* __XATTR__ */ diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index 3797e0077b35..2906077ac798 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -84,7 +84,7 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) { struct cachefiles_object *fsdef; - struct nameidata nd; + struct path path; struct kstatfs stats; struct dentry *graveyard, *cachedir, *root; const struct cred *saved_cred; @@ -114,15 +114,12 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) _debug("- fsdef %p", fsdef); /* look up the directory at the root of the cache */ - memset(&nd, 0, sizeof(nd)); - - ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd); + ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path); if (ret < 0) goto error_open_root; - cache->mnt = mntget(nd.path.mnt); - root = dget(nd.path.dentry); - path_put(&nd.path); + cache->mnt = path.mnt; + root = path.dentry; /* check parameters */ ret = -EOPNOTSUPP; diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 4618516dd994..c2413561ea75 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -21,6 +21,7 @@ #include <linux/mount.h> #include <linux/statfs.h> #include <linux/ctype.h> +#include <linux/string.h> #include <linux/fs_struct.h> #include "internal.h" @@ -257,8 +258,7 @@ static ssize_t cachefiles_daemon_write(struct file *file, if (args == data) goto error; *args = '\0'; - for (args++; isspace(*args); args++) - continue; + args = skip_spaces(++args); } /* run the appropriate command handler */ diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index a6c8c6fe8df9..1d8332563863 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -11,7 +11,6 @@ #include <linux/mount.h> #include <linux/file.h> -#include <linux/ima.h> #include "internal.h" /* @@ -923,7 +922,6 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) if (IS_ERR(file)) { ret = PTR_ERR(file); } else { - ima_counts_get(file); ret = -EIO; if (file->f_op->write) { pos = (loff_t) page->index << PAGE_SHIFT; diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 094ea65afc85..7b2600b380d7 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -5,7 +5,9 @@ have duplicated data). Fix oops in cifs_lookup. Workaround problem mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session. Disable use of server inode numbers when server only partially supports them (e.g. for one server querying inode numbers on -FindFirst fails but QPathInfo queries works). +FindFirst fails but QPathInfo queries works). Fix oops with dfs in +cifs_put_smb_ses. Fix mmap to work on directio mounts (needed +for OpenOffice when on forcedirectio mount e.g.) Version 1.60 ------------- diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index fea9e898c4ba..b44ce0a0711c 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -269,7 +269,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd, int err; mntget(newmnt); - err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags, mntlist); + err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist); switch (err) { case 0: path_put(&nd->path); @@ -371,7 +371,6 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) if (IS_ERR(mnt)) goto out_err; - nd->path.mnt->mnt_flags |= MNT_SHRINKABLE; rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list); out: diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 29f1da761bbf..8c6a03627176 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -758,7 +758,7 @@ const struct file_operations cifs_file_ops = { }; const struct file_operations cifs_file_direct_ops = { - /* no mmap, no aio, no readv - + /* no aio, no readv - BB reevaluate whether they can be done with directio, no cache */ .read = cifs_user_read, .write = cifs_user_write, @@ -767,6 +767,7 @@ const struct file_operations cifs_file_direct_ops = { .lock = cifs_lock, .fsync = cifs_fsync, .flush = cifs_flush, + .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, #ifdef CONFIG_CIFS_POSIX .unlocked_ioctl = cifs_ioctl, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 63ea83ff687f..3bbcaa716b3c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2287,12 +2287,12 @@ int cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, char *mount_data_global, const char *devname) { - int rc = 0; + int rc; int xid; struct smb_vol *volume_info; - struct cifsSesInfo *pSesInfo = NULL; - struct cifsTconInfo *tcon = NULL; - struct TCP_Server_Info *srvTcp = NULL; + struct cifsSesInfo *pSesInfo; + struct cifsTconInfo *tcon; + struct TCP_Server_Info *srvTcp; char *full_path; char *mount_data = mount_data_global; #ifdef CONFIG_CIFS_DFS_UPCALL @@ -2301,6 +2301,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, int referral_walks_count = 0; try_mount_again: #endif + rc = 0; + tcon = NULL; + pSesInfo = NULL; + srvTcp = NULL; full_path = NULL; xid = GetXid(); @@ -2597,6 +2601,7 @@ remote_path_check: cleanup_volume_info(&volume_info); referral_walks_count++; + FreeXid(xid); goto try_mount_again; } #else /* No DFS support, return error on mount */ diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 1f42f772865a..6ccf7262d1b7 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -214,7 +214,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode, posix_flags |= SMB_O_EXCL; if (oflags & O_TRUNC) posix_flags |= SMB_O_TRUNC; - if (oflags & O_SYNC) + /* be safe and imply O_SYNC for O_DSYNC */ + if (oflags & O_DSYNC) posix_flags |= SMB_O_SYNC; if (oflags & O_DIRECTORY) posix_flags |= SMB_O_DIRECTORY; diff --git a/fs/cifs/export.c b/fs/cifs/export.c index 75949d6a5f1b..6177f7cca16a 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -24,7 +24,7 @@ */ /* - * See Documentation/filesystems/Exporting + * See Documentation/filesystems/nfs/Exporting * and examples in fs/exportfs * * Since cifs is a network file system, an "fsid" must be included for diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 429337eb7afe..057e1dae12ab 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -76,8 +76,10 @@ static inline fmode_t cifs_posix_convert_flags(unsigned int flags) reopening a file. They had their effect on the original open */ if (flags & O_APPEND) posix_flags |= (fmode_t)O_APPEND; - if (flags & O_SYNC) - posix_flags |= (fmode_t)O_SYNC; + if (flags & O_DSYNC) + posix_flags |= (fmode_t)O_DSYNC; + if (flags & __O_SYNC) + posix_flags |= (fmode_t)__O_SYNC; if (flags & O_DIRECTORY) posix_flags |= (fmode_t)O_DIRECTORY; if (flags & O_NOFOLLOW) diff --git a/fs/compat.c b/fs/compat.c index 6c19040ffeef..00d90c2e66f0 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -38,8 +38,6 @@ #include <linux/dirent.h> #include <linux/fsnotify.h> #include <linux/highuid.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> #include <linux/nfsd/syscall.h> #include <linux/personality.h> #include <linux/rwsem.h> diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 2346895b3a77..c5c45de1a2ee 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -111,43 +111,40 @@ #include <linux/dvb/frontend.h> #include <linux/dvb/video.h> +#include <linux/sort.h> + #ifdef CONFIG_SPARC #include <asm/fbio.h> #endif -static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd, - unsigned long arg, struct file *f) -{ - return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg)); -} - -static int w_long(unsigned int fd, unsigned int cmd, unsigned long arg) +static int w_long(unsigned int fd, unsigned int cmd, + compat_ulong_t __user *argp) { mm_segment_t old_fs = get_fs(); int err; unsigned long val; - + set_fs (KERNEL_DS); err = sys_ioctl(fd, cmd, (unsigned long)&val); set_fs (old_fs); - if (!err && put_user(val, (u32 __user *)compat_ptr(arg))) + if (!err && put_user(val, argp)) return -EFAULT; return err; } - -static int rw_long(unsigned int fd, unsigned int cmd, unsigned long arg) + +static int rw_long(unsigned int fd, unsigned int cmd, + compat_ulong_t __user *argp) { mm_segment_t old_fs = get_fs(); - u32 __user *argptr = compat_ptr(arg); int err; unsigned long val; - - if(get_user(val, argptr)) + + if(get_user(val, argp)) return -EFAULT; set_fs (KERNEL_DS); err = sys_ioctl(fd, cmd, (unsigned long)&val); set_fs (old_fs); - if (!err && put_user(val, argptr)) + if (!err && put_user(val, argp)) return -EFAULT; return err; } @@ -161,7 +158,8 @@ struct compat_video_event { } u; }; -static int do_video_get_event(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_video_get_event(unsigned int fd, unsigned int cmd, + struct compat_video_event __user *up) { struct video_event kevent; mm_segment_t old_fs = get_fs(); @@ -172,8 +170,6 @@ static int do_video_get_event(unsigned int fd, unsigned int cmd, unsigned long a set_fs(old_fs); if (!err) { - struct compat_video_event __user *up = compat_ptr(arg); - err = put_user(kevent.type, &up->type); err |= put_user(kevent.timestamp, &up->timestamp); err |= put_user(kevent.u.size.w, &up->u.size.w); @@ -192,15 +188,14 @@ struct compat_video_still_picture { int32_t size; }; -static int do_video_stillpicture(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_video_stillpicture(unsigned int fd, unsigned int cmd, + struct compat_video_still_picture __user *up) { - struct compat_video_still_picture __user *up; struct video_still_picture __user *up_native; compat_uptr_t fp; int32_t size; int err; - up = (struct compat_video_still_picture __user *) arg; err = get_user(fp, &up->iFrame); err |= get_user(size, &up->size); if (err) @@ -224,14 +219,13 @@ struct compat_video_spu_palette { compat_uptr_t palette; }; -static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, + struct compat_video_spu_palette __user *up) { - struct compat_video_spu_palette __user *up; struct video_spu_palette __user *up_native; compat_uptr_t palp; int length, err; - up = (struct compat_video_spu_palette __user *) arg; err = get_user(palp, &up->palette); err |= get_user(length, &up->length); @@ -299,16 +293,15 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov return 0; } -static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) +static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, + sg_io_hdr32_t __user *sgio32) { sg_io_hdr_t __user *sgio; - sg_io_hdr32_t __user *sgio32; u16 iovec_count; u32 data; void __user *dxferp; int err; - sgio32 = compat_ptr(arg); if (get_user(iovec_count, &sgio32->iovec_count)) return -EFAULT; @@ -398,11 +391,11 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ int unused; }; -static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg) +static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct + compat_sg_req_info __user *o) { int err, i; sg_req_info_t __user *r; - struct compat_sg_req_info __user *o = (void __user *)arg; r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); err = sys_ioctl(fd,cmd,(unsigned long)r); if (err < 0) @@ -430,9 +423,9 @@ struct sock_fprog32 { #define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32) #define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32) -static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) +static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, + struct sock_fprog32 __user *u_fprog32) { - struct sock_fprog32 __user *u_fprog32 = compat_ptr(arg); struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog)); void __user *fptr64; u32 fptr32; @@ -469,15 +462,14 @@ struct ppp_idle32 { }; #define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32) -static int ppp_gidle(unsigned int fd, unsigned int cmd, unsigned long arg) +static int ppp_gidle(unsigned int fd, unsigned int cmd, + struct ppp_idle32 __user *idle32) { struct ppp_idle __user *idle; - struct ppp_idle32 __user *idle32; __kernel_time_t xmit, recv; int err; idle = compat_alloc_user_space(sizeof(*idle)); - idle32 = compat_ptr(arg); err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle); @@ -491,15 +483,14 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd, unsigned long arg) return err; } -static int ppp_scompress(unsigned int fd, unsigned int cmd, unsigned long arg) +static int ppp_scompress(unsigned int fd, unsigned int cmd, + struct ppp_option_data32 __user *odata32) { struct ppp_option_data __user *odata; - struct ppp_option_data32 __user *odata32; __u32 data; void __user *datap; odata = compat_alloc_user_space(sizeof(*odata)); - odata32 = compat_ptr(arg); if (get_user(data, &odata32->ptr)) return -EFAULT; @@ -515,35 +506,6 @@ static int ppp_scompress(unsigned int fd, unsigned int cmd, unsigned long arg) return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata); } -static int ppp_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - int err; - - switch (cmd) { - case PPPIOCGIDLE32: - err = ppp_gidle(fd, cmd, arg); - break; - - case PPPIOCSCOMPRESS32: - err = ppp_scompress(fd, cmd, arg); - break; - - default: - do { - static int count; - if (++count <= 20) - printk("ppp_ioctl: Unknown cmd fd(%d) " - "cmd(%08x) arg(%08x)\n", - (int)fd, (unsigned int)cmd, (unsigned int)arg); - } while(0); - err = -EINVAL; - break; - }; - - return err; -} - - #ifdef CONFIG_BLOCK struct mtget32 { compat_long_t mt_type; @@ -561,7 +523,7 @@ struct mtpos32 { }; #define MTIOCPOS32 _IOR('m', 3, struct mtpos32) -static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) +static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) { mm_segment_t old_fs = get_fs(); struct mtget get; @@ -581,15 +543,6 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) kcmd = MTIOCGET; karg = &get; break; - default: - do { - static int count; - if (++count <= 20) - printk("mt_ioctl: Unknown cmd fd(%d) " - "cmd(%08x) arg(%08x)\n", - (int)fd, (unsigned int)cmd, (unsigned int)arg); - } while(0); - return -EINVAL; } set_fs (KERNEL_DS); err = sys_ioctl (fd, kcmd, (unsigned long)karg); @@ -598,11 +551,11 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) return err; switch (cmd) { case MTIOCPOS32: - upos32 = compat_ptr(arg); + upos32 = argp; err = __put_user(pos.mt_blkno, &upos32->mt_blkno); break; case MTIOCGET32: - umget32 = compat_ptr(arg); + umget32 = argp; err = __put_user(get.mt_type, &umget32->mt_type); err |= __put_user(get.mt_resid, &umget32->mt_resid); err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg); @@ -617,162 +570,8 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) #endif /* CONFIG_BLOCK */ -#ifdef CONFIG_VT - -static int vt_check(struct file *file) -{ - struct tty_struct *tty; - struct inode *inode = file->f_path.dentry->d_inode; - struct vc_data *vc; - - if (file->f_op->unlocked_ioctl != tty_ioctl) - return -EINVAL; - - tty = (struct tty_struct *)file->private_data; - if (tty_paranoia_check(tty, inode, "tty_ioctl")) - return -EINVAL; - - if (tty->ops->ioctl != vt_ioctl) - return -EINVAL; - - vc = (struct vc_data *)tty->driver_data; - if (!vc_cons_allocated(vc->vc_num)) /* impossible? */ - return -ENOIOCTLCMD; - - /* - * To have permissions to do most of the vt ioctls, we either have - * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG. - */ - if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG)) - return 1; - return 0; -} - -struct consolefontdesc32 { - unsigned short charcount; /* characters in font (256 or 512) */ - unsigned short charheight; /* scan lines per character (1-32) */ - compat_caddr_t chardata; /* font data in expanded form */ -}; - -static int do_fontx_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file) -{ - struct consolefontdesc32 __user *user_cfd = compat_ptr(arg); - struct console_font_op op; - compat_caddr_t data; - int i, perm; - - perm = vt_check(file); - if (perm < 0) return perm; - - switch (cmd) { - case PIO_FONTX: - if (!perm) - return -EPERM; - op.op = KD_FONT_OP_SET; - op.flags = 0; - op.width = 8; - if (get_user(op.height, &user_cfd->charheight) || - get_user(op.charcount, &user_cfd->charcount) || - get_user(data, &user_cfd->chardata)) - return -EFAULT; - op.data = compat_ptr(data); - return con_font_op(vc_cons[fg_console].d, &op); - case GIO_FONTX: - op.op = KD_FONT_OP_GET; - op.flags = 0; - op.width = 8; - if (get_user(op.height, &user_cfd->charheight) || - get_user(op.charcount, &user_cfd->charcount) || - get_user(data, &user_cfd->chardata)) - return -EFAULT; - if (!data) - return 0; - op.data = compat_ptr(data); - i = con_font_op(vc_cons[fg_console].d, &op); - if (i) - return i; - if (put_user(op.height, &user_cfd->charheight) || - put_user(op.charcount, &user_cfd->charcount) || - put_user((compat_caddr_t)(unsigned long)op.data, - &user_cfd->chardata)) - return -EFAULT; - return 0; - } - return -EINVAL; -} - -struct console_font_op32 { - compat_uint_t op; /* operation code KD_FONT_OP_* */ - compat_uint_t flags; /* KD_FONT_FLAG_* */ - compat_uint_t width, height; /* font size */ - compat_uint_t charcount; - compat_caddr_t data; /* font data with height fixed to 32 */ -}; - -static int do_kdfontop_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file) -{ - struct console_font_op op; - struct console_font_op32 __user *fontop = compat_ptr(arg); - int perm = vt_check(file), i; - struct vc_data *vc; - - if (perm < 0) return perm; - - if (copy_from_user(&op, fontop, sizeof(struct console_font_op32))) - return -EFAULT; - if (!perm && op.op != KD_FONT_OP_GET) - return -EPERM; - op.data = compat_ptr(((struct console_font_op32 *)&op)->data); - op.flags |= KD_FONT_FLAG_OLD; - vc = ((struct tty_struct *)file->private_data)->driver_data; - i = con_font_op(vc, &op); - if (i) - return i; - ((struct console_font_op32 *)&op)->data = (unsigned long)op.data; - if (copy_to_user(fontop, &op, sizeof(struct console_font_op32))) - return -EFAULT; - return 0; -} - -struct unimapdesc32 { - unsigned short entry_ct; - compat_caddr_t entries; -}; - -static int do_unimap_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg, struct file *file) -{ - struct unimapdesc32 tmp; - struct unimapdesc32 __user *user_ud = compat_ptr(arg); - int perm = vt_check(file); - struct vc_data *vc; - - if (perm < 0) - return perm; - if (copy_from_user(&tmp, user_ud, sizeof tmp)) - return -EFAULT; - if (tmp.entries) - if (!access_ok(VERIFY_WRITE, compat_ptr(tmp.entries), - tmp.entry_ct*sizeof(struct unipair))) - return -EFAULT; - vc = ((struct tty_struct *)file->private_data)->driver_data; - switch (cmd) { - case PIO_UNIMAP: - if (!perm) - return -EPERM; - return con_set_unimap(vc, tmp.entry_ct, - compat_ptr(tmp.entries)); - case GIO_UNIMAP: - if (!perm && fg_console != vc->vc_num) - return -EPERM; - return con_get_unimap(vc, tmp.entry_ct, &(user_ud->entry_ct), - compat_ptr(tmp.entries)); - } - return 0; -} - -#endif /* CONFIG_VT */ - -static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, + compat_uid_t __user *argp) { mm_segment_t old_fs = get_fs(); __kernel_uid_t kuid; @@ -785,20 +584,15 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, unsigned long a set_fs(old_fs); if (err >= 0) - err = put_user(kuid, (compat_uid_t __user *)compat_ptr(arg)); + err = put_user(kuid, argp); return err; } -static __used int -ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - return -EINVAL; -} - -static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg) +static int ioc_settimeout(unsigned int fd, unsigned int cmd, + compat_ulong_t __user *argp) { - return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, arg); + return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, argp); } /* Bluetooth ioctls */ @@ -856,7 +650,8 @@ static int set_raw32_request(struct raw_config_request *req, struct raw32_config return ret ? -EFAULT : 0; } -static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg) +static int raw_ioctl(unsigned fd, unsigned cmd, + struct raw32_config_request __user *user_req) { int ret; @@ -864,7 +659,6 @@ static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg) case RAW_SETBIND: case RAW_GETBIND: { struct raw_config_request req; - struct raw32_config_request __user *user_req = compat_ptr(arg); mm_segment_t oldfs = get_fs(); if ((ret = get_raw32_request(&req, user_req))) @@ -879,9 +673,6 @@ static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg) } break; } - default: - ret = sys_ioctl(fd, cmd, arg); - break; } return ret; } @@ -909,11 +700,11 @@ struct serial_struct32 { compat_int_t reserved[1]; }; -static int serial_struct_ioctl(unsigned fd, unsigned cmd, unsigned long arg) +static int serial_struct_ioctl(unsigned fd, unsigned cmd, + struct serial_struct32 __user *ss32) { typedef struct serial_struct SS; typedef struct serial_struct32 SS32; - struct serial_struct32 __user *ss32 = compat_ptr(arg); int err; struct serial_struct ss; mm_segment_t oldseg = get_fs(); @@ -951,96 +742,6 @@ static int serial_struct_ioctl(unsigned fd, unsigned cmd, unsigned long arg) return err; } -struct usbdevfs_ctrltransfer32 { - u8 bRequestType; - u8 bRequest; - u16 wValue; - u16 wIndex; - u16 wLength; - u32 timeout; /* in milliseconds */ - compat_caddr_t data; -}; - -#define USBDEVFS_CONTROL32 _IOWR('U', 0, struct usbdevfs_ctrltransfer32) - -static int do_usbdevfs_control(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct usbdevfs_ctrltransfer32 __user *p32 = compat_ptr(arg); - struct usbdevfs_ctrltransfer __user *p; - __u32 udata; - p = compat_alloc_user_space(sizeof(*p)); - if (copy_in_user(p, p32, (sizeof(*p32) - sizeof(compat_caddr_t))) || - get_user(udata, &p32->data) || - put_user(compat_ptr(udata), &p->data)) - return -EFAULT; - return sys_ioctl(fd, USBDEVFS_CONTROL, (unsigned long)p); -} - - -struct usbdevfs_bulktransfer32 { - compat_uint_t ep; - compat_uint_t len; - compat_uint_t timeout; /* in milliseconds */ - compat_caddr_t data; -}; - -#define USBDEVFS_BULK32 _IOWR('U', 2, struct usbdevfs_bulktransfer32) - -static int do_usbdevfs_bulk(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct usbdevfs_bulktransfer32 __user *p32 = compat_ptr(arg); - struct usbdevfs_bulktransfer __user *p; - compat_uint_t n; - compat_caddr_t addr; - - p = compat_alloc_user_space(sizeof(*p)); - - if (get_user(n, &p32->ep) || put_user(n, &p->ep) || - get_user(n, &p32->len) || put_user(n, &p->len) || - get_user(n, &p32->timeout) || put_user(n, &p->timeout) || - get_user(addr, &p32->data) || put_user(compat_ptr(addr), &p->data)) - return -EFAULT; - - return sys_ioctl(fd, USBDEVFS_BULK, (unsigned long)p); -} - - -/* - * USBDEVFS_SUBMITURB, USBDEVFS_REAPURB and USBDEVFS_REAPURBNDELAY - * are handled in usbdevfs core. -Christopher Li - */ - -struct usbdevfs_disconnectsignal32 { - compat_int_t signr; - compat_caddr_t context; -}; - -#define USBDEVFS_DISCSIGNAL32 _IOR('U', 14, struct usbdevfs_disconnectsignal32) - -static int do_usbdevfs_discsignal(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct usbdevfs_disconnectsignal kdis; - struct usbdevfs_disconnectsignal32 __user *udis; - mm_segment_t old_fs; - u32 uctx; - int err; - - udis = compat_ptr(arg); - - if (get_user(kdis.signr, &udis->signr) || - __get_user(uctx, &udis->context)) - return -EFAULT; - - kdis.context = compat_ptr(uctx); - - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = sys_ioctl(fd, USBDEVFS_DISCSIGNAL, (unsigned long) &kdis); - set_fs(old_fs); - - return err; -} - /* * I2C layer ioctls */ @@ -1069,9 +770,9 @@ struct i2c_rdwr_aligned { struct i2c_msg msgs[0]; }; -static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, + struct i2c_rdwr_ioctl_data32 __user *udata) { - struct i2c_rdwr_ioctl_data32 __user *udata = compat_ptr(arg); struct i2c_rdwr_aligned __user *tdata; struct i2c_msg __user *tmsgs; struct i2c_msg32 __user *umsgs; @@ -1105,10 +806,10 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, unsigned long ar return sys_ioctl(fd, cmd, (unsigned long)tdata); } -static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) +static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, + struct i2c_smbus_ioctl_data32 __user *udata) { struct i2c_smbus_ioctl_data __user *tdata; - struct i2c_smbus_ioctl_data32 __user *udata; compat_caddr_t datap; tdata = compat_alloc_user_space(sizeof(*tdata)); @@ -1117,7 +818,6 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a if (!access_ok(VERIFY_WRITE, tdata, sizeof(*tdata))) return -EFAULT; - udata = compat_ptr(arg); if (!access_ok(VERIFY_READ, udata, sizeof(*udata))) return -EFAULT; @@ -1137,7 +837,7 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a #define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) #define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t) -static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg) +static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp) { mm_segment_t oldfs = get_fs(); compat_ulong_t val32; @@ -1155,29 +855,14 @@ static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg) if (ret) return ret; val32 = kval; - return put_user(val32, (unsigned int __user *)arg); + return put_user(val32, (unsigned int __user *)argp); case RTC_IRQP_SET32: - return sys_ioctl(fd, RTC_IRQP_SET, arg); + return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp); case RTC_EPOCH_SET32: - return sys_ioctl(fd, RTC_EPOCH_SET, arg); - default: - /* unreached */ - return -ENOIOCTLCMD; + return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp); } -} -static int -lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct compat_timeval __user *tc = (struct compat_timeval __user *)arg; - struct timeval __user *tn = compat_alloc_user_space(sizeof(struct timeval)); - struct timeval ts; - if (get_user(ts.tv_sec, &tc->tv_sec) || - get_user(ts.tv_usec, &tc->tv_usec) || - put_user(ts.tv_sec, &tn->tv_sec) || - put_user(ts.tv_usec, &tn->tv_usec)) - return -EFAULT; - return sys_ioctl(fd, cmd, (unsigned long)tn); + return -ENOIOCTLCMD; } /* on ia32 l_start is on a 32-bit boundary */ @@ -1197,9 +882,9 @@ struct space_resv_32 { #define FS_IOC_RESVSP64_32 _IOW ('X', 42, struct space_resv_32) /* just account for different alignment */ -static int compat_ioctl_preallocate(struct file *file, unsigned long arg) +static int compat_ioctl_preallocate(struct file *file, + struct space_resv_32 __user *p32) { - struct space_resv_32 __user *p32 = compat_ptr(arg); struct space_resv __user *p = compat_alloc_user_space(sizeof(*p)); if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || @@ -1215,27 +900,13 @@ static int compat_ioctl_preallocate(struct file *file, unsigned long arg) } #endif +/* + * simple reversible transform to make our table more evenly + * distributed after sorting. + */ +#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff) -typedef int (*ioctl_trans_handler_t)(unsigned int, unsigned int, - unsigned long, struct file *); - -struct ioctl_trans { - unsigned long cmd; - ioctl_trans_handler_t handler; - struct ioctl_trans *next; -}; - -#define HANDLE_IOCTL(cmd,handler) \ - { (cmd), (ioctl_trans_handler_t)(handler) }, - -/* pointer to compatible structure or no argument */ -#define COMPATIBLE_IOCTL(cmd) \ - { (cmd), do_ioctl32_pointer }, - -/* argument is an unsigned long integer, not a pointer */ -#define ULONG_IOCTL(cmd) \ - { (cmd), (ioctl_trans_handler_t)sys_ioctl }, - +#define COMPATIBLE_IOCTL(cmd) XFORM(cmd), /* ioctl should not be warned about even if it's not implemented. Valid reasons to use this: - It is implemented with ->compat_ioctl on some device, but programs @@ -1245,7 +916,7 @@ struct ioctl_trans { Most other reasons are not valid. */ #define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd) -static struct ioctl_trans ioctl_start[] = { +static unsigned int ioctl_pointer[] = { /* compatible ioctls first */ COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */ COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */ @@ -1256,7 +927,6 @@ COMPATIBLE_IOCTL(TCSETA) COMPATIBLE_IOCTL(TCSETAW) COMPATIBLE_IOCTL(TCSETAF) COMPATIBLE_IOCTL(TCSBRK) -ULONG_IOCTL(TCSBRKP) COMPATIBLE_IOCTL(TCXONC) COMPATIBLE_IOCTL(TCFLSH) COMPATIBLE_IOCTL(TCGETS) @@ -1266,7 +936,6 @@ COMPATIBLE_IOCTL(TCSETSF) COMPATIBLE_IOCTL(TIOCLINUX) COMPATIBLE_IOCTL(TIOCSBRK) COMPATIBLE_IOCTL(TIOCCBRK) -ULONG_IOCTL(TIOCMIWAIT) COMPATIBLE_IOCTL(TIOCGICOUNT) /* Little t */ COMPATIBLE_IOCTL(TIOCGETD) @@ -1288,7 +957,6 @@ COMPATIBLE_IOCTL(TIOCSTI) COMPATIBLE_IOCTL(TIOCOUTQ) COMPATIBLE_IOCTL(TIOCSPGRP) COMPATIBLE_IOCTL(TIOCGPGRP) -ULONG_IOCTL(TIOCSCTTY) COMPATIBLE_IOCTL(TIOCGPTN) COMPATIBLE_IOCTL(TIOCSPTLCK) COMPATIBLE_IOCTL(TIOCSERGETLSR) @@ -1311,44 +979,11 @@ COMPATIBLE_IOCTL(FIGETBSZ) /* 'X' - originally XFS but some now in the VFS */ COMPATIBLE_IOCTL(FIFREEZE) COMPATIBLE_IOCTL(FITHAW) -/* RAID */ -COMPATIBLE_IOCTL(RAID_VERSION) -COMPATIBLE_IOCTL(GET_ARRAY_INFO) -COMPATIBLE_IOCTL(GET_DISK_INFO) -COMPATIBLE_IOCTL(PRINT_RAID_DEBUG) -COMPATIBLE_IOCTL(RAID_AUTORUN) -COMPATIBLE_IOCTL(CLEAR_ARRAY) -COMPATIBLE_IOCTL(ADD_NEW_DISK) -ULONG_IOCTL(HOT_REMOVE_DISK) -COMPATIBLE_IOCTL(SET_ARRAY_INFO) -COMPATIBLE_IOCTL(SET_DISK_INFO) -COMPATIBLE_IOCTL(WRITE_RAID_INFO) -COMPATIBLE_IOCTL(UNPROTECT_ARRAY) -COMPATIBLE_IOCTL(PROTECT_ARRAY) -ULONG_IOCTL(HOT_ADD_DISK) -ULONG_IOCTL(SET_DISK_FAULTY) -COMPATIBLE_IOCTL(RUN_ARRAY) -COMPATIBLE_IOCTL(STOP_ARRAY) -COMPATIBLE_IOCTL(STOP_ARRAY_RO) -COMPATIBLE_IOCTL(RESTART_ARRAY_RW) -COMPATIBLE_IOCTL(GET_BITMAP_FILE) -ULONG_IOCTL(SET_BITMAP_FILE) -/* Big K */ -COMPATIBLE_IOCTL(PIO_FONT) -COMPATIBLE_IOCTL(GIO_FONT) -COMPATIBLE_IOCTL(PIO_CMAP) -COMPATIBLE_IOCTL(GIO_CMAP) -ULONG_IOCTL(KDSIGACCEPT) COMPATIBLE_IOCTL(KDGETKEYCODE) COMPATIBLE_IOCTL(KDSETKEYCODE) -ULONG_IOCTL(KIOCSOUND) -ULONG_IOCTL(KDMKTONE) COMPATIBLE_IOCTL(KDGKBTYPE) -ULONG_IOCTL(KDSETMODE) COMPATIBLE_IOCTL(KDGETMODE) -ULONG_IOCTL(KDSKBMODE) COMPATIBLE_IOCTL(KDGKBMODE) -ULONG_IOCTL(KDSKBMETA) COMPATIBLE_IOCTL(KDGKBMETA) COMPATIBLE_IOCTL(KDGKBENT) COMPATIBLE_IOCTL(KDSKBENT) @@ -1358,15 +993,7 @@ COMPATIBLE_IOCTL(KDGKBDIACR) COMPATIBLE_IOCTL(KDSKBDIACR) COMPATIBLE_IOCTL(KDKBDREP) COMPATIBLE_IOCTL(KDGKBLED) -ULONG_IOCTL(KDSKBLED) COMPATIBLE_IOCTL(KDGETLED) -ULONG_IOCTL(KDSETLED) -COMPATIBLE_IOCTL(GIO_SCRNMAP) -COMPATIBLE_IOCTL(PIO_SCRNMAP) -COMPATIBLE_IOCTL(GIO_UNISCRNMAP) -COMPATIBLE_IOCTL(PIO_UNISCRNMAP) -COMPATIBLE_IOCTL(PIO_FONTRESET) -COMPATIBLE_IOCTL(PIO_UNIMAPCLR) #ifdef CONFIG_BLOCK /* Big S */ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) @@ -1378,20 +1005,9 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) #endif -/* Big V */ -COMPATIBLE_IOCTL(VT_SETMODE) -COMPATIBLE_IOCTL(VT_GETMODE) -COMPATIBLE_IOCTL(VT_GETSTATE) -COMPATIBLE_IOCTL(VT_OPENQRY) -ULONG_IOCTL(VT_ACTIVATE) -ULONG_IOCTL(VT_WAITACTIVE) -ULONG_IOCTL(VT_RELDISP) -ULONG_IOCTL(VT_DISALLOCATE) -COMPATIBLE_IOCTL(VT_RESIZE) -COMPATIBLE_IOCTL(VT_RESIZEX) -COMPATIBLE_IOCTL(VT_LOCKSWITCH) -COMPATIBLE_IOCTL(VT_UNLOCKSWITCH) -COMPATIBLE_IOCTL(VT_GETHIFONTMASK) +/* Big V (don't complain on serial console) */ +IGNORE_IOCTL(VT_OPENQRY) +IGNORE_IOCTL(VT_GETMODE) /* Little p (/dev/rtc, /dev/envctrl, etc.) */ COMPATIBLE_IOCTL(RTC_AIE_ON) COMPATIBLE_IOCTL(RTC_AIE_OFF) @@ -1420,11 +1036,12 @@ COMPATIBLE_IOCTL(MTIOCTOP) /* Socket level stuff */ COMPATIBLE_IOCTL(FIOQSIZE) #ifdef CONFIG_BLOCK +/* loop */ +IGNORE_IOCTL(LOOP_CLR_FD) /* SG stuff */ COMPATIBLE_IOCTL(SG_SET_TIMEOUT) COMPATIBLE_IOCTL(SG_GET_TIMEOUT) COMPATIBLE_IOCTL(SG_EMULATED_HOST) -ULONG_IOCTL(SG_SET_TRANSFORM) COMPATIBLE_IOCTL(SG_GET_TRANSFORM) COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE) COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE) @@ -1478,8 +1095,6 @@ COMPATIBLE_IOCTL(PPPIOCGCHAN) /* PPPOX */ COMPATIBLE_IOCTL(PPPOEIOCSFWD) COMPATIBLE_IOCTL(PPPOEIOCDFWD) -/* LP */ -COMPATIBLE_IOCTL(LPGETSTATUS) /* ppdev */ COMPATIBLE_IOCTL(PPSETMODE) COMPATIBLE_IOCTL(PPRSTATUS) @@ -1661,8 +1276,6 @@ COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) COMPATIBLE_IOCTL(OSS_GETVERSION) /* AUTOFS */ -ULONG_IOCTL(AUTOFS_IOC_READY) -ULONG_IOCTL(AUTOFS_IOC_FAIL) COMPATIBLE_IOCTL(AUTOFS_IOC_CATATONIC) COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER) COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE) @@ -1755,30 +1368,11 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM) COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE) -/* USB */ -COMPATIBLE_IOCTL(USBDEVFS_RESETEP) -COMPATIBLE_IOCTL(USBDEVFS_SETINTERFACE) -COMPATIBLE_IOCTL(USBDEVFS_SETCONFIGURATION) -COMPATIBLE_IOCTL(USBDEVFS_GETDRIVER) -COMPATIBLE_IOCTL(USBDEVFS_DISCARDURB) -COMPATIBLE_IOCTL(USBDEVFS_CLAIMINTERFACE) -COMPATIBLE_IOCTL(USBDEVFS_RELEASEINTERFACE) -COMPATIBLE_IOCTL(USBDEVFS_CONNECTINFO) -COMPATIBLE_IOCTL(USBDEVFS_HUB_PORTINFO) -COMPATIBLE_IOCTL(USBDEVFS_RESET) -COMPATIBLE_IOCTL(USBDEVFS_SUBMITURB32) -COMPATIBLE_IOCTL(USBDEVFS_REAPURB32) -COMPATIBLE_IOCTL(USBDEVFS_REAPURBNDELAY32) -COMPATIBLE_IOCTL(USBDEVFS_CLEAR_HALT) /* NBD */ -ULONG_IOCTL(NBD_SET_SOCK) -ULONG_IOCTL(NBD_SET_BLKSIZE) -ULONG_IOCTL(NBD_SET_SIZE) COMPATIBLE_IOCTL(NBD_DO_IT) COMPATIBLE_IOCTL(NBD_CLEAR_SOCK) COMPATIBLE_IOCTL(NBD_CLEAR_QUE) COMPATIBLE_IOCTL(NBD_PRINT_DEBUG) -ULONG_IOCTL(NBD_SET_SIZE_BLOCKS) COMPATIBLE_IOCTL(NBD_DISCONNECT) /* i2c */ COMPATIBLE_IOCTL(I2C_SLAVE) @@ -1878,42 +1472,6 @@ COMPATIBLE_IOCTL(JSIOCGAXES) COMPATIBLE_IOCTL(JSIOCGBUTTONS) COMPATIBLE_IOCTL(JSIOCGNAME(0)) -/* now things that need handlers */ -#ifdef CONFIG_BLOCK -HANDLE_IOCTL(SG_IO,sg_ioctl_trans) -HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans) -#endif -HANDLE_IOCTL(PPPIOCGIDLE32, ppp_ioctl_trans) -HANDLE_IOCTL(PPPIOCSCOMPRESS32, ppp_ioctl_trans) -HANDLE_IOCTL(PPPIOCSPASS32, ppp_sock_fprog_ioctl_trans) -HANDLE_IOCTL(PPPIOCSACTIVE32, ppp_sock_fprog_ioctl_trans) -#ifdef CONFIG_BLOCK -HANDLE_IOCTL(MTIOCGET32, mt_ioctl_trans) -HANDLE_IOCTL(MTIOCPOS32, mt_ioctl_trans) -#endif -#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int) -HANDLE_IOCTL(AUTOFS_IOC_SETTIMEOUT32, ioc_settimeout) -#ifdef CONFIG_VT -HANDLE_IOCTL(PIO_FONTX, do_fontx_ioctl) -HANDLE_IOCTL(GIO_FONTX, do_fontx_ioctl) -HANDLE_IOCTL(PIO_UNIMAP, do_unimap_ioctl) -HANDLE_IOCTL(GIO_UNIMAP, do_unimap_ioctl) -HANDLE_IOCTL(KDFONTOP, do_kdfontop_ioctl) -#endif -/* One SMB ioctl needs translations. */ -#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t) -HANDLE_IOCTL(SMB_IOC_GETMOUNTUID_32, do_smb_getmountuid) -/* block stuff */ -#ifdef CONFIG_BLOCK -/* loop */ -IGNORE_IOCTL(LOOP_CLR_FD) -/* Raw devices */ -HANDLE_IOCTL(RAW_SETBIND, raw_ioctl) -HANDLE_IOCTL(RAW_GETBIND, raw_ioctl) -#endif -/* Serial */ -HANDLE_IOCTL(TIOCGSERIAL, serial_struct_ioctl) -HANDLE_IOCTL(TIOCSSERIAL, serial_struct_ioctl) #ifdef TIOCGLTC COMPATIBLE_IOCTL(TIOCGLTC) COMPATIBLE_IOCTL(TIOCSLTC) @@ -1928,39 +1486,6 @@ COMPATIBLE_IOCTL(TIOCSLTC) COMPATIBLE_IOCTL(TIOCSTART) COMPATIBLE_IOCTL(TIOCSTOP) #endif -/* Usbdevfs */ -HANDLE_IOCTL(USBDEVFS_CONTROL32, do_usbdevfs_control) -HANDLE_IOCTL(USBDEVFS_BULK32, do_usbdevfs_bulk) -HANDLE_IOCTL(USBDEVFS_DISCSIGNAL32, do_usbdevfs_discsignal) -COMPATIBLE_IOCTL(USBDEVFS_IOCTL32) -/* i2c */ -HANDLE_IOCTL(I2C_FUNCS, w_long) -HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl) -HANDLE_IOCTL(I2C_SMBUS, do_i2c_smbus_ioctl) -/* Not implemented in the native kernel */ -HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl) -HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl) -HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl) -HANDLE_IOCTL(RTC_EPOCH_SET32, rtc_ioctl) - -/* dvb */ -HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event) -HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture) -HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette) - -/* parport */ -COMPATIBLE_IOCTL(LPTIME) -COMPATIBLE_IOCTL(LPCHAR) -COMPATIBLE_IOCTL(LPABORTOPEN) -COMPATIBLE_IOCTL(LPCAREFUL) -COMPATIBLE_IOCTL(LPWAIT) -COMPATIBLE_IOCTL(LPSETIRQ) -COMPATIBLE_IOCTL(LPGETSTATUS) -COMPATIBLE_IOCTL(LPGETSTATUS) -COMPATIBLE_IOCTL(LPRESET) -/*LPGETSTATS not implemented, but no kernels seem to compile it in anyways*/ -COMPATIBLE_IOCTL(LPGETFLAGS) -HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans) /* fat 'r' ioctls. These are handled by fat with ->compat_ioctl, but we don't want warnings on other file systems. So declare @@ -1988,12 +1513,108 @@ IGNORE_IOCTL(FBIOGCURSOR32) #endif }; -#define IOCTL_HASHSIZE 256 -static struct ioctl_trans *ioctl32_hash_table[IOCTL_HASHSIZE]; - -static inline unsigned long ioctl32_hash(unsigned long cmd) +/* + * Convert common ioctl arguments based on their command number + * + * Please do not add any code in here. Instead, implement + * a compat_ioctl operation in the place that handleŃ• the + * ioctl for the native case. + */ +static long do_ioctl_trans(int fd, unsigned int cmd, + unsigned long arg, struct file *file) { - return (((cmd >> 6) ^ (cmd >> 4) ^ cmd)) % IOCTL_HASHSIZE; + void __user *argp = compat_ptr(arg); + + switch (cmd) { + case PPPIOCGIDLE32: + return ppp_gidle(fd, cmd, argp); + case PPPIOCSCOMPRESS32: + return ppp_scompress(fd, cmd, argp); + case PPPIOCSPASS32: + case PPPIOCSACTIVE32: + return ppp_sock_fprog_ioctl_trans(fd, cmd, argp); +#ifdef CONFIG_BLOCK + case SG_IO: + return sg_ioctl_trans(fd, cmd, argp); + case SG_GET_REQUEST_TABLE: + return sg_grt_trans(fd, cmd, argp); + case MTIOCGET32: + case MTIOCPOS32: + return mt_ioctl_trans(fd, cmd, argp); + /* Raw devices */ + case RAW_SETBIND: + case RAW_GETBIND: + return raw_ioctl(fd, cmd, argp); +#endif +#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int) + case AUTOFS_IOC_SETTIMEOUT32: + return ioc_settimeout(fd, cmd, argp); + /* One SMB ioctl needs translations. */ +#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t) + case SMB_IOC_GETMOUNTUID_32: + return do_smb_getmountuid(fd, cmd, argp); + /* Serial */ + case TIOCGSERIAL: + case TIOCSSERIAL: + return serial_struct_ioctl(fd, cmd, argp); + /* i2c */ + case I2C_FUNCS: + return w_long(fd, cmd, argp); + case I2C_RDWR: + return do_i2c_rdwr_ioctl(fd, cmd, argp); + case I2C_SMBUS: + return do_i2c_smbus_ioctl(fd, cmd, argp); + /* Not implemented in the native kernel */ + case RTC_IRQP_READ32: + case RTC_IRQP_SET32: + case RTC_EPOCH_READ32: + case RTC_EPOCH_SET32: + return rtc_ioctl(fd, cmd, argp); + + /* dvb */ + case VIDEO_GET_EVENT: + return do_video_get_event(fd, cmd, argp); + case VIDEO_STILLPICTURE: + return do_video_stillpicture(fd, cmd, argp); + case VIDEO_SET_SPU_PALETTE: + return do_video_set_spu_palette(fd, cmd, argp); + } + + /* + * These take an integer instead of a pointer as 'arg', + * so we must not do a compat_ptr() translation. + */ + switch (cmd) { + /* Big T */ + case TCSBRKP: + case TIOCMIWAIT: + case TIOCSCTTY: + /* RAID */ + case HOT_REMOVE_DISK: + case HOT_ADD_DISK: + case SET_DISK_FAULTY: + case SET_BITMAP_FILE: + /* Big K */ + case KDSIGACCEPT: + case KIOCSOUND: + case KDMKTONE: + case KDSETMODE: + case KDSKBMODE: + case KDSKBMETA: + case KDSKBLED: + case KDSETLED: + /* AUTOFS */ + case AUTOFS_IOC_READY: + case AUTOFS_IOC_FAIL: + /* NBD */ + case NBD_SET_SOCK: + case NBD_SET_BLKSIZE: + case NBD_SET_SIZE: + case NBD_SET_SIZE_BLOCKS: + return do_vfs_ioctl(file, fd, cmd, arg); + } + + return -ENOIOCTLCMD; } static void compat_ioctl_error(struct file *filp, unsigned int fd, @@ -2025,12 +1646,33 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd, free_page((unsigned long)path); } +static int compat_ioctl_check_table(unsigned int xcmd) +{ + int i; + const int max = ARRAY_SIZE(ioctl_pointer) - 1; + + BUILD_BUG_ON(max >= (1 << 16)); + + /* guess initial offset into table, assuming a + normalized distribution */ + i = ((xcmd >> 16) * max) >> 16; + + /* do linear search up first, until greater or equal */ + while (ioctl_pointer[i] < xcmd && i < max) + i++; + + /* then do linear search down */ + while (ioctl_pointer[i] > xcmd && i > 0) + i--; + + return ioctl_pointer[i] == xcmd; +} + asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { struct file *filp; int error = -EBADF; - struct ioctl_trans *t; int fput_needed; filp = fget_light(fd, &fput_needed); @@ -2058,7 +1700,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, #if defined(CONFIG_IA64) || defined(CONFIG_X86_64) case FS_IOC_RESVSP_32: case FS_IOC_RESVSP64_32: - error = compat_ioctl_preallocate(filp, arg); + error = compat_ioctl_preallocate(filp, compat_ptr(arg)); goto out_fput; #else case FS_IOC_RESVSP: @@ -2087,12 +1729,11 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, break; } - for (t = ioctl32_hash_table[ioctl32_hash(cmd)]; t; t = t->next) { - if (t->cmd == cmd) - goto found_handler; - } + if (compat_ioctl_check_table(XFORM(cmd))) + goto found_handler; - { + error = do_ioctl_trans(fd, cmd, arg, filp); + if (error == -ENOIOCTLCMD) { static int count; if (++count <= 50) @@ -2103,13 +1744,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, goto out_fput; found_handler: - if (t->handler) { - lock_kernel(); - error = t->handler(fd, cmd, arg, filp); - unlock_kernel(); - goto out_fput; - } - + arg = (unsigned long)compat_ptr(arg); do_ioctl: error = do_vfs_ioctl(filp, fd, cmd, arg); out_fput: @@ -2118,35 +1753,22 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, return error; } -static void ioctl32_insert_translation(struct ioctl_trans *trans) +static int __init init_sys32_ioctl_cmp(const void *p, const void *q) { - unsigned long hash; - struct ioctl_trans *t; - - hash = ioctl32_hash (trans->cmd); - if (!ioctl32_hash_table[hash]) - ioctl32_hash_table[hash] = trans; - else { - t = ioctl32_hash_table[hash]; - while (t->next) - t = t->next; - trans->next = NULL; - t->next = trans; - } + unsigned int a, b; + a = *(unsigned int *)p; + b = *(unsigned int *)q; + if (a > b) + return 1; + if (a < b) + return -1; + return 0; } static int __init init_sys32_ioctl(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(ioctl_start); i++) { - if (ioctl_start[i].next) { - printk("ioctl translation %d bad\n",i); - return -1; - } - - ioctl32_insert_translation(&ioctl_start[i]); - } + sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer), + init_sys32_ioctl_cmp, NULL); return 0; } __initcall(init_sys32_ioctl); diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index c8afa6b1d91d..32a5f46b1157 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -121,8 +121,10 @@ static int get_target(const char *symname, struct path *path, ret = -ENOENT; path_put(path); } - } else + } else { ret = -EPERM; + path_put(path); + } } return ret; diff --git a/fs/dcache.c b/fs/dcache.c index a100fa35a48f..953173a293a9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -978,6 +978,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name) q.hash = full_name_hash(q.name, q.len); return d_alloc(parent, &q); } +EXPORT_SYMBOL(d_alloc_name); /* the caller must hold dcache_lock */ static void __d_instantiate(struct dentry *dentry, struct inode *inode) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 0d23b52dd22c..274ac865bae8 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -32,7 +32,9 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; -static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev) +static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev, + void *data, const struct file_operations *fops) + { struct inode *inode = new_inode(sb); @@ -44,14 +46,18 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d init_special_inode(inode, mode, dev); break; case S_IFREG: - inode->i_fop = &debugfs_file_operations; + inode->i_fop = fops ? fops : &debugfs_file_operations; + inode->i_private = data; break; case S_IFLNK: inode->i_op = &debugfs_link_operations; + inode->i_fop = fops; + inode->i_private = data; break; case S_IFDIR: inode->i_op = &simple_dir_inode_operations; - inode->i_fop = &simple_dir_operations; + inode->i_fop = fops ? fops : &simple_dir_operations; + inode->i_private = data; /* directory inodes start off with i_nlink == 2 * (for "." entry) */ @@ -64,7 +70,8 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d /* SMP-safe */ static int debugfs_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t dev) + int mode, dev_t dev, void *data, + const struct file_operations *fops) { struct inode *inode; int error = -EPERM; @@ -72,7 +79,7 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry, if (dentry->d_inode) return -EEXIST; - inode = debugfs_get_inode(dir->i_sb, mode, dev); + inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops); if (inode) { d_instantiate(dentry, inode); dget(dentry); @@ -81,12 +88,13 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry, return error; } -static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode, + void *data, const struct file_operations *fops) { int res; mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR; - res = debugfs_mknod(dir, dentry, mode, 0); + res = debugfs_mknod(dir, dentry, mode, 0, data, fops); if (!res) { inc_nlink(dir); fsnotify_mkdir(dir, dentry); @@ -94,18 +102,20 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return res; } -static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode) +static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode, + void *data, const struct file_operations *fops) { mode = (mode & S_IALLUGO) | S_IFLNK; - return debugfs_mknod(dir, dentry, mode, 0); + return debugfs_mknod(dir, dentry, mode, 0, data, fops); } -static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode) +static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode, + void *data, const struct file_operations *fops) { int res; mode = (mode & S_IALLUGO) | S_IFREG; - res = debugfs_mknod(dir, dentry, mode, 0); + res = debugfs_mknod(dir, dentry, mode, 0, data, fops); if (!res) fsnotify_create(dir, dentry); return res; @@ -139,7 +149,9 @@ static struct file_system_type debug_fs_type = { static int debugfs_create_by_name(const char *name, mode_t mode, struct dentry *parent, - struct dentry **dentry) + struct dentry **dentry, + void *data, + const struct file_operations *fops) { int error = 0; @@ -148,15 +160,8 @@ static int debugfs_create_by_name(const char *name, mode_t mode, * block. A pointer to that is in the struct vfsmount that we * have around. */ - if (!parent) { - if (debugfs_mount && debugfs_mount->mnt_sb) { - parent = debugfs_mount->mnt_sb->s_root; - } - } - if (!parent) { - pr_debug("debugfs: Ah! can not find a parent!\n"); - return -EFAULT; - } + if (!parent) + parent = debugfs_mount->mnt_sb->s_root; *dentry = NULL; mutex_lock(&parent->d_inode->i_mutex); @@ -164,13 +169,16 @@ static int debugfs_create_by_name(const char *name, mode_t mode, if (!IS_ERR(*dentry)) { switch (mode & S_IFMT) { case S_IFDIR: - error = debugfs_mkdir(parent->d_inode, *dentry, mode); + error = debugfs_mkdir(parent->d_inode, *dentry, mode, + data, fops); break; case S_IFLNK: - error = debugfs_link(parent->d_inode, *dentry, mode); + error = debugfs_link(parent->d_inode, *dentry, mode, + data, fops); break; default: - error = debugfs_create(parent->d_inode, *dentry, mode); + error = debugfs_create(parent->d_inode, *dentry, mode, + data, fops); break; } dput(*dentry); @@ -221,19 +229,13 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode, if (error) goto exit; - error = debugfs_create_by_name(name, mode, parent, &dentry); + error = debugfs_create_by_name(name, mode, parent, &dentry, + data, fops); if (error) { dentry = NULL; simple_release_fs(&debugfs_mount, &debugfs_mount_count); goto exit; } - - if (dentry->d_inode) { - if (data) - dentry->d_inode->i_private = data; - if (fops) - dentry->d_inode->i_fop = fops; - } exit: return dentry; } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index d5f8c96964be..8882ecc0f1bf 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -517,11 +517,23 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number) { + struct dentry *dentry; + struct tty_struct *tty; + BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); + /* Ensure dentry has not been deleted by devpts_pty_kill() */ + dentry = d_find_alias(pts_inode); + if (!dentry) + return NULL; + + tty = NULL; if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) - return (struct tty_struct *)pts_inode->i_private; - return NULL; + tty = (struct tty_struct *)pts_inode->i_private; + + dput(dentry); + + return tty; } void devpts_pty_kill(struct tty_struct *tty) diff --git a/fs/direct-io.c b/fs/direct-io.c index b912270942fa..e82adc2debb7 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -53,13 +53,6 @@ * * If blkfactor is zero then the user's request was aligned to the filesystem's * blocksize. - * - * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems. - * This determines whether we need to do the fancy locking which prevents - * direct-IO from being able to read uninitialised disk blocks. If its zero - * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is - * not held for the entire direct write (taken briefly, initially, during a - * direct read though, but its never held for the duration of a direct-IO). */ struct dio { @@ -68,7 +61,7 @@ struct dio { struct inode *inode; int rw; loff_t i_size; /* i_size when submitted */ - int lock_type; /* doesn't change */ + int flags; /* doesn't change */ unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which is finer than the filesystem's soft @@ -104,6 +97,18 @@ struct dio { unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ sector_t cur_page_block; /* Where it starts */ + /* BIO completion state */ + spinlock_t bio_lock; /* protects BIO fields below */ + unsigned long refcount; /* direct_io_worker() and bios */ + struct bio *bio_list; /* singly linked via bi_private */ + struct task_struct *waiter; /* waiting task (NULL if none) */ + + /* AIO related stuff */ + struct kiocb *iocb; /* kiocb */ + int is_async; /* is IO async ? */ + int io_error; /* IO error in completion path */ + ssize_t result; /* IO result */ + /* * Page fetching state. These variables belong to dio_refill_pages(). */ @@ -115,22 +120,16 @@ struct dio { * Page queue. These variables belong to dio_refill_pages() and * dio_get_page(). */ - struct page *pages[DIO_PAGES]; /* page buffer */ unsigned head; /* next page to process */ unsigned tail; /* last valid page + 1 */ int page_errors; /* errno from get_user_pages() */ - /* BIO completion state */ - spinlock_t bio_lock; /* protects BIO fields below */ - unsigned long refcount; /* direct_io_worker() and bios */ - struct bio *bio_list; /* singly linked via bi_private */ - struct task_struct *waiter; /* waiting task (NULL if none) */ - - /* AIO related stuff */ - struct kiocb *iocb; /* kiocb */ - int is_async; /* is IO async ? */ - int io_error; /* IO error in completion path */ - ssize_t result; /* IO result */ + /* + * pages[] (and any fields placed after it) are not zeroed out at + * allocation time. Don't add new fields after pages[] unless you + * wish that they not be zeroed. + */ + struct page *pages[DIO_PAGES]; /* page buffer */ }; /* @@ -240,7 +239,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) if (dio->end_io && dio->result) dio->end_io(dio->iocb, offset, transferred, dio->map_bh.b_private); - if (dio->lock_type == DIO_LOCKING) + + if (dio->flags & DIO_LOCKING) /* lockdep: non-owner release */ up_read_non_owner(&dio->inode->i_alloc_sem); @@ -515,21 +515,24 @@ static int get_more_blocks(struct dio *dio) map_bh->b_state = 0; map_bh->b_size = fs_count << dio->inode->i_blkbits; + /* + * For writes inside i_size on a DIO_SKIP_HOLES filesystem we + * forbid block creations: only overwrites are permitted. + * We will return early to the caller once we see an + * unmapped buffer head returned, and the caller will fall + * back to buffered I/O. + * + * Otherwise the decision is left to the get_blocks method, + * which may decide to handle it or also return an unmapped + * buffer head. + */ create = dio->rw & WRITE; - if (dio->lock_type == DIO_LOCKING) { + if (dio->flags & DIO_SKIP_HOLES) { if (dio->block_in_file < (i_size_read(dio->inode) >> dio->blkbits)) create = 0; - } else if (dio->lock_type == DIO_NO_LOCKING) { - create = 0; } - /* - * For writes inside i_size we forbid block creations: only - * overwrites are permitted. We fall back to buffered writes - * at a higher level for inside-i_size block-instantiating - * writes. - */ ret = (*dio->get_block)(dio->inode, fs_startblk, map_bh, create); } @@ -1039,7 +1042,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * we can let i_mutex go now that its achieved its purpose * of protecting us from looking up uninitialized blocks. */ - if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) + if (rw == READ && (dio->flags & DIO_LOCKING)) mutex_unlock(&dio->inode->i_mutex); /* @@ -1086,30 +1089,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, /* * This is a library function for use by filesystem drivers. - * The locking rules are governed by the dio_lock_type parameter. * - * DIO_NO_LOCKING (no locking, for raw block device access) - * For writes, i_mutex is not held on entry; it is never taken. + * The locking rules are governed by the flags parameter: + * - if the flags value contains DIO_LOCKING we use a fancy locking + * scheme for dumb filesystems. + * For writes this function is called under i_mutex and returns with + * i_mutex held, for reads, i_mutex is not held on entry, but it is + * taken and dropped again before returning. + * For reads and writes i_alloc_sem is taken in shared mode and released + * on I/O completion (which may happen asynchronously after returning to + * the caller). * - * DIO_LOCKING (simple locking for regular files) - * For writes we are called under i_mutex and return with i_mutex held, even - * though it is internally dropped. - * For reads, i_mutex is not held on entry, but it is taken and dropped before - * returning. - * - * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of - * uninitialised data, allowing parallel direct readers and writers) - * For writes we are called without i_mutex, return without it, never touch it. - * For reads we are called under i_mutex and return with i_mutex held, even - * though it may be internally dropped. - * - * Additional i_alloc_sem locking requirements described inline below. + * - if the flags value does NOT contain DIO_LOCKING we don't use any + * internal locking but rather rely on the filesystem to synchronize + * direct I/O reads/writes versus each other and truncate. + * For reads and writes both i_mutex and i_alloc_sem are not held on + * entry and are never taken. */ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - int dio_lock_type) + int flags) { int seg; size_t size; @@ -1120,8 +1121,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; - int release_i_mutex = 0; - int acquire_i_mutex = 0; if (rw & WRITE) rw = WRITE_ODIRECT_PLUG; @@ -1151,48 +1150,41 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, } } - dio = kzalloc(sizeof(*dio), GFP_KERNEL); + dio = kmalloc(sizeof(*dio), GFP_KERNEL); retval = -ENOMEM; if (!dio) goto out; - /* - * For block device access DIO_NO_LOCKING is used, - * neither readers nor writers do any locking at all - * For regular files using DIO_LOCKING, - * readers need to grab i_mutex and i_alloc_sem - * writers need to grab i_alloc_sem only (i_mutex is already held) - * For regular files using DIO_OWN_LOCKING, - * neither readers nor writers take any locks here + * Believe it or not, zeroing out the page array caused a .5% + * performance regression in a database benchmark. So, we take + * care to only zero out what's needed. */ - dio->lock_type = dio_lock_type; - if (dio_lock_type != DIO_NO_LOCKING) { + memset(dio, 0, offsetof(struct dio, pages)); + + dio->flags = flags; + if (dio->flags & DIO_LOCKING) { /* watch out for a 0 len io from a tricksy fs */ if (rw == READ && end > offset) { - struct address_space *mapping; + struct address_space *mapping = + iocb->ki_filp->f_mapping; - mapping = iocb->ki_filp->f_mapping; - if (dio_lock_type != DIO_OWN_LOCKING) { - mutex_lock(&inode->i_mutex); - release_i_mutex = 1; - } + /* will be released by direct_io_worker */ + mutex_lock(&inode->i_mutex); retval = filemap_write_and_wait_range(mapping, offset, end - 1); if (retval) { + mutex_unlock(&inode->i_mutex); kfree(dio); goto out; } - - if (dio_lock_type == DIO_OWN_LOCKING) { - mutex_unlock(&inode->i_mutex); - acquire_i_mutex = 1; - } } - if (dio_lock_type == DIO_LOCKING) - /* lockdep: not the owner will release it */ - down_read_non_owner(&inode->i_alloc_sem); + /* + * Will be released at I/O completion, possibly in a + * different thread. + */ + down_read_non_owner(&inode->i_alloc_sem); } /* @@ -1210,24 +1202,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again for DIO_LOCKING. - * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by - * it's own meaner. + * + * NOTE: filesystems with their own locking have to handle this + * on their own. */ - if (unlikely(retval < 0 && (rw & WRITE))) { - loff_t isize = i_size_read(inode); - - if (end > isize && dio_lock_type == DIO_LOCKING) - vmtruncate(inode, isize); + if (flags & DIO_LOCKING) { + if (unlikely((rw & WRITE) && retval < 0)) { + loff_t isize = i_size_read(inode); + if (end > isize) + vmtruncate(inode, isize); + } } - if (rw == READ && dio_lock_type == DIO_LOCKING) - release_i_mutex = 0; - out: - if (release_i_mutex) - mutex_unlock(&inode->i_mutex); - else if (acquire_i_mutex) - mutex_lock(&inode->i_mutex); return retval; } EXPORT_SYMBOL(__blockdev_direct_IO); diff --git a/fs/dlm/config.c b/fs/dlm/config.c index fd9859f92fad..0df243850818 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -410,10 +410,10 @@ static struct config_group *make_cluster(struct config_group *g, struct dlm_comms *cms = NULL; void *gps = NULL; - cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL); - gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); - sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL); - cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL); + cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS); + gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS); + sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS); + cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS); if (!cl || !gps || !sps || !cms) goto fail; @@ -482,9 +482,9 @@ static struct config_group *make_space(struct config_group *g, const char *name) struct dlm_nodes *nds = NULL; void *gps = NULL; - sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL); - gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL); - nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL); + sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS); + gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS); + nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS); if (!sp || !gps || !nds) goto fail; @@ -536,7 +536,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name) { struct dlm_comm *cm; - cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL); + cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS); if (!cm) return ERR_PTR(-ENOMEM); @@ -569,7 +569,7 @@ static struct config_item *make_node(struct config_group *g, const char *name) struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); struct dlm_node *nd; - nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL); + nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS); if (!nd) return ERR_PTR(-ENOMEM); @@ -705,7 +705,7 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len) if (cm->addr_count >= DLM_MAX_ADDR_COUNT) return -ENOSPC; - addr = kzalloc(sizeof(*addr), GFP_KERNEL); + addr = kzalloc(sizeof(*addr), GFP_NOFS); if (!addr) return -ENOMEM; @@ -868,7 +868,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, ids_count = sp->members_count; - ids = kcalloc(ids_count, sizeof(int), GFP_KERNEL); + ids = kcalloc(ids_count, sizeof(int), GFP_NOFS); if (!ids) { rv = -ENOMEM; goto out; @@ -886,7 +886,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, if (!new_count) goto out_ids; - new = kcalloc(new_count, sizeof(int), GFP_KERNEL); + new = kcalloc(new_count, sizeof(int), GFP_NOFS); if (!new) { kfree(ids); rv = -ENOMEM; diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 1c8bb8c3a82e..375a2359b3bf 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -404,7 +404,7 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos) if (bucket >= ls->ls_rsbtbl_size) return NULL; - ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL); + ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS); if (!ri) return NULL; if (n == 0) diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index c4dfa1dcc86f..7b84c1dbc82e 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -49,8 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len) spin_unlock(&ls->ls_recover_list_lock); if (!found) - de = kzalloc(sizeof(struct dlm_direntry) + len, - ls->ls_allocation); + de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_NOFS); return de; } @@ -212,7 +211,7 @@ int dlm_recover_directory(struct dlm_ls *ls) dlm_dir_clear(ls); - last_name = kmalloc(DLM_RESNAME_MAXLEN, ls->ls_allocation); + last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS); if (!last_name) goto out; @@ -323,7 +322,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name, if (namelen > DLM_RESNAME_MAXLEN) return -EINVAL; - de = kzalloc(sizeof(struct dlm_direntry) + namelen, ls->ls_allocation); + de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_NOFS); if (!de) return -ENOMEM; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index d01ca0a711db..826d3dc6e0ab 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -473,7 +473,6 @@ struct dlm_ls { int ls_low_nodeid; int ls_total_weight; int *ls_node_array; - gfp_t ls_allocation; struct dlm_rsb ls_stub_rsb; /* for returning errors */ struct dlm_lkb ls_stub_lkb; /* for returning errors */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index eb507c453c5f..9c0c1db1e105 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -2689,7 +2689,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len, pass into lowcomms_commit and a message buffer (mb) that we write our data into */ - mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb); + mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); if (!mh) return -ENOBUFS; @@ -4512,7 +4512,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, } if (flags & DLM_LKF_VALBLK) { - ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL); + ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS); if (!ua->lksb.sb_lvbptr) { kfree(ua); __put_lkb(ls, lkb); @@ -4582,7 +4582,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, ua = lkb->lkb_ua; if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) { - ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL); + ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS); if (!ua->lksb.sb_lvbptr) { error = -ENOMEM; goto out_put; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index d489fcc86713..c010ecfc0d29 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -430,7 +430,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, error = -ENOMEM; - ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL); + ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS); if (!ls) goto out; memcpy(ls->ls_name, name, namelen); @@ -443,11 +443,6 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, if (flags & DLM_LSFL_TIMEWARN) set_bit(LSFL_TIMEWARN, &ls->ls_flags); - if (flags & DLM_LSFL_FS) - ls->ls_allocation = GFP_NOFS; - else - ls->ls_allocation = GFP_KERNEL; - /* ls_exflags are forced to match among nodes, and we don't need to require all nodes to have some flags set */ ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | @@ -456,7 +451,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, size = dlm_config.ci_rsbtbl_size; ls->ls_rsbtbl_size = size; - ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL); + ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_NOFS); if (!ls->ls_rsbtbl) goto out_lsfree; for (i = 0; i < size; i++) { @@ -468,7 +463,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, size = dlm_config.ci_lkbtbl_size; ls->ls_lkbtbl_size = size; - ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL); + ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_NOFS); if (!ls->ls_lkbtbl) goto out_rsbfree; for (i = 0; i < size; i++) { @@ -480,7 +475,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, size = dlm_config.ci_dirtbl_size; ls->ls_dirtbl_size = size; - ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL); + ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_NOFS); if (!ls->ls_dirtbl) goto out_lkbfree; for (i = 0; i < size; i++) { @@ -527,7 +522,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, mutex_init(&ls->ls_requestqueue_mutex); mutex_init(&ls->ls_clear_proc_locks); - ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); + ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); if (!ls->ls_recover_buf) goto out_dirfree; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 70736eb4b516..52cab160893c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1060,7 +1060,7 @@ static void init_local(void) if (dlm_our_addr(&sas, i)) break; - addr = kmalloc(sizeof(*addr), GFP_KERNEL); + addr = kmalloc(sizeof(*addr), GFP_NOFS); if (!addr) break; memcpy(addr, &sas, sizeof(*addr)); @@ -1099,7 +1099,7 @@ static int sctp_listen_for_all(void) struct sockaddr_storage localaddr; struct sctp_event_subscribe subscribe; int result = -EINVAL, num = 1, i, addr_len; - struct connection *con = nodeid2con(0, GFP_KERNEL); + struct connection *con = nodeid2con(0, GFP_NOFS); int bufsize = NEEDED_RMEM; if (!con) @@ -1171,7 +1171,7 @@ out: static int tcp_listen_for_all(void) { struct socket *sock = NULL; - struct connection *con = nodeid2con(0, GFP_KERNEL); + struct connection *con = nodeid2con(0, GFP_NOFS); int result = -EINVAL; if (!con) diff --git a/fs/dlm/member.c b/fs/dlm/member.c index b128775913b2..84f70bfb0baf 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -48,7 +48,7 @@ static int dlm_add_member(struct dlm_ls *ls, int nodeid) struct dlm_member *memb; int w, error; - memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation); + memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); if (!memb) return -ENOMEM; @@ -143,7 +143,7 @@ static void make_member_array(struct dlm_ls *ls) ls->ls_total_weight = total; - array = kmalloc(sizeof(int) * total, ls->ls_allocation); + array = kmalloc(sizeof(int) * total, GFP_NOFS); if (!array) return; @@ -226,7 +226,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) continue; log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]); - memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation); + memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); if (!memb) return -ENOMEM; memb->nodeid = rv->new[i]; @@ -341,7 +341,7 @@ int dlm_ls_start(struct dlm_ls *ls) int *ids = NULL, *new = NULL; int error, ids_count = 0, new_count = 0; - rv = kzalloc(sizeof(struct dlm_recover), ls->ls_allocation); + rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS); if (!rv) return -ENOMEM; diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index c1775b84ebab..8e0d00db004f 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls) { char *p; - p = kzalloc(ls->ls_lvblen, ls->ls_allocation); + p = kzalloc(ls->ls_lvblen, GFP_NOFS); return p; } @@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen) DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); - r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation); + r = kzalloc(sizeof(*r) + namelen, GFP_NOFS); return r; } @@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls) { struct dlm_lkb *lkb; - lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation); + lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS); return lkb; } diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 55ea369f43a9..052095cd592f 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -26,7 +26,7 @@ static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size) struct sk_buff *skb; void *data; - skb = genlmsg_new(size, GFP_KERNEL); + skb = genlmsg_new(size, GFP_NOFS); if (!skb) return -ENOMEM; diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index b540aa5d1f61..b5f89aef3b29 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -82,7 +82,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, if (!ls) return -EINVAL; - xop = kzalloc(sizeof(*xop), GFP_KERNEL); + xop = kzalloc(sizeof(*xop), GFP_NOFS); if (!xop) { rv = -ENOMEM; goto out; @@ -211,7 +211,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, if (!ls) return -EINVAL; - op = kzalloc(sizeof(*op), GFP_KERNEL); + op = kzalloc(sizeof(*op), GFP_NOFS); if (!op) { rv = -ENOMEM; goto out; @@ -266,7 +266,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, if (!ls) return -EINVAL; - op = kzalloc(sizeof(*op), GFP_KERNEL); + op = kzalloc(sizeof(*op), GFP_NOFS); if (!op) { rv = -ENOMEM; goto out; diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 67522c268c14..3c83a49a48a3 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -38,7 +38,7 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, char *mb; int mb_len = sizeof(struct dlm_rcom) + len; - mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb); + mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); if (!mh) { log_print("create_rcom to %d type %d len %d ENOBUFS", to_nodeid, type, len); diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index 7a2307c08911..a44fa22890e1 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -35,7 +35,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) struct rq_entry *e; int length = ms->m_header.h_length - sizeof(struct dlm_message); - e = kmalloc(sizeof(struct rq_entry) + length, ls->ls_allocation); + e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS); if (!e) { log_print("dlm_add_requestqueue: out of memory len %d", length); return; diff --git a/fs/dlm/user.c b/fs/dlm/user.c index ebce994ab0b7..e73a4bb572aa 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -267,7 +267,7 @@ static int device_user_lock(struct dlm_user_proc *proc, goto out; } - ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL); + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); if (!ua) goto out; ua->proc = proc; @@ -307,7 +307,7 @@ static int device_user_unlock(struct dlm_user_proc *proc, if (!ls) return -ENOENT; - ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL); + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); if (!ua) goto out; ua->proc = proc; @@ -352,7 +352,7 @@ static int dlm_device_register(struct dlm_ls *ls, char *name) error = -ENOMEM; len = strlen(name) + strlen(name_prefix) + 2; - ls->ls_device.name = kzalloc(len, GFP_KERNEL); + ls->ls_device.name = kzalloc(len, GFP_NOFS); if (!ls->ls_device.name) goto fail; @@ -520,7 +520,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, #endif return -EINVAL; - kbuf = kzalloc(count + 1, GFP_KERNEL); + kbuf = kzalloc(count + 1, GFP_NOFS); if (!kbuf) return -ENOMEM; @@ -546,7 +546,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, /* add 1 after namelen so that the name string is terminated */ kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1, - GFP_KERNEL); + GFP_NOFS); if (!kbuf) { kfree(k32buf); return -ENOMEM; @@ -648,7 +648,7 @@ static int device_open(struct inode *inode, struct file *file) if (!ls) return -ENOENT; - proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL); + proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS); if (!proc) { dlm_put_lockspace(ls); return -ENOMEM; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index fbb6e5eed697..7cb0a59f4b9d 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1748,7 +1748,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, char *cipher_name, size_t *key_size) { char dummy_key[ECRYPTFS_MAX_KEY_BYTES]; - char *full_alg_name; + char *full_alg_name = NULL; int rc; *key_tfm = NULL; @@ -1763,7 +1763,6 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, if (rc) goto out; *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC); - kfree(full_alg_name); if (IS_ERR(*key_tfm)) { rc = PTR_ERR(*key_tfm); printk(KERN_ERR "Unable to allocate crypto cipher with name " @@ -1786,6 +1785,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, goto out; } out: + kfree(full_alg_name); return rc; } diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 2dda5ade75bc..8f006a0d6076 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -62,7 +62,7 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) struct inode *lower_inode = ecryptfs_inode_to_lower(dentry->d_inode); - fsstack_copy_attr_all(dentry->d_inode, lower_inode, NULL); + fsstack_copy_attr_all(dentry->d_inode, lower_inode); } out: return rc; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 9e944057001b..678172b61be2 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -158,7 +158,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) struct dentry *ecryptfs_dentry = file->f_path.dentry; /* Private value of ecryptfs_dentry allocated in * ecryptfs_lookup() */ - struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); + struct dentry *lower_dentry; struct ecryptfs_file_info *file_info; mount_crypt_stat = &ecryptfs_superblock_to_private( @@ -191,13 +191,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) | ECRYPTFS_ENCRYPTED); } mutex_unlock(&crypt_stat->cs_mutex); - if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) - && !(file->f_flags & O_RDONLY)) { - rc = -EPERM; - printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " - "file must hence be opened RO\n", __func__); - goto out; - } if (!ecryptfs_inode_to_private(inode)->lower_file) { rc = ecryptfs_init_persistent_file(ecryptfs_dentry); if (rc) { @@ -208,6 +201,13 @@ static int ecryptfs_open(struct inode *inode, struct file *file) goto out; } } + if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) + && !(file->f_flags & O_RDONLY)) { + rc = -EPERM; + printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " + "file must hence be opened RO\n", __func__); + goto out; + } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { @@ -299,7 +299,6 @@ static int ecryptfs_ioctl(struct inode *inode, struct file *file, const struct file_operations ecryptfs_dir_fops = { .readdir = ecryptfs_readdir, .ioctl = ecryptfs_ioctl, - .mmap = generic_file_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 056fed62d0de..4a430ab4115c 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -282,7 +282,8 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, goto out; } rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, - ecryptfs_dir_inode->i_sb, 1); + ecryptfs_dir_inode->i_sb, + ECRYPTFS_INTERPOSE_FLAG_D_ADD); if (rc) { printk(KERN_ERR "%s: Error interposing; rc = [%d]\n", __func__, rc); @@ -463,9 +464,6 @@ out_lock: unlock_dir(lower_dir_dentry); dput(lower_new_dentry); dput(lower_old_dentry); - d_drop(lower_old_dentry); - d_drop(new_dentry); - d_drop(old_dentry); return rc; } @@ -614,6 +612,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *lower_new_dentry; struct dentry *lower_old_dir_dentry; struct dentry *lower_new_dir_dentry; + struct dentry *trap = NULL; lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); @@ -621,14 +620,24 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, dget(lower_new_dentry); lower_old_dir_dentry = dget_parent(lower_old_dentry); lower_new_dir_dentry = dget_parent(lower_new_dentry); - lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + /* source should not be ancestor of target */ + if (trap == lower_old_dentry) { + rc = -EINVAL; + goto out_lock; + } + /* target should not be ancestor of source */ + if (trap == lower_new_dentry) { + rc = -ENOTEMPTY; + goto out_lock; + } rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, lower_new_dir_dentry->d_inode, lower_new_dentry); if (rc) goto out_lock; - fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode, NULL); + fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); if (new_dir != old_dir) - fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode, NULL); + fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); out_lock: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); dput(lower_new_dentry->d_parent); @@ -715,31 +724,31 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) /* Released in ecryptfs_put_link(); only release here on error */ buf = kmalloc(len, GFP_KERNEL); if (!buf) { - rc = -ENOMEM; + buf = ERR_PTR(-ENOMEM); goto out; } old_fs = get_fs(); set_fs(get_ds()); rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); set_fs(old_fs); - if (rc < 0) - goto out_free; - else + if (rc < 0) { + kfree(buf); + buf = ERR_PTR(rc); + } else buf[rc] = '\0'; - rc = 0; - nd_set_link(nd, buf); - goto out; -out_free: - kfree(buf); out: - return ERR_PTR(rc); + nd_set_link(nd, buf); + return NULL; } static void ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr) { - /* Free the char* */ - kfree(nd_get_link(nd)); + char *buf = nd_get_link(nd); + if (!IS_ERR(buf)) { + /* Free the char* */ + kfree(buf); + } } /** @@ -772,18 +781,23 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat, } /** - * ecryptfs_truncate + * truncate_upper * @dentry: The ecryptfs layer dentry - * @new_length: The length to expand the file to + * @ia: Address of the ecryptfs inode's attributes + * @lower_ia: Address of the lower inode's attributes * * Function to handle truncations modifying the size of the file. Note * that the file sizes are interpolated. When expanding, we are simply - * writing strings of 0's out. When truncating, we need to modify the - * underlying file size according to the page index interpolations. + * writing strings of 0's out. When truncating, we truncate the upper + * inode and update the lower_ia according to the page index + * interpolations. If ATTR_SIZE is set in lower_ia->ia_valid upon return, + * the caller must use lower_ia in a call to notify_change() to perform + * the truncation of the lower inode. * * Returns zero on success; non-zero otherwise */ -int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) +static int truncate_upper(struct dentry *dentry, struct iattr *ia, + struct iattr *lower_ia) { int rc = 0; struct inode *inode = dentry->d_inode; @@ -794,8 +808,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) loff_t lower_size_before_truncate; loff_t lower_size_after_truncate; - if (unlikely((new_length == i_size))) + if (unlikely((ia->ia_size == i_size))) { + lower_ia->ia_valid &= ~ATTR_SIZE; goto out; + } crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; /* Set up a fake ecryptfs file, this is used to interface with * the file in the underlying filesystem so that the @@ -815,28 +831,30 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) &fake_ecryptfs_file, ecryptfs_inode_to_private(dentry->d_inode)->lower_file); /* Switch on growing or shrinking file */ - if (new_length > i_size) { + if (ia->ia_size > i_size) { char zero[] = { 0x00 }; + lower_ia->ia_valid &= ~ATTR_SIZE; /* Write a single 0 at the last position of the file; * this triggers code that will fill in 0's throughout * the intermediate portion of the previous end of the * file and the new and of the file */ rc = ecryptfs_write(&fake_ecryptfs_file, zero, - (new_length - 1), 1); - } else { /* new_length < i_size_read(inode) */ - /* We're chopping off all the pages down do the page - * in which new_length is located. Fill in the end of - * that page from (new_length & ~PAGE_CACHE_MASK) to + (ia->ia_size - 1), 1); + } else { /* ia->ia_size < i_size_read(inode) */ + /* We're chopping off all the pages down to the page + * in which ia->ia_size is located. Fill in the end of + * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to * PAGE_CACHE_SIZE with zeros. */ size_t num_zeros = (PAGE_CACHE_SIZE - - (new_length & ~PAGE_CACHE_MASK)); + - (ia->ia_size & ~PAGE_CACHE_MASK)); if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { - rc = vmtruncate(inode, new_length); + rc = vmtruncate(inode, ia->ia_size); if (rc) goto out_free; - rc = vmtruncate(lower_dentry->d_inode, new_length); + lower_ia->ia_size = ia->ia_size; + lower_ia->ia_valid |= ATTR_SIZE; goto out_free; } if (num_zeros) { @@ -848,7 +866,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) goto out_free; } rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt, - new_length, num_zeros); + ia->ia_size, num_zeros); kfree(zeros_virt); if (rc) { printk(KERN_ERR "Error attempting to zero out " @@ -857,7 +875,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) goto out_free; } } - vmtruncate(inode, new_length); + vmtruncate(inode, ia->ia_size); rc = ecryptfs_write_inode_size_to_metadata(inode); if (rc) { printk(KERN_ERR "Problem with " @@ -870,10 +888,12 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) lower_size_before_truncate = upper_size_to_lower_size(crypt_stat, i_size); lower_size_after_truncate = - upper_size_to_lower_size(crypt_stat, new_length); - if (lower_size_after_truncate < lower_size_before_truncate) - vmtruncate(lower_dentry->d_inode, - lower_size_after_truncate); + upper_size_to_lower_size(crypt_stat, ia->ia_size); + if (lower_size_after_truncate < lower_size_before_truncate) { + lower_ia->ia_size = lower_size_after_truncate; + lower_ia->ia_valid |= ATTR_SIZE; + } else + lower_ia->ia_valid &= ~ATTR_SIZE; } out_free: if (ecryptfs_file_to_private(&fake_ecryptfs_file)) @@ -883,6 +903,33 @@ out: return rc; } +/** + * ecryptfs_truncate + * @dentry: The ecryptfs layer dentry + * @new_length: The length to expand the file to + * + * Simple function that handles the truncation of an eCryptfs inode and + * its corresponding lower inode. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) +{ + struct iattr ia = { .ia_valid = ATTR_SIZE, .ia_size = new_length }; + struct iattr lower_ia = { .ia_valid = 0 }; + int rc; + + rc = truncate_upper(dentry, &ia, &lower_ia); + if (!rc && lower_ia.ia_valid & ATTR_SIZE) { + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + + mutex_lock(&lower_dentry->d_inode->i_mutex); + rc = notify_change(lower_dentry, &lower_ia); + mutex_unlock(&lower_dentry->d_inode->i_mutex); + } + return rc; +} + static int ecryptfs_permission(struct inode *inode, int mask) { @@ -905,6 +952,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) { int rc = 0; struct dentry *lower_dentry; + struct iattr lower_ia; struct inode *inode; struct inode *lower_inode; struct ecryptfs_crypt_stat *crypt_stat; @@ -943,15 +991,11 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) } } mutex_unlock(&crypt_stat->cs_mutex); + memcpy(&lower_ia, ia, sizeof(lower_ia)); + if (ia->ia_valid & ATTR_FILE) + lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file); if (ia->ia_valid & ATTR_SIZE) { - ecryptfs_printk(KERN_DEBUG, - "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n", - ia->ia_valid, ATTR_SIZE); - rc = ecryptfs_truncate(dentry, ia->ia_size); - /* ecryptfs_truncate handles resizing of the lower file */ - ia->ia_valid &= ~ATTR_SIZE; - ecryptfs_printk(KERN_DEBUG, "ia->ia_valid = [%x]\n", - ia->ia_valid); + rc = truncate_upper(dentry, ia, &lower_ia); if (rc < 0) goto out; } @@ -960,14 +1004,29 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) * mode change is for clearing setuid/setgid bits. Allow lower fs * to interpret this in its own way. */ - if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) - ia->ia_valid &= ~ATTR_MODE; + if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) + lower_ia.ia_valid &= ~ATTR_MODE; mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = notify_change(lower_dentry, ia); + rc = notify_change(lower_dentry, &lower_ia); mutex_unlock(&lower_dentry->d_inode->i_mutex); out: - fsstack_copy_attr_all(inode, lower_inode, NULL); + fsstack_copy_attr_all(inode, lower_inode); + return rc; +} + +int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct kstat lower_stat; + int rc; + + rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry), + ecryptfs_dentry_to_lower(dentry), &lower_stat); + if (!rc) { + generic_fillattr(dentry->d_inode, stat); + stat->blocks = lower_stat.blocks; + } return rc; } @@ -1100,6 +1159,7 @@ const struct inode_operations ecryptfs_dir_iops = { const struct inode_operations ecryptfs_main_iops = { .permission = ecryptfs_permission, .setattr = ecryptfs_setattr, + .getattr = ecryptfs_getattr, .setxattr = ecryptfs_setxattr, .getxattr = ecryptfs_getxattr, .listxattr = ecryptfs_listxattr, diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index c6ac85d6c701..ea2f92101dfe 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -35,7 +35,6 @@ #include <linux/key.h> #include <linux/parser.h> #include <linux/fs_stack.h> -#include <linux/ima.h> #include "ecryptfs_kernel.h" /** @@ -119,7 +118,6 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) const struct cred *cred = current_cred(); struct ecryptfs_inode_info *inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); - int opened_lower_file = 0; int rc = 0; mutex_lock(&inode_info->lower_file_mutex); @@ -136,12 +134,9 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) "for lower_dentry [0x%p] and lower_mnt [0x%p]; " "rc = [%d]\n", lower_dentry, lower_mnt, rc); inode_info->lower_file = NULL; - } else - opened_lower_file = 1; + } } mutex_unlock(&inode_info->lower_file_mutex); - if (opened_lower_file) - ima_counts_get(inode_info->lower_file); return rc; } @@ -194,7 +189,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, init_special_inode(inode, lower_inode->i_mode, lower_inode->i_rdev); dentry->d_op = &ecryptfs_dops; - fsstack_copy_attr_all(inode, lower_inode, NULL); + fsstack_copy_attr_all(inode, lower_inode); /* This size will be overwritten for real files w/ headers and * other metadata */ fsstack_copy_inode_size(inode, lower_inode); @@ -590,8 +585,8 @@ out: * with as much information as it can before needing * the lower filesystem. * ecryptfs_read_super(): this accesses the lower filesystem and uses - * ecryptfs_interpolate to perform most of the linking - * ecryptfs_interpolate(): links the lower filesystem into ecryptfs + * ecryptfs_interpose to perform most of the linking + * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) */ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, diff --git a/fs/eventfd.c b/fs/eventfd.c index 8b47e4200e65..7758cc382ef0 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -135,26 +135,71 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait) return events; } -static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) +static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) +{ + *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; + ctx->count -= *cnt; +} + +/** + * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. + * @ctx: [in] Pointer to eventfd context. + * @wait: [in] Wait queue to be removed. + * @cnt: [out] Pointer to the 64bit conter value. + * + * Returns zero if successful, or the following error codes: + * + * -EAGAIN : The operation would have blocked. + * + * This is used to atomically remove a wait queue entry from the eventfd wait + * queue head, and read/reset the counter value. + */ +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, + __u64 *cnt) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->wqh.lock, flags); + eventfd_ctx_do_read(ctx, cnt); + __remove_wait_queue(&ctx->wqh, wait); + if (*cnt != 0 && waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, POLLOUT); + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return *cnt != 0 ? 0 : -EAGAIN; +} +EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); + +/** + * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero. + * @ctx: [in] Pointer to eventfd context. + * @no_wait: [in] Different from zero if the operation should not block. + * @cnt: [out] Pointer to the 64bit conter value. + * + * Returns zero if successful, or the following error codes: + * + * -EAGAIN : The operation would have blocked but @no_wait was nonzero. + * -ERESTARTSYS : A signal interrupted the wait operation. + * + * If @no_wait is zero, the function might sleep until the eventfd internal + * counter becomes greater than zero. + */ +ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt) { - struct eventfd_ctx *ctx = file->private_data; ssize_t res; - __u64 ucnt = 0; DECLARE_WAITQUEUE(wait, current); - if (count < sizeof(ucnt)) - return -EINVAL; spin_lock_irq(&ctx->wqh.lock); + *cnt = 0; res = -EAGAIN; if (ctx->count > 0) - res = sizeof(ucnt); - else if (!(file->f_flags & O_NONBLOCK)) { + res = 0; + else if (!no_wait) { __add_wait_queue(&ctx->wqh, &wait); - for (res = 0;;) { + for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (ctx->count > 0) { - res = sizeof(ucnt); + res = 0; break; } if (signal_pending(current)) { @@ -168,18 +213,32 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } - if (likely(res > 0)) { - ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; - ctx->count -= ucnt; + if (likely(res == 0)) { + eventfd_ctx_do_read(ctx, cnt); if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, POLLOUT); } spin_unlock_irq(&ctx->wqh.lock); - if (res > 0 && put_user(ucnt, (__u64 __user *) buf)) - return -EFAULT; return res; } +EXPORT_SYMBOL_GPL(eventfd_ctx_read); + +static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct eventfd_ctx *ctx = file->private_data; + ssize_t res; + __u64 cnt; + + if (count < sizeof(cnt)) + return -EINVAL; + res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt); + if (res < 0) + return res; + + return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt); +} static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) @@ -339,7 +398,7 @@ struct file *eventfd_file_create(unsigned int count, int flags) ctx->flags = flags; file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, - flags & EFD_SHARED_FCNTL_FLAGS); + O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); if (IS_ERR(file)) eventfd_free_ctx(ctx); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 366c503f9657..bd056a5b4efc 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1206,7 +1206,7 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) * a file structure and a free file descriptor. */ error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, - flags & O_CLOEXEC); + O_RDWR | (flags & O_CLOEXEC)); if (error < 0) ep_free(ep); diff --git a/fs/exec.c b/fs/exec.c index c0c636e34f60..0790a107ff7e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -826,7 +826,9 @@ static int de_thread(struct task_struct *tsk) attach_pid(tsk, PIDTYPE_PID, task_pid(leader)); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); + list_replace_rcu(&leader->tasks, &tsk->tasks); + list_replace_init(&leader->sibling, &tsk->sibling); tsk->group_leader = tsk; leader->group_leader = tsk; @@ -923,6 +925,15 @@ char *get_task_comm(char *buf, struct task_struct *tsk) void set_task_comm(struct task_struct *tsk, char *buf) { task_lock(tsk); + + /* + * Threads may access current->comm without holding + * the task lock, so write the string carefully. + * Readers without a lock may see incomplete new + * names but are safe from non-terminating string reads. + */ + memset(tsk->comm, 0, TASK_COMM_LEN); + wmb(); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk); @@ -930,9 +941,7 @@ void set_task_comm(struct task_struct *tsk, char *buf) int flush_old_exec(struct linux_binprm * bprm) { - char * name; - int i, ch, retval; - char tcomm[sizeof(current->comm)]; + int retval; /* * Make sure we have a private signal table and that @@ -953,6 +962,25 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; /* We're using it now */ + current->flags &= ~PF_RANDOMIZE; + flush_thread(); + current->personality &= ~bprm->per_clear; + + return 0; + +out: + return retval; +} +EXPORT_SYMBOL(flush_old_exec); + +void setup_new_exec(struct linux_binprm * bprm) +{ + int i, ch; + char * name; + char tcomm[sizeof(current->comm)]; + + arch_pick_mmap_layout(current->mm); + /* This is the point of no return */ current->sas_ss_sp = current->sas_ss_size = 0; @@ -974,9 +1002,6 @@ int flush_old_exec(struct linux_binprm * bprm) tcomm[i] = '\0'; set_task_comm(current, tcomm); - current->flags &= ~PF_RANDOMIZE; - flush_thread(); - /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc @@ -992,8 +1017,6 @@ int flush_old_exec(struct linux_binprm * bprm) set_dumpable(current->mm, suid_dumpable); } - current->personality &= ~bprm->per_clear; - /* * Flush performance counters when crossing a * security domain: @@ -1008,14 +1031,8 @@ int flush_old_exec(struct linux_binprm * bprm) flush_signal_handlers(current, 0); flush_old_files(current->files); - - return 0; - -out: - return retval; } - -EXPORT_SYMBOL(flush_old_exec); +EXPORT_SYMBOL(setup_new_exec); /* * Prepare credentials and lock ->cred_guard_mutex. @@ -1752,17 +1769,20 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) struct mm_struct *mm = current->mm; struct linux_binfmt * binfmt; struct inode * inode; - struct file * file; const struct cred *old_cred; struct cred *cred; int retval = 0; int flag = 0; int ispipe = 0; - unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; char **helper_argv = NULL; int helper_argc = 0; int dump_count = 0; static atomic_t core_dump_count = ATOMIC_INIT(0); + struct coredump_params cprm = { + .signr = signr, + .regs = regs, + .limit = current->signal->rlim[RLIMIT_CORE].rlim_cur, + }; audit_core_dumps(signr); @@ -1818,15 +1838,15 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) ispipe = format_corename(corename, signr); unlock_kernel(); - if ((!ispipe) && (core_limit < binfmt->min_coredump)) + if ((!ispipe) && (cprm.limit < binfmt->min_coredump)) goto fail_unlock; if (ispipe) { - if (core_limit == 0) { + if (cprm.limit == 0) { /* * Normally core limits are irrelevant to pipes, since * we're not writing to the file system, but we use - * core_limit of 0 here as a speacial value. Any + * cprm.limit of 0 here as a speacial value. Any * non-zero limit gets set to RLIM_INFINITY below, but * a limit of 0 skips the dump. This is a consistent * way to catch recursive crashes. We can still crash @@ -1859,25 +1879,25 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) goto fail_dropcount; } - core_limit = RLIM_INFINITY; + cprm.limit = RLIM_INFINITY; /* SIGPIPE can happen, but it's just never processed */ if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL, - &file)) { + &cprm.file)) { printk(KERN_INFO "Core dump to %s pipe failed\n", corename); goto fail_dropcount; } } else - file = filp_open(corename, + cprm.file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600); - if (IS_ERR(file)) + if (IS_ERR(cprm.file)) goto fail_dropcount; - inode = file->f_path.dentry->d_inode; + inode = cprm.file->f_path.dentry->d_inode; if (inode->i_nlink > 1) goto close_fail; /* multiple links - don't dump */ - if (!ispipe && d_unhashed(file->f_path.dentry)) + if (!ispipe && d_unhashed(cprm.file->f_path.dentry)) goto close_fail; /* AK: actually i see no reason to not allow this for named pipes etc., @@ -1890,21 +1910,22 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) */ if (inode->i_uid != current_fsuid()) goto close_fail; - if (!file->f_op) + if (!cprm.file->f_op) goto close_fail; - if (!file->f_op->write) + if (!cprm.file->f_op->write) goto close_fail; - if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0) + if (!ispipe && + do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0) goto close_fail; - retval = binfmt->core_dump(signr, regs, file, core_limit); + retval = binfmt->core_dump(&cprm); if (retval) current->signal->group_exit_code |= 0x80; close_fail: if (ispipe && core_pipe_limit) - wait_for_dump_helpers(file); - filp_close(file, NULL); + wait_for_dump_helpers(cprm.file); + filp_close(cprm.file, NULL); fail_dropcount: if (dump_count) atomic_dec(&core_dump_count); diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index cc2d22db119c..2d0f757fda3e 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -12,5 +12,5 @@ # Kbuild - Gets included from the Kernels Makefile and build system # -exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o +exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/common.h b/fs/exofs/common.h index c6718e4817fe..b1b178e61718 100644 --- a/fs/exofs/common.h +++ b/fs/exofs/common.h @@ -49,6 +49,7 @@ #define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */ #define EXOFS_OBJ_OFF 0x10000 /* offset for objects */ #define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */ +#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */ #define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ /* exofs Application specific page/attribute */ @@ -78,17 +79,67 @@ enum { #define EXOFS_SUPER_MAGIC 0x5DF5 /* - * The file system control block - stored in an object's data (mainly, the one - * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored - * on disk. Right now it just has a magic value, which is basically a sanity - * check on our ability to communicate with the object store. + * The file system control block - stored in object EXOFS_SUPER_ID's data. + * This is where the in-memory superblock is stored on disk. */ +enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1}; struct exofs_fscb { __le64 s_nextid; /* Highest object ID used */ - __le32 s_numfiles; /* Number of files on fs */ + __le64 s_numfiles; /* Number of files on fs */ + __le32 s_version; /* == EXOFS_FSCB_VER */ __le16 s_magic; /* Magic signature */ __le16 s_newfs; /* Non-zero if this is a new fs */ -}; + + /* From here on it's a static part, only written by mkexofs */ + __le64 s_dev_table_oid; /* Resurved, not used */ + __le64 s_dev_table_count; /* == 0 means no dev_table */ +} __packed; + +/* + * Describes the raid used in the FS. It is part of the device table. + * This here is taken from the pNFS-objects definition. In exofs we + * use one raid policy through-out the filesystem. (NOTE: the funny + * alignment at begining. We take care of it at exofs_device_table. + */ +struct exofs_dt_data_map { + __le32 cb_num_comps; + __le64 cb_stripe_unit; + __le32 cb_group_width; + __le32 cb_group_depth; + __le32 cb_mirror_cnt; + __le32 cb_raid_algorithm; +} __packed; + +/* + * This is an osd device information descriptor. It is a single entry in + * the exofs device table. It describes an osd target lun which + * contains data belonging to this FS. (Same partition_id on all devices) + */ +struct exofs_dt_device_info { + __le32 systemid_len; + u8 systemid[OSD_SYSTEMID_LEN]; + __le64 long_name_offset; /* If !0 then offset-in-file */ + __le32 osdname_len; /* */ + u8 osdname[44]; /* Embbeded, Ususally an asci uuid */ +} __packed; + +/* + * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data. + * It contains the raid used for this multy-device FS and an array of + * participating devices. + */ +struct exofs_device_table { + __le32 dt_version; /* == EXOFS_DT_VER */ + struct exofs_dt_data_map dt_data_map; /* Raid policy to use */ + + /* Resurved space For future use. Total includeing this: + * (8 * sizeof(le64)) + */ + __le64 __Resurved[4]; + + __le64 dt_num_devices; /* Array size */ + struct exofs_dt_device_info dt_dev_table[]; /* Array of devices */ +} __packed; /**************************************************************************** * inode-related things @@ -155,22 +206,4 @@ enum { (((name_len) + offsetof(struct exofs_dir_entry, name) + \ EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) -/************************* - * function declarations * - *************************/ -/* osd.c */ -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], - const struct osd_obj_id *obj); - -int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid); -static inline int exofs_check_ok(struct osd_request *or) -{ - return exofs_check_ok_resid(or, NULL, NULL); -} -int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred); -int exofs_async_op(struct osd_request *or, - osd_req_done_fn *async_done, void *caller_context, u8 *cred); - -int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr); - #endif /*ifndef __EXOFS_COM_H__*/ diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 5ec72e020b22..c35fd4623986 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -30,13 +30,17 @@ * along with exofs; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#ifndef __EXOFS_H__ +#define __EXOFS_H__ #include <linux/fs.h> #include <linux/time.h> #include "common.h" -#ifndef __EXOFS_H__ -#define __EXOFS_H__ +/* FIXME: Remove once pnfs hits mainline + * #include <linux/exportfs/pnfs_osd_xdr.h> + */ +#include "pnfs.h" #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) @@ -55,7 +59,7 @@ * our extension to the in-memory superblock */ struct exofs_sb_info { - struct osd_dev *s_dev; /* returned by get_osd_dev */ + struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ osd_id s_pid; /* partition ID of file system*/ int s_timeout; /* timeout for OSD operations */ uint64_t s_nextid; /* highest object ID used */ @@ -63,7 +67,11 @@ struct exofs_sb_info { spinlock_t s_next_gen_lock; /* spinlock for gen # update */ u32 s_next_generation; /* next gen # to use */ atomic_t s_curr_pending; /* number of pending commands */ - uint8_t s_cred[OSD_CAP_LEN]; /* all-powerful credential */ + uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ + + struct pnfs_osd_data_map data_map; /* Default raid to use */ + unsigned s_numdevs; /* Num of devices in array */ + struct osd_dev *s_ods[1]; /* Variable length, minimum 1 */ }; /* @@ -79,6 +87,50 @@ struct exofs_i_info { struct inode vfs_inode; /* normal in-memory inode */ }; +static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) +{ + return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF; +} + +struct exofs_io_state; +typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private); + +struct exofs_io_state { + struct kref kref; + + void *private; + exofs_io_done_fn done; + + struct exofs_sb_info *sbi; + struct osd_obj_id obj; + u8 *cred; + + /* Global read/write IO*/ + loff_t offset; + unsigned long length; + void *kern_buff; + struct bio *bio; + + /* Attributes */ + unsigned in_attr_len; + struct osd_attr *in_attr; + unsigned out_attr_len; + struct osd_attr *out_attr; + + /* Variable array of size numdevs */ + unsigned numdevs; + struct exofs_per_dev_state { + struct osd_request *or; + struct bio *bio; + } per_dev[]; +}; + +static inline unsigned exofs_io_state_size(unsigned numdevs) +{ + return sizeof(struct exofs_io_state) + + sizeof(struct exofs_per_dev_state) * numdevs; +} + /* * our inode flags */ @@ -130,6 +182,42 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode) /************************* * function declarations * *************************/ + +/* ios.c */ +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], + const struct osd_obj_id *obj); +int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, + u64 offset, void *p, unsigned length); + +int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios); +void exofs_put_io_state(struct exofs_io_state *ios); + +int exofs_check_io(struct exofs_io_state *ios, u64 *resid); + +int exofs_sbi_create(struct exofs_io_state *ios); +int exofs_sbi_remove(struct exofs_io_state *ios); +int exofs_sbi_write(struct exofs_io_state *ios); +int exofs_sbi_read(struct exofs_io_state *ios); + +int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr); + +int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len); +static inline int exofs_oi_write(struct exofs_i_info *oi, + struct exofs_io_state *ios) +{ + ios->obj.id = exofs_oi_objno(oi); + ios->cred = oi->i_cred; + return exofs_sbi_write(ios); +} + +static inline int exofs_oi_read(struct exofs_i_info *oi, + struct exofs_io_state *ios) +{ + ios->obj.id = exofs_oi_objno(oi); + ios->cred = oi->i_cred; + return exofs_sbi_read(ios); +} + /* inode.c */ void exofs_truncate(struct inode *inode); int exofs_setattr(struct dentry *, struct iattr *); @@ -169,6 +257,7 @@ extern const struct file_operations exofs_file_operations; /* inode.c */ extern const struct address_space_operations exofs_aops; +extern const struct osd_attr g_attr_logical_length; /* namei.c */ extern const struct inode_operations exofs_dir_inode_operations; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 6c10f7476699..2afbcebeda71 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -37,15 +37,18 @@ #include "exofs.h" -#ifdef CONFIG_EXOFS_DEBUG -# define EXOFS_DEBUG_OBJ_ISIZE 1 -#endif +#define EXOFS_DBGMSG2(M...) do {} while (0) + +enum { BIO_MAX_PAGES_KMALLOC = + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), +}; struct page_collect { struct exofs_sb_info *sbi; struct request_queue *req_q; struct inode *inode; unsigned expected_pages; + struct exofs_io_state *ios; struct bio *bio; unsigned nr_pages; @@ -54,22 +57,23 @@ struct page_collect { }; static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, - struct inode *inode) + struct inode *inode) { struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; pcol->sbi = sbi; - pcol->req_q = osd_request_queue(sbi->s_dev); + /* Create master bios on first Q, later on cloning, each clone will be + * allocated on it's destination Q + */ + pcol->req_q = osd_request_queue(sbi->s_ods[0]); pcol->inode = inode; pcol->expected_pages = expected_pages; + pcol->ios = NULL; pcol->bio = NULL; pcol->nr_pages = 0; pcol->length = 0; pcol->pg_first = -1; - - EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino, - expected_pages); } static void _pcol_reset(struct page_collect *pcol) @@ -80,35 +84,49 @@ static void _pcol_reset(struct page_collect *pcol) pcol->nr_pages = 0; pcol->length = 0; pcol->pg_first = -1; - EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n", - pcol->inode->i_ino, pcol->expected_pages); + pcol->ios = NULL; /* this is probably the end of the loop but in writes * it might not end here. don't be left with nothing */ if (!pcol->expected_pages) - pcol->expected_pages = 128; + pcol->expected_pages = BIO_MAX_PAGES_KMALLOC; } static int pcol_try_alloc(struct page_collect *pcol) { - int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES); + int pages = min_t(unsigned, pcol->expected_pages, + BIO_MAX_PAGES_KMALLOC); + + if (!pcol->ios) { /* First time allocate io_state */ + int ret = exofs_get_io_state(pcol->sbi, &pcol->ios); + + if (ret) + return ret; + } for (; pages; pages >>= 1) { - pcol->bio = bio_alloc(GFP_KERNEL, pages); + pcol->bio = bio_kmalloc(GFP_KERNEL, pages); if (likely(pcol->bio)) return 0; } - EXOFS_ERR("Failed to kcalloc expected_pages=%u\n", + EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n", pcol->expected_pages); return -ENOMEM; } static void pcol_free(struct page_collect *pcol) { - bio_put(pcol->bio); - pcol->bio = NULL; + if (pcol->bio) { + bio_put(pcol->bio); + pcol->bio = NULL; + } + + if (pcol->ios) { + exofs_put_io_state(pcol->ios); + pcol->ios = NULL; + } } static int pcol_add_page(struct page_collect *pcol, struct page *page, @@ -161,22 +179,17 @@ static void update_write_page(struct page *page, int ret) /* Called at the end of reads, to optionally unlock pages and update their * status. */ -static int __readpages_done(struct osd_request *or, struct page_collect *pcol, - bool do_unlock) +static int __readpages_done(struct page_collect *pcol, bool do_unlock) { struct bio_vec *bvec; int i; u64 resid; u64 good_bytes; u64 length = 0; - int ret = exofs_check_ok_resid(or, &resid, NULL); - - osd_end_request(or); + int ret = exofs_check_io(pcol->ios, &resid); if (likely(!ret)) good_bytes = pcol->length; - else if (!resid) - good_bytes = 0; else good_bytes = pcol->length - resid; @@ -198,7 +211,7 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol, else page_stat = ret; - EXOFS_DBGMSG(" readpages_done(0x%lx, 0x%lx) %s\n", + EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n", inode->i_ino, page->index, page_stat ? "bad_bytes" : "good_bytes"); @@ -214,13 +227,13 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol, } /* callback of async reads */ -static void readpages_done(struct osd_request *or, void *p) +static void readpages_done(struct exofs_io_state *ios, void *p) { struct page_collect *pcol = p; - __readpages_done(or, pcol, true); + __readpages_done(pcol, true); atomic_dec(&pcol->sbi->s_curr_pending); - kfree(p); + kfree(pcol); } static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) @@ -238,17 +251,13 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) unlock_page(page); } - pcol_free(pcol); } static int read_exec(struct page_collect *pcol, bool is_sync) { struct exofs_i_info *oi = exofs_i(pcol->inode); - struct osd_obj_id obj = {pcol->sbi->s_pid, - pcol->inode->i_ino + EXOFS_OBJ_OFF}; - struct osd_request *or = NULL; + struct exofs_io_state *ios = pcol->ios; struct page_collect *pcol_copy = NULL; - loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT; int ret; if (!pcol->bio) @@ -257,17 +266,13 @@ static int read_exec(struct page_collect *pcol, bool is_sync) /* see comment in _readpage() about sync reads */ WARN_ON(is_sync && (pcol->nr_pages != 1)); - or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - ret = -ENOMEM; - goto err; - } - - osd_req_read(or, &obj, i_start, pcol->bio, pcol->length); + ios->bio = pcol->bio; + ios->length = pcol->length; + ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; if (is_sync) { - exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred); - return __readpages_done(or, pcol, false); + exofs_oi_read(oi, pcol->ios); + return __readpages_done(pcol, false); } pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); @@ -277,14 +282,16 @@ static int read_exec(struct page_collect *pcol, bool is_sync) } *pcol_copy = *pcol; - ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred); + ios->done = readpages_done; + ios->private = pcol_copy; + ret = exofs_oi_read(oi, ios); if (unlikely(ret)) goto err; atomic_inc(&pcol->sbi->s_curr_pending); EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", - obj.id, _LLU(i_start), pcol->length); + ios->obj.id, _LLU(ios->offset), pcol->length); /* pages ownership was passed to pcol_copy */ _pcol_reset(pcol); @@ -293,12 +300,10 @@ static int read_exec(struct page_collect *pcol, bool is_sync) err: if (!is_sync) _unlock_pcol_pages(pcol, ret, READ); - else /* Pages unlocked by caller in sync mode only free bio */ - pcol_free(pcol); + + pcol_free(pcol); kfree(pcol_copy); - if (or) - osd_end_request(or); return ret; } @@ -370,12 +375,12 @@ try_again: if (len != PAGE_CACHE_SIZE) zero_user(page, len, PAGE_CACHE_SIZE - len); - EXOFS_DBGMSG(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", + EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", inode->i_ino, page->index, len); ret = pcol_add_page(pcol, page, len); if (ret) { - EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p " + EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p " "this_len=0x%zx nr_pages=%u length=0x%lx\n", page, len, pcol->nr_pages, pcol->length); @@ -419,9 +424,8 @@ static int _readpage(struct page *page, bool is_sync) _pcol_init(&pcol, 1, page->mapping->host); - /* readpage_strip might call read_exec(,async) inside at several places - * but this is safe for is_async=0 since read_exec will not do anything - * when we have a single page. + /* readpage_strip might call read_exec(,is_sync==false) at several + * places but not if we have a single page. */ ret = readpage_strip(&pcol, page); if (ret) { @@ -440,8 +444,8 @@ static int exofs_readpage(struct file *file, struct page *page) return _readpage(page, false); } -/* Callback for osd_write. All writes are asynchronouse */ -static void writepages_done(struct osd_request *or, void *p) +/* Callback for osd_write. All writes are asynchronous */ +static void writepages_done(struct exofs_io_state *ios, void *p) { struct page_collect *pcol = p; struct bio_vec *bvec; @@ -449,16 +453,12 @@ static void writepages_done(struct osd_request *or, void *p) u64 resid; u64 good_bytes; u64 length = 0; + int ret = exofs_check_io(ios, &resid); - int ret = exofs_check_ok_resid(or, NULL, &resid); - - osd_end_request(or); atomic_dec(&pcol->sbi->s_curr_pending); if (likely(!ret)) good_bytes = pcol->length; - else if (!resid) - good_bytes = 0; else good_bytes = pcol->length - resid; @@ -482,7 +482,7 @@ static void writepages_done(struct osd_request *or, void *p) update_write_page(page, page_stat); unlock_page(page); - EXOFS_DBGMSG(" writepages_done(0x%lx, 0x%lx) status=%d\n", + EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n", inode->i_ino, page->index, page_stat); length += bvec->bv_len; @@ -496,23 +496,13 @@ static void writepages_done(struct osd_request *or, void *p) static int write_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); - struct osd_obj_id obj = {pcol->sbi->s_pid, - pcol->inode->i_ino + EXOFS_OBJ_OFF}; - struct osd_request *or = NULL; + struct exofs_io_state *ios = pcol->ios; struct page_collect *pcol_copy = NULL; - loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT; int ret; if (!pcol->bio) return 0; - or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("write_exec: Faild to osd_start_request()\n"); - ret = -ENOMEM; - goto err; - } - pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); if (!pcol_copy) { EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n"); @@ -523,16 +513,22 @@ static int write_exec(struct page_collect *pcol) *pcol_copy = *pcol; pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */ - osd_req_write(or, &obj, i_start, pcol_copy->bio, pcol_copy->length); - ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred); + + ios->bio = pcol_copy->bio; + ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; + ios->length = pcol_copy->length; + ios->done = writepages_done; + ios->private = pcol_copy; + + ret = exofs_oi_write(oi, ios); if (unlikely(ret)) { - EXOFS_ERR("write_exec: exofs_async_op() Faild\n"); + EXOFS_ERR("write_exec: exofs_oi_write() Faild\n"); goto err; } atomic_inc(&pcol->sbi->s_curr_pending); EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", - pcol->inode->i_ino, pcol->pg_first, _LLU(i_start), + pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), pcol->length); /* pages ownership was passed to pcol_copy */ _pcol_reset(pcol); @@ -540,9 +536,9 @@ static int write_exec(struct page_collect *pcol) err: _unlock_pcol_pages(pcol, ret, WRITE); + pcol_free(pcol); kfree(pcol_copy); - if (or) - osd_end_request(or); + return ret; } @@ -586,6 +582,9 @@ static int writepage_strip(struct page *page, if (PageError(page)) ClearPageError(page); unlock_page(page); + EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) " + "outside the limits\n", + inode->i_ino, page->index); return 0; } } @@ -600,6 +599,9 @@ try_again: ret = write_exec(pcol); if (unlikely(ret)) goto fail; + + EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n", + inode->i_ino, page->index); goto try_again; } @@ -609,7 +611,7 @@ try_again: goto fail; } - EXOFS_DBGMSG(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", + EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", inode->i_ino, page->index, len); ret = pcol_add_page(pcol, page, len); @@ -634,6 +636,8 @@ try_again: return 0; fail: + EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n", + inode->i_ino, page->index, ret); set_bit(AS_EIO, &page->mapping->flags); unlock_page(page); return ret; @@ -652,14 +656,17 @@ static int exofs_writepages(struct address_space *mapping, wbc->range_end >> PAGE_CACHE_SHIFT; if (start || end) - expected_pages = min(end - start + 1, 32L); + expected_pages = end - start + 1; else expected_pages = mapping->nrpages; - EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx" - " m->nrpages=%lu start=0x%lx end=0x%lx\n", + if (expected_pages < 32L) + expected_pages = 32L; + + EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " + "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n", mapping->host->i_ino, wbc->range_start, wbc->range_end, - mapping->nrpages, start, end); + mapping->nrpages, start, end, expected_pages); _pcol_init(&pcol, expected_pages, mapping->host); @@ -731,13 +738,28 @@ static int exofs_write_begin_export(struct file *file, fsdata); } +static int exofs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + /* According to comment in simple_write_end i_mutex is held */ + loff_t i_size = inode->i_size; + int ret; + + ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata); + if (i_size != inode->i_size) + mark_inode_dirty(inode); + return ret; +} + const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, .writepage = exofs_writepage, .writepages = exofs_writepages, .write_begin = exofs_write_begin_export, - .write_end = simple_write_end, + .write_end = exofs_write_end, }; /****************************************************************************** @@ -771,19 +793,28 @@ static int exofs_get_block(struct inode *inode, sector_t iblock, const struct osd_attr g_attr_logical_length = ATTR_DEF( OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); +static int _do_truncate(struct inode *inode) +{ + struct exofs_i_info *oi = exofs_i(inode); + loff_t isize = i_size_read(inode); + int ret; + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + nobh_truncate_page(inode->i_mapping, isize, exofs_get_block); + + ret = exofs_oi_truncate(oi, (u64)isize); + EXOFS_DBGMSG("(0x%lx) size=0x%llx\n", inode->i_ino, isize); + return ret; +} + /* * Truncate a file to the specified size - all we have to do is set the size * attribute. We make sure the object exists first. */ void exofs_truncate(struct inode *inode) { - struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; struct exofs_i_info *oi = exofs_i(inode); - struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; - struct osd_request *or; - struct osd_attr attr; - loff_t isize = i_size_read(inode); - __be64 newsize; int ret; if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) @@ -793,22 +824,6 @@ void exofs_truncate(struct inode *inode) return; if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - - nobh_truncate_page(inode->i_mapping, isize, exofs_get_block); - - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n"); - goto fail; - } - - osd_req_set_attributes(or, &obj); - - newsize = cpu_to_be64((u64)isize); - attr = g_attr_logical_length; - attr.val_ptr = &newsize; - osd_req_add_set_attr_list(or, &attr, 1); /* if we are about to truncate an object, and it hasn't been * created yet, wait @@ -816,8 +831,7 @@ void exofs_truncate(struct inode *inode) if (unlikely(wait_obj_created(oi))) goto fail; - ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); - osd_end_request(or); + ret = _do_truncate(inode); if (ret) goto fail; @@ -847,65 +861,62 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr) /* * Read an inode from the OSD, and return it as is. We also return the size - * attribute in the 'sanity' argument if we got compiled with debugging turned - * on. + * attribute in the 'obj_size' argument. */ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, - struct exofs_fcb *inode, uint64_t *sanity) + struct exofs_fcb *inode, uint64_t *obj_size) { struct exofs_sb_info *sbi = sb->s_fs_info; - struct osd_request *or; - struct osd_attr attr; - struct osd_obj_id obj = {sbi->s_pid, - oi->vfs_inode.i_ino + EXOFS_OBJ_OFF}; + struct osd_attr attrs[2]; + struct exofs_io_state *ios; int ret; - exofs_make_credential(oi->i_cred, &obj); - - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n"); - return -ENOMEM; + *obj_size = ~0; + ret = exofs_get_io_state(sbi, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + return ret; } - osd_req_get_attributes(or, &obj); - /* we need the inode attribute */ - osd_req_add_get_attr_list(or, &g_attr_inode_data, 1); + ios->obj.id = exofs_oi_objno(oi); + exofs_make_credential(oi->i_cred, &ios->obj); + ios->cred = oi->i_cred; -#ifdef EXOFS_DEBUG_OBJ_ISIZE - /* we get the size attributes to do a sanity check */ - osd_req_add_get_attr_list(or, &g_attr_logical_length, 1); -#endif + attrs[0] = g_attr_inode_data; + attrs[1] = g_attr_logical_length; + ios->in_attr = attrs; + ios->in_attr_len = ARRAY_SIZE(attrs); - ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); + ret = exofs_sbi_read(ios); if (ret) goto out; - attr = g_attr_inode_data; - ret = extract_attr_from_req(or, &attr); + ret = extract_attr_from_ios(ios, &attrs[0]); if (ret) { - EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n"); + EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); goto out; } + WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); + memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE); - WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE); - memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE); - -#ifdef EXOFS_DEBUG_OBJ_ISIZE - attr = g_attr_logical_length; - ret = extract_attr_from_req(or, &attr); + ret = extract_attr_from_ios(ios, &attrs[1]); if (ret) { - EXOFS_ERR("ERROR: extract attr from or failed\n"); + EXOFS_ERR("%s: extract_attr of logical_length failed\n", + __func__); goto out; } - *sanity = get_unaligned_be64(attr.val_ptr); -#endif + *obj_size = get_unaligned_be64(attrs[1].val_ptr); out: - osd_end_request(or); + exofs_put_io_state(ios); return ret; } +static void __oi_init(struct exofs_i_info *oi) +{ + init_waitqueue_head(&oi->i_wq); + oi->i_flags = 0; +} /* * Fill in an inode read from the OSD and set it up for use */ @@ -914,7 +925,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) struct exofs_i_info *oi; struct exofs_fcb fcb; struct inode *inode; - uint64_t uninitialized_var(sanity); + uint64_t obj_size; int ret; inode = iget_locked(sb, ino); @@ -923,13 +934,13 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) if (!(inode->i_state & I_NEW)) return inode; oi = exofs_i(inode); + __oi_init(oi); /* read the inode from the osd */ - ret = exofs_get_inode(sb, oi, &fcb, &sanity); + ret = exofs_get_inode(sb, oi, &fcb, &obj_size); if (ret) goto bad_inode; - init_waitqueue_head(&oi->i_wq); set_obj_created(oi); /* copy stuff from on-disk struct to in-memory struct */ @@ -947,14 +958,12 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) inode->i_blkbits = EXOFS_BLKSHIFT; inode->i_generation = le32_to_cpu(fcb.i_generation); -#ifdef EXOFS_DEBUG_OBJ_ISIZE - if ((inode->i_size != sanity) && + if ((inode->i_size != obj_size) && (!exofs_inode_is_fast_symlink(inode))) { - EXOFS_ERR("WARNING: Size of object from inode and " - "attributes differ (%lld != %llu)\n", - inode->i_size, _LLU(sanity)); + EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n", + inode->i_size, _LLU(obj_size)); + /* FIXME: call exofs_inode_recovery() */ } -#endif oi->i_dir_start_lookup = 0; @@ -1020,23 +1029,30 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi) * set the obj_created flag so that other methods know that the object exists on * the OSD. */ -static void create_done(struct osd_request *or, void *p) +static void create_done(struct exofs_io_state *ios, void *p) { struct inode *inode = p; struct exofs_i_info *oi = exofs_i(inode); struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; int ret; - ret = exofs_check_ok(or); - osd_end_request(or); + ret = exofs_check_io(ios, NULL); + exofs_put_io_state(ios); + atomic_dec(&sbi->s_curr_pending); if (unlikely(ret)) { EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", - _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF)); - make_bad_inode(inode); - } else - set_obj_created(oi); + _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid)); + /*TODO: When FS is corrupted creation can fail, object already + * exist. Get rid of this asynchronous creation, if exist + * increment the obj counter and try the next object. Until we + * succeed. All these dangling objects will be made into lost + * files by chkfs.exofs + */ + } + + set_obj_created(oi); atomic_dec(&inode->i_count); wake_up(&oi->i_wq); @@ -1051,8 +1067,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) struct inode *inode; struct exofs_i_info *oi; struct exofs_sb_info *sbi; - struct osd_request *or; - struct osd_obj_id obj; + struct exofs_io_state *ios; int ret; sb = dir->i_sb; @@ -1061,8 +1076,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) return ERR_PTR(-ENOMEM); oi = exofs_i(inode); + __oi_init(oi); - init_waitqueue_head(&oi->i_wq); set_obj_2bcreated(oi); sbi = sb->s_fs_info; @@ -1089,28 +1104,28 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) mark_inode_dirty(inode); - obj.partition = sbi->s_pid; - obj.id = inode->i_ino + EXOFS_OBJ_OFF; - exofs_make_credential(oi->i_cred, &obj); - - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("exofs_new_inode: osd_start_request failed\n"); - return ERR_PTR(-ENOMEM); + ret = exofs_get_io_state(sbi, &ios); + if (unlikely(ret)) { + EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); + return ERR_PTR(ret); } - osd_req_create_object(or, &obj); + ios->obj.id = exofs_oi_objno(oi); + exofs_make_credential(oi->i_cred, &ios->obj); /* increment the refcount so that the inode will still be around when we * reach the callback */ atomic_inc(&inode->i_count); - ret = exofs_async_op(or, create_done, inode, oi->i_cred); + ios->done = create_done; + ios->private = inode; + ios->cred = oi->i_cred; + ret = exofs_sbi_create(ios); if (ret) { atomic_dec(&inode->i_count); - osd_end_request(or); - return ERR_PTR(-EIO); + exofs_put_io_state(ios); + return ERR_PTR(ret); } atomic_inc(&sbi->s_curr_pending); @@ -1128,11 +1143,11 @@ struct updatei_args { /* * Callback function from exofs_update_inode(). */ -static void updatei_done(struct osd_request *or, void *p) +static void updatei_done(struct exofs_io_state *ios, void *p) { struct updatei_args *args = p; - osd_end_request(or); + exofs_put_io_state(ios); atomic_dec(&args->sbi->s_curr_pending); @@ -1148,8 +1163,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) struct exofs_i_info *oi = exofs_i(inode); struct super_block *sb = inode->i_sb; struct exofs_sb_info *sbi = sb->s_fs_info; - struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; - struct osd_request *or; + struct exofs_io_state *ios; struct osd_attr attr; struct exofs_fcb *fcb; struct updatei_args *args; @@ -1186,18 +1200,16 @@ static int exofs_update_inode(struct inode *inode, int do_sync) } else memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n"); - ret = -ENOMEM; + ret = exofs_get_io_state(sbi, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); goto free_args; } - osd_req_set_attributes(or, &obj); - attr = g_attr_inode_data; attr.val_ptr = fcb; - osd_req_add_set_attr_list(or, &attr, 1); + ios->out_attr_len = 1; + ios->out_attr = &attr; if (!obj_created(oi)) { EXOFS_DBGMSG("!obj_created\n"); @@ -1206,22 +1218,19 @@ static int exofs_update_inode(struct inode *inode, int do_sync) EXOFS_DBGMSG("wait_event done\n"); } - if (do_sync) { - ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); - osd_end_request(or); - goto free_args; - } else { + if (!do_sync) { args->sbi = sbi; + ios->done = updatei_done; + ios->private = args; + } - ret = exofs_async_op(or, updatei_done, args, oi->i_cred); - if (ret) { - osd_end_request(or); - goto free_args; - } + ret = exofs_oi_write(oi, ios); + if (!do_sync && !ret) { atomic_inc(&sbi->s_curr_pending); goto out; /* deallocation in updatei_done */ } + exofs_put_io_state(ios); free_args: kfree(args); out: @@ -1238,11 +1247,12 @@ int exofs_write_inode(struct inode *inode, int wait) * Callback function from exofs_delete_inode() - don't have much cleaning up to * do. */ -static void delete_done(struct osd_request *or, void *p) +static void delete_done(struct exofs_io_state *ios, void *p) { - struct exofs_sb_info *sbi; - osd_end_request(or); - sbi = p; + struct exofs_sb_info *sbi = p; + + exofs_put_io_state(ios); + atomic_dec(&sbi->s_curr_pending); } @@ -1256,8 +1266,7 @@ void exofs_delete_inode(struct inode *inode) struct exofs_i_info *oi = exofs_i(inode); struct super_block *sb = inode->i_sb; struct exofs_sb_info *sbi = sb->s_fs_info; - struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; - struct osd_request *or; + struct exofs_io_state *ios; int ret; truncate_inode_pages(&inode->i_data, 0); @@ -1274,25 +1283,26 @@ void exofs_delete_inode(struct inode *inode) clear_inode(inode); - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n"); + ret = exofs_get_io_state(sbi, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); return; } - osd_req_remove_object(or, &obj); - /* if we are deleting an obj that hasn't been created yet, wait */ if (!obj_created(oi)) { BUG_ON(!obj_2bcreated(oi)); wait_event(oi->i_wq, obj_created(oi)); } - ret = exofs_async_op(or, delete_done, sbi, oi->i_cred); + ios->obj.id = exofs_oi_objno(oi); + ios->done = delete_done; + ios->private = sbi; + ios->cred = oi->i_cred; + ret = exofs_sbi_remove(ios); if (ret) { - EXOFS_ERR( - "ERROR: @exofs_delete_inode exofs_async_op failed\n"); - osd_end_request(or); + EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__); + exofs_put_io_state(ios); return; } atomic_inc(&sbi->s_curr_pending); diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c new file mode 100644 index 000000000000..5bad01fa1f9f --- /dev/null +++ b/fs/exofs/ios.c @@ -0,0 +1,421 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <scsi/scsi_device.h> + +#include "exofs.h" + +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) +{ + osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); +} + +int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, + u64 offset, void *p, unsigned length) +{ + struct osd_request *or = osd_start_request(od, GFP_KERNEL); +/* struct osd_sense_info osi = {.key = 0};*/ + int ret; + + if (unlikely(!or)) { + EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); + return -ENOMEM; + } + ret = osd_req_read_kern(or, obj, offset, p, length); + if (unlikely(ret)) { + EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__); + goto out; + } + + ret = osd_finalize_request(or, 0, cred, NULL); + if (unlikely(ret)) { + EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); + goto out; + } + + ret = osd_execute_request(or); + if (unlikely(ret)) + EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); + /* osd_req_decode_sense(or, ret); */ + +out: + osd_end_request(or); + return ret; +} + +int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios) +{ + struct exofs_io_state *ios; + + /*TODO: Maybe use kmem_cach per sbi of size + * exofs_io_state_size(sbi->s_numdevs) + */ + ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL); + if (unlikely(!ios)) { + *pios = NULL; + return -ENOMEM; + } + + ios->sbi = sbi; + ios->obj.partition = sbi->s_pid; + *pios = ios; + return 0; +} + +void exofs_put_io_state(struct exofs_io_state *ios) +{ + if (ios) { + unsigned i; + + for (i = 0; i < ios->numdevs; i++) { + struct exofs_per_dev_state *per_dev = &ios->per_dev[i]; + + if (per_dev->or) + osd_end_request(per_dev->or); + if (per_dev->bio) + bio_put(per_dev->bio); + } + + kfree(ios); + } +} + +static void _sync_done(struct exofs_io_state *ios, void *p) +{ + struct completion *waiting = p; + + complete(waiting); +} + +static void _last_io(struct kref *kref) +{ + struct exofs_io_state *ios = container_of( + kref, struct exofs_io_state, kref); + + ios->done(ios, ios->private); +} + +static void _done_io(struct osd_request *or, void *p) +{ + struct exofs_io_state *ios = p; + + kref_put(&ios->kref, _last_io); +} + +static int exofs_io_execute(struct exofs_io_state *ios) +{ + DECLARE_COMPLETION_ONSTACK(wait); + bool sync = (ios->done == NULL); + int i, ret; + + if (sync) { + ios->done = _sync_done; + ios->private = &wait; + } + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + if (unlikely(!or)) + continue; + + ret = osd_finalize_request(or, 0, ios->cred, NULL); + if (unlikely(ret)) { + EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", + ret); + return ret; + } + } + + kref_init(&ios->kref); + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + if (unlikely(!or)) + continue; + + kref_get(&ios->kref); + osd_execute_request_async(or, _done_io, ios); + } + + kref_put(&ios->kref, _last_io); + ret = 0; + + if (sync) { + wait_for_completion(&wait); + ret = exofs_check_io(ios, NULL); + } + return ret; +} + +int exofs_check_io(struct exofs_io_state *ios, u64 *resid) +{ + enum osd_err_priority acumulated_osd_err = 0; + int acumulated_lin_err = 0; + int i; + + for (i = 0; i < ios->numdevs; i++) { + struct osd_sense_info osi; + int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi); + + if (likely(!ret)) + continue; + + if (unlikely(ret == -EFAULT)) { + EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__); + /*FIXME: All the pages in this device range should: + * clear_highpage(page); + */ + } + + if (osi.osd_err_pri >= acumulated_osd_err) { + acumulated_osd_err = osi.osd_err_pri; + acumulated_lin_err = ret; + } + } + + /* TODO: raid specific residual calculations */ + if (resid) { + if (likely(!acumulated_lin_err)) + *resid = 0; + else + *resid = ios->length; + } + + return acumulated_lin_err; +} + +int exofs_sbi_create(struct exofs_io_state *ios) +{ + int i, ret; + + for (i = 0; i < ios->sbi->s_numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + osd_req_create_object(or, &ios->obj); + } + ret = exofs_io_execute(ios); + +out: + return ret; +} + +int exofs_sbi_remove(struct exofs_io_state *ios) +{ + int i, ret; + + for (i = 0; i < ios->sbi->s_numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + osd_req_remove_object(or, &ios->obj); + } + ret = exofs_io_execute(ios); + +out: + return ret; +} + +int exofs_sbi_write(struct exofs_io_state *ios) +{ + int i, ret; + + for (i = 0; i < ios->sbi->s_numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + if (ios->bio) { + struct bio *bio; + + if (i != 0) { + bio = bio_kmalloc(GFP_KERNEL, + ios->bio->bi_max_vecs); + if (unlikely(!bio)) { + ret = -ENOMEM; + goto out; + } + + __bio_clone(bio, ios->bio); + bio->bi_bdev = NULL; + bio->bi_next = NULL; + ios->per_dev[i].bio = bio; + } else { + bio = ios->bio; + } + + osd_req_write(or, &ios->obj, ios->offset, bio, + ios->length); +/* EXOFS_DBGMSG("write sync=%d\n", sync);*/ + } else if (ios->kern_buff) { + osd_req_write_kern(or, &ios->obj, ios->offset, + ios->kern_buff, ios->length); +/* EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/ + } else { + osd_req_set_attributes(or, &ios->obj); +/* EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/ + } + + if (ios->out_attr) + osd_req_add_set_attr_list(or, ios->out_attr, + ios->out_attr_len); + + if (ios->in_attr) + osd_req_add_get_attr_list(or, ios->in_attr, + ios->in_attr_len); + } + ret = exofs_io_execute(ios); + +out: + return ret; +} + +int exofs_sbi_read(struct exofs_io_state *ios) +{ + int i, ret; + + for (i = 0; i < 1; i++) { + struct osd_request *or; + unsigned first_dev = (unsigned)ios->obj.id; + + first_dev %= ios->sbi->s_numdevs; + or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + if (ios->bio) { + osd_req_read(or, &ios->obj, ios->offset, ios->bio, + ios->length); +/* EXOFS_DBGMSG("read sync=%d\n", sync);*/ + } else if (ios->kern_buff) { + osd_req_read_kern(or, &ios->obj, ios->offset, + ios->kern_buff, ios->length); +/* EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/ + } else { + osd_req_get_attributes(or, &ios->obj); +/* EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/ + } + + if (ios->out_attr) + osd_req_add_set_attr_list(or, ios->out_attr, + ios->out_attr_len); + + if (ios->in_attr) + osd_req_add_get_attr_list(or, ios->in_attr, + ios->in_attr_len); + } + ret = exofs_io_execute(ios); + +out: + return ret; +} + +int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) +{ + struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ + void *iter = NULL; + int nelem; + + do { + nelem = 1; + osd_req_decode_get_attr_list(ios->per_dev[0].or, + &cur_attr, &nelem, &iter); + if ((cur_attr.attr_page == attr->attr_page) && + (cur_attr.attr_id == attr->attr_id)) { + attr->len = cur_attr.len; + attr->val_ptr = cur_attr.val_ptr; + return 0; + } + } while (iter); + + return -EIO; +} + +int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) +{ + struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; + struct exofs_io_state *ios; + struct osd_attr attr; + __be64 newsize; + int i, ret; + + if (exofs_get_io_state(sbi, &ios)) + return -ENOMEM; + + ios->obj.id = exofs_oi_objno(oi); + ios->cred = oi->i_cred; + + newsize = cpu_to_be64(size); + attr = g_attr_logical_length; + attr.val_ptr = &newsize; + + for (i = 0; i < sbi->s_numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(sbi->s_ods[i], GFP_KERNEL); + if (unlikely(!or)) { + EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + osd_req_set_attributes(or, &ios->obj); + osd_req_add_set_attr_list(or, &attr, 1); + } + ret = exofs_io_execute(ios); + +out: + exofs_put_io_state(ios); + return ret; +} diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c deleted file mode 100644 index 4372542df284..000000000000 --- a/fs/exofs/osd.c +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <scsi/scsi_device.h> -#include <scsi/osd_sense.h> - -#include "exofs.h" - -int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid) -{ - struct osd_sense_info osi; - int ret = osd_req_decode_sense(or, &osi); - - if (ret) { /* translate to Linux codes */ - if (osi.additional_code == scsi_invalid_field_in_cdb) { - if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE) - ret = -EFAULT; - if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID) - ret = -ENOENT; - else - ret = -EINVAL; - } else if (osi.additional_code == osd_quota_error) - ret = -ENOSPC; - else - ret = -EIO; - } - - /* FIXME: should be include in osd_sense_info */ - if (in_resid) - *in_resid = or->in.req ? or->in.req->resid_len : 0; - - if (out_resid) - *out_resid = or->out.req ? or->out.req->resid_len : 0; - - return ret; -} - -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) -{ - osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); -} - -/* - * Perform a synchronous OSD operation. - */ -int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential) -{ - int ret; - - or->timeout = timeout; - ret = osd_finalize_request(or, 0, credential, NULL); - if (ret) { - EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); - return ret; - } - - ret = osd_execute_request(or); - - if (ret) - EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); - /* osd_req_decode_sense(or, ret); */ - return ret; -} - -/* - * Perform an asynchronous OSD operation. - */ -int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done, - void *caller_context, u8 *cred) -{ - int ret; - - ret = osd_finalize_request(or, 0, cred, NULL); - if (ret) { - EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret); - return ret; - } - - ret = osd_execute_request_async(or, async_done, caller_context); - - if (ret) - EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret); - return ret; -} - -int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr) -{ - struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ - void *iter = NULL; - int nelem; - - do { - nelem = 1; - osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter); - if ((cur_attr.attr_page == attr->attr_page) && - (cur_attr.attr_id == attr->attr_id)) { - attr->len = cur_attr.len; - attr->val_ptr = cur_attr.val_ptr; - return 0; - } - } while (iter); - - return -EIO; -} diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h new file mode 100644 index 000000000000..c52e9888b8ab --- /dev/null +++ b/fs/exofs/pnfs.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify it under the + * terms of the GNU General Public License version 2 as published by the Free + * Software Foundation. + * + */ + +/* FIXME: Remove this file once pnfs hits mainline */ + +#ifndef __EXOFS_PNFS_H__ +#define __EXOFS_PNFS_H__ + +#if ! defined(__PNFS_OSD_XDR_H__) + +enum pnfs_iomode { + IOMODE_READ = 1, + IOMODE_RW = 2, + IOMODE_ANY = 3, +}; + +/* Layout Structure */ +enum pnfs_osd_raid_algorithm4 { + PNFS_OSD_RAID_0 = 1, + PNFS_OSD_RAID_4 = 2, + PNFS_OSD_RAID_5 = 3, + PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ +}; + +struct pnfs_osd_data_map { + u32 odm_num_comps; + u64 odm_stripe_unit; + u32 odm_group_width; + u32 odm_group_depth; + u32 odm_mirror_cnt; + u32 odm_raid_algorithm; +}; + +#endif /* ! defined(__PNFS_OSD_XDR_H__) */ + +#endif /* __EXOFS_PNFS_H__ */ diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 9f500dec3b59..a1d1e77b12eb 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -203,49 +203,45 @@ int exofs_sync_fs(struct super_block *sb, int wait) { struct exofs_sb_info *sbi; struct exofs_fscb *fscb; - struct osd_request *or; - struct osd_obj_id obj; + struct exofs_io_state *ios; int ret = -ENOMEM; - fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL); - if (!fscb) { - EXOFS_ERR("exofs_write_super: memory allocation failed.\n"); - return -ENOMEM; - } - lock_super(sb); sbi = sb->s_fs_info; + fscb = &sbi->s_fscb; + + ret = exofs_get_io_state(sbi, &ios); + if (ret) + goto out; + + /* Note: We only write the changing part of the fscb. .i.e upto the + * the fscb->s_dev_table_oid member. There is no read-modify-write + * here. + */ + ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); + memset(fscb, 0, ios->length); fscb->s_nextid = cpu_to_le64(sbi->s_nextid); fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); fscb->s_magic = cpu_to_le16(sb->s_magic); fscb->s_newfs = 0; + fscb->s_version = EXOFS_FSCB_VER; - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("exofs_write_super: osd_start_request failed.\n"); - goto out; - } + ios->obj.id = EXOFS_SUPER_ID; + ios->offset = 0; + ios->kern_buff = fscb; + ios->cred = sbi->s_cred; - obj.partition = sbi->s_pid; - obj.id = EXOFS_SUPER_ID; - ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb)); + ret = exofs_sbi_write(ios); if (unlikely(ret)) { - EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n"); - goto out; - } - - ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred); - if (unlikely(ret)) { - EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n"); + EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); goto out; } sb->s_dirt = 0; out: - if (or) - osd_end_request(or); + EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); + exofs_put_io_state(ios); unlock_super(sb); - kfree(fscb); return ret; } @@ -257,6 +253,29 @@ static void exofs_write_super(struct super_block *sb) sb->s_dirt = 0; } +static void _exofs_print_device(const char *msg, const char *dev_path, + struct osd_dev *od, u64 pid) +{ + const struct osd_dev_info *odi = osduld_device_info(od); + + printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n", + msg, dev_path ?: "", odi->osdname, _LLU(pid)); +} + +void exofs_free_sbi(struct exofs_sb_info *sbi) +{ + while (sbi->s_numdevs) { + int i = --sbi->s_numdevs; + struct osd_dev *od = sbi->s_ods[i]; + + if (od) { + sbi->s_ods[i] = NULL; + osduld_put_device(od); + } + } + kfree(sbi); +} + /* * This function is called when the vfs is freeing the superblock. We just * need to free our own part. @@ -279,11 +298,182 @@ static void exofs_put_super(struct super_block *sb) msecs_to_jiffies(100)); } - osduld_put_device(sbi->s_dev); - kfree(sb->s_fs_info); + _exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid); + + exofs_free_sbi(sbi); sb->s_fs_info = NULL; } +static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, + struct exofs_device_table *dt) +{ + sbi->data_map.odm_num_comps = + le32_to_cpu(dt->dt_data_map.cb_num_comps); + sbi->data_map.odm_stripe_unit = + le64_to_cpu(dt->dt_data_map.cb_stripe_unit); + sbi->data_map.odm_group_width = + le32_to_cpu(dt->dt_data_map.cb_group_width); + sbi->data_map.odm_group_depth = + le32_to_cpu(dt->dt_data_map.cb_group_depth); + sbi->data_map.odm_mirror_cnt = + le32_to_cpu(dt->dt_data_map.cb_mirror_cnt); + sbi->data_map.odm_raid_algorithm = + le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); + +/* FIXME: Hard coded mirror only for now. if not so do not mount */ + if ((sbi->data_map.odm_num_comps != numdevs) || + (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) || + (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) || + (sbi->data_map.odm_mirror_cnt != (numdevs - 1))) + return -EINVAL; + else + return 0; +} + +/* @odi is valid only as long as @fscb_dev is valid */ +static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, + struct osd_dev_info *odi) +{ + odi->systemid_len = le32_to_cpu(dt_dev->systemid_len); + memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len); + + odi->osdname_len = le32_to_cpu(dt_dev->osdname_len); + odi->osdname = dt_dev->osdname; + + /* FIXME support long names. Will need a _put function */ + if (dt_dev->long_name_offset) + return -EINVAL; + + /* Make sure osdname is printable! + * mkexofs should give us space for a null-terminator else the + * device-table is invalid. + */ + if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname))) + odi->osdname_len = sizeof(dt_dev->osdname) - 1; + dt_dev->osdname[odi->osdname_len] = 0; + + /* If it's all zeros something is bad we read past end-of-obj */ + return !(odi->systemid_len || odi->osdname_len); +} + +static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, + unsigned table_count) +{ + struct exofs_sb_info *sbi = *psbi; + struct osd_dev *fscb_od; + struct osd_obj_id obj = {.partition = sbi->s_pid, + .id = EXOFS_DEVTABLE_ID}; + struct exofs_device_table *dt; + unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + + sizeof(*dt); + unsigned numdevs, i; + int ret; + + dt = kmalloc(table_bytes, GFP_KERNEL); + if (unlikely(!dt)) { + EXOFS_ERR("ERROR: allocating %x bytes for device table\n", + table_bytes); + return -ENOMEM; + } + + fscb_od = sbi->s_ods[0]; + sbi->s_ods[0] = NULL; + sbi->s_numdevs = 0; + ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); + if (unlikely(ret)) { + EXOFS_ERR("ERROR: reading device table\n"); + goto out; + } + + numdevs = le64_to_cpu(dt->dt_num_devices); + if (unlikely(!numdevs)) { + ret = -EINVAL; + goto out; + } + WARN_ON(table_count != numdevs); + + ret = _read_and_match_data_map(sbi, numdevs, dt); + if (unlikely(ret)) + goto out; + + if (likely(numdevs > 1)) { + unsigned size = numdevs * sizeof(sbi->s_ods[0]); + + sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); + if (unlikely(!sbi)) { + ret = -ENOMEM; + goto out; + } + memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0])); + *psbi = sbi; + } + + for (i = 0; i < numdevs; i++) { + struct exofs_fscb fscb; + struct osd_dev_info odi; + struct osd_dev *od; + + if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) { + EXOFS_ERR("ERROR: Read all-zeros device entry\n"); + ret = -EINVAL; + goto out; + } + + printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", + i, odi.osdname); + + /* On all devices the device table is identical. The user can + * specify any one of the participating devices on the command + * line. We always keep them in device-table order. + */ + if (fscb_od && osduld_device_same(fscb_od, &odi)) { + sbi->s_ods[i] = fscb_od; + ++sbi->s_numdevs; + fscb_od = NULL; + continue; + } + + od = osduld_info_lookup(&odi); + if (unlikely(IS_ERR(od))) { + ret = PTR_ERR(od); + EXOFS_ERR("ERROR: device requested is not found " + "osd_name-%s =>%d\n", odi.osdname, ret); + goto out; + } + + sbi->s_ods[i] = od; + ++sbi->s_numdevs; + + /* Read the fscb of the other devices to make sure the FS + * partition is there. + */ + ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, + sizeof(fscb)); + if (unlikely(ret)) { + EXOFS_ERR("ERROR: Malformed participating device " + "error reading fscb osd_name-%s\n", + odi.osdname); + goto out; + } + + /* TODO: verify other information is correct and FS-uuid + * matches. Benny what did you say about device table + * generation and old devices? + */ + } + +out: + kfree(dt); + if (unlikely(!ret && fscb_od)) { + EXOFS_ERR( + "ERROR: Bad device-table container device not present\n"); + osduld_put_device(fscb_od); + ret = -EINVAL; + } + + return ret; +} + /* * Read the superblock from the OSD and fill in the fields */ @@ -292,24 +482,25 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) struct inode *root; struct exofs_mountopt *opts = data; struct exofs_sb_info *sbi; /*extended info */ + struct osd_dev *od; /* Master device */ struct exofs_fscb fscb; /*on-disk superblock info */ - struct osd_request *or = NULL; struct osd_obj_id obj; + unsigned table_count; int ret; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; - sb->s_fs_info = sbi; /* use mount options to fill superblock */ - sbi->s_dev = osduld_path_lookup(opts->dev_name); - if (IS_ERR(sbi->s_dev)) { - ret = PTR_ERR(sbi->s_dev); - sbi->s_dev = NULL; + od = osduld_path_lookup(opts->dev_name); + if (IS_ERR(od)) { + ret = PTR_ERR(od); goto free_sbi; } + sbi->s_ods[0] = od; + sbi->s_numdevs = 1; sbi->s_pid = opts->pid; sbi->s_timeout = opts->timeout; @@ -323,35 +514,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sb->s_bdev = NULL; sb->s_dev = 0; - /* read data from on-disk superblock object */ obj.partition = sbi->s_pid; obj.id = EXOFS_SUPER_ID; exofs_make_credential(sbi->s_cred, &obj); - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - if (!silent) - EXOFS_ERR( - "exofs_fill_super: osd_start_request failed.\n"); - ret = -ENOMEM; - goto free_sbi; - } - ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb)); - if (unlikely(ret)) { - if (!silent) - EXOFS_ERR( - "exofs_fill_super: osd_req_read_kern failed.\n"); - ret = -ENOMEM; - goto free_sbi; - } - - ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred); - if (unlikely(ret)) { - if (!silent) - EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n"); - ret = -EIO; + ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb)); + if (unlikely(ret)) goto free_sbi; - } sb->s_magic = le16_to_cpu(fscb.s_magic); sbi->s_nextid = le64_to_cpu(fscb.s_nextid); @@ -364,12 +533,26 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) ret = -EINVAL; goto free_sbi; } + if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) { + EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n", + EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version)); + ret = -EINVAL; + goto free_sbi; + } /* start generation numbers from a random point */ get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); + table_count = le64_to_cpu(fscb.s_dev_table_count); + if (table_count) { + ret = exofs_read_lookup_dev_table(&sbi, table_count); + if (unlikely(ret)) + goto free_sbi; + } + /* set up operation vectors */ + sb->s_fs_info = sbi; sb->s_op = &exofs_sops; sb->s_export_op = &exofs_export_ops; root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); @@ -395,16 +578,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - ret = 0; -out: - if (or) - osd_end_request(or); - return ret; + _exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0], + sbi->s_pid); + return 0; free_sbi: - osduld_put_device(sbi->s_dev); /* NULL safe */ - kfree(sbi); - goto out; + EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", + opts->dev_name, sbi->s_pid, ret); + exofs_free_sbi(sbi); + return ret; } /* @@ -433,7 +615,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct exofs_sb_info *sbi = sb->s_fs_info; - struct osd_obj_id obj = {sbi->s_pid, 0}; + struct exofs_io_state *ios; struct osd_attr attrs[] = { ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), @@ -442,32 +624,33 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) }; uint64_t capacity = ULLONG_MAX; uint64_t used = ULLONG_MAX; - struct osd_request *or; uint8_t cred_a[OSD_CAP_LEN]; int ret; - /* get used/capacity attributes */ - exofs_make_credential(cred_a, &obj); - - or = osd_start_request(sbi->s_dev, GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n"); - return -ENOMEM; + ret = exofs_get_io_state(sbi, &ios); + if (ret) { + EXOFS_DBGMSG("exofs_get_io_state failed.\n"); + return ret; } - osd_req_get_attributes(or, &obj); - osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs)); - ret = exofs_sync_op(or, sbi->s_timeout, cred_a); + exofs_make_credential(cred_a, &ios->obj); + ios->cred = sbi->s_cred; + ios->in_attr = attrs; + ios->in_attr_len = ARRAY_SIZE(attrs); + + ret = exofs_sbi_read(ios); if (unlikely(ret)) goto out; - ret = extract_attr_from_req(or, &attrs[0]); - if (likely(!ret)) + ret = extract_attr_from_ios(ios, &attrs[0]); + if (likely(!ret)) { capacity = get_unaligned_be64(attrs[0].val_ptr); - else + if (unlikely(!capacity)) + capacity = ULLONG_MAX; + } else EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n"); - ret = extract_attr_from_req(or, &attrs[1]); + ret = extract_attr_from_ios(ios, &attrs[1]); if (likely(!ret)) used = get_unaligned_be64(attrs[1].val_ptr); else @@ -476,15 +659,15 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) /* fill in the stats buffer */ buf->f_type = EXOFS_SUPER_MAGIC; buf->f_bsize = EXOFS_BLKSIZE; - buf->f_blocks = (capacity >> EXOFS_BLKSHIFT); - buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT); + buf->f_blocks = capacity >> 9; + buf->f_bfree = (capacity - used) >> 9; buf->f_bavail = buf->f_bfree; buf->f_files = sbi->s_numfiles; buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles; buf->f_namelen = EXOFS_NAME_LEN; out: - osd_end_request(or); + exofs_put_io_state(ios); return ret; } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 197c7db583c7..e9e175949a63 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -6,7 +6,7 @@ * and for mapping back from file handles to dentries. * * For details on why we do all the strange and hairy things in here - * take a look at Documentation/filesystems/Exporting. + * take a look at Documentation/filesystems/nfs/Exporting. */ #include <linux/exportfs.h> #include <linux/fs.h> diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index a63d44256a70..a99e54318c3d 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -339,12 +339,12 @@ ext2_acl_chmod(struct inode *inode) * Extended attribut handlers */ static size_t -ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_size) memcpy(list, POSIX_ACL_XATTR_ACCESS, size); @@ -352,12 +352,12 @@ ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size, } static size_t -ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_size) memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); @@ -365,15 +365,18 @@ ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size, } static int -ext2_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { struct posix_acl *acl; int error; - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) return -EOPNOTSUPP; - acl = ext2_get_acl(inode, type); + acl = ext2_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -385,33 +388,17 @@ ext2_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) } static int -ext2_xattr_get_acl_access(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int -ext2_xattr_get_acl_default(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int -ext2_xattr_set_acl(struct inode *inode, int type, const void *value, - size_t size) +ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { struct posix_acl *acl; int error; - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) return -EOPNOTSUPP; - if (!is_owner_or_cap(inode)) + if (!is_owner_or_cap(dentry->d_inode)) return -EPERM; if (value) { @@ -426,41 +413,25 @@ ext2_xattr_set_acl(struct inode *inode, int type, const void *value, } else acl = NULL; - error = ext2_set_acl(inode, type, acl); + error = ext2_set_acl(dentry->d_inode, type, acl); release_and_out: posix_acl_release(acl); return error; } -static int -ext2_xattr_set_acl_access(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int -ext2_xattr_set_acl_default(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - struct xattr_handler ext2_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, .list = ext2_xattr_list_acl_access, - .get = ext2_xattr_get_acl_access, - .set = ext2_xattr_set_acl_access, + .get = ext2_xattr_get_acl, + .set = ext2_xattr_set_acl, }; struct xattr_handler ext2_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, .list = ext2_xattr_list_acl_default, - .get = ext2_xattr_get_acl_default, - .set = ext2_xattr_set_acl_default, + .get = ext2_xattr_get_acl, + .set = ext2_xattr_set_acl, }; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 6cde970b0a1a..7516957273ed 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -353,8 +353,8 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) * ext2_find_entry() * * finds an entry in the specified directory with the wanted name. It - * returns the page in which the entry was found, and the entry itself - * (as a parameter - res_dir). Page is returned mapped and unlocked. + * returns the page in which the entry was found (as a parameter - res_page), + * and the entry itself. Page is returned mapped and unlocked. * Entry is guaranteed to be valid. */ struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir, @@ -721,5 +721,5 @@ const struct file_operations ext2_dir_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, #endif - .fsync = simple_fsync, + .fsync = ext2_fsync, }; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 9a8a8e27a063..061914add3cf 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -142,7 +142,7 @@ struct dentry *ext2_get_parent(struct dentry *child); /* super.c */ extern void ext2_error (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ext2_warning (struct super_block *, const char *, const char *, ...) +extern void ext2_msg(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); extern void ext2_update_dynamic_rev (struct super_block *sb); extern void ext2_write_super (struct super_block *); @@ -155,6 +155,7 @@ extern void ext2_write_super (struct super_block *); extern const struct file_operations ext2_dir_operations; /* file.c */ +extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync); extern const struct inode_operations ext2_file_inode_operations; extern const struct file_operations ext2_file_operations; extern const struct file_operations ext2_xip_file_operations; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index a2f3afd1a1c1..586e3589d4c2 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -19,6 +19,7 @@ */ #include <linux/time.h> +#include <linux/pagemap.h> #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -38,6 +39,22 @@ static int ext2_release_file (struct inode * inode, struct file * filp) return 0; } +int ext2_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + int ret; + struct super_block *sb = dentry->d_inode->i_sb; + struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + + ret = simple_fsync(file, dentry, datasync); + if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) { + /* We don't really know where the IO error happened... */ + ext2_error(sb, __func__, + "detected IO error when writing metadata buffers"); + ret = -EIO; + } + return ret; +} + /* * We have mostly NULL's here: the current defaults are ok for * the ext2 filesystem. @@ -55,7 +72,7 @@ const struct file_operations ext2_file_operations = { .mmap = generic_file_mmap, .open = generic_file_open, .release = ext2_release_file, - .fsync = simple_fsync, + .fsync = ext2_fsync, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, }; @@ -72,7 +89,7 @@ const struct file_operations ext2_xip_file_operations = { .mmap = xip_file_mmap, .open = generic_file_open, .release = ext2_release_file, - .fsync = simple_fsync, + .fsync = ext2_fsync, }; #endif diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index ade634076d0a..71b032c65a02 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -137,7 +137,8 @@ static int ext2_block_to_path(struct inode *inode, int final = 0; if (i_block < 0) { - ext2_warning (inode->i_sb, "ext2_block_to_path", "block < 0"); + ext2_msg(inode->i_sb, KERN_WARNING, + "warning: %s: block < 0", __func__); } else if (i_block < direct_blocks) { offsets[n++] = i_block; final = direct_blocks; @@ -157,7 +158,8 @@ static int ext2_block_to_path(struct inode *inode, offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else { - ext2_warning (inode->i_sb, "ext2_block_to_path", "block > big"); + ext2_msg(inode->i_sb, KERN_WARNING, + "warning: %s: block is too big", __func__); } if (boundary) *boundary = final - 1 - (i_block & (ptrs - 1)); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 1a9ffee47d56..f9cb54a585ce 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -58,27 +58,27 @@ void ext2_error (struct super_block * sb, const char * function, } va_start(args, fmt); - printk(KERN_CRIT "EXT2-fs error (device %s): %s: ",sb->s_id, function); + printk(KERN_CRIT "EXT2-fs (%s): error: %s: ", sb->s_id, function); vprintk(fmt, args); printk("\n"); va_end(args); if (test_opt(sb, ERRORS_PANIC)) - panic("EXT2-fs panic from previous error\n"); + panic("EXT2-fs: panic from previous error\n"); if (test_opt(sb, ERRORS_RO)) { - printk("Remounting filesystem read-only\n"); + ext2_msg(sb, KERN_CRIT, + "error: remounting filesystem read-only"); sb->s_flags |= MS_RDONLY; } } -void ext2_warning (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext2_msg(struct super_block *sb, const char *prefix, + const char *fmt, ...) { va_list args; va_start(args, fmt); - printk(KERN_WARNING "EXT2-fs warning (device %s): %s: ", - sb->s_id, function); + printk("%sEXT2-fs (%s): ", prefix, sb->s_id); vprintk(fmt, args); printk("\n"); va_end(args); @@ -91,9 +91,9 @@ void ext2_update_dynamic_rev(struct super_block *sb) if (le32_to_cpu(es->s_rev_level) > EXT2_GOOD_OLD_REV) return; - ext2_warning(sb, __func__, - "updating to rev %d because of new feature flag, " - "running e2fsck is recommended", + ext2_msg(sb, KERN_WARNING, + "warning: updating to rev %d because of " + "new feature flag, running e2fsck is recommended", EXT2_DYNAMIC_REV); es->s_first_ino = cpu_to_le32(EXT2_GOOD_OLD_FIRST_INO); @@ -419,10 +419,10 @@ static const match_table_t tokens = { {Opt_err, NULL} }; -static int parse_options (char * options, - struct ext2_sb_info *sbi) +static int parse_options(char *options, struct super_block *sb) { - char * p; + char *p; + struct ext2_sb_info *sbi = EXT2_SB(sb); substring_t args[MAX_OPT_ARGS]; int option; @@ -505,7 +505,8 @@ static int parse_options (char * options, #else case Opt_user_xattr: case Opt_nouser_xattr: - printk("EXT2 (no)user_xattr options not supported\n"); + ext2_msg(sb, KERN_INFO, "(no)user_xattr options" + "not supported"); break; #endif #ifdef CONFIG_EXT2_FS_POSIX_ACL @@ -518,14 +519,15 @@ static int parse_options (char * options, #else case Opt_acl: case Opt_noacl: - printk("EXT2 (no)acl options not supported\n"); + ext2_msg(sb, KERN_INFO, + "(no)acl options not supported"); break; #endif case Opt_xip: #ifdef CONFIG_EXT2_FS_XIP set_opt (sbi->s_mount_opt, XIP); #else - printk("EXT2 xip option not supported\n"); + ext2_msg(sb, KERN_INFO, "xip option not supported"); #endif break; @@ -542,19 +544,18 @@ static int parse_options (char * options, case Opt_quota: case Opt_usrquota: case Opt_grpquota: - printk(KERN_ERR - "EXT2-fs: quota operations not supported.\n"); - + ext2_msg(sb, KERN_INFO, + "quota operations not supported"); break; #endif case Opt_reservation: set_opt(sbi->s_mount_opt, RESERVATION); - printk("reservations ON\n"); + ext2_msg(sb, KERN_INFO, "reservations ON"); break; case Opt_noreservation: clear_opt(sbi->s_mount_opt, RESERVATION); - printk("reservations OFF\n"); + ext2_msg(sb, KERN_INFO, "reservations OFF"); break; case Opt_ignore: break; @@ -573,34 +574,40 @@ static int ext2_setup_super (struct super_block * sb, struct ext2_sb_info *sbi = EXT2_SB(sb); if (le32_to_cpu(es->s_rev_level) > EXT2_MAX_SUPP_REV) { - printk ("EXT2-fs warning: revision level too high, " - "forcing read-only mode\n"); + ext2_msg(sb, KERN_ERR, + "error: revision level too high, " + "forcing read-only mode"); res = MS_RDONLY; } if (read_only) return res; if (!(sbi->s_mount_state & EXT2_VALID_FS)) - printk ("EXT2-fs warning: mounting unchecked fs, " - "running e2fsck is recommended\n"); + ext2_msg(sb, KERN_WARNING, + "warning: mounting unchecked fs, " + "running e2fsck is recommended"); else if ((sbi->s_mount_state & EXT2_ERROR_FS)) - printk ("EXT2-fs warning: mounting fs with errors, " - "running e2fsck is recommended\n"); + ext2_msg(sb, KERN_WARNING, + "warning: mounting fs with errors, " + "running e2fsck is recommended"); else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) - printk ("EXT2-fs warning: maximal mount count reached, " - "running e2fsck is recommended\n"); + ext2_msg(sb, KERN_WARNING, + "warning: maximal mount count reached, " + "running e2fsck is recommended"); else if (le32_to_cpu(es->s_checkinterval) && - (le32_to_cpu(es->s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= get_seconds())) - printk ("EXT2-fs warning: checktime reached, " - "running e2fsck is recommended\n"); + (le32_to_cpu(es->s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= get_seconds())) + ext2_msg(sb, KERN_WARNING, + "warning: checktime reached, " + "running e2fsck is recommended"); if (!le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext2_write_super(sb); if (test_opt (sb, DEBUG)) - printk ("[EXT II FS %s, %s, bs=%lu, fs=%lu, gc=%lu, " - "bpg=%lu, ipg=%lu, mo=%04lx]\n", + ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, " + "bpg=%lu, ipg=%lu, mo=%04lx]", EXT2FS_VERSION, EXT2FS_DATE, sb->s_blocksize, sbi->s_frag_size, sbi->s_groups_count, @@ -767,7 +774,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) */ blocksize = sb_min_blocksize(sb, BLOCK_SIZE); if (!blocksize) { - printk ("EXT2-fs: unable to set blocksize\n"); + ext2_msg(sb, KERN_ERR, "error: unable to set blocksize"); goto failed_sbi; } @@ -783,7 +790,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } if (!(bh = sb_bread(sb, logic_sb_block))) { - printk ("EXT2-fs: unable to read superblock\n"); + ext2_msg(sb, KERN_ERR, "error: unable to read superblock"); goto failed_sbi; } /* @@ -826,7 +833,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, RESERVATION); - if (!parse_options ((char *) data, sbi)) + if (!parse_options((char *) data, sb)) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -840,8 +847,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) || EXT2_HAS_INCOMPAT_FEATURE(sb, ~0U))) - printk("EXT2-fs warning: feature flags set on rev 0 fs, " - "running e2fsck is recommended\n"); + ext2_msg(sb, KERN_WARNING, + "warning: feature flags set on rev 0 fs, " + "running e2fsck is recommended"); /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, @@ -849,16 +857,16 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) */ features = EXT2_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP); if (features) { - printk("EXT2-fs: %s: couldn't mount because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + ext2_msg(sb, KERN_ERR, "error: couldn't mount because of " + "unsupported optional features (%x)", + le32_to_cpu(features)); goto failed_mount; } if (!(sb->s_flags & MS_RDONLY) && (features = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))){ - printk("EXT2-fs: %s: couldn't mount RDWR because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + ext2_msg(sb, KERN_ERR, "error: couldn't mount RDWR because of " + "unsupported optional features (%x)", + le32_to_cpu(features)); goto failed_mount; } @@ -866,7 +874,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) { if (!silent) - printk("XIP: Unsupported blocksize\n"); + ext2_msg(sb, KERN_ERR, + "error: unsupported blocksize for xip"); goto failed_mount; } @@ -875,7 +884,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); if (!sb_set_blocksize(sb, blocksize)) { - printk(KERN_ERR "EXT2-fs: blocksize too small for device.\n"); + ext2_msg(sb, KERN_ERR, "error: blocksize is too small"); goto failed_sbi; } @@ -883,14 +892,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) offset = (sb_block*BLOCK_SIZE) % blocksize; bh = sb_bread(sb, logic_sb_block); if(!bh) { - printk("EXT2-fs: Couldn't read superblock on " - "2nd try.\n"); + ext2_msg(sb, KERN_ERR, "error: couldn't read" + "superblock on 2nd try"); goto failed_sbi; } es = (struct ext2_super_block *) (((char *)bh->b_data) + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT2_SUPER_MAGIC)) { - printk ("EXT2-fs: Magic mismatch, very weird !\n"); + ext2_msg(sb, KERN_ERR, "error: magic mismatch"); goto failed_mount; } } @@ -906,7 +915,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if ((sbi->s_inode_size < EXT2_GOOD_OLD_INODE_SIZE) || !is_power_of_2(sbi->s_inode_size) || (sbi->s_inode_size > blocksize)) { - printk ("EXT2-fs: unsupported inode size: %d\n", + ext2_msg(sb, KERN_ERR, + "error: unsupported inode size: %d", sbi->s_inode_size); goto failed_mount; } @@ -943,29 +953,33 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_blocksize != bh->b_size) { if (!silent) - printk ("VFS: Unsupported blocksize on dev " - "%s.\n", sb->s_id); + ext2_msg(sb, KERN_ERR, "error: unsupported blocksize"); goto failed_mount; } if (sb->s_blocksize != sbi->s_frag_size) { - printk ("EXT2-fs: fragsize %lu != blocksize %lu (not supported yet)\n", + ext2_msg(sb, KERN_ERR, + "error: fragsize %lu != blocksize %lu" + "(not supported yet)", sbi->s_frag_size, sb->s_blocksize); goto failed_mount; } if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { - printk ("EXT2-fs: #blocks per group too big: %lu\n", + ext2_msg(sb, KERN_ERR, + "error: #blocks per group too big: %lu", sbi->s_blocks_per_group); goto failed_mount; } if (sbi->s_frags_per_group > sb->s_blocksize * 8) { - printk ("EXT2-fs: #fragments per group too big: %lu\n", + ext2_msg(sb, KERN_ERR, + "error: #fragments per group too big: %lu", sbi->s_frags_per_group); goto failed_mount; } if (sbi->s_inodes_per_group > sb->s_blocksize * 8) { - printk ("EXT2-fs: #inodes per group too big: %lu\n", + ext2_msg(sb, KERN_ERR, + "error: #inodes per group too big: %lu", sbi->s_inodes_per_group); goto failed_mount; } @@ -979,13 +993,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) EXT2_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { - printk ("EXT2-fs: not enough memory\n"); + ext2_msg(sb, KERN_ERR, "error: not enough memory"); goto failed_mount; } bgl_lock_init(sbi->s_blockgroup_lock); sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); if (!sbi->s_debts) { - printk ("EXT2-fs: not enough memory\n"); + ext2_msg(sb, KERN_ERR, "error: not enough memory"); goto failed_mount_group_desc; } for (i = 0; i < db_count; i++) { @@ -994,12 +1008,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (!sbi->s_group_desc[i]) { for (j = 0; j < i; j++) brelse (sbi->s_group_desc[j]); - printk ("EXT2-fs: unable to read group descriptors\n"); + ext2_msg(sb, KERN_ERR, + "error: unable to read group descriptors"); goto failed_mount_group_desc; } } if (!ext2_check_descriptors (sb)) { - printk ("EXT2-fs: group descriptors corrupted!\n"); + ext2_msg(sb, KERN_ERR, "group descriptors corrupted"); goto failed_mount2; } sbi->s_gdb_count = db_count; @@ -1032,7 +1047,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_count_dirs(sb)); } if (err) { - printk(KERN_ERR "EXT2-fs: insufficient memory\n"); + ext2_msg(sb, KERN_ERR, "error: insufficient memory"); goto failed_mount3; } /* @@ -1048,27 +1063,28 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { iput(root); - printk(KERN_ERR "EXT2-fs: corrupt root inode, run e2fsck\n"); + ext2_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); goto failed_mount3; } sb->s_root = d_alloc_root(root); if (!sb->s_root) { iput(root); - printk(KERN_ERR "EXT2-fs: get root inode failed\n"); + ext2_msg(sb, KERN_ERR, "error: get root inode failed"); ret = -ENOMEM; goto failed_mount3; } if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) - ext2_warning(sb, __func__, - "mounting ext3 filesystem as ext2"); + ext2_msg(sb, KERN_WARNING, + "warning: mounting ext3 filesystem as ext2"); ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY); return 0; cantfind_ext2: if (!silent) - printk("VFS: Can't find an ext2 filesystem on dev %s.\n", - sb->s_id); + ext2_msg(sb, KERN_ERR, + "error: can't find an ext2 filesystem on dev %s.", + sb->s_id); goto failed_mount; failed_mount3: percpu_counter_destroy(&sbi->s_freeblocks_counter); @@ -1089,9 +1105,30 @@ failed_sbi: return ret; } +static void ext2_clear_super_error(struct super_block *sb) +{ + struct buffer_head *sbh = EXT2_SB(sb)->s_sbh; + + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + printk(KERN_ERR "EXT2-fs: %s previous I/O error to " + "superblock detected", sb->s_id); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } +} + static void ext2_commit_super (struct super_block * sb, struct ext2_super_block * es) { + ext2_clear_super_error(sb); es->s_wtime = cpu_to_le32(get_seconds()); mark_buffer_dirty(EXT2_SB(sb)->s_sbh); sb->s_dirt = 0; @@ -1099,6 +1136,7 @@ static void ext2_commit_super (struct super_block * sb, static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es) { + ext2_clear_super_error(sb); es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); es->s_wtime = cpu_to_le32(get_seconds()); @@ -1121,8 +1159,24 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es) static int ext2_sync_fs(struct super_block *sb, int wait) { struct ext2_super_block *es = EXT2_SB(sb)->s_es; + struct buffer_head *sbh = EXT2_SB(sb)->s_sbh; lock_kernel(); + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + ext2_msg(sb, KERN_ERR, + "previous I/O error to superblock detected\n"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { ext2_debug("setting valid to 0\n"); es->s_state &= cpu_to_le16(~EXT2_VALID_FS); @@ -1170,7 +1224,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options (data, sbi)) { + if (!parse_options(data, sb)) { err = -EINVAL; goto restore_opts; } @@ -1182,7 +1236,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) EXT2_MOUNT_XIP if not */ if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) { - printk("XIP: Unsupported blocksize\n"); + ext2_msg(sb, KERN_WARNING, + "warning: unsupported blocksize for xip"); err = -EINVAL; goto restore_opts; } @@ -1191,8 +1246,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != (old_mount_opt & EXT2_MOUNT_XIP)) && invalidate_inodes(sb)) { - ext2_warning(sb, __func__, "refusing change of xip flag " - "with busy inodes while remounting"); + ext2_msg(sb, KERN_WARNING, "warning: refusing change of " + "xip flag with busy inodes while remounting"); sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; } @@ -1216,9 +1271,10 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP); if (ret) { - printk("EXT2-fs: %s: couldn't remount RDWR because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(ret)); + ext2_msg(sb, KERN_WARNING, + "warning: couldn't remount RDWR because of " + "unsupported optional features (%x).", + le32_to_cpu(ret)); err = -EROFS; goto restore_opts; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 7913531ec6d5..904f00642f84 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -60,6 +60,7 @@ #include <linux/mbcache.h> #include <linux/quotaops.h> #include <linux/rwsem.h> +#include <linux/security.h> #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -249,8 +250,9 @@ cleanup: * used / required on success. */ static int -ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) +ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct buffer_head *bh = NULL; struct ext2_xattr_entry *entry; char *end; @@ -300,9 +302,10 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", ext2_xattr_handler(entry->e_name_index); if (handler) { - size_t size = handler->list(inode, buffer, rest, + size_t size = handler->list(dentry, buffer, rest, entry->e_name, - entry->e_name_len); + entry->e_name_len, + handler->flags); if (buffer) { if (size > rest) { error = -ERANGE; @@ -330,7 +333,7 @@ cleanup: ssize_t ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) { - return ext2_xattr_list(dentry->d_inode, buffer, size); + return ext2_xattr_list(dentry, buffer, size); } /* diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 70c0dbdcdcb7..c8155845ac05 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -11,8 +11,8 @@ #include "xattr.h" static size_t -ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const int prefix_len = XATTR_SECURITY_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -26,22 +26,22 @@ ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size, } static int -ext2_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext2_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_get(inode, EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name, buffer, size); } static int -ext2_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext2_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index e8219f8eae9f..2a26d71f4771 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -13,8 +13,8 @@ #include "xattr.h" static size_t -ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const int prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -31,22 +31,22 @@ ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, } static int -ext2_xattr_trusted_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext2_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_get(inode, EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name, buffer, size); } static int -ext2_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext2_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 92495d28c62f..3f6caf3684b4 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -12,13 +12,13 @@ #include "xattr.h" static size_t -ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return 0; if (list && total_len <= list_size) { @@ -30,27 +30,28 @@ ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size, } static int -ext2_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext2_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, buffer, size); + return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_USER, + name, buffer, size); } static int -ext2_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext2_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, - value, size, flags); + return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_USER, + name, value, size, flags); } struct xattr_handler ext2_xattr_user_handler = { diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c index c18fbf3e4068..322a56b2dfb1 100644 --- a/fs/ext2/xip.c +++ b/fs/ext2/xip.c @@ -69,8 +69,9 @@ void ext2_xip_verify_sb(struct super_block *sb) if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) && !sb->s_bdev->bd_disk->fops->direct_access) { sbi->s_mount_opt &= (~EXT2_MOUNT_XIP); - ext2_warning(sb, __func__, - "ignoring xip option - not supported by bdev"); + ext2_msg(sb, KERN_WARNING, + "warning: ignoring xip option - " + "not supported by bdev"); } } diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index c9b0df376b5f..82ba34158661 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -366,12 +366,12 @@ out: * Extended attribute handlers */ static size_t -ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, - const char *name, size_t name_len) +ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_len) memcpy(list, POSIX_ACL_XATTR_ACCESS, size); @@ -379,12 +379,12 @@ ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, } static size_t -ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, - const char *name, size_t name_len) +ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_len) memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); @@ -392,15 +392,18 @@ ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, } static int -ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { struct posix_acl *acl; int error; - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) return -EOPNOTSUPP; - acl = ext3_get_acl(inode, type); + acl = ext3_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -412,31 +415,16 @@ ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) } static int -ext3_xattr_get_acl_access(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int -ext3_xattr_get_acl_default(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int -ext3_xattr_set_acl(struct inode *inode, int type, const void *value, - size_t size) +ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; handle_t *handle; struct posix_acl *acl; int error, retries = 0; + if (strcmp(name, "") != 0) + return -EINVAL; if (!test_opt(inode->i_sb, POSIX_ACL)) return -EOPNOTSUPP; if (!is_owner_or_cap(inode)) @@ -468,34 +456,18 @@ release_and_out: return error; } -static int -ext3_xattr_set_acl_access(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int -ext3_xattr_set_acl_default(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - struct xattr_handler ext3_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, .list = ext3_xattr_list_acl_access, - .get = ext3_xattr_get_acl_access, - .set = ext3_xattr_set_acl_access, + .get = ext3_xattr_get_acl, + .set = ext3_xattr_set_acl, }; struct xattr_handler ext3_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, .list = ext3_xattr_list_acl_default, - .get = ext3_xattr_get_acl_default, - .set = ext3_xattr_set_acl_default, + .get = ext3_xattr_get_acl, + .set = ext3_xattr_set_acl, }; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2db957778903..455e6e6e5cb9 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -970,7 +970,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock, if (max_blocks > DIO_MAX_BLOCKS) max_blocks = DIO_MAX_BLOCKS; handle = ext3_journal_start(inode, DIO_CREDITS + - 2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb)); + EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -1151,6 +1151,16 @@ static int do_journal_get_write_access(handle_t *handle, return ext3_journal_get_write_access(handle, bh); } +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static void ext3_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext3_truncate(inode); +} + static int ext3_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1209,7 +1219,7 @@ write_begin_failed: unlock_page(page); page_cache_release(page); if (pos + len > inode->i_size) - ext3_truncate(inode); + ext3_truncate_failed_write(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1304,7 +1314,7 @@ static int ext3_ordered_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - ext3_truncate(inode); + ext3_truncate_failed_write(inode); return ret ? ret : copied; } @@ -1330,7 +1340,7 @@ static int ext3_writeback_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - ext3_truncate(inode); + ext3_truncate_failed_write(inode); return ret ? ret : copied; } @@ -1383,7 +1393,7 @@ static int ext3_journalled_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - ext3_truncate(inode); + ext3_truncate_failed_write(inode); return ret ? ret : copied; } @@ -3136,8 +3146,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+ - EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; @@ -3229,7 +3239,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode) #ifdef CONFIG_QUOTA /* We know that structure was already allocated during vfs_dq_init so * we will be updating only the data blocks + inodes */ - ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); + ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); #endif return ret; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index aad6400c9b77..7b0e44f7d66f 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1699,7 +1699,7 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1733,7 +1733,7 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1769,7 +1769,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1920,7 +1920,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) struct ext3_iloc iloc; int err = 0, rc; - lock_super(sb); + mutex_lock(&EXT3_SB(sb)->s_orphan_lock); if (!list_empty(&EXT3_I(inode)->i_orphan)) goto out_unlock; @@ -1929,9 +1929,13 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) /* @@@ FIXME: Observation from aviro: * I think I can trigger J_ASSERT in ext3_orphan_add(). We block - * here (on lock_super()), so race with ext3_link() which might bump + * here (on s_orphan_lock), so race with ext3_link() which might bump * ->i_nlink. For, say it, character device. Not a regular file, * not a directory, not a symlink and ->i_nlink > 0. + * + * tytso, 4/25/2009: I'm not sure how that could happen; + * shouldn't the fs core protect us from these sort of + * unlink()/link() races? */ J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); @@ -1968,7 +1972,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out_unlock: - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_orphan_lock); ext3_std_error(inode->i_sb, err); return err; } @@ -1986,11 +1990,9 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode) struct ext3_iloc iloc; int err = 0; - lock_super(inode->i_sb); - if (list_empty(&ei->i_orphan)) { - unlock_super(inode->i_sb); - return 0; - } + mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock); + if (list_empty(&ei->i_orphan)) + goto out; ino_next = NEXT_ORPHAN(inode); prev = ei->i_orphan.prev; @@ -2040,7 +2042,7 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode) out_err: ext3_std_error(inode->i_sb, err); out: - unlock_super(inode->i_sb); + mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock); return err; out_brelse: @@ -2175,7 +2177,7 @@ static int ext3_symlink (struct inode * dir, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 8359e7b3dc89..54351ac7cef9 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -209,7 +209,7 @@ static int setup_new_group_blocks(struct super_block *sb, if (IS_ERR(handle)) return PTR_ERR(handle); - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { err = -EBUSY; goto exit_journal; @@ -266,7 +266,7 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; if (IS_ERR(gdb = bclean(handle, sb, block))) { - err = PTR_ERR(bh); + err = PTR_ERR(gdb); goto exit_bh; } ext3_journal_dirty_metadata(handle, gdb); @@ -324,7 +324,7 @@ exit_bh: brelse(bh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; @@ -662,11 +662,12 @@ exit_free: * important part is that the new block and inode counts are in the backup * superblocks, and the location of the new group metadata in the GDT backups. * - * We do not need lock_super() for this, because these blocks are not - * otherwise touched by the filesystem code when it is mounted. We don't - * need to worry about last changing from sbi->s_groups_count, because the - * worst that can happen is that we do not copy the full number of backups - * at this time. The resize which changed s_groups_count will backup again. + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. */ static void update_backups(struct super_block *sb, int blk_off, char *data, int size) @@ -825,7 +826,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) goto exit_put; } - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { ext3_warning(sb, __func__, "multiple resizers run on filesystem!"); @@ -856,7 +857,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) /* * OK, now we've set up the new group. Time to make it active. * - * Current kernels don't lock all allocations via lock_super(), + * We do not lock all allocations via s_resize_lock * so we have to be safe wrt. concurrent accesses the group * data. So we need to be careful to set all of the relevant * group descriptor data etc. *before* we enable the group. @@ -900,12 +901,12 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) * * The precise rules we use are: * - * * Writers of s_groups_count *must* hold lock_super + * * Writers of s_groups_count *must* hold s_resize_lock * AND * * Writers must perform a smp_wmb() after updating all dependent * data and before modifying the groups count * - * * Readers must hold lock_super() over the access + * * Readers must hold s_resize_lock over the access * OR * * Readers must perform an smp_rmb() after reading the groups count * and before reading any dependent data. @@ -936,7 +937,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) ext3_journal_dirty_metadata(handle, sbi->s_sbh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; if (!err) { @@ -973,7 +974,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after - * taking lock_super() below. */ + * taking the s_resize_lock below. */ o_blocks_count = le32_to_cpu(es->s_blocks_count); o_groups_count = EXT3_SB(sb)->s_groups_count; @@ -1045,11 +1046,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, goto exit_put; } - lock_super(sb); + mutex_lock(&EXT3_SB(sb)->s_resize_lock); if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { ext3_warning(sb, __func__, "multiple resizers run on filesystem!"); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_journal_stop(handle); err = -EBUSY; goto exit_put; @@ -1059,13 +1060,13 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, EXT3_SB(sb)->s_sbh))) { ext3_warning(sb, __func__, "error %d on journal write access", err); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_journal_stop(handle); goto exit_put; } es->s_blocks_count = cpu_to_le32(o_blocks_count + add); ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, o_blocks_count + add); ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 427496c4767c..afa2b569da10 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -135,12 +135,24 @@ void ext3_journal_abort_handle(const char *caller, const char *err_fn, if (is_handle_aborted(handle)) return; - printk(KERN_ERR "%s: aborting transaction: %s in %s\n", - caller, errstr, err_fn); + printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n", + caller, errstr, err_fn); journal_abort_handle(handle); } +void ext3_msg(struct super_block *sb, const char *prefix, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + printk("%sEXT3-fs (%s): ", prefix, sb->s_id); + vprintk(fmt, args); + printk("\n"); + va_end(args); +} + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * @@ -174,12 +186,13 @@ static void ext3_handle_error(struct super_block *sb) journal_abort(journal, -EIO); } if (test_opt (sb, ERRORS_RO)) { - printk (KERN_CRIT "Remounting filesystem read-only\n"); + ext3_msg(sb, KERN_CRIT, + "error: remounting filesystem read-only"); sb->s_flags |= MS_RDONLY; } ext3_commit_super(sb, es, 1); if (test_opt(sb, ERRORS_PANIC)) - panic("EXT3-fs (device %s): panic forced after error\n", + panic("EXT3-fs (%s): panic forced after error\n", sb->s_id); } @@ -247,8 +260,7 @@ void __ext3_std_error (struct super_block * sb, const char * function, return; errstr = ext3_decode_error(sb, errno, nbuf); - printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n", - sb->s_id, function, errstr); + ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr); ext3_handle_error(sb); } @@ -268,21 +280,20 @@ void ext3_abort (struct super_block * sb, const char * function, { va_list args; - printk (KERN_CRIT "ext3_abort called.\n"); - va_start(args, fmt); - printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function); + printk(KERN_CRIT "EXT3-fs (%s): error: %s: ", sb->s_id, function); vprintk(fmt, args); printk("\n"); va_end(args); if (test_opt(sb, ERRORS_PANIC)) - panic("EXT3-fs panic from previous error\n"); + panic("EXT3-fs: panic from previous error\n"); if (sb->s_flags & MS_RDONLY) return; - printk(KERN_CRIT "Remounting filesystem read-only\n"); + ext3_msg(sb, KERN_CRIT, + "error: remounting filesystem read-only"); EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; sb->s_flags |= MS_RDONLY; EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; @@ -296,7 +307,7 @@ void ext3_warning (struct super_block * sb, const char * function, va_list args; va_start(args, fmt); - printk(KERN_WARNING "EXT3-fs warning (device %s): %s: ", + printk(KERN_WARNING "EXT3-fs (%s): warning: %s: ", sb->s_id, function); vprintk(fmt, args); printk("\n"); @@ -310,10 +321,10 @@ void ext3_update_dynamic_rev(struct super_block *sb) if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV) return; - ext3_warning(sb, __func__, - "updating to rev %d because of new feature flag, " - "running e2fsck is recommended", - EXT3_DYNAMIC_REV); + ext3_msg(sb, KERN_WARNING, + "warning: updating to rev %d because of " + "new feature flag, running e2fsck is recommended", + EXT3_DYNAMIC_REV); es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO); es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE); @@ -331,7 +342,7 @@ void ext3_update_dynamic_rev(struct super_block *sb) /* * Open the external journal device */ -static struct block_device *ext3_blkdev_get(dev_t dev) +static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb) { struct block_device *bdev; char b[BDEVNAME_SIZE]; @@ -342,8 +353,9 @@ static struct block_device *ext3_blkdev_get(dev_t dev) return bdev; fail: - printk(KERN_ERR "EXT3: failed to open journal device %s: %ld\n", - __bdevname(dev, b), PTR_ERR(bdev)); + ext3_msg(sb, "error: failed to open journal device %s: %ld", + __bdevname(dev, b), PTR_ERR(bdev)); + return NULL; } @@ -378,13 +390,13 @@ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) { struct list_head *l; - printk(KERN_ERR "sb orphan head is %d\n", + ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d", le32_to_cpu(sbi->s_es->s_last_orphan)); - printk(KERN_ERR "sb_info orphan list:\n"); + ext3_msg(sb, KERN_ERR, "sb_info orphan list:"); list_for_each(l, &sbi->s_orphan) { struct inode *inode = orphan_list_entry(l); - printk(KERN_ERR " " + ext3_msg(sb, KERN_ERR, " " "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", inode->i_sb->s_id, inode->i_ino, inode, inode->i_mode, inode->i_nlink, @@ -527,9 +539,22 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl #if defined(CONFIG_QUOTA) struct ext3_sb_info *sbi = EXT3_SB(sb); - if (sbi->s_jquota_fmt) - seq_printf(seq, ",jqfmt=%s", - (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0"); + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } if (sbi->s_qf_names[USRQUOTA]) seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); @@ -636,6 +661,9 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); + if (test_opt(sb, NOLOAD)) + seq_puts(seq, ",norecovery"); + ext3_show_quota_options(seq, sb); return 0; @@ -787,9 +815,9 @@ enum { Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_grpquota + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, + Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, + Opt_usrquota, Opt_grpquota }; static const match_table_t tokens = { @@ -818,6 +846,7 @@ static const match_table_t tokens = { {Opt_reservation, "reservation"}, {Opt_noreservation, "noreservation"}, {Opt_noload, "noload"}, + {Opt_noload, "norecovery"}, {Opt_nobh, "nobh"}, {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, @@ -836,6 +865,7 @@ static const match_table_t tokens = { {Opt_grpjquota, "grpjquota=%s"}, {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_grpquota, "grpquota"}, {Opt_noquota, "noquota"}, {Opt_quota, "quota"}, @@ -845,7 +875,7 @@ static const match_table_t tokens = { {Opt_err, NULL}, }; -static ext3_fsblk_t get_sb_block(void **data) +static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb) { ext3_fsblk_t sb_block; char *options = (char *) *data; @@ -856,7 +886,7 @@ static ext3_fsblk_t get_sb_block(void **data) /*todo: use simple_strtoll with >32bit ext3 */ sb_block = simple_strtoul(options, &options, 0); if (*options && *options != ',') { - printk("EXT3-fs: Invalid sb specification: %s\n", + ext3_msg(sb, "error: invalid sb specification: %s", (char *) *data); return 1; } @@ -956,7 +986,8 @@ static int parse_options (char *options, struct super_block *sb, #else case Opt_user_xattr: case Opt_nouser_xattr: - printk("EXT3 (no)user_xattr options not supported\n"); + ext3_msg(sb, KERN_INFO, + "(no)user_xattr options not supported"); break; #endif #ifdef CONFIG_EXT3_FS_POSIX_ACL @@ -969,7 +1000,8 @@ static int parse_options (char *options, struct super_block *sb, #else case Opt_acl: case Opt_noacl: - printk("EXT3 (no)acl options not supported\n"); + ext3_msg(sb, KERN_INFO, + "(no)acl options not supported"); break; #endif case Opt_reservation: @@ -985,16 +1017,16 @@ static int parse_options (char *options, struct super_block *sb, user to specify an existing inode to be the journal file. */ if (is_remount) { - printk(KERN_ERR "EXT3-fs: cannot specify " - "journal on remount\n"); + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); return 0; } set_opt (sbi->s_mount_opt, UPDATE_JOURNAL); break; case Opt_journal_inum: if (is_remount) { - printk(KERN_ERR "EXT3-fs: cannot specify " - "journal on remount\n"); + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); return 0; } if (match_int(&args[0], &option)) @@ -1003,8 +1035,8 @@ static int parse_options (char *options, struct super_block *sb, break; case Opt_journal_dev: if (is_remount) { - printk(KERN_ERR "EXT3-fs: cannot specify " - "journal on remount\n"); + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); return 0; } if (match_int(&args[0], &option)) @@ -1036,12 +1068,11 @@ static int parse_options (char *options, struct super_block *sb, if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) == data_opt) break; - printk(KERN_ERR - "EXT3-fs (device %s): Cannot change " + ext3_msg(sb, KERN_ERR, + "error: cannot change " "data mode on remount. The filesystem " "is mounted in data=%s mode and you " - "try to remount it in data=%s mode.\n", - sb->s_id, + "try to remount it in data=%s mode.", data_mode_string(sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS), data_mode_string(data_opt)); @@ -1066,31 +1097,31 @@ static int parse_options (char *options, struct super_block *sb, set_qf_name: if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { - printk(KERN_ERR - "EXT3-fs: Cannot change journaled " - "quota options when quota turned on.\n"); + ext3_msg(sb, KERN_ERR, + "error: cannot change journaled " + "quota options when quota turned on."); return 0; } qname = match_strdup(&args[0]); if (!qname) { - printk(KERN_ERR - "EXT3-fs: not enough memory for " - "storing quotafile name.\n"); + ext3_msg(sb, KERN_ERR, + "error: not enough memory for " + "storing quotafile name."); return 0; } if (sbi->s_qf_names[qtype] && strcmp(sbi->s_qf_names[qtype], qname)) { - printk(KERN_ERR - "EXT3-fs: %s quota file already " - "specified.\n", QTYPE2NAME(qtype)); + ext3_msg(sb, KERN_ERR, + "error: %s quota file already " + "specified.", QTYPE2NAME(qtype)); kfree(qname); return 0; } sbi->s_qf_names[qtype] = qname; if (strchr(sbi->s_qf_names[qtype], '/')) { - printk(KERN_ERR - "EXT3-fs: quotafile must be on " - "filesystem root.\n"); + ext3_msg(sb, KERN_ERR, + "error: quotafile must be on " + "filesystem root."); kfree(sbi->s_qf_names[qtype]); sbi->s_qf_names[qtype] = NULL; return 0; @@ -1105,9 +1136,9 @@ set_qf_name: clear_qf_name: if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { - printk(KERN_ERR "EXT3-fs: Cannot change " + ext3_msg(sb, KERN_ERR, "error: cannot change " "journaled quota options when " - "quota turned on.\n"); + "quota turned on."); return 0; } /* @@ -1121,12 +1152,15 @@ clear_qf_name: goto set_qf_format; case Opt_jqfmt_vfsv0: qfmt = QFMT_VFS_V0; + goto set_qf_format; + case Opt_jqfmt_vfsv1: + qfmt = QFMT_VFS_V1; set_qf_format: if (sb_any_quota_loaded(sb) && sbi->s_jquota_fmt != qfmt) { - printk(KERN_ERR "EXT3-fs: Cannot change " + ext3_msg(sb, KERN_ERR, "error: cannot change " "journaled quota options when " - "quota turned on.\n"); + "quota turned on."); return 0; } sbi->s_jquota_fmt = qfmt; @@ -1142,8 +1176,8 @@ set_qf_format: break; case Opt_noquota: if (sb_any_quota_loaded(sb)) { - printk(KERN_ERR "EXT3-fs: Cannot change quota " - "options when quota turned on.\n"); + ext3_msg(sb, KERN_ERR, "error: cannot change " + "quota options when quota turned on."); return 0; } clear_opt(sbi->s_mount_opt, QUOTA); @@ -1154,8 +1188,8 @@ set_qf_format: case Opt_quota: case Opt_usrquota: case Opt_grpquota: - printk(KERN_ERR - "EXT3-fs: quota options not supported.\n"); + ext3_msg(sb, KERN_ERR, + "error: quota options not supported."); break; case Opt_usrjquota: case Opt_grpjquota: @@ -1163,9 +1197,10 @@ set_qf_format: case Opt_offgrpjquota: case Opt_jqfmt_vfsold: case Opt_jqfmt_vfsv0: - printk(KERN_ERR - "EXT3-fs: journaled quota options not " - "supported.\n"); + case Opt_jqfmt_vfsv1: + ext3_msg(sb, KERN_ERR, + "error: journaled quota options not " + "supported."); break; case Opt_noquota: break; @@ -1185,8 +1220,9 @@ set_qf_format: break; case Opt_resize: if (!is_remount) { - printk("EXT3-fs: resize option only available " - "for remount\n"); + ext3_msg(sb, KERN_ERR, + "error: resize option only available " + "for remount"); return 0; } if (match_int(&args[0], &option) != 0) @@ -1200,9 +1236,9 @@ set_qf_format: clear_opt(sbi->s_mount_opt, NOBH); break; default: - printk (KERN_ERR - "EXT3-fs: Unrecognized mount option \"%s\" " - "or missing value\n", p); + ext3_msg(sb, KERN_ERR, + "error: unrecognized mount option \"%s\" " + "or missing value", p); return 0; } } @@ -1220,21 +1256,21 @@ set_qf_format: (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) || (sbi->s_qf_names[GRPQUOTA] && (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) { - printk(KERN_ERR "EXT3-fs: old and new quota " - "format mixing.\n"); + ext3_msg(sb, KERN_ERR, "error: old and new quota " + "format mixing."); return 0; } if (!sbi->s_jquota_fmt) { - printk(KERN_ERR "EXT3-fs: journaled quota format " - "not specified.\n"); + ext3_msg(sb, KERN_ERR, "error: journaled quota format " + "not specified."); return 0; } } else { if (sbi->s_jquota_fmt) { - printk(KERN_ERR "EXT3-fs: journaled quota format " + ext3_msg(sb, KERN_ERR, "error: journaled quota format " "specified with no journaling " - "enabled.\n"); + "enabled."); return 0; } } @@ -1249,31 +1285,33 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, int res = 0; if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) { - printk (KERN_ERR "EXT3-fs warning: revision level too high, " - "forcing read-only mode\n"); + ext3_msg(sb, KERN_ERR, + "error: revision level too high, " + "forcing read-only mode"); res = MS_RDONLY; } if (read_only) return res; if (!(sbi->s_mount_state & EXT3_VALID_FS)) - printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, " - "running e2fsck is recommended\n"); + ext3_msg(sb, KERN_WARNING, + "warning: mounting unchecked fs, " + "running e2fsck is recommended"); else if ((sbi->s_mount_state & EXT3_ERROR_FS)) - printk (KERN_WARNING - "EXT3-fs warning: mounting fs with errors, " - "running e2fsck is recommended\n"); + ext3_msg(sb, KERN_WARNING, + "warning: mounting fs with errors, " + "running e2fsck is recommended"); else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) - printk (KERN_WARNING - "EXT3-fs warning: maximal mount count reached, " - "running e2fsck is recommended\n"); + ext3_msg(sb, KERN_WARNING, + "warning: maximal mount count reached, " + "running e2fsck is recommended"); else if (le32_to_cpu(es->s_checkinterval) && (le32_to_cpu(es->s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= get_seconds())) - printk (KERN_WARNING - "EXT3-fs warning: checktime reached, " - "running e2fsck is recommended\n"); + ext3_msg(sb, KERN_WARNING, + "warning: checktime reached, " + "running e2fsck is recommended"); #if 0 /* @@@ We _will_ want to clear the valid bit if we find inconsistencies, to force a fsck at reboot. But for @@ -1290,22 +1328,20 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, ext3_commit_super(sb, es, 1); if (test_opt(sb, DEBUG)) - printk(KERN_INFO "[EXT3 FS bs=%lu, gc=%lu, " - "bpg=%lu, ipg=%lu, mo=%04lx]\n", + ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, " + "bpg=%lu, ipg=%lu, mo=%04lx]", sb->s_blocksize, sbi->s_groups_count, EXT3_BLOCKS_PER_GROUP(sb), EXT3_INODES_PER_GROUP(sb), sbi->s_mount_opt); - printk(KERN_INFO "EXT3 FS on %s, ", sb->s_id); if (EXT3_SB(sb)->s_journal->j_inode == NULL) { char b[BDEVNAME_SIZE]; - - printk("external journal on %s\n", + ext3_msg(sb, KERN_INFO, "using external journal on %s", bdevname(EXT3_SB(sb)->s_journal->j_dev, b)); } else { - printk("internal journal\n"); + ext3_msg(sb, KERN_INFO, "using internal journal"); } return res; } @@ -1399,8 +1435,8 @@ static void ext3_orphan_cleanup (struct super_block * sb, } if (bdev_read_only(sb->s_bdev)) { - printk(KERN_ERR "EXT3-fs: write access " - "unavailable, skipping orphan cleanup.\n"); + ext3_msg(sb, KERN_ERR, "error: write access " + "unavailable, skipping orphan cleanup."); return; } @@ -1414,8 +1450,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, } if (s_flags & MS_RDONLY) { - printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n", - sb->s_id); + ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); sb->s_flags &= ~MS_RDONLY; } #ifdef CONFIG_QUOTA @@ -1426,9 +1461,9 @@ static void ext3_orphan_cleanup (struct super_block * sb, if (EXT3_SB(sb)->s_qf_names[i]) { int ret = ext3_quota_on_mount(sb, i); if (ret < 0) - printk(KERN_ERR - "EXT3-fs: Cannot turn on journaled " - "quota: error %d\n", ret); + ext3_msg(sb, KERN_ERR, + "error: cannot turn on journaled " + "quota: %d", ret); } } #endif @@ -1466,11 +1501,11 @@ static void ext3_orphan_cleanup (struct super_block * sb, #define PLURAL(x) (x), ((x)==1) ? "" : "s" if (nr_orphans) - printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n", - sb->s_id, PLURAL(nr_orphans)); + ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted", + PLURAL(nr_orphans)); if (nr_truncates) - printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n", - sb->s_id, PLURAL(nr_truncates)); + ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up", + PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA /* Turn quotas off */ for (i = 0; i < MAXQUOTAS; i++) { @@ -1554,7 +1589,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) struct ext3_super_block *es = NULL; struct ext3_sb_info *sbi; ext3_fsblk_t block; - ext3_fsblk_t sb_block = get_sb_block(&data); + ext3_fsblk_t sb_block = get_sb_block(&data, sb); ext3_fsblk_t logic_sb_block; unsigned long offset = 0; unsigned int journal_inum = 0; @@ -1590,7 +1625,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); if (!blocksize) { - printk(KERN_ERR "EXT3-fs: unable to set blocksize\n"); + ext3_msg(sb, KERN_ERR, "error: unable to set blocksize"); goto out_fail; } @@ -1606,7 +1641,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) } if (!(bh = sb_bread(sb, logic_sb_block))) { - printk (KERN_ERR "EXT3-fs: unable to read superblock\n"); + ext3_msg(sb, KERN_ERR, "error: unable to read superblock"); goto out_fail; } /* @@ -1665,9 +1700,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) || EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U))) - printk(KERN_WARNING - "EXT3-fs warning: feature flags set on rev 0 fs, " - "running e2fsck is recommended\n"); + ext3_msg(sb, KERN_WARNING, + "warning: feature flags set on rev 0 fs, " + "running e2fsck is recommended"); /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, @@ -1675,25 +1710,25 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) */ features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP); if (features) { - printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + ext3_msg(sb, KERN_ERR, + "error: couldn't mount because of unsupported " + "optional features (%x)", le32_to_cpu(features)); goto failed_mount; } features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP); if (!(sb->s_flags & MS_RDONLY) && features) { - printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + ext3_msg(sb, KERN_ERR, + "error: couldn't mount RDWR because of unsupported " + "optional features (%x)", le32_to_cpu(features)); goto failed_mount; } blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); if (blocksize < EXT3_MIN_BLOCK_SIZE || blocksize > EXT3_MAX_BLOCK_SIZE) { - printk(KERN_ERR - "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n", - blocksize, sb->s_id); + ext3_msg(sb, KERN_ERR, + "error: couldn't mount because of unsupported " + "filesystem blocksize %d", blocksize); goto failed_mount; } @@ -1704,30 +1739,31 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) * than the hardware sectorsize for the machine. */ if (blocksize < hblock) { - printk(KERN_ERR "EXT3-fs: blocksize %d too small for " - "device blocksize %d.\n", blocksize, hblock); + ext3_msg(sb, KERN_ERR, + "error: fsblocksize %d too small for " + "hardware sectorsize %d", blocksize, hblock); goto failed_mount; } brelse (bh); if (!sb_set_blocksize(sb, blocksize)) { - printk(KERN_ERR "EXT3-fs: bad blocksize %d.\n", - blocksize); + ext3_msg(sb, KERN_ERR, + "error: bad blocksize %d", blocksize); goto out_fail; } logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; bh = sb_bread(sb, logic_sb_block); if (!bh) { - printk(KERN_ERR - "EXT3-fs: Can't read superblock on 2nd try.\n"); + ext3_msg(sb, KERN_ERR, + "error: can't read superblock on 2nd try"); goto failed_mount; } es = (struct ext3_super_block *)(((char *)bh->b_data) + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) { - printk (KERN_ERR - "EXT3-fs: Magic mismatch, very weird !\n"); + ext3_msg(sb, KERN_ERR, + "error: magic mismatch"); goto failed_mount; } } @@ -1743,8 +1779,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > blocksize)) { - printk (KERN_ERR - "EXT3-fs: unsupported inode size: %d\n", + ext3_msg(sb, KERN_ERR, + "error: unsupported inode size: %d", sbi->s_inode_size); goto failed_mount; } @@ -1752,8 +1788,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << le32_to_cpu(es->s_log_frag_size); if (blocksize != sbi->s_frag_size) { - printk(KERN_ERR - "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n", + ext3_msg(sb, KERN_ERR, + "error: fragsize %lu != blocksize %u (unsupported)", sbi->s_frag_size, blocksize); goto failed_mount; } @@ -1789,31 +1825,31 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) } if (sbi->s_blocks_per_group > blocksize * 8) { - printk (KERN_ERR - "EXT3-fs: #blocks per group too big: %lu\n", + ext3_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", sbi->s_blocks_per_group); goto failed_mount; } if (sbi->s_frags_per_group > blocksize * 8) { - printk (KERN_ERR - "EXT3-fs: #fragments per group too big: %lu\n", + ext3_msg(sb, KERN_ERR, + "error: #fragments per group too big: %lu", sbi->s_frags_per_group); goto failed_mount; } if (sbi->s_inodes_per_group > blocksize * 8) { - printk (KERN_ERR - "EXT3-fs: #inodes per group too big: %lu\n", + ext3_msg(sb, KERN_ERR, + "error: #inodes per group too big: %lu", sbi->s_inodes_per_group); goto failed_mount; } if (le32_to_cpu(es->s_blocks_count) > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { - printk(KERN_ERR "EXT3-fs: filesystem on %s:" - " too large to mount safely\n", sb->s_id); + ext3_msg(sb, KERN_ERR, + "error: filesystem is too large to mount safely"); if (sizeof(sector_t) < 8) - printk(KERN_WARNING "EXT3-fs: CONFIG_LBDAF not " - "enabled\n"); + ext3_msg(sb, KERN_ERR, + "error: CONFIG_LBDAF not enabled"); goto failed_mount; } @@ -1827,7 +1863,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { - printk (KERN_ERR "EXT3-fs: not enough memory\n"); + ext3_msg(sb, KERN_ERR, + "error: not enough memory"); goto failed_mount; } @@ -1837,14 +1874,15 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) block = descriptor_loc(sb, logic_sb_block, i); sbi->s_group_desc[i] = sb_bread(sb, block); if (!sbi->s_group_desc[i]) { - printk (KERN_ERR "EXT3-fs: " - "can't read group descriptor %d\n", i); + ext3_msg(sb, KERN_ERR, + "error: can't read group descriptor %d", i); db_count = i; goto failed_mount2; } } if (!ext3_check_descriptors (sb)) { - printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n"); + ext3_msg(sb, KERN_ERR, + "error: group descriptors corrupted"); goto failed_mount2; } sbi->s_gdb_count = db_count; @@ -1862,7 +1900,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) ext3_count_dirs(sb)); } if (err) { - printk(KERN_ERR "EXT3-fs: insufficient memory\n"); + ext3_msg(sb, KERN_ERR, "error: insufficient memory"); goto failed_mount3; } @@ -1890,6 +1928,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sb->dq_op = &ext3_quota_operations; #endif INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_orphan_lock); + mutex_init(&sbi->s_resize_lock); sb->s_root = NULL; @@ -1910,9 +1950,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount3; } else { if (!silent) - printk (KERN_ERR - "ext3: No journal on filesystem on %s\n", - sb->s_id); + ext3_msg(sb, KERN_ERR, + "error: no journal found. " + "mounting ext3 over ext2?"); goto failed_mount3; } @@ -1934,8 +1974,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) case EXT3_MOUNT_WRITEBACK_DATA: if (!journal_check_available_features (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { - printk(KERN_ERR "EXT3-fs: Journal does not support " - "requested data journaling mode\n"); + ext3_msg(sb, KERN_ERR, + "error: journal does not support " + "requested data journaling mode"); goto failed_mount4; } default: @@ -1944,8 +1985,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) if (test_opt(sb, NOBH)) { if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) { - printk(KERN_WARNING "EXT3-fs: Ignoring nobh option - " - "its supported only with writeback mode\n"); + ext3_msg(sb, KERN_WARNING, + "warning: ignoring nobh option - " + "it is supported only with writeback mode"); clear_opt(sbi->s_mount_opt, NOBH); } } @@ -1956,39 +1998,32 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) root = ext3_iget(sb, EXT3_ROOT_INO); if (IS_ERR(root)) { - printk(KERN_ERR "EXT3-fs: get root inode failed\n"); + ext3_msg(sb, KERN_ERR, "error: get root inode failed"); ret = PTR_ERR(root); goto failed_mount4; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { iput(root); - printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n"); + ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); goto failed_mount4; } sb->s_root = d_alloc_root(root); if (!sb->s_root) { - printk(KERN_ERR "EXT3-fs: get root dentry failed\n"); + ext3_msg(sb, KERN_ERR, "error: get root dentry failed"); iput(root); ret = -ENOMEM; goto failed_mount4; } ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock - * in numerous places. Here we just pop the lock - it's relatively - * harmless, because we are now ready to accept write_super() requests, - * and aviro says that's the only reason for hanging onto the - * superblock lock. - */ + EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; ext3_orphan_cleanup(sb, es); EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; if (needs_recovery) - printk (KERN_INFO "EXT3-fs: recovery complete.\n"); + ext3_msg(sb, KERN_INFO, "recovery complete"); ext3_mark_recovery_complete(sb, es); - printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n", + ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode", test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); @@ -1998,7 +2033,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) cantfind_ext3: if (!silent) - printk(KERN_ERR "VFS: Can't find ext3 filesystem on dev %s.\n", + ext3_msg(sb, KERN_INFO, + "error: can't find ext3 filesystem on dev %s.", sb->s_id); goto failed_mount; @@ -2066,27 +2102,27 @@ static journal_t *ext3_get_journal(struct super_block *sb, journal_inode = ext3_iget(sb, journal_inum); if (IS_ERR(journal_inode)) { - printk(KERN_ERR "EXT3-fs: no journal found.\n"); + ext3_msg(sb, KERN_ERR, "error: no journal found"); return NULL; } if (!journal_inode->i_nlink) { make_bad_inode(journal_inode); iput(journal_inode); - printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n"); + ext3_msg(sb, KERN_ERR, "error: journal inode is deleted"); return NULL; } jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", journal_inode, journal_inode->i_size); if (!S_ISREG(journal_inode->i_mode)) { - printk(KERN_ERR "EXT3-fs: invalid journal inode.\n"); + ext3_msg(sb, KERN_ERR, "error: invalid journal inode"); iput(journal_inode); return NULL; } journal = journal_init_inode(journal_inode); if (!journal) { - printk(KERN_ERR "EXT3-fs: Could not load journal inode\n"); + ext3_msg(sb, KERN_ERR, "error: could not load journal inode"); iput(journal_inode); return NULL; } @@ -2108,13 +2144,13 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, struct ext3_super_block * es; struct block_device *bdev; - bdev = ext3_blkdev_get(j_dev); + bdev = ext3_blkdev_get(j_dev, sb); if (bdev == NULL) return NULL; if (bd_claim(bdev, sb)) { - printk(KERN_ERR - "EXT3: failed to claim external journal device.\n"); + ext3_msg(sb, KERN_ERR, + "error: failed to claim external journal device"); blkdev_put(bdev, FMODE_READ|FMODE_WRITE); return NULL; } @@ -2122,8 +2158,8 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { - printk(KERN_ERR - "EXT3-fs: blocksize too small for journal device.\n"); + ext3_msg(sb, KERN_ERR, + "error: blocksize too small for journal device"); goto out_bdev; } @@ -2131,8 +2167,8 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, offset = EXT3_MIN_BLOCK_SIZE % blocksize; set_blocksize(bdev, blocksize); if (!(bh = __bread(bdev, sb_block, blocksize))) { - printk(KERN_ERR "EXT3-fs: couldn't read superblock of " - "external journal\n"); + ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of " + "external journal"); goto out_bdev; } @@ -2140,14 +2176,14 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { - printk(KERN_ERR "EXT3-fs: external journal has " - "bad superblock\n"); + ext3_msg(sb, KERN_ERR, "error: external journal has " + "bad superblock"); brelse(bh); goto out_bdev; } if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { - printk(KERN_ERR "EXT3-fs: journal UUID does not match\n"); + ext3_msg(sb, KERN_ERR, "error: journal UUID does not match"); brelse(bh); goto out_bdev; } @@ -2159,19 +2195,21 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, journal = journal_init_dev(bdev, sb->s_bdev, start, len, blocksize); if (!journal) { - printk(KERN_ERR "EXT3-fs: failed to create device journal\n"); + ext3_msg(sb, KERN_ERR, + "error: failed to create device journal"); goto out_bdev; } journal->j_private = sb; ll_rw_block(READ, 1, &journal->j_sb_buffer); wait_on_buffer(journal->j_sb_buffer); if (!buffer_uptodate(journal->j_sb_buffer)) { - printk(KERN_ERR "EXT3-fs: I/O error on journal device\n"); + ext3_msg(sb, KERN_ERR, "I/O error on journal device"); goto out_journal; } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { - printk(KERN_ERR "EXT3-fs: External journal has more than one " - "user (unsupported) - %d\n", + ext3_msg(sb, KERN_ERR, + "error: external journal has more than one " + "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); goto out_journal; } @@ -2197,8 +2235,8 @@ static int ext3_load_journal(struct super_block *sb, if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { - printk(KERN_INFO "EXT3-fs: external journal device major/minor " - "numbers have changed\n"); + ext3_msg(sb, KERN_INFO, "external journal device major/minor " + "numbers have changed"); journal_dev = new_decode_dev(journal_devnum); } else journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); @@ -2213,21 +2251,21 @@ static int ext3_load_journal(struct super_block *sb, if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) { if (sb->s_flags & MS_RDONLY) { - printk(KERN_INFO "EXT3-fs: INFO: recovery " - "required on readonly filesystem.\n"); + ext3_msg(sb, KERN_INFO, + "recovery required on readonly filesystem"); if (really_read_only) { - printk(KERN_ERR "EXT3-fs: write access " - "unavailable, cannot proceed.\n"); + ext3_msg(sb, KERN_ERR, "error: write access " + "unavailable, cannot proceed"); return -EROFS; } - printk (KERN_INFO "EXT3-fs: write access will " - "be enabled during recovery.\n"); + ext3_msg(sb, KERN_INFO, + "write access will be enabled during recovery"); } } if (journal_inum && journal_dev) { - printk(KERN_ERR "EXT3-fs: filesystem has both journal " - "and inode journals!\n"); + ext3_msg(sb, KERN_ERR, "error: filesystem has both journal " + "and inode journals"); return -EINVAL; } @@ -2242,7 +2280,7 @@ static int ext3_load_journal(struct super_block *sb, if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { err = journal_update_format(journal); if (err) { - printk(KERN_ERR "EXT3-fs: error updating journal.\n"); + ext3_msg(sb, KERN_ERR, "error updating journal"); journal_destroy(journal); return err; } @@ -2254,7 +2292,7 @@ static int ext3_load_journal(struct super_block *sb, err = journal_load(journal); if (err) { - printk(KERN_ERR "EXT3-fs: error loading journal.\n"); + ext3_msg(sb, KERN_ERR, "error loading journal"); journal_destroy(journal); return err; } @@ -2273,16 +2311,17 @@ static int ext3_load_journal(struct super_block *sb, return 0; } -static int ext3_create_journal(struct super_block * sb, - struct ext3_super_block * es, +static int ext3_create_journal(struct super_block *sb, + struct ext3_super_block *es, unsigned int journal_inum) { journal_t *journal; int err; if (sb->s_flags & MS_RDONLY) { - printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to " - "create journal.\n"); + ext3_msg(sb, KERN_ERR, + "error: readonly filesystem when trying to " + "create journal"); return -EROFS; } @@ -2290,12 +2329,12 @@ static int ext3_create_journal(struct super_block * sb, if (!journal) return -EINVAL; - printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n", + ext3_msg(sb, KERN_INFO, "creating new journal on inode %u", journal_inum); err = journal_create(journal); if (err) { - printk(KERN_ERR "EXT3-fs: error creating journal.\n"); + ext3_msg(sb, KERN_ERR, "error creating journal"); journal_destroy(journal); return -EIO; } @@ -2359,13 +2398,11 @@ static void ext3_mark_recovery_complete(struct super_block * sb, if (journal_flush(journal) < 0) goto out; - lock_super(sb); if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ext3_commit_super(sb, es, 1); } - unlock_super(sb); out: journal_unlock_updates(journal); @@ -2376,8 +2413,8 @@ out: * has recorded an error from a previous lifetime, move that error to the * main filesystem now. */ -static void ext3_clear_journal_err(struct super_block * sb, - struct ext3_super_block * es) +static void ext3_clear_journal_err(struct super_block *sb, + struct ext3_super_block *es) { journal_t *journal; int j_errno; @@ -2557,21 +2594,15 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) (sbi->s_mount_state & EXT3_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); - /* - * We have to unlock super so that we can wait for - * transactions. - */ - unlock_super(sb); ext3_mark_recovery_complete(sb, es); - lock_super(sb); } else { __le32 ret; if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))) { - printk(KERN_WARNING "EXT3-fs: %s: couldn't " - "remount RDWR because of unsupported " - "optional features (%x).\n", - sb->s_id, le32_to_cpu(ret)); + ext3_msg(sb, KERN_WARNING, + "warning: couldn't remount RDWR " + "because of unsupported optional " + "features (%x)", le32_to_cpu(ret)); err = -EROFS; goto restore_opts; } @@ -2582,11 +2613,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) * require a full umount/remount for now. */ if (es->s_last_orphan) { - printk(KERN_WARNING "EXT3-fs: %s: couldn't " + ext3_msg(sb, KERN_WARNING, "warning: couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " - "umount/remount instead.\n", - sb->s_id); + "umount/remount instead."); err = -EINVAL; goto restore_opts; } @@ -2686,13 +2716,11 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) buf->f_bsize = sb->s_blocksize; buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last; buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); - es->s_free_blocks_count = cpu_to_le32(buf->f_bfree); buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); - es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); buf->f_namelen = EXT3_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); @@ -2837,9 +2865,9 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, if (EXT3_SB(sb)->s_qf_names[type]) { /* Quotafile not of fs root? */ if (path.dentry->d_parent != sb->s_root) - printk(KERN_WARNING - "EXT3-fs: Quota file not on filesystem root. " - "Journaled quota will not work.\n"); + ext3_msg(sb, KERN_WARNING, + "warning: Quota file not on filesystem root. " + "Journaled quota will not work."); } /* @@ -2921,8 +2949,9 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, handle_t *handle = journal_current_handle(); if (!handle) { - printk(KERN_WARNING "EXT3-fs: Quota write (off=%Lu, len=%Lu)" - " cancelled because transaction is not started.\n", + ext3_msg(sb, KERN_WARNING, + "warning: quota write (off=%llu, len=%llu)" + " cancelled because transaction is not started.", (unsigned long long)off, (unsigned long long)len); return -EIO; } diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 545e37c4b91e..66895ccf76c7 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -99,7 +99,7 @@ static struct buffer_head *ext3_xattr_cache_find(struct inode *, struct mb_cache_entry **); static void ext3_xattr_rehash(struct ext3_xattr_header *, struct ext3_xattr_entry *); -static int ext3_xattr_list(struct inode *inode, char *buffer, +static int ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size); static struct mb_cache *ext3_xattr_cache; @@ -147,7 +147,7 @@ ext3_xattr_handler(int name_index) ssize_t ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) { - return ext3_xattr_list(dentry->d_inode, buffer, size); + return ext3_xattr_list(dentry, buffer, size); } static int @@ -332,7 +332,7 @@ ext3_xattr_get(struct inode *inode, int name_index, const char *name, } static int -ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry, +ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry, char *buffer, size_t buffer_size) { size_t rest = buffer_size; @@ -342,9 +342,10 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry, ext3_xattr_handler(entry->e_name_index); if (handler) { - size_t size = handler->list(inode, buffer, rest, + size_t size = handler->list(dentry, buffer, rest, entry->e_name, - entry->e_name_len); + entry->e_name_len, + handler->flags); if (buffer) { if (size > rest) return -ERANGE; @@ -357,8 +358,9 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry, } static int -ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) +ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct buffer_head *bh = NULL; int error; @@ -383,7 +385,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) goto cleanup; } ext3_xattr_cache_insert(bh); - error = ext3_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size); + error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: brelse(bh); @@ -392,8 +394,9 @@ cleanup: } static int -ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) +ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct ext3_xattr_ibody_header *header; struct ext3_inode *raw_inode; struct ext3_iloc iloc; @@ -411,7 +414,7 @@ ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) error = ext3_xattr_check_names(IFIRST(header), end); if (error) goto cleanup; - error = ext3_xattr_list_entries(inode, IFIRST(header), + error = ext3_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); cleanup: @@ -430,12 +433,12 @@ cleanup: * used / required on success. */ static int -ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) +ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { int i_error, b_error; - down_read(&EXT3_I(inode)->xattr_sem); - i_error = ext3_xattr_ibody_list(inode, buffer, buffer_size); + down_read(&EXT3_I(dentry->d_inode)->xattr_sem); + i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size); if (i_error < 0) { b_error = 0; } else { @@ -443,11 +446,11 @@ ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) buffer += i_error; buffer_size -= i_error; } - b_error = ext3_xattr_block_list(inode, buffer, buffer_size); + b_error = ext3_xattr_block_list(dentry, buffer, buffer_size); if (b_error < 0) i_error = 0; } - up_read(&EXT3_I(inode)->xattr_sem); + up_read(&EXT3_I(dentry->d_inode)->xattr_sem); return i_error + b_error; } @@ -960,6 +963,10 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (error) goto cleanup; + error = ext3_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto cleanup; + if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc); memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); @@ -985,9 +992,6 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (flags & XATTR_CREATE) goto cleanup; } - error = ext3_journal_get_write_access(handle, is.iloc.bh); - if (error) - goto cleanup; if (!value) { if (!is.s.not_found) error = ext3_xattr_ibody_set(handle, inode, &i, &is); diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 37b81097bdf2..474348788dd9 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -12,8 +12,8 @@ #include "xattr.h" static size_t -ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -28,23 +28,23 @@ ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size, } static int -ext3_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext3_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_get(inode, EXT3_XATTR_INDEX_SECURITY, name, - buffer, size); + return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + name, buffer, size); } static int -ext3_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext3_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_set(inode, EXT3_XATTR_INDEX_SECURITY, name, - value, size, flags); + return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + name, value, size, flags); } int diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index c7c41a410c4b..e5562845ed96 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -14,8 +14,8 @@ #include "xattr.h" static size_t -ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -32,22 +32,22 @@ ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, } static int -ext3_xattr_trusted_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext3_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name, - buffer, size); + return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, + name, buffer, size); } static int -ext3_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext3_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_set(inode, EXT3_XATTR_INDEX_TRUSTED, name, + return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index 430fe63b31b3..3bcfe9ee0a68 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -13,13 +13,13 @@ #include "xattr.h" static size_t -ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return 0; if (list && total_len <= list_size) { @@ -31,26 +31,27 @@ ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size, } static int -ext3_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, buffer, size); + return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_USER, + name, buffer, size); } static int -ext3_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext3_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext3_xattr_set(inode, EXT3_XATTR_INDEX_USER, name, - value, size, flags); + return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_USER, + name, value, size, flags); } struct xattr_handler ext3_xattr_user_handler = { diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 9f2d45d75b1a..9ed1bb1f319f 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -26,6 +26,17 @@ config EXT4_FS If unsure, say N. +config EXT4_USE_FOR_EXT23 + bool "Use ext4 for ext2/ext3 file systems" + depends on EXT4_FS + depends on EXT3_FS=n || EXT2_FS=n + default y + help + Allow the ext4 file system driver code to be used for ext2 or + ext3 file system mounts. This allows users to reduce their + compiled kernel size by using one file system driver for + ext2, ext3, and ext4 file systems. + config EXT4_FS_XATTR bool "Ext4 extended attributes" depends on EXT4_FS diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 0df88b2a69b0..8a2a29d35a6f 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -364,12 +364,12 @@ out: * Extended attribute handlers */ static size_t -ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, - const char *name, size_t name_len) +ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_len) memcpy(list, POSIX_ACL_XATTR_ACCESS, size); @@ -377,12 +377,12 @@ ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, } static size_t -ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, - const char *name, size_t name_len) +ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_len) memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); @@ -390,15 +390,18 @@ ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, } static int -ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { struct posix_acl *acl; int error; - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) return -EOPNOTSUPP; - acl = ext4_get_acl(inode, type); + acl = ext4_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -410,31 +413,16 @@ ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) } static int -ext4_xattr_get_acl_access(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int -ext4_xattr_get_acl_default(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int -ext4_xattr_set_acl(struct inode *inode, int type, const void *value, - size_t size) +ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; handle_t *handle; struct posix_acl *acl; int error, retries = 0; + if (strcmp(name, "") != 0) + return -EINVAL; if (!test_opt(inode->i_sb, POSIX_ACL)) return -EOPNOTSUPP; if (!is_owner_or_cap(inode)) @@ -466,34 +454,18 @@ release_and_out: return error; } -static int -ext4_xattr_set_acl_access(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int -ext4_xattr_set_acl_default(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - struct xattr_handler ext4_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, .list = ext4_xattr_list_acl_access, - .get = ext4_xattr_get_acl_access, - .set = ext4_xattr_set_acl_access, + .get = ext4_xattr_get_acl, + .set = ext4_xattr_set_acl, }; struct xattr_handler ext4_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, .list = ext4_xattr_list_acl_default, - .get = ext4_xattr_get_acl_default, - .set = ext4_xattr_set_acl_default, + .get = ext4_xattr_get_acl, + .set = ext4_xattr_set_acl, }; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1d0418980f8d..22bc7435d913 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -499,44 +499,6 @@ error_return: } /** - * ext4_free_blocks() -- Free given blocks and update quota - * @handle: handle for this transaction - * @inode: inode - * @block: start physical block to free - * @count: number of blocks to count - * @metadata: Are these metadata blocks - */ -void ext4_free_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t block, unsigned long count, - int metadata) -{ - struct super_block *sb; - unsigned long dquot_freed_blocks; - - /* this isn't the right place to decide whether block is metadata - * inode.c/extents.c knows better, but for safety ... */ - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - metadata = 1; - - /* We need to make sure we don't reuse - * block released untill the transaction commit. - * writeback mode have weak data consistency so - * don't force data as metadata when freeing block - * for writeback mode. - */ - if (metadata == 0 && !ext4_should_writeback_data(inode)) - metadata = 1; - - sb = inode->i_sb; - - ext4_mb_free_blocks(handle, inode, block, count, - metadata, &dquot_freed_blocks); - if (dquot_freed_blocks) - vfs_dq_free_block(inode, dquot_freed_blocks); - return; -} - -/** * ext4_has_free_blocks() * @sbi: in-core super block structure. * @nblocks: number of needed blocks @@ -761,7 +723,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, ext4_group_t group) { - return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0; + if (!ext4_bg_has_super(sb, group)) + return 0; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + else + return EXT4_SB(sb)->s_gdb_count; } /** diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 50784ef07563..a60ab9aad57d 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -16,7 +16,6 @@ #include <linux/module.h> #include <linux/swap.h> #include <linux/pagemap.h> -#include <linux/version.h> #include <linux/blkdev.h> #include <linux/mutex.h> #include "ext4.h" @@ -160,7 +159,7 @@ int ext4_setup_system_zone(struct super_block *sb) if (ext4_bg_has_super(sb, i) && ((i < 5) || ((i % flex_size) == 0))) add_system_zone(sbi, ext4_group_first_block_no(sb, i), - sbi->s_gdb_count + 1); + ext4_bg_num_gdb(sb, i) + 1); gdp = ext4_get_group_desc(sb, i, NULL); ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); if (ret) @@ -228,6 +227,7 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, struct rb_node *n = sbi->system_blks.rb_node; if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || (start_blk + count > ext4_blocks_count(sbi->s_es))) return 0; while (n) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8825515eeddd..874d169a193e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -361,14 +361,11 @@ struct ext4_new_group_data { so set the magic i_delalloc_reserve_flag after taking the inode allocation semaphore for */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 - /* Call ext4_da_update_reserve_space() after successfully - allocating the blocks */ -#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 /* caller is from the direct IO path, request to creation of an unitialized extents if not allocated, split the uninitialized extent if blocks has been preallocated already*/ -#define EXT4_GET_BLOCKS_DIO 0x0010 -#define EXT4_GET_BLOCKS_CONVERT 0x0020 +#define EXT4_GET_BLOCKS_DIO 0x0008 +#define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) /* Convert extent to initialized after direct IO complete */ @@ -376,6 +373,12 @@ struct ext4_new_group_data { EXT4_GET_BLOCKS_DIO_CREATE_EXT) /* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 + +/* * ioctl commands */ #define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS @@ -693,16 +696,29 @@ struct ext4_inode_info { unsigned int i_reserved_meta_blocks; unsigned int i_allocated_meta_blocks; unsigned short i_delalloc_reserved_flag; + sector_t i_da_metadata_calc_last_lblock; + int i_da_metadata_calc_len; /* on-disk additional length */ __u16 i_extra_isize; spinlock_t i_block_reservation_lock; +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif /* completed async DIOs that might need unwritten extents handling */ struct list_head i_aio_dio_complete_list; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; }; /* @@ -750,6 +766,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt #define set_opt(o, opt) o |= EXT4_MOUNT_##opt @@ -1324,8 +1341,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp); extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); -extern void ext4_free_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t block, unsigned long count, int metadata); extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); @@ -1384,16 +1399,15 @@ extern int ext4_mb_reserve_blocks(struct super_block *, int); extern void ext4_discard_preallocations(struct inode *); extern int __init init_ext4_mballoc(void); extern void exit_ext4_mballoc(void); -extern void ext4_mb_free_blocks(handle_t *, struct inode *, - ext4_fsblk_t, unsigned long, int, unsigned long *); +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); extern void ext4_mb_put_buddy_cache_lock(struct super_block *, ext4_group_t, int); /* inode.c */ -int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t blocknr); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int, int *); struct buffer_head *ext4_bread(handle_t *, struct inode *, @@ -1424,8 +1438,10 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); -extern qsize_t ext4_get_reserved_space(struct inode *inode); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int flush_aio_dio_completed_IO(struct inode *inode); +extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 2ca686454e87..bdb6ce7e2eb4 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -225,7 +225,8 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); } -extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + sector_t lblocks); extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 6a9409920dee..b57e5c711b6d 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -4,6 +4,8 @@ #include "ext4_jbd2.h" +#include <trace/events/ext4.h> + int __ext4_journal_get_undo_access(const char *where, handle_t *handle, struct buffer_head *bh) { @@ -32,35 +34,69 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle, return err; } -int __ext4_journal_forget(const char *where, handle_t *handle, - struct buffer_head *bh) +/* + * The ext4 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + * + * If the handle isn't valid we're not journaling, but we still need to + * call into ext4_journal_revoke() to put the buffer head. + */ +int __ext4_forget(const char *where, handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + ext4_fsblk_t blocknr) { - int err = 0; + int err; - if (ext4_handle_valid(handle)) { - err = jbd2_journal_forget(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); - } - else + might_sleep(); + + trace_ext4_forget(inode, is_metadata, blocknr); + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %x\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* In the no journal case, we can just do a bforget and return */ + if (!ext4_handle_valid(handle)) { bforget(bh); - return err; -} + return 0; + } -int __ext4_journal_revoke(const char *where, handle_t *handle, - ext4_fsblk_t blocknr, struct buffer_head *bh) -{ - int err = 0; + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ - if (ext4_handle_valid(handle)) { - err = jbd2_journal_revoke(handle, blocknr, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext4_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call jbd2_journal_forget"); + err = jbd2_journal_forget(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + return err; + } + return 0; } - else - bforget(bh); + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call jbd2_journal_revoke"); + err = jbd2_journal_revoke(handle, blocknr, bh); + if (err) { + ext4_journal_abort_handle(where, __func__, bh, handle, err); + ext4_abort(inode->i_sb, __func__, + "error %d when attempting revoke", err); + } + BUFFER_TRACE(bh, "exit"); return err; } diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index a2865980342f..05eca817d704 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -49,7 +49,7 @@ #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ EXT4_XATTR_TRANS_BLOCKS - 2 + \ - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* * Define the number of metadata blocks we need to account to modify data. @@ -57,7 +57,7 @@ * This include super block, inode block, quota blocks and xattr blocks */ #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) /* Delete operations potentially hit one directory's namespace plus an * entire inode, plus arbitrary amounts of bitmap/indirection data. Be @@ -92,6 +92,7 @@ * but inode, sb and group updates are done only once */ #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) + #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) #else @@ -99,6 +100,9 @@ #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 #endif +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) int ext4_mark_iloc_dirty(handle_t *handle, @@ -116,12 +120,8 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); /* - * Wrapper functions with which ext4 calls into JBD. The intent here is - * to allow these to be turned into appropriate stubs so ext4 can control - * ext2 filesystems, so ext2+ext4 systems only nee one fs. This work hasn't - * been done yet. + * Wrapper functions with which ext4 calls into JBD. */ - void ext4_journal_abort_handle(const char *caller, const char *err_fn, struct buffer_head *bh, handle_t *handle, int err); @@ -131,13 +131,9 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle, int __ext4_journal_get_write_access(const char *where, handle_t *handle, struct buffer_head *bh); -/* When called with an invalid handle, this will still do a put on the BH */ -int __ext4_journal_forget(const char *where, handle_t *handle, - struct buffer_head *bh); - -/* When called with an invalid handle, this will still do a put on the BH */ -int __ext4_journal_revoke(const char *where, handle_t *handle, - ext4_fsblk_t blocknr, struct buffer_head *bh); +int __ext4_forget(const char *where, handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + ext4_fsblk_t blocknr); int __ext4_journal_get_create_access(const char *where, handle_t *handle, struct buffer_head *bh); @@ -149,12 +145,11 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, __ext4_journal_get_undo_access(__func__, (handle), (bh)) #define ext4_journal_get_write_access(handle, bh) \ __ext4_journal_get_write_access(__func__, (handle), (bh)) -#define ext4_journal_revoke(handle, blocknr, bh) \ - __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) +#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ + __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\ + (block_nr)) #define ext4_journal_get_create_access(handle, bh) \ __ext4_journal_get_create_access(__func__, (handle), (bh)) -#define ext4_journal_forget(handle, bh) \ - __ext4_journal_forget(__func__, (handle), (bh)) #define ext4_handle_dirty_metadata(handle, inode, bh) \ __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) @@ -254,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) return 0; } +static inline void ext4_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (ext4_handle_valid(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + ei->i_datasync_tid = handle->h_transaction->t_tid; + } +} + /* super.c */ int ext4_force_commit(struct super_block *sb); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 715264b4bae4..765a4826b118 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -296,29 +296,44 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) * to allocate @blocks * Worse case is one block per extent */ -int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) +int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock) { - int lcap, icap, rcap, leafs, idxs, num; - int newextents = blocks; - - rcap = ext4_ext_space_root_idx(inode, 0); - lcap = ext4_ext_space_block(inode, 0); - icap = ext4_ext_space_block_idx(inode, 0); + struct ext4_inode_info *ei = EXT4_I(inode); + int idxs, num = 0; - /* number of new leaf blocks needed */ - num = leafs = (newextents + lcap - 1) / lcap; + idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent_idx)); /* - * Worse case, we need separate index block(s) - * to link all new leaf blocks + * If the new delayed allocation block is contiguous with the + * previous da block, it can share index blocks with the + * previous block, so we only need to allocate a new index + * block every idxs leaf blocks. At ldxs**2 blocks, we need + * an additional index block, and at ldxs**3 blocks, yet + * another index blocks. */ - idxs = (leafs + icap - 1) / icap; - do { - num += idxs; - idxs = (idxs + icap - 1) / icap; - } while (idxs > rcap); + if (ei->i_da_metadata_calc_len && + ei->i_da_metadata_calc_last_lblock+1 == lblock) { + if ((ei->i_da_metadata_calc_len % idxs) == 0) + num++; + if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) + num++; + if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { + num++; + ei->i_da_metadata_calc_len = 0; + } else + ei->i_da_metadata_calc_len++; + ei->i_da_metadata_calc_last_lblock++; + return num; + } - return num; + /* + * In the worst case we need a new set of index blocks at + * every level of the inode's extent tree. + */ + ei->i_da_metadata_calc_len = 1; + ei->i_da_metadata_calc_last_lblock = lblock; + return ext_depth(inode) + 1; } static int @@ -1007,7 +1022,8 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; - ext4_free_blocks(handle, inode, ablocks[i], 1, 1); + ext4_free_blocks(handle, inode, 0, ablocks[i], 1, + EXT4_FREE_BLOCKS_METADATA); } } kfree(ablocks); @@ -1761,7 +1777,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, while (block < last && block != EXT_MAX_BLOCK) { num = last - block; /* find extent for this block */ + down_read(&EXT4_I(inode)->i_data_sem); path = ext4_ext_find_extent(inode, block, path); + up_read(&EXT4_I(inode)->i_data_sem); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; @@ -1957,7 +1975,6 @@ errout: static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { - struct buffer_head *bh; int err; ext4_fsblk_t leaf; @@ -1973,9 +1990,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, if (err) return err; ext_debug("index is empty, remove it, free block %llu\n", leaf); - bh = sb_find_get_block(inode->i_sb, leaf); - ext4_forget(handle, 1, inode, bh, leaf); - ext4_free_blocks(handle, inode, leaf, 1, 1); + ext4_free_blocks(handle, inode, 0, leaf, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return err; } @@ -2042,12 +2058,11 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, ext4_lblk_t from, ext4_lblk_t to) { - struct buffer_head *bh; unsigned short ee_len = ext4_ext_get_actual_len(ex); - int i, metadata = 0; + int flags = EXT4_FREE_BLOCKS_FORGET; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - metadata = 1; + flags |= EXT4_FREE_BLOCKS_METADATA; #ifdef EXTENTS_STATS { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2072,11 +2087,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, num = le32_to_cpu(ex->ee_block) + ee_len - from; start = ext_pblock(ex) + ee_len - num; ext_debug("free last %u blocks starting %llu\n", num, start); - for (i = 0; i < num; i++) { - bh = sb_find_get_block(inode->i_sb, start + i); - ext4_forget(handle, 0, inode, bh, start + i); - } - ext4_free_blocks(handle, inode, start, num, metadata); + ext4_free_blocks(handle, inode, 0, start, num, flags); } else if (from == le32_to_cpu(ex->ee_block) && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", @@ -2167,7 +2178,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, correct_index = 1; credits += (ext_depth(inode)) + 1; } - credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); err = ext4_ext_truncate_extend_restart(handle, inode, credits); if (err) @@ -3027,6 +3038,14 @@ out: return err; } +static void unmap_underlying_metadata_blocks(struct block_device *bdev, + sector_t block, int count) +{ + int i; + for (i = 0; i < count; i++) + unmap_underlying_metadata(bdev, block + i); +} + static int ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int max_blocks, @@ -3064,6 +3083,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { ret = ext4_convert_unwritten_extents_dio(handle, inode, path); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); goto out2; } /* buffered IO case */ @@ -3091,6 +3112,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ret = ext4_ext_convert_to_initialized(handle, inode, path, iblock, max_blocks); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out: if (ret <= 0) { err = ret; @@ -3098,6 +3121,30 @@ out: } else allocated = ret; set_buffer_new(bh_result); + /* + * if we allocated more blocks than requested + * we need to make sure we unmap the extra block + * allocated. The actual needed block will get + * unmapped later when we find the buffer_head marked + * new. + */ + if (allocated > max_blocks) { + unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, + newblock + max_blocks, + allocated - max_blocks); + allocated = max_blocks; + } + + /* + * If we have done fallocate with the offset that is already + * delayed allocated, we would have block reservation + * and quota reservation done in the delayed write path. + * But fallocate would have already updated quota and block + * count for this offset. So cancel these reservation + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ext4_da_update_reserve_space(inode, allocated, 0); + map_out: set_buffer_mapped(bh_result); out1: @@ -3190,7 +3237,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * this situation is possible, though, _during_ tree modification; * this is why assert can't be put in ext4_ext_find_extent() */ - BUG_ON(path[depth].p_ext == NULL && depth != 0); + if (path[depth].p_ext == NULL && depth != 0) { + ext4_error(inode->i_sb, __func__, "bad extent address " + "inode: %lu, iblock: %d, depth: %d", + inode->i_ino, iblock, depth); + err = -EIO; + goto out2; + } eh = path[depth].p_hdr; ex = path[depth].p_ext; @@ -3319,20 +3372,35 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); - ext4_free_blocks(handle, inode, ext_pblock(&newex), - ext4_ext_get_actual_len(&newex), 0); + ext4_free_blocks(handle, inode, 0, ext_pblock(&newex), + ext4_ext_get_actual_len(&newex), 0); goto out2; } /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); + if (allocated > max_blocks) + allocated = max_blocks; set_buffer_new(bh_result); - /* Cache only when it is _not_ an uninitialized extent */ - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) + /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ext4_da_update_reserve_space(inode, allocated, 1); + + /* + * Cache the extent and update transaction to commit on fdatasync only + * when it is _not_ an uninitialized extent. + */ + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { ext4_ext_put_in_cache(inode, iblock, allocated, newblock, EXT4_EXT_CACHE_EXTENT); + ext4_update_inode_fsync_trans(handle, inode, 1); + } else + ext4_update_inode_fsync_trans(handle, inode, 0); out: if (allocated > max_blocks) allocated = max_blocks; @@ -3720,10 +3788,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Walk the extent tree gathering extent information. * ext4_ext_fiemap_cb will push extents back to user. */ - down_read(&EXT4_I(inode)->i_data_sem); error = ext4_ext_walk_space(inode, start_blk, len_blks, ext4_ext_fiemap_cb, fieinfo); - up_read(&EXT4_I(inode)->i_data_sem); } return error; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 2b1531266ee2..98bd140aad01 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -51,25 +51,30 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; + struct ext4_inode_info *ei = EXT4_I(inode); journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - int err, ret = 0; + int ret; + tid_t commit_tid; J_ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file(file, dentry, datasync); + if (inode->i_sb->s_flags & MS_RDONLY) + return 0; + ret = flush_aio_dio_completed_IO(inode); if (ret < 0) - goto out; + return ret; + + if (!journal) + return simple_fsync(file, dentry, datasync); + /* - * data=writeback: + * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. - * sync_inode() will sync the metadata - * - * data=ordered: - * The caller's filemap_fdatawrite() will write the data and - * sync_inode() will write the inode if it is dirty. Then the caller's - * filemap_fdatawait() will wait on the pages. + * Metadata is in the journal, we wait for proper transaction to + * commit here. * * data=journal: * filemap_fdatawrite won't do anything (the buffers are clean). @@ -79,32 +84,25 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext4_should_journal_data(inode)) { - ret = ext4_force_commit(inode->i_sb); - goto out; - } + if (ext4_should_journal_data(inode)) + return ext4_force_commit(inode->i_sb); - if (!journal) - ret = sync_mapping_buffers(inode->i_mapping); - - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto out; - - /* - * The VFS has written the file data. If the inode is unaltered - * then we need not start a commit. - */ - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* sys_fsync did this */ - }; - err = sync_inode(inode, &wbc); - if (ret == 0) - ret = err; - } -out: - if (journal && (journal->j_flags & JBD2_BARRIER)) + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + if (jbd2_log_start_commit(journal, commit_tid)) { + /* + * When the journal is on a different device than the + * fs data disk, we need to issue the barrier in + * writeback mode. (In ordered mode, the jbd2 layer + * will take care of issuing the barrier. In + * data=journal, all of the data blocks are written to + * the journal device.) + */ + if (ext4_should_writeback_data(inode) && + (journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + jbd2_log_wait_commit(journal, commit_tid); + } else if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(inode->i_sb->s_bdev, NULL); return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4e8e2f15b8bd..e11952404e02 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -71,58 +71,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) } /* - * The ext4 forget function must perform a revoke if we are freeing data - * which has been journaled. Metadata (eg. indirect blocks) must be - * revoked in all cases. - * - * "bh" may be NULL: a metadata block may have been freed from memory - * but there may still be a record of it in the journal, and that record - * still needs to be revoked. - * - * If the handle isn't valid we're not journaling, but we still need to - * call into ext4_journal_revoke() to put the buffer head. - */ -int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t blocknr) -{ - int err; - - might_sleep(); - - BUFFER_TRACE(bh, "enter"); - - jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " - "data mode %x\n", - bh, is_metadata, inode->i_mode, - test_opt(inode->i_sb, DATA_FLAGS)); - - /* Never use the revoke function if we are doing full data - * journaling: there is no need to, and a V1 superblock won't - * support it. Otherwise, only skip the revoke on un-journaled - * data blocks. */ - - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - (!is_metadata && !ext4_should_journal_data(inode))) { - if (bh) { - BUFFER_TRACE(bh, "call jbd2_journal_forget"); - return ext4_journal_forget(handle, bh); - } - return 0; - } - - /* - * data!=journal && (is_metadata || should_journal_data(inode)) - */ - BUFFER_TRACE(bh, "call ext4_journal_revoke"); - err = ext4_journal_revoke(handle, blocknr, bh); - if (err) - ext4_abort(inode->i_sb, __func__, - "error %d when attempting revoke", err); - BUFFER_TRACE(bh, "exit"); - return err; -} - -/* * Work out how many blocks we need to proceed with the next chunk of a * truncate transaction. */ @@ -721,7 +669,7 @@ allocated: return ret; failed_out: for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); return ret; } @@ -817,14 +765,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, return err; failed: /* Allocation failed, free what we already allocated */ + ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); for (i = 1; i <= n ; i++) { - BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); - ext4_journal_forget(handle, branch[i].bh); + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, + EXT4_FREE_BLOCKS_FORGET); } - for (i = 0; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); + for (i = n+1; i < indirect_blks; i++) + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); - ext4_free_blocks(handle, inode, new_blocks[i], num, 0); + ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); return err; } @@ -903,12 +857,16 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, err_out: for (i = 1; i <= num; i++) { - BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); - ext4_journal_forget(handle, where[i].bh); - ext4_free_blocks(handle, inode, - le32_to_cpu(where[i-1].key), 1, 0); + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, + EXT4_FREE_BLOCKS_FORGET); } - ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); + ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), + blks, 0); return err; } @@ -1021,10 +979,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, if (!err) err = ext4_splice_branch(handle, inode, iblock, partial, indirect_blks, count); - else + if (err) goto cleanup; set_buffer_new(bh_result); + + ext4_update_inode_fsync_trans(handle, inode, 1); got_it: map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); if (count > blocks_to_boundary) @@ -1043,92 +1003,120 @@ out: return err; } -qsize_t ext4_get_reserved_space(struct inode *inode) +#ifdef CONFIG_QUOTA +qsize_t *ext4_get_reserved_space(struct inode *inode) { - unsigned long long total; - - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - total = EXT4_I(inode)->i_reserved_data_blocks + - EXT4_I(inode)->i_reserved_meta_blocks; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - - return total; + return &EXT4_I(inode)->i_reserved_quota; } +#endif + /* * Calculate the number of metadata blocks need to reserve - * to allocate @blocks for non extent file based file + * to allocate a new block at @lblocks for non extent file based file */ -static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) +static int ext4_indirect_calc_metadata_amount(struct inode *inode, + sector_t lblock) { - int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ind_blks, dind_blks, tind_blks; - - /* number of new indirect blocks needed */ - ind_blks = (blocks + icap - 1) / icap; + struct ext4_inode_info *ei = EXT4_I(inode); + int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1; + int blk_bits; - dind_blks = (ind_blks + icap - 1) / icap; + if (lblock < EXT4_NDIR_BLOCKS) + return 0; - tind_blks = 1; + lblock -= EXT4_NDIR_BLOCKS; - return ind_blks + dind_blks + tind_blks; + if (ei->i_da_metadata_calc_len && + (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { + ei->i_da_metadata_calc_len++; + return 0; + } + ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; + ei->i_da_metadata_calc_len = 1; + blk_bits = roundup_pow_of_two(lblock + 1); + return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; } /* * Calculate the number of metadata blocks need to reserve - * to allocate given number of blocks + * to allocate a block located at @lblock */ -static int ext4_calc_metadata_amount(struct inode *inode, int blocks) +static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) { - if (!blocks) - return 0; - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) - return ext4_ext_calc_metadata_amount(inode, blocks); + return ext4_ext_calc_metadata_amount(inode, lblock); - return ext4_indirect_calc_metadata_amount(inode, blocks); + return ext4_indirect_calc_metadata_amount(inode, lblock); } -static void ext4_da_update_reserve_space(struct inode *inode, int used) +/* + * Called with i_data_sem down, which is important since we can call + * ext4_discard_preallocations() from here. + */ +void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int total, mdb, mdb_free; - - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - /* recalculate the number of metablocks still need to be reserved */ - total = EXT4_I(inode)->i_reserved_data_blocks - used; - mdb = ext4_calc_metadata_amount(inode, total); - - /* figure out how many metablocks to release */ - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; - - if (mdb_free) { - /* Account for allocated meta_blocks */ - mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; - - /* update fs dirty blocks counter */ + struct ext4_inode_info *ei = EXT4_I(inode); + int mdb_free = 0, allocated_meta_blocks = 0; + + spin_lock(&ei->i_block_reservation_lock); + if (unlikely(used > ei->i_reserved_data_blocks)) { + ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " + "with only %d reserved data blocks\n", + __func__, inode->i_ino, used, + ei->i_reserved_data_blocks); + WARN_ON(1); + used = ei->i_reserved_data_blocks; + } + + /* Update per-inode reservations */ + ei->i_reserved_data_blocks -= used; + used += ei->i_allocated_meta_blocks; + ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; + allocated_meta_blocks = ei->i_allocated_meta_blocks; + ei->i_allocated_meta_blocks = 0; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, used); + + if (ei->i_reserved_data_blocks == 0) { + /* + * We can release all of the reserved metadata blocks + * only when we have written all of the delayed + * allocation blocks. + */ + mdb_free = ei->i_reserved_meta_blocks; + ei->i_reserved_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); - EXT4_I(inode)->i_allocated_meta_blocks = 0; - EXT4_I(inode)->i_reserved_meta_blocks = mdb; } - - /* update per-inode reservations */ - BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); - EXT4_I(inode)->i_reserved_data_blocks -= used; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - /* - * free those over-booking quota for metadata blocks - */ - if (mdb_free) - vfs_dq_release_reservation_block(inode, mdb_free); + /* Update quota subsystem */ + if (quota_claim) { + vfs_dq_claim_block(inode, used); + if (mdb_free) + vfs_dq_release_reservation_block(inode, mdb_free); + } else { + /* + * We did fallocate with an offset that is already delayed + * allocated. So on delayed allocated writeback we should + * not update the quota for allocated blocks. But then + * converting an fallocate region to initialized region would + * have caused a metadata allocation. So claim quota for + * that + */ + if (allocated_meta_blocks) + vfs_dq_claim_block(inode, allocated_meta_blocks); + vfs_dq_release_reservation_block(inode, mdb_free + used); + } /* * If we have done all the pending block allocations and if * there aren't any writers on the inode, we can discard the * inode's preallocations. */ - if (!total && (atomic_read(&inode->i_writecount) == 0)) + if ((ei->i_reserved_data_blocks == 0) && + (atomic_read(&inode->i_writecount) == 0)) ext4_discard_preallocations(inode); } @@ -1320,18 +1308,20 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, */ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; } - } + /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. We don't + * support fallocate for non extent files. So we can update + * reserve space here. + */ + if ((retval > 0) && + (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) + ext4_da_update_reserve_space(inode, retval, 1); + } if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) EXT4_I(inode)->i_delalloc_reserved_flag = 0; - /* - * Update reserved blocks/metadata blocks after successful - * block allocation which had been deferred till now. - */ - if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)) - ext4_da_update_reserve_space(inode, retval); - up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && buffer_mapped(bh)) { int ret = check_block_validity(inode, "file system " @@ -1534,6 +1524,16 @@ static int do_journal_get_write_access(handle_t *handle, return ext4_journal_get_write_access(handle, bh); } +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static void ext4_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); +} + static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1599,7 +1599,7 @@ retry: ext4_journal_stop(handle); if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to @@ -1709,7 +1709,7 @@ static int ext4_ordered_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1751,7 +1751,7 @@ static int ext4_writeback_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1814,7 +1814,7 @@ static int ext4_journalled_write_end(struct file *file, if (!ret) ret = ret2; if (pos + len > inode->i_size) { - ext4_truncate(inode); + ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode @@ -1827,11 +1827,15 @@ static int ext4_journalled_write_end(struct file *file, return ret ? ret : copied; } -static int ext4_da_reserve_space(struct inode *inode, int nrblocks) +/* + * Reserve a single block located at lblock + */ +static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) { int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned long md_needed, mdblocks, total = 0; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long md_needed, md_reserved; /* * recalculate the amount of metadata blocks to reserve @@ -1839,86 +1843,78 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) * worse case is one extent per block */ repeat: - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; - mdblocks = ext4_calc_metadata_amount(inode, total); - BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); - - md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; - total = md_needed + nrblocks; + spin_lock(&ei->i_block_reservation_lock); + md_reserved = ei->i_reserved_meta_blocks; + md_needed = ext4_calc_metadata_amount(inode, lblock); + spin_unlock(&ei->i_block_reservation_lock); /* * Make quota reservation here to prevent quota overflow * later. Real quota accounting is done at pages writeout * time. */ - if (vfs_dq_reserve_block(inode, total)) { - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + if (vfs_dq_reserve_block(inode, md_needed + 1)) return -EDQUOT; - } - if (ext4_claim_free_blocks(sbi, total)) { - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - vfs_dq_release_reservation_block(inode, total); + if (ext4_claim_free_blocks(sbi, md_needed + 1)) { + vfs_dq_release_reservation_block(inode, md_needed + 1); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; } return -ENOSPC; } - EXT4_I(inode)->i_reserved_data_blocks += nrblocks; - EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks++; + ei->i_reserved_meta_blocks += md_needed; + spin_unlock(&ei->i_block_reservation_lock); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); return 0; /* success */ } static void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int total, mdb, mdb_free, release; + struct ext4_inode_info *ei = EXT4_I(inode); if (!to_free) return; /* Nothing to release, exit */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - if (!EXT4_I(inode)->i_reserved_data_blocks) { + if (unlikely(to_free > ei->i_reserved_data_blocks)) { /* - * if there is no reserved blocks, but we try to free some - * then the counter is messed up somewhere. - * but since this function is called from invalidate - * page, it's harmless to return without any action + * if there aren't enough reserved blocks, then the + * counter is messed up somewhere. Since this + * function is called from invalidate page, it's + * harmless to return without any action. */ - printk(KERN_INFO "ext4 delalloc try to release %d reserved " - "blocks for inode %lu, but there is no reserved " - "data blocks\n", to_free, inode->i_ino); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return; + ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " + "ino %lu, to_free %d with only %d reserved " + "data blocks\n", inode->i_ino, to_free, + ei->i_reserved_data_blocks); + WARN_ON(1); + to_free = ei->i_reserved_data_blocks; } + ei->i_reserved_data_blocks -= to_free; - /* recalculate the number of metablocks still need to be reserved */ - total = EXT4_I(inode)->i_reserved_data_blocks - to_free; - mdb = ext4_calc_metadata_amount(inode, total); - - /* figure out how many metablocks to release */ - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; - - release = to_free + mdb_free; - - /* update fs dirty blocks counter for truncate case */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); + if (ei->i_reserved_data_blocks == 0) { + /* + * We can release all of the reserved metadata blocks + * only when we have written all of the delayed + * allocation blocks. + */ + to_free += ei->i_reserved_meta_blocks; + ei->i_reserved_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; + } - /* update per-inode reservations */ - BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); - EXT4_I(inode)->i_reserved_data_blocks -= to_free; + /* update fs dirty blocks counter */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - EXT4_I(inode)->i_reserved_meta_blocks = mdb; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - vfs_dq_release_reservation_block(inode, release); + vfs_dq_release_reservation_block(inode, to_free); } static void ext4_da_page_release_reservation(struct page *page, @@ -2223,10 +2219,10 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * variables are updated after the blocks have been allocated. */ new.b_state = 0; - get_blocks_flags = (EXT4_GET_BLOCKS_CREATE | - EXT4_GET_BLOCKS_DELALLOC_RESERVE); + get_blocks_flags = EXT4_GET_BLOCKS_CREATE; if (mpd->b_state & (1 << BH_Delay)) - get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE; + get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, &new, get_blocks_flags); if (blks < 0) { @@ -2524,7 +2520,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - ret = ext4_da_reserve_space(inode, 1); + ret = ext4_da_reserve_space(inode, iblock); if (ret) /* not enough space to reserve */ return ret; @@ -2600,7 +2596,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) } static int __ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc, unsigned int len) { struct address_space *mapping = page->mapping; @@ -2758,7 +2753,7 @@ static int ext4_writepage(struct page *page, * doesn't seem much point in redirtying the page here. */ ClearPageChecked(page); - return __ext4_journalled_writepage(page, wbc, len); + return __ext4_journalled_writepage(page, len); } if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) @@ -2788,7 +2783,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) * number of contiguous block. So we will limit * number of contiguous block to a sane value */ - if (!(inode->i_flags & EXT4_EXTENTS_FL) && + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && (max_blocks > EXT4_MAX_TRANS_DATA)) max_blocks = EXT4_MAX_TRANS_DATA; @@ -2999,8 +2994,7 @@ retry: out_writepages: if (!no_nrwrite_index_update) wbc->no_nrwrite_index_update = 0; - if (wbc->nr_to_write > nr_to_writebump) - wbc->nr_to_write -= nr_to_writebump; + wbc->nr_to_write -= nr_to_writebump; wbc->range_start = range_start; trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); return ret; @@ -3025,11 +3019,18 @@ static int ext4_nonda_switch(struct super_block *sb) if (2 * free_blocks < 3 * dirty_blocks || free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { /* - * free block count is less that 150% of dirty blocks - * or free blocks is less that watermark + * free block count is less than 150% of dirty blocks + * or free blocks is less than watermark */ return 1; } + /* + * Even if we don't switch but are nearing capacity, + * start pushing delalloc when 1/2 of free blocks are dirty. + */ + if (free_blocks < 2 * dirty_blocks) + writeback_inodes_sb_if_idle(sb); + return 0; } @@ -3037,7 +3038,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - int ret, retries = 0; + int ret, retries = 0, quota_retries = 0; struct page *page; pgoff_t index; unsigned from, to; @@ -3091,11 +3092,27 @@ retry: * i_size_read because we hold i_mutex. */ if (pos + len > inode->i_size) - ext4_truncate(inode); + ext4_truncate_failed_write(inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; + + if ((ret == -EDQUOT) && + EXT4_I(inode)->i_reserved_meta_blocks && + (quota_retries++ < 3)) { + /* + * Since we often over-estimate the number of meta + * data blocks required, we may sometimes get a + * spurios out of quota error even though there would + * be enough space once we write the data blocks and + * find out how many meta data blocks were _really_ + * required. So try forcing the inode write to see if + * that helps. + */ + write_inode_now(inode, (quota_retries == 3)); + goto retry; + } out: return ret; } @@ -4120,6 +4137,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, __le32 *last) { __le32 *p; + int flags = EXT4_FREE_BLOCKS_FORGET; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); @@ -4134,27 +4156,10 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, } } - /* - * Any buffers which are on the journal will be in memory. We - * find them on the hash table so jbd2_journal_revoke() will - * run jbd2_journal_forget() on them. We've already detached - * each block from the file, so bforget() in - * jbd2_journal_forget() should be safe. - * - * AKPM: turn on bforget in jbd2_journal_forget()!!! - */ - for (p = first; p < last; p++) { - u32 nr = le32_to_cpu(*p); - if (nr) { - struct buffer_head *tbh; - - *p = 0; - tbh = sb_find_get_block(inode->i_sb, nr); - ext4_forget(handle, 0, inode, tbh, nr); - } - } + for (p = first; p < last; p++) + *p = 0; - ext4_free_blocks(handle, inode, block_to_free, count, 0); + ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); } /** @@ -4342,7 +4347,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, blocks_for_truncate(inode)); } - ext4_free_blocks(handle, inode, nr, 1, 1); + ext4_free_blocks(handle, inode, 0, nr, 1, + EXT4_FREE_BLOCKS_METADATA); if (parent_bh) { /* @@ -4781,8 +4787,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; - struct buffer_head *bh; struct inode *inode; + journal_t *journal = EXT4_SB(sb)->s_journal; long ret; int block; @@ -4793,11 +4799,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; ei = EXT4_I(inode); + iloc.bh = 0; ret = __ext4_get_inode_loc(inode, &iloc, 0); if (ret < 0) goto bad_inode; - bh = iloc.bh; raw_inode = ext4_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); @@ -4820,7 +4826,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (inode->i_mode == 0 || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { /* this inode is deleted */ - brelse(bh); ret = -ESTALE; goto bad_inode; } @@ -4837,6 +4842,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(raw_inode); ei->i_disksize = inode->i_size; +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; ei->i_last_alloc_group = ~0; @@ -4848,11 +4856,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + spin_unlock(&journal->j_state_lock); + ei->i_sync_tid = tid; + ei->i_datasync_tid = tid; + } + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb)) { - brelse(bh); ret = -EIO; goto bad_inode; } @@ -4884,10 +4916,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && - ((ei->i_file_acl < - (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + - EXT4_SB(sb)->s_gdb_count)) || - (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { + !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { ext4_error(sb, __func__, "bad extended attribute block %llu in inode #%lu", ei->i_file_acl, inode->i_ino); @@ -4905,10 +4934,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) /* Validate block references which are part of inode */ ret = ext4_check_inode_blockref(inode); } - if (ret) { - brelse(bh); + if (ret) goto bad_inode; - } if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; @@ -4936,7 +4963,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else { - brelse(bh); ret = -EIO; ext4_error(inode->i_sb, __func__, "bogus i_mode (%o) for inode=%lu", @@ -4949,6 +4975,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; bad_inode: + brelse(iloc.bh); iget_failed(inode); return ERR_PTR(ret); } @@ -5108,6 +5135,7 @@ static int ext4_do_update_inode(handle_t *handle, err = rc; ei->i_state &= ~EXT4_STATE_NEW; + ext4_update_inode_fsync_trans(handle, inode, 0); out_brelse: brelse(bh); ext4_std_error(inode->i_sb, err); @@ -5227,8 +5255,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ - EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index c1cdf613e725..b63d193126db 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -221,31 +221,38 @@ setversion_out: struct file *donor_filp; int err; + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + if (copy_from_user(&me, (struct move_extent __user *)arg, sizeof(me))) return -EFAULT; + me.moved_len = 0; donor_filp = fget(me.donor_fd); if (!donor_filp) return -EBADF; - if (!capable(CAP_DAC_OVERRIDE)) { - if ((current->real_cred->fsuid != inode->i_uid) || - !(inode->i_mode & S_IRUSR) || - !(donor_filp->f_dentry->d_inode->i_mode & - S_IRUSR)) { - fput(donor_filp); - return -EACCES; - } + if (!(donor_filp->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto mext_out; } + err = mnt_want_write(filp->f_path.mnt); + if (err) + goto mext_out; + err = ext4_move_extents(filp, donor_filp, me.orig_start, me.donor_start, me.len, &me.moved_len); - fput(donor_filp); + mnt_drop_write(filp->f_path.mnt); + if (me.moved_len > 0) + file_remove_suid(donor_filp); if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) - return -EFAULT; - + err = -EFAULT; +mext_out: + fput(donor_filp); return err; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 74e495dabe09..d34afad3e137 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2529,7 +2529,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) struct ext4_group_info *db; int err, count = 0, count2 = 0; struct ext4_free_data *entry; - ext4_fsblk_t discard_block; struct list_head *l, *ltmp; list_for_each_safe(l, ltmp, &txn->t_private_list) { @@ -2559,13 +2558,19 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) page_cache_release(e4b.bd_bitmap_page); } ext4_unlock_group(sb, entry->group); - discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) - + entry->start_blk - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, - entry->count); - sb_issue_discard(sb, discard_block, entry->count); - + if (test_opt(sb, DISCARD)) { + ext4_fsblk_t discard_block; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + discard_block = (ext4_fsblk_t)entry->group * + EXT4_BLOCKS_PER_GROUP(sb) + + entry->start_blk + + le32_to_cpu(es->s_first_data_block); + trace_ext4_discard_blocks(sb, + (unsigned long long)discard_block, + entry->count); + sb_issue_discard(sb, discard_block, entry->count); + } kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); } @@ -2750,12 +2755,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); - else { - percpu_counter_sub(&sbi->s_dirtyblocks_counter, - ac->ac_b_ex.fe_len); - /* convert reserved quota blocks to real quota blocks */ - vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len); - } if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, @@ -3006,6 +3005,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) } /* + * Called on failure; free up any blocks from the inode PA for this + * context. We don't need this for MB_GROUP_PA because we only change + * pa_free in ext4_mb_release_context(), but on failure, we've already + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. + */ +static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + int len; + + if (pa && pa->pa_type == MB_INODE_PA) { + len = ac->ac_b_ex.fe_len; + pa->pa_free += len; + } + +} + +/* * use blocks preallocated to inode */ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, @@ -3932,7 +3949,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) * per cpu locality group is to reduce the contention between block * request from multiple CPUs. */ - ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id()); + ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups); /* we're going to use group allocation */ ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; @@ -4290,6 +4307,7 @@ repeat: ac->ac_status = AC_STATUS_CONTINUE; goto repeat; } else if (*errp) { + ext4_discard_allocated_blocks(ac); ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); @@ -4422,18 +4440,24 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, return 0; } -/* - * Main entry point into mballoc to free blocks +/** + * ext4_free_blocks() -- Free given blocks and update quota + * @handle: handle for this transaction + * @inode: inode + * @block: start physical block to free + * @count: number of blocks to count + * @metadata: Are these metadata blocks */ -void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t block, unsigned long count, - int metadata, unsigned long *freed) +void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags) { struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; struct ext4_allocation_context *ac = NULL; struct ext4_group_desc *gdp; struct ext4_super_block *es; + unsigned long freed = 0; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; @@ -4443,13 +4467,16 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, int err = 0; int ret; - *freed = 0; + if (bh) { + if (block) + BUG_ON(block != bh->b_blocknr); + else + block = bh->b_blocknr; + } sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; - if (block < le32_to_cpu(es->s_first_data_block) || - block + count < block || - block + count > ext4_blocks_count(es)) { + if (!ext4_data_block_valid(sbi, block, count)) { ext4_error(sb, __func__, "Freeing blocks not in datazone - " "block = %llu, count = %lu", block, count); @@ -4457,7 +4484,32 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, } ext4_debug("freeing block %llu\n", block); - trace_ext4_free_blocks(inode, block, count, metadata); + trace_ext4_free_blocks(inode, block, count, flags); + + if (flags & EXT4_FREE_BLOCKS_FORGET) { + struct buffer_head *tbh = bh; + int i; + + BUG_ON(bh && (count > 1)); + + for (i = 0; i < count; i++) { + if (!bh) + tbh = sb_find_get_block(inode->i_sb, + block + i); + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + inode, tbh, block + i); + } + } + + /* + * We need to make sure we don't reuse the freed block until + * after the transaction is committed, which we can do by + * treating the block as metadata, below. We make an + * exception if the inode is to be written in writeback mode + * since writeback mode has weak data consistency guarantees. + */ + if (!ext4_should_writeback_data(inode)) + flags |= EXT4_FREE_BLOCKS_METADATA; ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (ac) { @@ -4533,7 +4585,8 @@ do_more: err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) goto error_return; - if (metadata && ext4_handle_valid(handle)) { + + if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { struct ext4_free_data *new_entry; /* * blocks being freed are metadata. these blocks shouldn't @@ -4572,7 +4625,7 @@ do_more: ext4_mb_release_desc(&e4b); - *freed += count; + freed += count; /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); @@ -4592,6 +4645,8 @@ do_more: } sb->s_dirt = 1; error_return: + if (freed) + vfs_dq_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); if (ac) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 0ca811061bc7..436521cae456 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -17,7 +17,6 @@ #include <linux/proc_fs.h> #include <linux/pagemap.h> #include <linux/seq_file.h> -#include <linux/version.h> #include <linux/blkdev.h> #include <linux/mutex.h> #include "ext4_jbd2.h" diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a93d5b80f3e2..81415814b00b 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) * So allocate a credit of 3. We may update * quota (user and group). */ - needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); if (ext4_journal_extend(handle, needed) != 0) retval = ext4_journal_restart(handle, needed); @@ -262,13 +262,17 @@ static int free_dind_blocks(handle_t *handle, for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, - le32_to_cpu(tmp_idata[i]), 1, 1); + ext4_free_blocks(handle, inode, 0, + le32_to_cpu(tmp_idata[i]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); } } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); return 0; } @@ -297,7 +301,9 @@ static int free_tind_blocks(handle_t *handle, } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); return 0; } @@ -308,8 +314,10 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) /* ei->i_data[EXT4_IND_BLOCK] */ if (i_data[0]) { extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, - le32_to_cpu(i_data[0]), 1, 1); + ext4_free_blocks(handle, inode, 0, + le32_to_cpu(i_data[0]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); } /* ei->i_data[EXT4_DIND_BLOCK] */ @@ -419,7 +427,8 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, block, 1, 1); + ext4_free_blocks(handle, inode, 0, block, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return retval; } @@ -477,7 +486,7 @@ int ext4_ext_migrate(struct inode *inode) handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + 1); if (IS_ERR(handle)) { retval = PTR_ERR(handle); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 25b6b1457360..82c415be87a4 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -77,12 +77,14 @@ static int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent **extent) { + struct ext4_extent_header *eh; int ppos, leaf_ppos = path->p_depth; ppos = leaf_ppos; if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { /* leaf block */ *extent = ++path[ppos].p_ext; + path[ppos].p_block = ext_pblock(path[ppos].p_ext); return 0; } @@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, ext_block_hdr(path[cur_ppos+1].p_bh); } + path[leaf_ppos].p_ext = *extent = NULL; + + eh = path[leaf_ppos].p_hdr; + if (le16_to_cpu(eh->eh_entries) == 0) + /* empty leaf is found */ + return -ENODATA; + /* leaf block */ path[leaf_ppos].p_ext = *extent = EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); + path[leaf_ppos].p_block = + ext_pblock(path[leaf_ppos].p_ext); return 0; } } @@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2, } /** - * mext_double_down_read - Acquire two inodes' read semaphore - * - * @orig_inode: original inode structure - * @donor_inode: donor inode structure - * Acquire read semaphore of the two inodes (orig and donor) by i_ino order. - */ -static void -mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode) -{ - struct inode *first = orig_inode, *second = donor_inode; - - /* - * Use the inode number to provide the stable locking order instead - * of its address, because the C language doesn't guarantee you can - * compare pointers that don't come from the same array. - */ - if (donor_inode->i_ino < orig_inode->i_ino) { - first = donor_inode; - second = orig_inode; - } - - down_read(&EXT4_I(first)->i_data_sem); - down_read(&EXT4_I(second)->i_data_sem); -} - -/** - * mext_double_down_write - Acquire two inodes' write semaphore + * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem * * @orig_inode: original inode structure * @donor_inode: donor inode structure - * Acquire write semaphore of the two inodes (orig and donor) by i_ino order. + * Acquire write lock of i_data_sem of the two inodes (orig and donor) by + * i_ino order. */ static void -mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) +double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) { struct inode *first = orig_inode, *second = donor_inode; @@ -203,32 +189,18 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) } down_write(&EXT4_I(first)->i_data_sem); - down_write(&EXT4_I(second)->i_data_sem); + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); } /** - * mext_double_up_read - Release two inodes' read semaphore + * double_up_write_data_sem - Release two inodes' write lock of i_data_sem * * @orig_inode: original inode structure to be released its lock first * @donor_inode: donor inode structure to be released its lock second - * Release read semaphore of two inodes (orig and donor). + * Release write lock of i_data_sem of two inodes (orig and donor). */ static void -mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) -{ - up_read(&EXT4_I(orig_inode)->i_data_sem); - up_read(&EXT4_I(donor_inode)->i_data_sem); -} - -/** - * mext_double_up_write - Release two inodes' write semaphore - * - * @orig_inode: original inode structure to be released its lock first - * @donor_inode: donor inode structure to be released its lock second - * Release write semaphore of two inodes (orig and donor). - */ -static void -mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) +double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) { up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); @@ -596,7 +568,7 @@ out: * @tmp_oext: the extent that will belong to the donor inode * @orig_off: block offset of original inode * @donor_off: block offset of donor inode - * @max_count: the maximun length of extents + * @max_count: the maximum length of extents * * Return 0 on success, or a negative error value on failure. */ @@ -661,6 +633,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, * @donor_inode: donor inode * @from: block offset of orig_inode * @count: block count to be replaced + * @err: pointer to save return value * * Replace original inode extents and donor inode extents page by page. * We implement this replacement in the following three steps: @@ -671,33 +644,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, * 3. Change the block information of donor inode to point at the saved * original inode blocks in the dummy extents. * - * Return 0 on success, or a negative error value on failure. + * Return replaced block count. */ static int mext_replace_branches(handle_t *handle, struct inode *orig_inode, struct inode *donor_inode, ext4_lblk_t from, - ext4_lblk_t count) + ext4_lblk_t count, int *err) { struct ext4_ext_path *orig_path = NULL; struct ext4_ext_path *donor_path = NULL; struct ext4_extent *oext, *dext; struct ext4_extent tmp_dext, tmp_oext; ext4_lblk_t orig_off = from, donor_off = from; - int err = 0; int depth; int replaced_count = 0; int dext_alen; - mext_double_down_write(orig_inode, donor_inode); + /* Protect extent trees against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); /* Get the original extent for the block "orig_off" */ - err = get_ext_path(orig_inode, orig_off, &orig_path); - if (err) + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) goto out; /* Get the donor extent for the head */ - err = get_ext_path(donor_inode, donor_off, &donor_path); - if (err) + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) goto out; depth = ext_depth(orig_inode); oext = orig_path[depth].p_ext; @@ -707,9 +680,9 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, dext = donor_path[depth].p_ext; tmp_dext = *dext; - err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, donor_off, count); - if (err) + if (*err) goto out; /* Loop for the donor extents */ @@ -718,7 +691,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, if (!dext) { ext4_error(donor_inode->i_sb, __func__, "The extent for donor must be found"); - err = -EIO; + *err = -EIO; goto out; } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { ext4_error(donor_inode->i_sb, __func__, @@ -726,20 +699,20 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, "extent(%u) should be equal", donor_off, le32_to_cpu(tmp_dext.ee_block)); - err = -EIO; + *err = -EIO; goto out; } /* Set donor extent to orig extent */ - err = mext_leaf_block(handle, orig_inode, + *err = mext_leaf_block(handle, orig_inode, orig_path, &tmp_dext, &orig_off); - if (err < 0) + if (*err) goto out; /* Set orig extent to donor extent */ - err = mext_leaf_block(handle, donor_inode, + *err = mext_leaf_block(handle, donor_inode, donor_path, &tmp_oext, &donor_off); - if (err < 0) + if (*err) goto out; dext_alen = ext4_ext_get_actual_len(&tmp_dext); @@ -753,35 +726,25 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, if (orig_path) ext4_ext_drop_refs(orig_path); - err = get_ext_path(orig_inode, orig_off, &orig_path); - if (err) + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) goto out; depth = ext_depth(orig_inode); oext = orig_path[depth].p_ext; - if (le32_to_cpu(oext->ee_block) + - ext4_ext_get_actual_len(oext) <= orig_off) { - err = 0; - goto out; - } tmp_oext = *oext; if (donor_path) ext4_ext_drop_refs(donor_path); - err = get_ext_path(donor_inode, donor_off, &donor_path); - if (err) + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) goto out; depth = ext_depth(donor_inode); dext = donor_path[depth].p_ext; - if (le32_to_cpu(dext->ee_block) + - ext4_ext_get_actual_len(dext) <= donor_off) { - err = 0; - goto out; - } tmp_dext = *dext; - err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, donor_off, count - replaced_count); - if (err) + if (*err) goto out; } @@ -795,8 +758,12 @@ out: kfree(donor_path); } - mext_double_up_write(orig_inode, donor_inode); - return err; + ext4_ext_invalidate_cache(orig_inode); + ext4_ext_invalidate_cache(donor_inode); + + double_up_write_data_sem(orig_inode, donor_inode); + + return replaced_count; } /** @@ -808,16 +775,17 @@ out: * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped * @uninit: orig extent is uninitialized or not + * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents * with donor inode extents by calling mext_replace_branches(). - * Finally, write out the saved data in new original inode blocks. Return 0 - * on success, or a negative error value on failure. + * Finally, write out the saved data in new original inode blocks. Return + * replaced block count. */ static int move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, int data_offset_in_page, - int block_len_in_page, int uninit) + int block_len_in_page, int uninit, int *err) { struct inode *orig_inode = o_filp->f_dentry->d_inode; struct address_space *mapping = orig_inode->i_mapping; @@ -829,9 +797,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, long long offs = orig_page_offset << PAGE_CACHE_SHIFT; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int w_flags = 0; - unsigned int tmp_data_len, data_len; + unsigned int tmp_data_size, data_size, replaced_size; void *fsdata; - int ret, i, jblocks; + int i, jblocks; + int err2 = 0; + int replaced_count = 0; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; /* @@ -841,8 +811,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; handle = ext4_journal_start(orig_inode, jblocks); if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - return ret; + *err = PTR_ERR(handle); + return 0; } if (segment_eq(get_fs(), KERNEL_DS)) @@ -858,39 +828,36 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, * Just swap data blocks between orig and donor. */ if (uninit) { - ret = mext_replace_branches(handle, orig_inode, - donor_inode, orig_blk_offset, - block_len_in_page); - - /* Clear the inode cache not to refer to the old data */ - ext4_ext_invalidate_cache(orig_inode); - ext4_ext_invalidate_cache(donor_inode); + replaced_count = mext_replace_branches(handle, orig_inode, + donor_inode, orig_blk_offset, + block_len_in_page, err); goto out2; } offs = (long long)orig_blk_offset << orig_inode->i_blkbits; - /* Calculate data_len */ + /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { /* Replace the last block */ - tmp_data_len = orig_inode->i_size & (blocksize - 1); + tmp_data_size = orig_inode->i_size & (blocksize - 1); /* - * If data_len equal zero, it shows data_len is multiples of + * If data_size equal zero, it shows data_size is multiples of * blocksize. So we set appropriate value. */ - if (tmp_data_len == 0) - tmp_data_len = blocksize; + if (tmp_data_size == 0) + tmp_data_size = blocksize; - data_len = tmp_data_len + + data_size = tmp_data_size + ((block_len_in_page - 1) << orig_inode->i_blkbits); - } else { - data_len = block_len_in_page << orig_inode->i_blkbits; - } + } else + data_size = block_len_in_page << orig_inode->i_blkbits; + + replaced_size = data_size; - ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags, + *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, &page, &fsdata); - if (unlikely(ret < 0)) + if (unlikely(*err < 0)) goto out; if (!PageUptodate(page)) { @@ -911,14 +878,17 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, /* Release old bh and drop refs */ try_to_release_page(page, 0); - ret = mext_replace_branches(handle, orig_inode, donor_inode, - orig_blk_offset, block_len_in_page); - if (ret < 0) - goto out; - - /* Clear the inode cache not to refer to the old data */ - ext4_ext_invalidate_cache(orig_inode); - ext4_ext_invalidate_cache(donor_inode); + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, + orig_blk_offset, block_len_in_page, + &err2); + if (err2) { + if (replaced_count) { + block_len_in_page = replaced_count; + replaced_size = + block_len_in_page << orig_inode->i_blkbits; + } else + goto out; + } if (!page_has_buffers(page)) create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); @@ -928,16 +898,16 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { - ret = ext4_get_block(orig_inode, + *err = ext4_get_block(orig_inode, (sector_t)(orig_blk_offset + i), bh, 0); - if (ret < 0) + if (*err < 0) goto out; if (bh->b_this_page != NULL) bh = bh->b_this_page; } - ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len, + *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, page, fsdata); page = NULL; @@ -951,7 +921,10 @@ out: out2: ext4_journal_stop(handle); - return ret < 0 ? ret : 0; + if (err2) + *err = err2; + + return replaced_count; } /** @@ -962,7 +935,6 @@ out2: * @orig_start: logical start offset in block for orig * @donor_start: logical start offset in block for donor * @len: the number of blocks to be moved - * @moved_len: moved block length * * Check the arguments of ext4_move_extents() whether the files can be * exchanged with each other. @@ -970,8 +942,8 @@ out2: */ static int mext_check_arguments(struct inode *orig_inode, - struct inode *donor_inode, __u64 orig_start, - __u64 donor_start, __u64 *len, __u64 moved_len) + struct inode *donor_inode, __u64 orig_start, + __u64 donor_start, __u64 *len) { ext4_lblk_t orig_blocks, donor_blocks; unsigned int blkbits = orig_inode->i_blkbits; @@ -985,6 +957,13 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { + ext4_debug("ext4 move extent: suid or sgid is set" + " to donor file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + /* Ext4 move extent does not support swapfile */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { ext4_debug("ext4 move extent: The argument files should " @@ -1025,13 +1004,6 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - if (moved_len) { - ext4_debug("ext4 move extent: moved_len should be 0 " - "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, - donor_inode->i_ino); - return -EINVAL; - } - if ((orig_start > EXT_MAX_BLOCK) || (donor_start > EXT_MAX_BLOCK) || (*len > EXT_MAX_BLOCK) || @@ -1088,7 +1060,7 @@ mext_check_arguments(struct inode *orig_inode, } if (!*len) { - ext4_debug("ext4 move extent: len shoudld not be 0 " + ext4_debug("ext4 move extent: len should not be 0 " "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; @@ -1232,16 +1204,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, return -EINVAL; } - /* protect orig and donor against a truncate */ + /* Protect orig and donor inodes against a truncate */ ret1 = mext_inode_double_lock(orig_inode, donor_inode); if (ret1 < 0) return ret1; - mext_double_down_read(orig_inode, donor_inode); + /* Protect extent tree against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, - donor_start, &len, *moved_len); - mext_double_up_read(orig_inode, donor_inode); + donor_start, &len); if (ret1) goto out; @@ -1355,36 +1327,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, seq_start = le32_to_cpu(ext_cur->ee_block); rest_blocks = seq_blocks; - /* Discard preallocations of two inodes */ - down_write(&EXT4_I(orig_inode)->i_data_sem); - ext4_discard_preallocations(orig_inode); - up_write(&EXT4_I(orig_inode)->i_data_sem); - - down_write(&EXT4_I(donor_inode)->i_data_sem); - ext4_discard_preallocations(donor_inode); - up_write(&EXT4_I(donor_inode)->i_data_sem); + /* + * Up semaphore to avoid following problems: + * a. transaction deadlock among ext4_journal_start, + * ->write_begin via pagefault, and jbd2_journal_commit + * b. racing with ->readpage, ->write_begin, and ext4_get_block + * in move_extent_per_page + */ + double_up_write_data_sem(orig_inode, donor_inode); while (orig_page_offset <= seq_end_page) { /* Swap original branches with new branches */ - ret1 = move_extent_per_page(o_filp, donor_inode, + block_len_in_page = move_extent_per_page( + o_filp, donor_inode, orig_page_offset, data_offset_in_page, - block_len_in_page, uninit); - if (ret1 < 0) - goto out; - orig_page_offset++; + block_len_in_page, uninit, + &ret1); + /* Count how many blocks we have exchanged */ *moved_len += block_len_in_page; + if (ret1 < 0) + break; if (*moved_len > len) { ext4_error(orig_inode->i_sb, __func__, "We replaced blocks too much! " "sum of replaced: %llu requested: %llu", *moved_len, len); ret1 = -EIO; - goto out; + break; } + orig_page_offset++; data_offset_in_page = 0; rest_blocks -= block_len_in_page; if (rest_blocks > blocks_per_page) @@ -1393,6 +1368,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, block_len_in_page = rest_blocks; } + double_down_write_data_sem(orig_inode, donor_inode); + if (ret1 < 0) + break; + /* Decrease buffer counter */ if (holecheck_path) ext4_ext_drop_refs(holecheck_path); @@ -1414,6 +1393,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, } out: + if (*moved_len) { + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); + } + if (orig_path) { ext4_ext_drop_refs(orig_path); kfree(orig_path); @@ -1422,7 +1406,7 @@ out: ext4_ext_drop_refs(holecheck_path); kfree(holecheck_path); } - + double_up_write_data_sem(orig_inode, donor_inode); ret2 = mext_inode_double_unlock(orig_inode, donor_inode); if (ret1) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6d2c1b897fc7..17a17e10dd60 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1292,9 +1292,6 @@ errout: * add_dirent_to_buf will attempt search the directory block for * space. It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. - * - * NOTE! bh is NOT released in the case where ENOSPC is returned. In - * all other cases bh is released. */ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *inode, struct ext4_dir_entry_2 *de, @@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, top = bh->b_data + blocksize - reclen; while ((char *) de <= top) { if (!ext4_check_dir_entry("ext4_add_entry", dir, de, - bh, offset)) { - brelse(bh); + bh, offset)) return -EIO; - } - if (ext4_match(namelen, name, de)) { - brelse(bh); + if (ext4_match(namelen, name, de)) return -EEXIST; - } nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); if ((de->inode? rlen - nlen: rlen) >= reclen) @@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, err = ext4_journal_get_write_access(handle, bh); if (err) { ext4_std_error(dir->i_sb, err); - brelse(bh); return err; } @@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, err = ext4_handle_dirty_metadata(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); - brelse(bh); return 0; } @@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, if (!(de)) return retval; - return add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); + return retval; } /* @@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if(!bh) return retval; retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (retval != -ENOSPC) + if (retval != -ENOSPC) { + brelse(bh); return retval; + } if (blocks == 1 && !dx_fallback && EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) @@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); - return add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); + return retval; } /* @@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto journal_error; err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (err != -ENOSPC) { - bh = NULL; + if (err != -ENOSPC) goto cleanup; - } /* Block full, should compress but for now just split */ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", @@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (!de) goto cleanup; err = add_dirent_to_buf(handle, dentry, inode, de, bh); - bh = NULL; goto cleanup; journal_error: @@ -1775,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1809,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1846,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2259,7 +2253,7 @@ static int ext4_symlink(struct inode *dir, retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3cfc343c41b5..3b2c5541d8a6 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; if (IS_ERR(gdb = bclean(handle, sb, block))) { - err = PTR_ERR(bh); + err = PTR_ERR(gdb); goto exit_bh; } ext4_handle_dirty_metadata(handle, NULL, gdb); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d4ca92aab514..735c20d5fd56 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -603,10 +603,6 @@ static void ext4_put_super(struct super_block *sb) if (sb->s_dirt) ext4_commit_super(sb, 1); - ext4_release_system_zone(sb); - ext4_mb_release(sb); - ext4_ext_release(sb); - ext4_xattr_put_super(sb); if (sbi->s_journal) { err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -614,6 +610,12 @@ static void ext4_put_super(struct super_block *sb) ext4_abort(sb, __func__, "Couldn't clean up the journal"); } + + ext4_release_system_zone(sb); + ext4_mb_release(sb); + ext4_ext_release(sb); + ext4_xattr_put_super(sb); + if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); @@ -700,10 +702,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; ei->i_delalloc_reserved_flag = 0; spin_lock_init(&(ei->i_block_reservation_lock)); +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); ei->cur_aio_dio = NULL; + ei->i_sync_tid = 0; + ei->i_datasync_tid = 0; return &ei->vfs_inode; } @@ -765,9 +773,22 @@ static inline void ext4_show_quota_options(struct seq_file *seq, #if defined(CONFIG_QUOTA) struct ext4_sb_info *sbi = EXT4_SB(sb); - if (sbi->s_jquota_fmt) - seq_printf(seq, ",jqfmt=%s", - (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0"); + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } if (sbi->s_qf_names[USRQUOTA]) seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); @@ -899,6 +920,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NO_AUTO_DA_ALLOC)) seq_puts(seq, ",noauto_da_alloc"); + if (test_opt(sb, DISCARD)) + seq_puts(seq, ",discard"); + + if (test_opt(sb, NOLOAD)) + seq_puts(seq, ",norecovery"); + ext4_show_quota_options(seq, sb); return 0; @@ -991,7 +1018,9 @@ static const struct dquot_operations ext4_quota_operations = { .reserve_space = dquot_reserve_space, .claim_space = dquot_claim_space, .release_rsv = dquot_release_reserved_space, +#ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, +#endif .alloc_inode = dquot_alloc_inode, .free_space = dquot_free_space, .free_inode = dquot_free_inode, @@ -1074,12 +1103,13 @@ enum { Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, - Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, - Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, + Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, + Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_block_validity, Opt_noblock_validity, - Opt_inode_readahead_blks, Opt_journal_ioprio + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_discard, Opt_nodiscard, }; static const match_table_t tokens = { @@ -1104,6 +1134,7 @@ static const match_table_t tokens = { {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_noload, "noload"}, + {Opt_noload, "norecovery"}, {Opt_nobh, "nobh"}, {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, @@ -1125,6 +1156,7 @@ static const match_table_t tokens = { {Opt_grpjquota, "grpjquota=%s"}, {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_grpquota, "grpquota"}, {Opt_noquota, "noquota"}, {Opt_quota, "quota"}, @@ -1144,6 +1176,8 @@ static const match_table_t tokens = { {Opt_auto_da_alloc, "auto_da_alloc=%u"}, {Opt_auto_da_alloc, "auto_da_alloc"}, {Opt_noauto_da_alloc, "noauto_da_alloc"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_err, NULL}, }; @@ -1425,6 +1459,9 @@ clear_qf_name: goto set_qf_format; case Opt_jqfmt_vfsv0: qfmt = QFMT_VFS_V0; + goto set_qf_format; + case Opt_jqfmt_vfsv1: + qfmt = QFMT_VFS_V1; set_qf_format: if (sb_any_quota_loaded(sb) && sbi->s_jquota_fmt != qfmt) { @@ -1467,6 +1504,7 @@ set_qf_format: case Opt_offgrpjquota: case Opt_jqfmt_vfsold: case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: ext4_msg(sb, KERN_ERR, "journaled quota options not supported"); break; @@ -1565,6 +1603,12 @@ set_qf_format: else set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); break; + case Opt_discard: + set_opt(sbi->s_mount_opt, DISCARD); + break; + case Opt_nodiscard: + clear_opt(sbi->s_mount_opt, DISCARD); + break; default: ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " @@ -1673,14 +1717,14 @@ static int ext4_fill_flex_info(struct super_block *sb) size_t size; int i; - if (!sbi->s_es->s_log_groups_per_flex) { + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; + groups_per_flex = 1 << sbi->s_log_groups_per_flex; + + if (groups_per_flex < 2) { sbi->s_log_groups_per_flex = 0; return 1; } - sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; - groups_per_flex = 1 << sbi->s_log_groups_per_flex; - /* We allocate both existing and potentially added groups */ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << @@ -2099,11 +2143,8 @@ static int parse_strtoul(const char *buf, { char *endp; - while (*buf && isspace(*buf)) - buf++; - *value = simple_strtoul(buf, &endp, 0); - while (*endp && isspace(*endp)) - endp++; + *value = simple_strtoul(skip_spaces(buf), &endp, 0); + endp = skip_spaces(endp); if (*endp || *value > max) return -EINVAL; @@ -2134,9 +2175,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, struct super_block *sb = sbi->s_buddy_cache->i_sb; return snprintf(buf, PAGE_SIZE, "%llu\n", - sbi->s_kbytes_written + + (unsigned long long)(sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - - EXT4_SB(sb)->s_sectors_written_start) >> 1)); + EXT4_SB(sb)->s_sectors_written_start) >> 1))); } static ssize_t inode_readahead_blks_store(struct ext4_attr *a, @@ -2721,26 +2762,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { if (ext4_load_journal(sb, es, journal_devnum)) goto failed_mount3; - if (!(sb->s_flags & MS_RDONLY) && - EXT4_SB(sb)->s_journal->j_failed_commit) { - ext4_msg(sb, KERN_CRIT, "error: " - "ext4_fill_super: Journal transaction " - "%u is corrupt", - EXT4_SB(sb)->s_journal->j_failed_commit); - if (test_opt(sb, ERRORS_RO)) { - ext4_msg(sb, KERN_CRIT, - "Mounting filesystem read-only"); - sb->s_flags |= MS_RDONLY; - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - } - if (test_opt(sb, ERRORS_PANIC)) { - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb, 1); - goto failed_mount4; - } - } } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { ext4_msg(sb, KERN_ERR, "required journal recovery " @@ -3668,13 +3689,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); - ext4_free_blocks_count_set(es, buf->f_bfree); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); - es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); buf->f_namelen = EXT4_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); @@ -3966,6 +3985,60 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags, return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); } +#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static struct file_system_type ext2_fs_type = { + .owner = THIS_MODULE, + .name = "ext2", + .get_sb = ext4_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static inline void register_as_ext2(void) +{ + int err = register_filesystem(&ext2_fs_type); + if (err) + printk(KERN_WARNING + "EXT4-fs: Unable to register as ext2 (%d)\n", err); +} + +static inline void unregister_as_ext2(void) +{ + unregister_filesystem(&ext2_fs_type); +} +MODULE_ALIAS("ext2"); +#else +static inline void register_as_ext2(void) { } +static inline void unregister_as_ext2(void) { } +#endif + +#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static struct file_system_type ext3_fs_type = { + .owner = THIS_MODULE, + .name = "ext3", + .get_sb = ext4_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static inline void register_as_ext3(void) +{ + int err = register_filesystem(&ext3_fs_type); + if (err) + printk(KERN_WARNING + "EXT4-fs: Unable to register as ext3 (%d)\n", err); +} + +static inline void unregister_as_ext3(void) +{ + unregister_filesystem(&ext3_fs_type); +} +MODULE_ALIAS("ext3"); +#else +static inline void register_as_ext3(void) { } +static inline void unregister_as_ext3(void) { } +#endif + static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", @@ -3995,11 +4068,15 @@ static int __init init_ext4_fs(void) err = init_inodecache(); if (err) goto out1; + register_as_ext2(); + register_as_ext3(); err = register_filesystem(&ext4_fs_type); if (err) goto out; return 0; out: + unregister_as_ext2(); + unregister_as_ext3(); destroy_inodecache(); out1: exit_ext4_xattr(); @@ -4015,6 +4092,8 @@ out4: static void __exit exit_ext4_fs(void) { + unregister_as_ext2(); + unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); destroy_inodecache(); exit_ext4_xattr(); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index fed5b01d7a8d..f3a2f7ed45aa 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -92,7 +92,7 @@ static struct buffer_head *ext4_xattr_cache_find(struct inode *, struct mb_cache_entry **); static void ext4_xattr_rehash(struct ext4_xattr_header *, struct ext4_xattr_entry *); -static int ext4_xattr_list(struct inode *inode, char *buffer, +static int ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size); static struct mb_cache *ext4_xattr_cache; @@ -140,7 +140,7 @@ ext4_xattr_handler(int name_index) ssize_t ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) { - return ext4_xattr_list(dentry->d_inode, buffer, size); + return ext4_xattr_list(dentry, buffer, size); } static int @@ -325,7 +325,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, } static int -ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry, +ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, char *buffer, size_t buffer_size) { size_t rest = buffer_size; @@ -335,9 +335,10 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry, ext4_xattr_handler(entry->e_name_index); if (handler) { - size_t size = handler->list(inode, buffer, rest, + size_t size = handler->list(dentry, buffer, rest, entry->e_name, - entry->e_name_len); + entry->e_name_len, + handler->flags); if (buffer) { if (size > rest) return -ERANGE; @@ -350,8 +351,9 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry, } static int -ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) +ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct buffer_head *bh = NULL; int error; @@ -376,7 +378,7 @@ ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) goto cleanup; } ext4_xattr_cache_insert(bh); - error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size); + error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: brelse(bh); @@ -385,8 +387,9 @@ cleanup: } static int -ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) +ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; @@ -404,7 +407,7 @@ ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) error = ext4_xattr_check_names(IFIRST(header), end); if (error) goto cleanup; - error = ext4_xattr_list_entries(inode, IFIRST(header), + error = ext4_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); cleanup: @@ -423,12 +426,12 @@ cleanup: * used / required on success. */ static int -ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) +ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { int i_error, b_error; - down_read(&EXT4_I(inode)->xattr_sem); - i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size); + down_read(&EXT4_I(dentry->d_inode)->xattr_sem); + i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size); if (i_error < 0) { b_error = 0; } else { @@ -436,11 +439,11 @@ ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) buffer += i_error; buffer_size -= i_error; } - b_error = ext4_xattr_block_list(inode, buffer, buffer_size); + b_error = ext4_xattr_block_list(dentry, buffer, buffer_size); if (b_error < 0) i_error = 0; } - up_read(&EXT4_I(inode)->xattr_sem); + up_read(&EXT4_I(dentry->d_inode)->xattr_sem); return i_error + b_error; } @@ -482,9 +485,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, ea_bdebug(bh, "refcount now=0; freeing"); if (ce) mb_cache_entry_free(ce); - ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1); get_bh(bh); - ext4_forget(handle, 1, inode, bh, bh->b_blocknr); + ext4_free_blocks(handle, inode, bh, 0, 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); error = ext4_handle_dirty_metadata(handle, inode, bh); @@ -832,7 +836,8 @@ inserted: new_bh = sb_getblk(sb, block); if (!new_bh) { getblk_failed: - ext4_free_blocks(handle, inode, block, 1, 1); + ext4_free_blocks(handle, inode, 0, block, 1, + EXT4_FREE_BLOCKS_METADATA); error = -EIO; goto cleanup; } @@ -988,6 +993,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (error) goto cleanup; + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto cleanup; + if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); @@ -1013,9 +1022,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (flags & XATTR_CREATE) goto cleanup; } - error = ext4_journal_get_write_access(handle, is.iloc.bh); - if (error) - goto cleanup; if (!value) { if (!is.s.not_found) error = ext4_xattr_ibody_set(handle, inode, &i, &is); @@ -1326,6 +1332,8 @@ retry: goto cleanup; kfree(b_entry_name); kfree(buffer); + b_entry_name = NULL; + buffer = NULL; brelse(is->iloc.bh); kfree(is); kfree(bs); diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index ca5f89fc6cae..983c253999a7 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -12,8 +12,8 @@ #include "xattr.h" static size_t -ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; const size_t total_len = prefix_len + name_len + 1; @@ -28,23 +28,23 @@ ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size, } static int -ext4_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext4_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name, - buffer, size); + return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + name, buffer, size); } static int -ext4_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext4_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name, - value, size, flags); + return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + name, value, size, flags); } int diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index ac1a52cf2a37..15b50edc6587 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -14,8 +14,8 @@ #include "xattr.h" static size_t -ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -32,23 +32,23 @@ ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, } static int -ext4_xattr_trusted_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name, - buffer, size); + return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + name, buffer, size); } static int -ext4_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext4_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name, - value, size, flags); + return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + name, value, size, flags); } struct xattr_handler ext4_xattr_trusted_handler = { diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index d91aa61b42aa..c4ce05746ce1 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -13,13 +13,13 @@ #include "xattr.h" static size_t -ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return 0; if (list && total_len <= list_size) { @@ -31,26 +31,27 @@ ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, } static int -ext4_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext4_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size); + return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER, + name, buffer, size); } static int -ext4_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext4_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name, - value, size, flags); + return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER, + name, value, size, flags); } struct xattr_handler ext4_xattr_user_handler = { diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 7db0979c6b72..e6efdfa0f6db 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -44,7 +44,8 @@ struct fat_mount_options { nocase:1, /* Does this need case conversion? 0=need case conversion*/ usefree:1, /* Use free_clusters for FAT32 */ tz_utc:1, /* Filesystem timestamps are in UTC */ - rodir:1; /* allow ATTR_RO for directory */ + rodir:1, /* allow ATTR_RO for directory */ + discard:1; /* Issue discard requests on deletions */ }; #define FAT_HASH_BITS 8 diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index a81037721a6f..81184d3b75a3 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -566,16 +566,21 @@ int fat_free_clusters(struct inode *inode, int cluster) goto error; } - /* - * Issue discard for the sectors we no longer care about, - * batching contiguous clusters into one request - */ - if (cluster != fatent.entry + 1) { - int nr_clus = fatent.entry - first_cl + 1; - - sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl), - nr_clus * sbi->sec_per_clus); - first_cl = cluster; + if (sbi->options.discard) { + /* + * Issue discard for the sectors we no longer + * care about, batching contiguous clusters + * into one request + */ + if (cluster != fatent.entry + 1) { + int nr_clus = fatent.entry - first_cl + 1; + + sb_issue_discard(sb, + fat_clus_to_blknr(sbi, first_cl), + nr_clus * sbi->sec_per_clus); + + first_cl = cluster; + } } ops->ent_put(&fatent, FAT_ENT_FREE); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 76b7961ab663..14da530b05ca 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -858,6 +858,8 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, ",errors=panic"); else seq_puts(m, ",errors=remount-ro"); + if (opts->discard) + seq_puts(m, ",discard"); return 0; } @@ -871,7 +873,7 @@ enum { Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, - Opt_err_panic, Opt_err_ro, Opt_err, + Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err, }; static const match_table_t fat_tokens = { @@ -899,6 +901,7 @@ static const match_table_t fat_tokens = { {Opt_err_cont, "errors=continue"}, {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, + {Opt_discard, "discard"}, {Opt_obsolate, "conv=binary"}, {Opt_obsolate, "conv=text"}, {Opt_obsolate, "conv=auto"}, @@ -1136,6 +1139,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, case Opt_rodir: opts->rodir = 1; break; + case Opt_discard: + opts->discard = 1; + break; /* obsolete mount options */ case Opt_obsolate: diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 0f55f5cb732f..d3da05f26465 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/fs.h> #include <linux/buffer_head.h> +#include <linux/time.h> #include "fat.h" /* @@ -157,10 +158,6 @@ extern struct timezone sys_tz; #define SECS_PER_MIN 60 #define SECS_PER_HOUR (60 * 60) #define SECS_PER_DAY (SECS_PER_HOUR * 24) -#define UNIX_SECS_1980 315532800L -#if BITS_PER_LONG == 64 -#define UNIX_SECS_2108 4354819200L -#endif /* days between 1.1.70 and 1.1.80 (2 leap days) */ #define DAYS_DELTA (365 * 10 + 2) /* 120 (2100 - 1980) isn't leap year */ @@ -213,58 +210,35 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts, __le16 *time, __le16 *date, u8 *time_cs) { - time_t second = ts->tv_sec; - time_t day, leap_day, month, year; + struct tm tm; + time_to_tm(ts->tv_sec, sbi->options.tz_utc ? 0 : + -sys_tz.tz_minuteswest * 60, &tm); - if (!sbi->options.tz_utc) - second -= sys_tz.tz_minuteswest * SECS_PER_MIN; - - /* Jan 1 GMT 00:00:00 1980. But what about another time zone? */ - if (second < UNIX_SECS_1980) { + /* FAT can only support year between 1980 to 2107 */ + if (tm.tm_year < 1980 - 1900) { *time = 0; *date = cpu_to_le16((0 << 9) | (1 << 5) | 1); if (time_cs) *time_cs = 0; return; } -#if BITS_PER_LONG == 64 - if (second >= UNIX_SECS_2108) { + if (tm.tm_year > 2107 - 1900) { *time = cpu_to_le16((23 << 11) | (59 << 5) | 29); *date = cpu_to_le16((127 << 9) | (12 << 5) | 31); if (time_cs) *time_cs = 199; return; } -#endif - day = second / SECS_PER_DAY - DAYS_DELTA; - year = day / 365; - leap_day = (year + 3) / 4; - if (year > YEAR_2100) /* 2100 isn't leap year */ - leap_day--; - if (year * 365 + leap_day > day) - year--; - leap_day = (year + 3) / 4; - if (year > YEAR_2100) /* 2100 isn't leap year */ - leap_day--; - day -= year * 365 + leap_day; - - if (IS_LEAP_YEAR(year) && day == days_in_year[3]) { - month = 2; - } else { - if (IS_LEAP_YEAR(year) && day > days_in_year[3]) - day--; - for (month = 1; month < 12; month++) { - if (days_in_year[month + 1] > day) - break; - } - } - day -= days_in_year[month]; + /* from 1900 -> from 1980 */ + tm.tm_year -= 80; + /* 0~11 -> 1~12 */ + tm.tm_mon++; + /* 0~59 -> 0~29(2sec counts) */ + tm.tm_sec >>= 1; - *time = cpu_to_le16(((second / SECS_PER_HOUR) % 24) << 11 - | ((second / SECS_PER_MIN) % 60) << 5 - | (second % SECS_PER_MIN) >> 1); - *date = cpu_to_le16((year << 9) | (month << 5) | (day + 1)); + *time = cpu_to_le16(tm.tm_hour << 11 | tm.tm_min << 5 | tm.tm_sec); + *date = cpu_to_le16(tm.tm_year << 9 | tm.tm_mon << 5 | tm.tm_mday); if (time_cs) *time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000; } @@ -285,4 +259,3 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) } return err; } - diff --git a/fs/fcntl.c b/fs/fcntl.c index 2cf93ec40a67..5ef953e6f908 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -199,7 +199,9 @@ static int setfl(int fd, struct file * filp, unsigned long arg) static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, int force) { - write_lock_irq(&filp->f_owner.lock); + unsigned long flags; + + write_lock_irqsave(&filp->f_owner.lock, flags); if (force || !filp->f_owner.pid) { put_pid(filp->f_owner.pid); filp->f_owner.pid = get_pid(pid); @@ -211,7 +213,7 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, filp->f_owner.euid = cred->euid; } } - write_unlock_irq(&filp->f_owner.lock); + write_unlock_irqrestore(&filp->f_owner.lock, flags); } int __f_setown(struct file *filp, struct pid *pid, enum pid_type type, @@ -618,60 +620,90 @@ static DEFINE_RWLOCK(fasync_lock); static struct kmem_cache *fasync_cache __read_mostly; /* - * fasync_helper() is used by almost all character device drivers - * to set up the fasync queue. It returns negative on error, 0 if it did - * no changes and positive if it added/deleted the entry. + * Remove a fasync entry. If successfully removed, return + * positive and clear the FASYNC flag. If no entry exists, + * do nothing and return 0. + * + * NOTE! It is very important that the FASYNC flag always + * match the state "is the filp on a fasync list". + * + * We always take the 'filp->f_lock', in since fasync_lock + * needs to be irq-safe. */ -int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) +static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) { struct fasync_struct *fa, **fp; - struct fasync_struct *new = NULL; int result = 0; - if (on) { - new = kmem_cache_alloc(fasync_cache, GFP_KERNEL); - if (!new) - return -ENOMEM; + spin_lock(&filp->f_lock); + write_lock_irq(&fasync_lock); + for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { + if (fa->fa_file != filp) + continue; + *fp = fa->fa_next; + kmem_cache_free(fasync_cache, fa); + filp->f_flags &= ~FASYNC; + result = 1; + break; } + write_unlock_irq(&fasync_lock); + spin_unlock(&filp->f_lock); + return result; +} + +/* + * Add a fasync entry. Return negative on error, positive if + * added, and zero if did nothing but change an existing one. + * + * NOTE! It is very important that the FASYNC flag always + * match the state "is the filp on a fasync list". + */ +static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) +{ + struct fasync_struct *new, *fa, **fp; + int result = 0; + + new = kmem_cache_alloc(fasync_cache, GFP_KERNEL); + if (!new) + return -ENOMEM; - /* - * We need to take f_lock first since it's not an IRQ-safe - * lock. - */ spin_lock(&filp->f_lock); write_lock_irq(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { - if (fa->fa_file == filp) { - if(on) { - fa->fa_fd = fd; - kmem_cache_free(fasync_cache, new); - } else { - *fp = fa->fa_next; - kmem_cache_free(fasync_cache, fa); - result = 1; - } - goto out; - } + if (fa->fa_file != filp) + continue; + fa->fa_fd = fd; + kmem_cache_free(fasync_cache, new); + goto out; } - if (on) { - new->magic = FASYNC_MAGIC; - new->fa_file = filp; - new->fa_fd = fd; - new->fa_next = *fapp; - *fapp = new; - result = 1; - } + new->magic = FASYNC_MAGIC; + new->fa_file = filp; + new->fa_fd = fd; + new->fa_next = *fapp; + *fapp = new; + result = 1; + filp->f_flags |= FASYNC; + out: - if (on) - filp->f_flags |= FASYNC; - else - filp->f_flags &= ~FASYNC; write_unlock_irq(&fasync_lock); spin_unlock(&filp->f_lock); return result; } +/* + * fasync_helper() is used by almost all character device drivers + * to set up the fasync queue, and for regular files by the file + * lease code. It returns negative on error, 0 if it did no changes + * and positive if it added/deleted the entry. + */ +int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) +{ + if (!on) + return fasync_remove_entry(filp, fapp); + return fasync_add_entry(fd, filp, fapp); +} + EXPORT_SYMBOL(fasync_helper); void __kill_fasync(struct fasync_struct *fa, int sig, int band) diff --git a/fs/file_table.c b/fs/file_table.c index 4bef4c01ec6f..69652c5bd5f0 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -21,9 +21,12 @@ #include <linux/fsnotify.h> #include <linux/sysctl.h> #include <linux/percpu_counter.h> +#include <linux/ima.h> #include <asm/atomic.h> +#include "internal.h" + /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE @@ -147,8 +150,6 @@ fail: return NULL; } -EXPORT_SYMBOL(get_empty_filp); - /** * alloc_file - allocate and initialize a 'struct file' * @mnt: the vfsmount on which the file will reside @@ -164,8 +165,8 @@ EXPORT_SYMBOL(get_empty_filp); * If all the callers of init_file() are eliminated, its * code should be moved into this function. */ -struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry, - fmode_t mode, const struct file_operations *fop) +struct file *alloc_file(struct path *path, fmode_t mode, + const struct file_operations *fop) { struct file *file; @@ -173,35 +174,8 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry, if (!file) return NULL; - init_file(file, mnt, dentry, mode, fop); - return file; -} -EXPORT_SYMBOL(alloc_file); - -/** - * init_file - initialize a 'struct file' - * @file: the already allocated 'struct file' to initialized - * @mnt: the vfsmount on which the file resides - * @dentry: the dentry representing this file - * @mode: the mode the file is opened with - * @fop: the 'struct file_operations' for this file - * - * Use this instead of setting the members directly. Doing so - * avoids making mistakes like forgetting the mntget() or - * forgetting to take a write on the mnt. - * - * Note: This is a crappy interface. It is here to make - * merging with the existing users of get_empty_filp() - * who have complex failure logic easier. All users - * of this should be moving to alloc_file(). - */ -int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry, - fmode_t mode, const struct file_operations *fop) -{ - int error = 0; - file->f_path.dentry = dentry; - file->f_path.mnt = mntget(mnt); - file->f_mapping = dentry->d_inode->i_mapping; + file->f_path = *path; + file->f_mapping = path->dentry->d_inode->i_mapping; file->f_mode = mode; file->f_op = fop; @@ -211,14 +185,14 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry, * visible. We do this for consistency, and so * that we can do debugging checks at __fput() */ - if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) { + if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) { file_take_write(file); - error = mnt_clone_write(mnt); - WARN_ON(error); + WARN_ON(mnt_clone_write(path->mnt)); } - return error; + ima_counts_get(file); + return file; } -EXPORT_SYMBOL(init_file); +EXPORT_SYMBOL(alloc_file); void fput(struct file *file) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 49bc1b8e8f19..1a7c42c64ff4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -242,6 +242,7 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, /** * bdi_start_writeback - start writeback * @bdi: the backing device to write from + * @sb: write inodes from this super_block * @nr_pages: the number of pages to write * * Description: @@ -1187,6 +1188,23 @@ void writeback_inodes_sb(struct super_block *sb) EXPORT_SYMBOL(writeback_inodes_sb); /** + * writeback_inodes_sb_if_idle - start writeback if none underway + * @sb: the superblock + * + * Invoke writeback_inodes_sb if no writeback is currently underway. + * Returns 1 if writeback was started, 0 if not. + */ +int writeback_inodes_sb_if_idle(struct super_block *sb) +{ + if (!writeback_in_progress(sb->s_bdi)) { + writeback_inodes_sb(sb); + return 1; + } else + return 0; +} +EXPORT_SYMBOL(writeback_inodes_sb_if_idle); + +/** * sync_inodes_sb - sync sb inode pages * @sb: the superblock * diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index e590242fa41a..3221a0c7944e 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -91,7 +91,7 @@ EXPORT_SYMBOL(fscache_object_destroy); */ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos) { - struct fscache_object *pobj, *obj, *minobj = NULL; + struct fscache_object *pobj, *obj = NULL, *minobj = NULL; struct rb_node *p; unsigned long pos; diff --git a/fs/generic_acl.c b/fs/generic_acl.c index e0b53aa7bbec..55458031e501 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -1,62 +1,58 @@ /* - * fs/generic_acl.c - * * (C) 2005 Andreas Gruenbacher <agruen@suse.de> * * This file is released under the GPL. + * + * Generic ACL support for in-memory filesystems. */ #include <linux/sched.h> #include <linux/fs.h> #include <linux/generic_acl.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> -/** - * generic_acl_list - Generic xattr_handler->list() operation - * @ops: Filesystem specific getacl and setacl callbacks - */ -size_t -generic_acl_list(struct inode *inode, struct generic_acl_operations *ops, - int type, char *list, size_t list_size) + +static size_t +generic_acl_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { struct posix_acl *acl; - const char *name; + const char *xname; size_t size; - acl = ops->getacl(inode, type); + acl = get_cached_acl(dentry->d_inode, type); if (!acl) return 0; posix_acl_release(acl); - switch(type) { - case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; - break; - - case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; - break; - - default: - return 0; + switch (type) { + case ACL_TYPE_ACCESS: + xname = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + xname = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return 0; } - size = strlen(name) + 1; + size = strlen(xname) + 1; if (list && size <= list_size) - memcpy(list, name, size); + memcpy(list, xname, size); return size; } -/** - * generic_acl_get - Generic xattr_handler->get() operation - * @ops: Filesystem specific getacl and setacl callbacks - */ -int -generic_acl_get(struct inode *inode, struct generic_acl_operations *ops, - int type, void *buffer, size_t size) +static int +generic_acl_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { struct posix_acl *acl; int error; - acl = ops->getacl(inode, type); + if (strcmp(name, "") != 0) + return -EINVAL; + + acl = get_cached_acl(dentry->d_inode, type); if (!acl) return -ENODATA; error = posix_acl_to_xattr(acl, buffer, size); @@ -65,17 +61,16 @@ generic_acl_get(struct inode *inode, struct generic_acl_operations *ops, return error; } -/** - * generic_acl_set - Generic xattr_handler->set() operation - * @ops: Filesystem specific getacl and setacl callbacks - */ -int -generic_acl_set(struct inode *inode, struct generic_acl_operations *ops, - int type, const void *value, size_t size) +static int +generic_acl_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; struct posix_acl *acl = NULL; int error; + if (strcmp(name, "") != 0) + return -EINVAL; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; if (!is_owner_or_cap(inode)) @@ -91,28 +86,27 @@ generic_acl_set(struct inode *inode, struct generic_acl_operations *ops, error = posix_acl_valid(acl); if (error) goto failed; - switch(type) { - case ACL_TYPE_ACCESS: - mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); - if (error < 0) - goto failed; - inode->i_mode = mode; - if (error == 0) { - posix_acl_release(acl); - acl = NULL; - } - break; - - case ACL_TYPE_DEFAULT: - if (!S_ISDIR(inode->i_mode)) { - error = -EINVAL; - goto failed; - } - break; + switch (type) { + case ACL_TYPE_ACCESS: + mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + goto failed; + inode->i_mode = mode; + if (error == 0) { + posix_acl_release(acl); + acl = NULL; + } + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + error = -EINVAL; + goto failed; + } + break; } } - ops->setacl(inode, type, acl); + set_cached_acl(inode, type, acl); error = 0; failed: posix_acl_release(acl); @@ -121,14 +115,12 @@ failed: /** * generic_acl_init - Take care of acl inheritance at @inode create time - * @ops: Filesystem specific getacl and setacl callbacks * * Files created inside a directory with a default ACL inherit the * directory's default ACL. */ int -generic_acl_init(struct inode *inode, struct inode *dir, - struct generic_acl_operations *ops) +generic_acl_init(struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; mode_t mode = inode->i_mode; @@ -136,7 +128,7 @@ generic_acl_init(struct inode *inode, struct inode *dir, inode->i_mode = mode & ~current_umask(); if (!S_ISLNK(inode->i_mode)) - acl = ops->getacl(dir, ACL_TYPE_DEFAULT); + acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); if (acl) { struct posix_acl *clone; @@ -145,7 +137,7 @@ generic_acl_init(struct inode *inode, struct inode *dir, error = -ENOMEM; if (!clone) goto cleanup; - ops->setacl(inode, ACL_TYPE_DEFAULT, clone); + set_cached_acl(inode, ACL_TYPE_DEFAULT, clone); posix_acl_release(clone); } clone = posix_acl_clone(acl, GFP_KERNEL); @@ -156,7 +148,7 @@ generic_acl_init(struct inode *inode, struct inode *dir, if (error >= 0) { inode->i_mode = mode; if (error > 0) - ops->setacl(inode, ACL_TYPE_ACCESS, clone); + set_cached_acl(inode, ACL_TYPE_ACCESS, clone); } posix_acl_release(clone); } @@ -169,20 +161,19 @@ cleanup: /** * generic_acl_chmod - change the access acl of @inode upon chmod() - * @ops: FIlesystem specific getacl and setacl callbacks * * A chmod also changes the permissions of the owner, group/mask, and * other ACL entries. */ int -generic_acl_chmod(struct inode *inode, struct generic_acl_operations *ops) +generic_acl_chmod(struct inode *inode) { struct posix_acl *acl, *clone; int error = 0; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - acl = ops->getacl(inode, ACL_TYPE_ACCESS); + acl = get_cached_acl(inode, ACL_TYPE_ACCESS); if (acl) { clone = posix_acl_clone(acl, GFP_KERNEL); posix_acl_release(acl); @@ -190,8 +181,37 @@ generic_acl_chmod(struct inode *inode, struct generic_acl_operations *ops) return -ENOMEM; error = posix_acl_chmod_masq(clone, inode->i_mode); if (!error) - ops->setacl(inode, ACL_TYPE_ACCESS, clone); + set_cached_acl(inode, ACL_TYPE_ACCESS, clone); posix_acl_release(clone); } return error; } + +int +generic_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS); + + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + return -EAGAIN; +} + +struct xattr_handler generic_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = generic_acl_list, + .get = generic_acl_get, + .set = generic_acl_set, +}; + +struct xattr_handler generic_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = generic_acl_list, + .get = generic_acl_get, + .set = generic_acl_set, +}; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 3eb1ea846173..87ee309d4c24 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -126,7 +126,7 @@ static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl) error = posix_acl_to_xattr(acl, data, len); if (error < 0) goto out; - error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, data, len, 0); + error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); if (!error) set_cached_acl(inode, type, acl); out: @@ -232,9 +232,10 @@ static int gfs2_acl_type(const char *name) return -EINVAL; } -static int gfs2_xattr_system_get(struct inode *inode, const char *name, - void *buffer, size_t size) +static int gfs2_xattr_system_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int xtype) { + struct inode *inode = dentry->d_inode; struct posix_acl *acl; int type; int error; @@ -255,9 +256,11 @@ static int gfs2_xattr_system_get(struct inode *inode, const char *name, return error; } -static int gfs2_xattr_system_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int gfs2_xattr_system_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, + int xtype) { + struct inode *inode = dentry->d_inode; struct gfs2_sbd *sdp = GFS2_SB(inode); struct posix_acl *acl = NULL; int error = 0, type; @@ -319,7 +322,7 @@ static int gfs2_xattr_system_set(struct inode *inode, const char *name, } set_acl: - error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, 0); + error = __gfs2_xattr_set(inode, name, value, size, 0, GFS2_EATYPE_SYS); if (!error) { if (acl) set_cached_acl(inode, type, acl); @@ -334,6 +337,7 @@ out: struct xattr_handler gfs2_xattr_system_handler = { .prefix = XATTR_SYSTEM_PREFIX, + .flags = GFS2_EATYPE_SYS, .get = gfs2_xattr_system_get, .set = gfs2_xattr_system_set, }; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4eb308aa3234..a6abbae8a278 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -569,6 +569,40 @@ static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) return ret; } +/** + * gfs2_file_aio_write - Perform a write to a file + * @iocb: The io context + * @iov: The data to write + * @nr_segs: Number of @iov segments + * @pos: The file position + * + * We have to do a lock/unlock here to refresh the inode size for + * O_APPEND writes, otherwise we can land up writing at the wrong + * offset. There is still a race, but provided the app is using its + * own file locking, this will make O_APPEND work as expected. + * + */ + +static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + + if (file->f_flags & O_APPEND) { + struct dentry *dentry = file->f_dentry; + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_holder gh; + int ret; + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) + return ret; + gfs2_glock_dq_uninit(&gh); + } + + return generic_file_aio_write(iocb, iov, nr_segs, pos); +} + #ifdef CONFIG_GFS2_FS_LOCKING_DLM /** @@ -711,7 +745,7 @@ const struct file_operations gfs2_file_fops = { .read = do_sync_read, .aio_read = generic_file_aio_read, .write = do_sync_write, - .aio_write = generic_file_aio_write, + .aio_write = gfs2_file_aio_write, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, @@ -741,7 +775,7 @@ const struct file_operations gfs2_file_fops_nolock = { .read = do_sync_read, .aio_read = generic_file_aio_read, .write = do_sync_write, - .aio_write = generic_file_aio_write, + .aio_write = gfs2_file_aio_write, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 26ba2a4c4a2d..6e220f4eee7d 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -125,7 +125,7 @@ static struct inode *gfs2_iget_skip(struct super_block *sb, * directory entry when gfs2_inode_lookup() is invoked. Part of the code * segment inside gfs2_inode_lookup code needs to get moved around. * - * Clean up I_LOCK and I_NEW as well. + * Clears I_NEW as well. **/ void gfs2_set_iop(struct inode *inode) @@ -801,7 +801,8 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) return err; } - err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0); + err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0, + GFS2_EATYPE_SECURITY); kfree(value); kfree(name); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index cb8d7a93d5ec..6f68a5f18eb8 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -121,7 +121,7 @@ struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp) if (aspace) { mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS); aspace->i_mapping->a_ops = &aspace_aops; - aspace->i_size = ~0ULL; + aspace->i_size = MAX_LFS_FILESIZE; ip = GFS2_I(aspace); clear_bit(GIF_USER, &ip->i_flags); insert_inode_hash(aspace); diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 247436c10deb..84350e1be66d 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -748,7 +748,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; - int alloc_required; + int alloc_required = 0; unsigned int x; int error; @@ -867,7 +867,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, goto out_gunlock; } - alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); + if (nip == NULL) + alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); + error = alloc_required; if (error < 0) goto out_gunlock; error = 0; @@ -1086,7 +1088,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) error = vfs_follow_link(nd, buf); if (buf != array) kfree(buf); - } + } else + path_put(&nd->path); return ERR_PTR(error); } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 0608f490c295..503b842f3ba2 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -591,11 +591,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip) u64 rgrp_count = ip->i_disksize; int error; - if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) { - gfs2_consist_inode(ip); - return -EIO; - } - + do_div(rgrp_count, sizeof(struct gfs2_rindex)); clear_rgrpdi(sdp); file_ra_state_init(&ra_state, inode->i_mapping); @@ -915,7 +911,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) { BUG_ON(ip->i_alloc != NULL); - ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_KERNEL); + ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS); return ip->i_alloc; } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index c5dad1eb7b91..0dc34621f6a6 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -85,11 +85,7 @@ static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf) buf[0] = '\0'; if (!gfs2_uuid_valid(uuid)) return 0; - return snprintf(buf, PAGE_SIZE, "%02X%02X%02X%02X-%02X%02X-" - "%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n", - uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5], - uuid[6], uuid[7], uuid[8], uuid[9], uuid[10], uuid[11], - uuid[12], uuid[13], uuid[14], uuid[15]); + return snprintf(buf, PAGE_SIZE, "%pUB\n", uuid); } static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) @@ -575,14 +571,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); if (!sdp->sd_args.ar_spectator) add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); - if (gfs2_uuid_valid(uuid)) { - add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-" - "%02X%02X-%02X%02X%02X%02X%02X%02X", - uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], - uuid[5], uuid[6], uuid[7], uuid[8], uuid[9], - uuid[10], uuid[11], uuid[12], uuid[13], - uuid[14], uuid[15]); - } + if (gfs2_uuid_valid(uuid)) + add_uevent_var(env, "UUID=%pUB", uuid); return 0; } diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 912f5cbc4740..c2ebdf2c01d4 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -567,18 +567,17 @@ out: /** * gfs2_xattr_get - Get a GFS2 extended attribute * @inode: The inode - * @type: The type of extended attribute * @name: The name of the extended attribute * @buffer: The buffer to write the result into * @size: The size of the buffer + * @type: The type of extended attribute * * Returns: actual size of data on success, -errno on error */ - -int gfs2_xattr_get(struct inode *inode, int type, const char *name, - void *buffer, size_t size) +static int gfs2_xattr_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { - struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); struct gfs2_ea_location el; int error; @@ -1119,7 +1118,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) /** * gfs2_xattr_remove - Remove a GFS2 extended attribute - * @inode: The inode + * @ip: The inode * @type: The type of the extended attribute * @name: The name of the extended attribute * @@ -1130,9 +1129,8 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) * Returns: 0, or errno on failure */ -static int gfs2_xattr_remove(struct inode *inode, int type, const char *name) +static int gfs2_xattr_remove(struct gfs2_inode *ip, int type, const char *name) { - struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_ea_location el; int error; @@ -1156,24 +1154,24 @@ static int gfs2_xattr_remove(struct inode *inode, int type, const char *name) } /** - * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute - * @inode: The inode - * @type: The type of the extended attribute + * __gfs2_xattr_set - Set (or remove) a GFS2 extended attribute + * @ip: The inode * @name: The name of the extended attribute * @value: The value of the extended attribute (NULL for remove) * @size: The size of the @value argument * @flags: Create or Replace + * @type: The type of the extended attribute * * See gfs2_xattr_remove() for details of the removal of xattrs. * * Returns: 0 or errno on failure */ -int gfs2_xattr_set(struct inode *inode, int type, const char *name, - const void *value, size_t size, int flags) +int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags, int type) { - struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_ea_location el; unsigned int namel = strlen(name); int error; @@ -1184,7 +1182,7 @@ int gfs2_xattr_set(struct inode *inode, int type, const char *name, return -ERANGE; if (value == NULL) - return gfs2_xattr_remove(inode, type, name); + return gfs2_xattr_remove(ip, type, name); if (ea_check_size(sdp, namel, size)) return -ERANGE; @@ -1224,6 +1222,13 @@ int gfs2_xattr_set(struct inode *inode, int type, const char *name, return error; } +static int gfs2_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + return __gfs2_xattr_set(dentry->d_inode, name, value, + size, flags, type); +} + static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, char *data) { @@ -1291,6 +1296,7 @@ fail: int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_ea_location el; struct buffer_head *dibh; int error; @@ -1300,16 +1306,17 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) return error; if (GFS2_EA_IS_STUFFED(el.el_ea)) { - error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); - if (error) - return error; - - gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1); - memcpy(GFS2_EA2DATA(el.el_ea), data, - GFS2_EA_DATA_LEN(el.el_ea)); - } else + error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0); + if (error == 0) { + gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1); + memcpy(GFS2_EA2DATA(el.el_ea), data, + GFS2_EA_DATA_LEN(el.el_ea)); + } + } else { error = ea_acl_chmod_unstuffed(ip, el.el_ea, data); + } + brelse(el.el_bh); if (error) return error; @@ -1322,8 +1329,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) brelse(dibh); } - gfs2_trans_end(GFS2_SB(&ip->i_inode)); - + gfs2_trans_end(sdp); return error; } @@ -1529,40 +1535,18 @@ out_alloc: return error; } -static int gfs2_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size); -} - -static int gfs2_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags); -} - -static int gfs2_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size); -} - -static int gfs2_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags); -} - static struct xattr_handler gfs2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, - .get = gfs2_xattr_user_get, - .set = gfs2_xattr_user_set, + .flags = GFS2_EATYPE_USR, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, }; static struct xattr_handler gfs2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .get = gfs2_xattr_security_get, - .set = gfs2_xattr_security_set, + .flags = GFS2_EATYPE_SECURITY, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, }; struct xattr_handler *gfs2_xattr_handlers[] = { diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h index 8d6ae5813c4d..d392f8358f2f 100644 --- a/fs/gfs2/xattr.h +++ b/fs/gfs2/xattr.h @@ -53,10 +53,9 @@ struct gfs2_ea_location { struct gfs2_ea_header *el_prev; }; -extern int gfs2_xattr_get(struct inode *inode, int type, const char *name, - void *buffer, size_t size); -extern int gfs2_xattr_set(struct inode *inode, int type, const char *name, - const void *value, size_t size, int flags); +extern int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, + int flags, int type); extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); extern int gfs2_ea_dealloc(struct gfs2_inode *ip); diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index 6d98f116ca03..424b0337f524 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -289,6 +289,10 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name, err = hfs_brec_find(&src_fd); if (err) goto out; + if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) { + err = -EIO; + goto out; + } hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset, src_fd.entrylength); diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 7c69b98a2e45..2b3b8611b41b 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -79,6 +79,11 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) filp->f_pos++; /* fall through */ case 1: + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); if (entry.type != HFS_CDR_THD) { printk(KERN_ERR "hfs: bad catalog folder thread\n"); @@ -109,6 +114,12 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) err = -EIO; goto out; } + + if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { + err = -EIO; + goto out; + } + hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); type = entry.type; len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index f7fcbe49da72..5ed7252b7b23 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -409,8 +409,13 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) /* try to get the root inode */ hfs_find_init(HFS_SB(sb)->cat_tree, &fd); res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd); - if (!res) + if (!res) { + if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) { + res = -EIO; + goto bail; + } hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength); + } if (res) { hfs_find_exit(&fd); goto bail_no_root; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index f2feaa06bf26..cadc4ce48656 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -14,6 +14,7 @@ #include <linux/magic.h> #include <linux/sched.h> #include <linux/smp_lock.h> +#include <linux/bitmap.h> /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ @@ -115,15 +116,13 @@ static void hpfs_put_super(struct super_block *s) unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) { struct quad_buffer_head qbh; - unsigned *bits; - unsigned i, count; - if (!(bits = hpfs_map_4sectors(s, secno, &qbh, 4))) return 0; - count = 0; - for (i = 0; i < 2048 / sizeof(unsigned); i++) { - unsigned b; - if (!bits[i]) continue; - for (b = bits[i]; b; b>>=1) count += b & 1; - } + unsigned long *bits; + unsigned count; + + bits = hpfs_map_4sectors(s, secno, &qbh, 4); + if (!bits) + return 0; + count = bitmap_weight(bits, 2048 * BITS_PER_BYTE); hpfs_brelse4(&qbh); return count; } diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index a5089a6dd67a..7239efc690d8 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -646,22 +646,27 @@ static const struct super_operations hppfs_sbops = { static int hppfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct dentry *proc_dentry; - - proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer, buflen); } static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct dentry *proc_dentry; - - proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd); } +static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + + if (proc_dentry->d_inode->i_op->put_link) + proc_dentry->d_inode->i_op->put_link(proc_dentry, nd, cookie); +} + static const struct inode_operations hppfs_dir_iops = { .lookup = hppfs_lookup, }; @@ -669,6 +674,7 @@ static const struct inode_operations hppfs_dir_iops = { static const struct inode_operations hppfs_link_iops = { .readlink = hppfs_readlink, .follow_link = hppfs_follow_link, + .put_link = hppfs_put_link, }; static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 87a1258953b8..a0bbd3d1b41a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -30,7 +30,6 @@ #include <linux/dnotify.h> #include <linux/statfs.h> #include <linux/security.h> -#include <linux/ima.h> #include <linux/magic.h> #include <asm/uaccess.h> @@ -922,7 +921,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, int error = -ENOMEM; struct file *file; struct inode *inode; - struct dentry *dentry, *root; + struct path path; + struct dentry *root; struct qstr quick_string; *user = NULL; @@ -944,10 +944,11 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, quick_string.name = name; quick_string.len = strlen(quick_string.name); quick_string.hash = 0; - dentry = d_alloc(root, &quick_string); - if (!dentry) + path.dentry = d_alloc(root, &quick_string); + if (!path.dentry) goto out_shm_unlock; + path.mnt = mntget(hugetlbfs_vfsmount); error = -ENOSPC; inode = hugetlbfs_get_inode(root->d_sb, current_fsuid(), current_fsgid(), S_IFREG | S_IRWXUGO, 0); @@ -960,24 +961,22 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, acctflag)) goto out_inode; - d_instantiate(dentry, inode); + d_instantiate(path.dentry, inode); inode->i_size = size; inode->i_nlink = 0; error = -ENFILE; - file = alloc_file(hugetlbfs_vfsmount, dentry, - FMODE_WRITE | FMODE_READ, + file = alloc_file(&path, FMODE_WRITE | FMODE_READ, &hugetlbfs_file_operations); if (!file) goto out_dentry; /* inode is already attached */ - ima_counts_get(file); return file; out_inode: iput(inode); out_dentry: - dput(dentry); + path_put(&path); out_shm_unlock: if (*user) { user_shm_unlock(size, *user); diff --git a/fs/inode.c b/fs/inode.c index 06c1f02de611..03dfeb2e3928 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -113,7 +113,7 @@ static void wake_up_inode(struct inode *inode) * Prevent speculative execution through spin_unlock(&inode_lock); */ smp_mb(); - wake_up_bit(&inode->i_state, __I_LOCK); + wake_up_bit(&inode->i_state, __I_NEW); } /** @@ -690,17 +690,17 @@ void unlock_new_inode(struct inode *inode) } #endif /* - * This is special! We do not need the spinlock when clearing I_LOCK, + * This is special! We do not need the spinlock when clearing I_NEW, * because we're guaranteed that nobody else tries to do anything about * the state of the inode when it is locked, as we just created it (so - * there can be no old holders that haven't tested I_LOCK). + * there can be no old holders that haven't tested I_NEW). * However we must emit the memory barrier so that other CPUs reliably - * see the clearing of I_LOCK after the other inode initialisation has + * see the clearing of I_NEW after the other inode initialisation has * completed. */ smp_mb(); - WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW)); - inode->i_state &= ~(I_LOCK|I_NEW); + WARN_ON(!(inode->i_state & I_NEW)); + inode->i_state &= ~I_NEW; wake_up_inode(inode); } EXPORT_SYMBOL(unlock_new_inode); @@ -731,7 +731,7 @@ static struct inode *get_new_inode(struct super_block *sb, goto set_failed; __inode_add_to_lists(sb, head, inode); - inode->i_state = I_LOCK|I_NEW; + inode->i_state = I_NEW; spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the @@ -778,7 +778,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb, if (!old) { inode->i_ino = ino; __inode_add_to_lists(sb, head, inode); - inode->i_state = I_LOCK|I_NEW; + inode->i_state = I_NEW; spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the @@ -1083,7 +1083,7 @@ int insert_inode_locked(struct inode *inode) ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); - inode->i_state |= I_LOCK|I_NEW; + inode->i_state |= I_NEW; while (1) { struct hlist_node *node; struct inode *old = NULL; @@ -1120,7 +1120,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, struct super_block *sb = inode->i_sb; struct hlist_head *head = inode_hashtable + hash(sb, hashval); - inode->i_state |= I_LOCK|I_NEW; + inode->i_state |= I_NEW; while (1) { struct hlist_node *node; @@ -1510,7 +1510,7 @@ EXPORT_SYMBOL(inode_wait); * until the deletion _might_ have completed. Callers are responsible * to recheck inode state. * - * It doesn't matter if I_LOCK is not set initially, a call to + * It doesn't matter if I_NEW is not set initially, a call to * wake_up_inode() after removing from the hash list will DTRT. * * This is called with inode_lock held. @@ -1518,8 +1518,8 @@ EXPORT_SYMBOL(inode_wait); static void __wait_on_freeing_inode(struct inode *inode) { wait_queue_head_t *wq; - DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK); - wq = bit_waitqueue(&inode->i_state, __I_LOCK); + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); spin_unlock(&inode_lock); schedule(); diff --git a/fs/internal.h b/fs/internal.h index 515175b8b72e..e96a1667d749 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -79,8 +79,16 @@ extern void chroot_fs_refs(struct path *, struct path *); * file_table.c */ extern void mark_files_ro(struct super_block *); +extern struct file *get_empty_filp(void); /* * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); + +/* + * open.c + */ +struct nameidata; +extern struct file *nameidata_to_filp(struct nameidata *); +extern void release_open_intent(struct nameidata *); diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index defb932eee9a..0b3fa7974fa8 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -36,286 +36,323 @@ static void *zisofs_zlib_workspace; static DEFINE_MUTEX(zisofs_zlib_lock); /* - * When decompressing, we typically obtain more than one page - * per reference. We inject the additional pages into the page - * cache as a form of readahead. + * Read data of @inode from @block_start to @block_end and uncompress + * to one zisofs block. Store the data in the @pages array with @pcount + * entries. Start storing at offset @poffset of the first page. */ -static int zisofs_readpage(struct file *file, struct page *page) +static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, + loff_t block_end, int pcount, + struct page **pages, unsigned poffset, + int *errp) { - struct inode *inode = file->f_path.dentry->d_inode; - struct address_space *mapping = inode->i_mapping; - unsigned int maxpage, xpage, fpage, blockindex; - unsigned long offset; - unsigned long blockptr, blockendptr, cstart, cend, csize; - struct buffer_head *bh, *ptrbh[2]; - unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); - unsigned int bufshift = ISOFS_BUFFER_BITS(inode); - unsigned long bufmask = bufsize - 1; - int err = -EIO; - int i; - unsigned int header_size = ISOFS_I(inode)->i_format_parm[0]; unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1]; - /* unsigned long zisofs_block_size = 1UL << zisofs_block_shift; */ - unsigned int zisofs_block_page_shift = zisofs_block_shift-PAGE_CACHE_SHIFT; - unsigned long zisofs_block_pages = 1UL << zisofs_block_page_shift; - unsigned long zisofs_block_page_mask = zisofs_block_pages-1; - struct page *pages[zisofs_block_pages]; - unsigned long index = page->index; - int indexblocks; - - /* We have already been given one page, this is the one - we must do. */ - xpage = index & zisofs_block_page_mask; - pages[xpage] = page; - - /* The remaining pages need to be allocated and inserted */ - offset = index & ~zisofs_block_page_mask; - blockindex = offset >> zisofs_block_page_shift; - maxpage = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - - /* - * If this page is wholly outside i_size we just return zero; - * do_generic_file_read() will handle this for us - */ - if (page->index >= maxpage) { - SetPageUptodate(page); - unlock_page(page); + unsigned int bufsize = ISOFS_BUFFER_SIZE(inode); + unsigned int bufshift = ISOFS_BUFFER_BITS(inode); + unsigned int bufmask = bufsize - 1; + int i, block_size = block_end - block_start; + z_stream stream = { .total_out = 0, + .avail_in = 0, + .avail_out = 0, }; + int zerr; + int needblocks = (block_size + (block_start & bufmask) + bufmask) + >> bufshift; + int haveblocks; + blkcnt_t blocknum; + struct buffer_head *bhs[needblocks + 1]; + int curbh, curpage; + + if (block_size > deflateBound(1UL << zisofs_block_shift)) { + *errp = -EIO; return 0; } - - maxpage = min(zisofs_block_pages, maxpage-offset); - - for ( i = 0 ; i < maxpage ; i++, offset++ ) { - if ( i != xpage ) { - pages[i] = grab_cache_page_nowait(mapping, offset); - } - page = pages[i]; - if ( page ) { - ClearPageError(page); - kmap(page); + /* Empty block? */ + if (block_size == 0) { + for ( i = 0 ; i < pcount ; i++ ) { + if (!pages[i]) + continue; + memset(page_address(pages[i]), 0, PAGE_CACHE_SIZE); + flush_dcache_page(pages[i]); + SetPageUptodate(pages[i]); } + return ((loff_t)pcount) << PAGE_CACHE_SHIFT; } - /* This is the last page filled, plus one; used in case of abort. */ - fpage = 0; + /* Because zlib is not thread-safe, do all the I/O at the top. */ + blocknum = block_start >> bufshift; + memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *)); + haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks); + ll_rw_block(READ, haveblocks, bhs); - /* Find the pointer to this specific chunk */ - /* Note: we're not using isonum_731() here because the data is known aligned */ - /* Note: header_size is in 32-bit words (4 bytes) */ - blockptr = (header_size + blockindex) << 2; - blockendptr = blockptr + 4; + curbh = 0; + curpage = 0; + /* + * First block is special since it may be fractional. We also wait for + * it before grabbing the zlib mutex; odds are that the subsequent + * blocks are going to come in in short order so we don't hold the zlib + * mutex longer than necessary. + */ - indexblocks = ((blockptr^blockendptr) >> bufshift) ? 2 : 1; - ptrbh[0] = ptrbh[1] = NULL; + if (!bhs[0]) + goto b_eio; - if ( isofs_get_blocks(inode, blockptr >> bufshift, ptrbh, indexblocks) != indexblocks ) { - if ( ptrbh[0] ) brelse(ptrbh[0]); - printk(KERN_DEBUG "zisofs: Null buffer on reading block table, inode = %lu, block = %lu\n", - inode->i_ino, blockptr >> bufshift); - goto eio; - } - ll_rw_block(READ, indexblocks, ptrbh); - - bh = ptrbh[0]; - if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) { - printk(KERN_DEBUG "zisofs: Failed to read block table, inode = %lu, block = %lu\n", - inode->i_ino, blockptr >> bufshift); - if ( ptrbh[1] ) - brelse(ptrbh[1]); - goto eio; - } - cstart = le32_to_cpu(*(__le32 *)(bh->b_data + (blockptr & bufmask))); - - if ( indexblocks == 2 ) { - /* We just crossed a block boundary. Switch to the next block */ - brelse(bh); - bh = ptrbh[1]; - if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) { - printk(KERN_DEBUG "zisofs: Failed to read block table, inode = %lu, block = %lu\n", - inode->i_ino, blockendptr >> bufshift); - goto eio; - } + wait_on_buffer(bhs[0]); + if (!buffer_uptodate(bhs[0])) { + *errp = -EIO; + goto b_eio; } - cend = le32_to_cpu(*(__le32 *)(bh->b_data + (blockendptr & bufmask))); - brelse(bh); - if (cstart > cend) - goto eio; + stream.workspace = zisofs_zlib_workspace; + mutex_lock(&zisofs_zlib_lock); - csize = cend-cstart; - - if (csize > deflateBound(1UL << zisofs_block_shift)) - goto eio; - - /* Now page[] contains an array of pages, any of which can be NULL, - and the locks on which we hold. We should now read the data and - release the pages. If the pages are NULL the decompressed data - for that particular page should be discarded. */ - - if ( csize == 0 ) { - /* This data block is empty. */ - - for ( fpage = 0 ; fpage < maxpage ; fpage++ ) { - if ( (page = pages[fpage]) != NULL ) { - memset(page_address(page), 0, PAGE_CACHE_SIZE); - - flush_dcache_page(page); - SetPageUptodate(page); - kunmap(page); - unlock_page(page); - if ( fpage == xpage ) - err = 0; /* The critical page */ - else - page_cache_release(page); + zerr = zlib_inflateInit(&stream); + if (zerr != Z_OK) { + if (zerr == Z_MEM_ERROR) + *errp = -ENOMEM; + else + *errp = -EIO; + printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n", + zerr); + goto z_eio; + } + + while (curpage < pcount && curbh < haveblocks && + zerr != Z_STREAM_END) { + if (!stream.avail_out) { + if (pages[curpage]) { + stream.next_out = page_address(pages[curpage]) + + poffset; + stream.avail_out = PAGE_CACHE_SIZE - poffset; + poffset = 0; + } else { + stream.next_out = (void *)&zisofs_sink_page; + stream.avail_out = PAGE_CACHE_SIZE; } } - } else { - /* This data block is compressed. */ - z_stream stream; - int bail = 0, left_out = -1; - int zerr; - int needblocks = (csize + (cstart & bufmask) + bufmask) >> bufshift; - int haveblocks; - struct buffer_head *bhs[needblocks+1]; - struct buffer_head **bhptr; - - /* Because zlib is not thread-safe, do all the I/O at the top. */ - - blockptr = cstart >> bufshift; - memset(bhs, 0, (needblocks+1)*sizeof(struct buffer_head *)); - haveblocks = isofs_get_blocks(inode, blockptr, bhs, needblocks); - ll_rw_block(READ, haveblocks, bhs); - - bhptr = &bhs[0]; - bh = *bhptr++; - - /* First block is special since it may be fractional. - We also wait for it before grabbing the zlib - mutex; odds are that the subsequent blocks are - going to come in in short order so we don't hold - the zlib mutex longer than necessary. */ - - if ( !bh || (wait_on_buffer(bh), !buffer_uptodate(bh)) ) { - printk(KERN_DEBUG "zisofs: Hit null buffer, fpage = %d, xpage = %d, csize = %ld\n", - fpage, xpage, csize); - goto b_eio; - } - stream.next_in = bh->b_data + (cstart & bufmask); - stream.avail_in = min(bufsize-(cstart & bufmask), csize); - csize -= stream.avail_in; - - stream.workspace = zisofs_zlib_workspace; - mutex_lock(&zisofs_zlib_lock); - - zerr = zlib_inflateInit(&stream); - if ( zerr != Z_OK ) { - if ( err && zerr == Z_MEM_ERROR ) - err = -ENOMEM; - printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n", - zerr); - goto z_eio; + if (!stream.avail_in) { + wait_on_buffer(bhs[curbh]); + if (!buffer_uptodate(bhs[curbh])) { + *errp = -EIO; + break; + } + stream.next_in = bhs[curbh]->b_data + + (block_start & bufmask); + stream.avail_in = min_t(unsigned, bufsize - + (block_start & bufmask), + block_size); + block_size -= stream.avail_in; + block_start = 0; } - while ( !bail && fpage < maxpage ) { - page = pages[fpage]; - if ( page ) - stream.next_out = page_address(page); - else - stream.next_out = (void *)&zisofs_sink_page; - stream.avail_out = PAGE_CACHE_SIZE; - - while ( stream.avail_out ) { - int ao, ai; - if ( stream.avail_in == 0 && left_out ) { - if ( !csize ) { - printk(KERN_WARNING "zisofs: ZF read beyond end of input\n"); - bail = 1; - break; - } else { - bh = *bhptr++; - if ( !bh || - (wait_on_buffer(bh), !buffer_uptodate(bh)) ) { - /* Reached an EIO */ - printk(KERN_DEBUG "zisofs: Hit null buffer, fpage = %d, xpage = %d, csize = %ld\n", - fpage, xpage, csize); - - bail = 1; - break; - } - stream.next_in = bh->b_data; - stream.avail_in = min(csize,bufsize); - csize -= stream.avail_in; - } - } - ao = stream.avail_out; ai = stream.avail_in; - zerr = zlib_inflate(&stream, Z_SYNC_FLUSH); - left_out = stream.avail_out; - if ( zerr == Z_BUF_ERROR && stream.avail_in == 0 ) - continue; - if ( zerr != Z_OK ) { - /* EOF, error, or trying to read beyond end of input */ - if ( err && zerr == Z_MEM_ERROR ) - err = -ENOMEM; - if ( zerr != Z_STREAM_END ) - printk(KERN_DEBUG "zisofs: zisofs_inflate returned %d, inode = %lu, index = %lu, fpage = %d, xpage = %d, avail_in = %d, avail_out = %d, ai = %d, ao = %d\n", - zerr, inode->i_ino, index, - fpage, xpage, - stream.avail_in, stream.avail_out, - ai, ao); - bail = 1; - break; + while (stream.avail_out && stream.avail_in) { + zerr = zlib_inflate(&stream, Z_SYNC_FLUSH); + if (zerr == Z_BUF_ERROR && stream.avail_in == 0) + break; + if (zerr == Z_STREAM_END) + break; + if (zerr != Z_OK) { + /* EOF, error, or trying to read beyond end of input */ + if (zerr == Z_MEM_ERROR) + *errp = -ENOMEM; + else { + printk(KERN_DEBUG + "zisofs: zisofs_inflate returned" + " %d, inode = %lu," + " page idx = %d, bh idx = %d," + " avail_in = %d," + " avail_out = %d\n", + zerr, inode->i_ino, curpage, + curbh, stream.avail_in, + stream.avail_out); + *errp = -EIO; } + goto inflate_out; } + } - if ( stream.avail_out && zerr == Z_STREAM_END ) { - /* Fractional page written before EOF. This may - be the last page in the file. */ - memset(stream.next_out, 0, stream.avail_out); - stream.avail_out = 0; + if (!stream.avail_out) { + /* This page completed */ + if (pages[curpage]) { + flush_dcache_page(pages[curpage]); + SetPageUptodate(pages[curpage]); } + curpage++; + } + if (!stream.avail_in) + curbh++; + } +inflate_out: + zlib_inflateEnd(&stream); - if ( !stream.avail_out ) { - /* This page completed */ - if ( page ) { - flush_dcache_page(page); - SetPageUptodate(page); - kunmap(page); - unlock_page(page); - if ( fpage == xpage ) - err = 0; /* The critical page */ - else - page_cache_release(page); - } - fpage++; - } +z_eio: + mutex_unlock(&zisofs_zlib_lock); + +b_eio: + for (i = 0; i < haveblocks; i++) + brelse(bhs[i]); + return stream.total_out; +} + +/* + * Uncompress data so that pages[full_page] is fully uptodate and possibly + * fills in other pages if we have data for them. + */ +static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount, + struct page **pages) +{ + loff_t start_off, end_off; + loff_t block_start, block_end; + unsigned int header_size = ISOFS_I(inode)->i_format_parm[0]; + unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1]; + unsigned int blockptr; + loff_t poffset = 0; + blkcnt_t cstart_block, cend_block; + struct buffer_head *bh; + unsigned int blkbits = ISOFS_BUFFER_BITS(inode); + unsigned int blksize = 1 << blkbits; + int err; + loff_t ret; + + BUG_ON(!pages[full_page]); + + /* + * We want to read at least 'full_page' page. Because we have to + * uncompress the whole compression block anyway, fill the surrounding + * pages with the data we have anyway... + */ + start_off = page_offset(pages[full_page]); + end_off = min_t(loff_t, start_off + PAGE_CACHE_SIZE, inode->i_size); + + cstart_block = start_off >> zisofs_block_shift; + cend_block = (end_off + (1 << zisofs_block_shift) - 1) + >> zisofs_block_shift; + + WARN_ON(start_off - (full_page << PAGE_CACHE_SHIFT) != + ((cstart_block << zisofs_block_shift) & PAGE_CACHE_MASK)); + + /* Find the pointer to this specific chunk */ + /* Note: we're not using isonum_731() here because the data is known aligned */ + /* Note: header_size is in 32-bit words (4 bytes) */ + blockptr = (header_size + cstart_block) << 2; + bh = isofs_bread(inode, blockptr >> blkbits); + if (!bh) + return -EIO; + block_start = le32_to_cpu(*(__le32 *) + (bh->b_data + (blockptr & (blksize - 1)))); + + while (cstart_block < cend_block && pcount > 0) { + /* Load end of the compressed block in the file */ + blockptr += 4; + /* Traversed to next block? */ + if (!(blockptr & (blksize - 1))) { + brelse(bh); + + bh = isofs_bread(inode, blockptr >> blkbits); + if (!bh) + return -EIO; + } + block_end = le32_to_cpu(*(__le32 *) + (bh->b_data + (blockptr & (blksize - 1)))); + if (block_start > block_end) { + brelse(bh); + return -EIO; + } + err = 0; + ret = zisofs_uncompress_block(inode, block_start, block_end, + pcount, pages, poffset, &err); + poffset += ret; + pages += poffset >> PAGE_CACHE_SHIFT; + pcount -= poffset >> PAGE_CACHE_SHIFT; + full_page -= poffset >> PAGE_CACHE_SHIFT; + poffset &= ~PAGE_CACHE_MASK; + + if (err) { + brelse(bh); + /* + * Did we finish reading the page we really wanted + * to read? + */ + if (full_page < 0) + return 0; + return err; } - zlib_inflateEnd(&stream); - z_eio: - mutex_unlock(&zisofs_zlib_lock); + block_start = block_end; + cstart_block++; + } + + if (poffset && *pages) { + memset(page_address(*pages) + poffset, 0, + PAGE_CACHE_SIZE - poffset); + flush_dcache_page(*pages); + SetPageUptodate(*pages); + } + return 0; +} - b_eio: - for ( i = 0 ; i < haveblocks ; i++ ) { - if ( bhs[i] ) - brelse(bhs[i]); +/* + * When decompressing, we typically obtain more than one page + * per reference. We inject the additional pages into the page + * cache as a form of readahead. + */ +static int zisofs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + int err; + int i, pcount, full_page; + unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1]; + unsigned int zisofs_pages_per_cblock = + PAGE_CACHE_SHIFT <= zisofs_block_shift ? + (1 << (zisofs_block_shift - PAGE_CACHE_SHIFT)) : 0; + struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)]; + pgoff_t index = page->index, end_index; + + end_index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + /* + * If this page is wholly outside i_size we just return zero; + * do_generic_file_read() will handle this for us + */ + if (index >= end_index) { + SetPageUptodate(page); + unlock_page(page); + return 0; + } + + if (PAGE_CACHE_SHIFT <= zisofs_block_shift) { + /* We have already been given one page, this is the one + we must do. */ + full_page = index & (zisofs_pages_per_cblock - 1); + pcount = min_t(int, zisofs_pages_per_cblock, + end_index - (index & ~(zisofs_pages_per_cblock - 1))); + index -= full_page; + } else { + full_page = 0; + pcount = 1; + } + pages[full_page] = page; + + for (i = 0; i < pcount; i++, index++) { + if (i != full_page) + pages[i] = grab_cache_page_nowait(mapping, index); + if (pages[i]) { + ClearPageError(pages[i]); + kmap(pages[i]); } } -eio: + err = zisofs_fill_pages(inode, full_page, pcount, pages); /* Release any residual pages, do not SetPageUptodate */ - while ( fpage < maxpage ) { - page = pages[fpage]; - if ( page ) { - flush_dcache_page(page); - if ( fpage == xpage ) - SetPageError(page); - kunmap(page); - unlock_page(page); - if ( fpage != xpage ) - page_cache_release(page); + for (i = 0; i < pcount; i++) { + if (pages[i]) { + flush_dcache_page(pages[i]); + if (i == full_page && err) + SetPageError(pages[i]); + kunmap(pages[i]); + unlock_page(pages[i]); + if (i != full_page) + page_cache_release(pages[i]); } - fpage++; } /* At this point, err contains 0 or -EIO depending on the "critical" page */ diff --git a/fs/isofs/export.c b/fs/isofs/export.c index e81a30593ba9..ed752cb38474 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -9,7 +9,7 @@ * * The following files are helpful: * - * Documentation/filesystems/Exporting + * Documentation/filesystems/nfs/Exporting * fs/exportfs/expfs.c. */ diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index c2fb2dd0131f..96a685c550fd 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -518,8 +518,7 @@ repeat: if (algo == SIG('p', 'z')) { int block_shift = isonum_711(&rr->u.ZF.parms[1]); - if (block_shift < PAGE_CACHE_SHIFT - || block_shift > 17) { + if (block_shift > 17) { printk(KERN_WARNING "isofs: " "Can't handle ZF block " "size of 2^%d\n", diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 4160afad6d00..bd224eec9b07 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1913,7 +1913,7 @@ static void __init jbd_create_debugfs_entry(void) { jbd_debugfs_dir = debugfs_create_dir("jbd", NULL); if (jbd_debugfs_dir) - jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO, + jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR, jbd_debugfs_dir, &journal_enable_debug); } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index ca0f5eb62b20..886849370950 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -22,6 +22,7 @@ #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> +#include <linux/blkdev.h> #include <trace/events/jbd2.h> /* @@ -515,6 +516,20 @@ int jbd2_cleanup_journal_tail(journal_t *journal) journal->j_tail_sequence = first_tid; journal->j_tail = blocknr; spin_unlock(&journal->j_state_lock); + + /* + * If there is an external journal, we need to make sure that + * any data blocks that were recently written out --- perhaps + * by jbd2_log_do_checkpoint() --- are flushed out before we + * drop the transactions from the external journal. It's + * unlikely this will be necessary, especially with a + * appropriately sized journal, but we need this to guarantee + * correctness. Fortunately jbd2_cleanup_journal_tail() + * doesn't get called all that often. + */ + if ((journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(journal->j_fs_dev, NULL); if (!(journal->j_flags & JBD2_ABORT)) jbd2_journal_update_superblock(journal, 1); return 0; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index d4cfd6d2779e..1bc74b6f26d2 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -259,6 +259,7 @@ static int journal_submit_data_buffers(journal_t *journal, ret = err; spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); + commit_transaction->t_flushed_data_blocks = 1; jinode->i_flags &= ~JI_COMMIT_RUNNING; wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } @@ -286,7 +287,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, if (err) { /* * Because AS_EIO is cleared by - * wait_on_page_writeback_range(), set it again so + * filemap_fdatawait_range(), set it again so * that user process can get -EIO from fsync(). */ set_bit(AS_EIO, @@ -636,6 +637,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) JBUFFER_TRACE(jh, "ph3: write metadata"); flags = jbd2_journal_write_metadata_buffer(commit_transaction, jh, &new_jh, blocknr); + if (flags < 0) { + jbd2_journal_abort(journal, flags); + continue; + } set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); wbuf[bufs++] = jh2bh(new_jh); @@ -704,8 +709,17 @@ start_journal_io: } } - /* Done it all: now write the commit record asynchronously. */ + /* + * If the journal is not located on the file system device, + * then we must flush the file system device before we issue + * the commit record + */ + if (commit_transaction->t_flushed_data_blocks && + (journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(journal->j_fs_dev, NULL); + /* Done it all: now write the commit record asynchronously. */ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { err = journal_submit_commit_record(journal, commit_transaction, @@ -716,13 +730,6 @@ start_journal_io: blkdev_issue_flush(journal->j_dev, NULL); } - /* - * This is the right place to wait for data buffers both for ASYNC - * and !ASYNC commit. If commit is ASYNC, we need to wait only after - * the commit block went to disk (which happens above). If commit is - * SYNC, we need to wait for data buffers before we start writing - * commit block, which happens below in such setting. - */ err = journal_finish_inode_data_buffers(journal, commit_transaction); if (err) { printk(KERN_WARNING diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fed85388ee86..ac0d027595d0 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno); EXPORT_SYMBOL(jbd2_journal_ack_err); EXPORT_SYMBOL(jbd2_journal_clear_err); EXPORT_SYMBOL(jbd2_log_wait_commit); +EXPORT_SYMBOL(jbd2_log_start_commit); EXPORT_SYMBOL(jbd2_journal_start_commit); EXPORT_SYMBOL(jbd2_journal_force_commit_nested); EXPORT_SYMBOL(jbd2_journal_wipe); @@ -358,6 +359,10 @@ repeat: jbd_unlock_bh_state(bh_in); tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); + if (!tmp) { + jbd2_journal_put_journal_head(new_jh); + return -ENOMEM; + } jbd_lock_bh_state(bh_in); if (jh_in->b_frozen_data) { jbd2_free(tmp, bh_in->b_size); @@ -809,7 +814,7 @@ static journal_t * journal_init_common (void) journal_t *journal; int err; - journal = kzalloc(sizeof(*journal), GFP_KERNEL|__GFP_NOFAIL); + journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) goto fail; @@ -1248,6 +1253,13 @@ int jbd2_journal_load(journal_t *journal) if (jbd2_journal_recover(journal)) goto recovery_error; + if (journal->j_failed_commit) { + printk(KERN_ERR "JBD2: journal transaction %u on %s " + "is corrupt.\n", journal->j_failed_commit, + journal->j_devname); + return -EIO; + } + /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory * and reset them on disk. */ @@ -2103,7 +2115,8 @@ static void __init jbd2_create_debugfs_entry(void) { jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL); if (jbd2_debugfs_dir) - jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, S_IRUGO, + jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, + S_IRUGO | S_IWUSR, jbd2_debugfs_dir, &jbd2_journal_enable_debug); } diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 7edb62e97419..7cdc3196476a 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -350,8 +350,8 @@ int jffs2_acl_chmod(struct inode *inode) return rc; } -static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t jffs2_acl_access_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) { const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS); @@ -360,8 +360,8 @@ static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t return retlen; } -static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t jffs2_acl_default_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) { const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT); @@ -370,12 +370,16 @@ static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_ return retlen; } -static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_t size) +static int jffs2_acl_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { struct posix_acl *acl; int rc; - acl = jffs2_get_acl(inode, type); + if (name[0] != '\0') + return -EINVAL; + + acl = jffs2_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (!acl) @@ -386,26 +390,15 @@ static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_ return rc; } -static int jffs2_acl_access_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) -{ - if (name[0] != '\0') - return -EINVAL; - return jffs2_acl_getxattr(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int jffs2_acl_default_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) -{ - if (name[0] != '\0') - return -EINVAL; - return jffs2_acl_getxattr(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value, size_t size) +static int jffs2_acl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { struct posix_acl *acl; int rc; - if (!is_owner_or_cap(inode)) + if (name[0] != '\0') + return -EINVAL; + if (!is_owner_or_cap(dentry->d_inode)) return -EPERM; if (value) { @@ -420,38 +413,24 @@ static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value, } else { acl = NULL; } - rc = jffs2_set_acl(inode, type, acl); + rc = jffs2_set_acl(dentry->d_inode, type, acl); out: posix_acl_release(acl); return rc; } -static int jffs2_acl_access_setxattr(struct inode *inode, const char *name, - const void *buffer, size_t size, int flags) -{ - if (name[0] != '\0') - return -EINVAL; - return jffs2_acl_setxattr(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int jffs2_acl_default_setxattr(struct inode *inode, const char *name, - const void *buffer, size_t size, int flags) -{ - if (name[0] != '\0') - return -EINVAL; - return jffs2_acl_setxattr(inode, ACL_TYPE_DEFAULT, buffer, size); -} - struct xattr_handler jffs2_acl_access_xattr_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_DEFAULT, .list = jffs2_acl_access_listxattr, - .get = jffs2_acl_access_getxattr, - .set = jffs2_acl_access_setxattr, + .get = jffs2_acl_getxattr, + .set = jffs2_acl_setxattr, }; struct xattr_handler jffs2_acl_default_xattr_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, .list = jffs2_acl_default_listxattr, - .get = jffs2_acl_default_getxattr, - .set = jffs2_acl_default_setxattr, + .get = jffs2_acl_getxattr, + .set = jffs2_acl_setxattr, }; diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 090c556ffed2..3b6f2fa12cff 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -700,7 +700,8 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_ struct jffs2_raw_inode ri; struct jffs2_node_frag *last_frag; union jffs2_device_node dev; - char *mdata = NULL, mdatalen = 0; + char *mdata = NULL; + int mdatalen = 0; uint32_t alloclen, ilen; int ret; diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 378991cfe40f..e22de8397b74 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -1284,7 +1284,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, f->target = NULL; mutex_unlock(&f->sem); jffs2_do_clear_inode(c, f); - return -ret; + return ret; } f->target[je32_to_cpu(latest_node->csize)] = '\0'; diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index 02c39c64ecb3..eaccee058583 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -44,26 +44,28 @@ int jffs2_init_security(struct inode *inode, struct inode *dir) } /* ---- XATTR Handler for "security.*" ----------------- */ -static int jffs2_security_getxattr(struct inode *inode, const char *name, - void *buffer, size_t size) +static int jffs2_security_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size); + return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY, + name, buffer, size); } -static int jffs2_security_setxattr(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +static int jffs2_security_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size, flags); + return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY, + name, buffer, size, flags); } -static size_t jffs2_security_listxattr(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t jffs2_security_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) { size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1; diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c index 6caf1e1ee26d..800171dca53b 100644 --- a/fs/jffs2/summary.c +++ b/fs/jffs2/summary.c @@ -23,7 +23,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c) { - uint32_t sum_size = max_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE); + uint32_t sum_size = min_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE); c->summary = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL); diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 4b107881acd5..9e75c62c85d6 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -990,9 +990,11 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) if (!xhandle) continue; if (buffer) { - rc = xhandle->list(inode, buffer+len, size-len, xd->xname, xd->name_len); + rc = xhandle->list(dentry, buffer+len, size-len, + xd->xname, xd->name_len, xd->flags); } else { - rc = xhandle->list(inode, NULL, 0, xd->xname, xd->name_len); + rc = xhandle->list(dentry, NULL, 0, xd->xname, + xd->name_len, xd->flags); } if (rc < 0) goto out; diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index 8ec5765ef348..3e5a5e356e05 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -16,24 +16,26 @@ #include <linux/mtd/mtd.h> #include "nodelist.h" -static int jffs2_trusted_getxattr(struct inode *inode, const char *name, - void *buffer, size_t size) +static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size); + return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED, + name, buffer, size); } -static int jffs2_trusted_setxattr(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags); + return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED, + name, buffer, size, flags); } -static size_t jffs2_trusted_listxattr(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) { size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1; diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index 8bbeab90ada1..8544af67dffe 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -16,24 +16,26 @@ #include <linux/mtd/mtd.h> #include "nodelist.h" -static int jffs2_user_getxattr(struct inode *inode, const char *name, - void *buffer, size_t size) +static int jffs2_user_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size); + return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_USER, + name, buffer, size); } -static int jffs2_user_setxattr(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +static int jffs2_user_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size, flags); + return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_USER, + name, buffer, size, flags); } -static size_t jffs2_user_listxattr(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t jffs2_user_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) { size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1; diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index f26e4d03ada5..d945ea76b445 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1292,7 +1292,7 @@ int txCommit(tid_t tid, /* transaction identifier */ */ /* * I believe this code is no longer needed. Splitting I_LOCK - * into two bits, I_LOCK and I_SYNC should prevent this + * into two bits, I_NEW and I_SYNC should prevent this * deadlock as well. But since I don't have a JFS testload * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done. * Joern diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 2234c73fc577..d929a822a74e 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -524,7 +524,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) * Page cache is indexed by long. * I would use MAX_LFS_FILESIZE, but it's only half as big */ - sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, sb->s_maxbytes); + sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, (u64)sb->s_maxbytes); #endif sb->s_time_gran = 1; return 0; diff --git a/fs/libfs.c b/fs/libfs.c index 219576c52d80..6e8d17e1dc4c 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -848,7 +848,6 @@ EXPORT_SYMBOL(simple_write_end); EXPORT_SYMBOL(simple_dir_inode_operations); EXPORT_SYMBOL(simple_dir_operations); EXPORT_SYMBOL(simple_empty); -EXPORT_SYMBOL(d_alloc_name); EXPORT_SYMBOL(simple_fill_super); EXPORT_SYMBOL(simple_getattr); EXPORT_SYMBOL(simple_link); diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index bd173a6ca3b1..a7966eed3c17 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -11,10 +11,6 @@ #include <linux/time.h> #include <linux/slab.h> #include <linux/smp_lock.h> -#include <linux/in.h> -#include <linux/sunrpc/svc.h> -#include <linux/sunrpc/clnt.h> -#include <linux/nfsd/nfsd.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index e1d28ddd2169..56c9519d900a 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -11,10 +11,6 @@ #include <linux/time.h> #include <linux/slab.h> #include <linux/smp_lock.h> -#include <linux/in.h> -#include <linux/sunrpc/svc.h> -#include <linux/sunrpc/clnt.h> -#include <linux/nfsd/nfsd.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> diff --git a/fs/namei.c b/fs/namei.c index d11f404667e9..94a5e60779f9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -35,7 +35,7 @@ #include <linux/fs_struct.h> #include <asm/uaccess.h> -#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) +#include "internal.h" /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) @@ -108,8 +108,6 @@ * any extra contention... */ -static int __link_path_walk(const char *name, struct nameidata *nd); - /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. @@ -234,6 +232,7 @@ int generic_permission(struct inode *inode, int mask, /* * Searching includes executable on directories, else just read. */ + mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) if (capable(CAP_DAC_READ_SEARCH)) return 0; @@ -414,36 +413,55 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd) } /* - * Internal lookup() using the new generic dcache. - * SMP-safe + * force_reval_path - force revalidation of a dentry + * + * In some situations the path walking code will trust dentries without + * revalidating them. This causes problems for filesystems that depend on + * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set + * (which indicates that it's possible for the dentry to go stale), force + * a d_revalidate call before proceeding. + * + * Returns 0 if the revalidation was successful. If the revalidation fails, + * either return the error returned by d_revalidate or -ESTALE if the + * revalidation it just returned 0. If d_revalidate returns 0, we attempt to + * invalidate the dentry. It's up to the caller to handle putting references + * to the path if necessary. */ -static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd) +static int +force_reval_path(struct path *path, struct nameidata *nd) { - struct dentry * dentry = __d_lookup(parent, name); + int status; + struct dentry *dentry = path->dentry; - /* lockess __d_lookup may fail due to concurrent d_move() - * in some unrelated directory, so try with d_lookup + /* + * only check on filesystems where it's possible for the dentry to + * become stale. It's assumed that if this flag is set then the + * d_revalidate op will also be defined. */ - if (!dentry) - dentry = d_lookup(parent, name); + if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) + return 0; - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) - dentry = do_revalidate(dentry, nd); + status = dentry->d_op->d_revalidate(dentry, nd); + if (status > 0) + return 0; - return dentry; + if (!status) { + d_invalidate(dentry); + status = -ESTALE; + } + return status; } /* - * Short-cut version of permission(), for calling by - * path_walk(), when dcache lock is held. Combines parts - * of permission() and generic_permission(), and tests ONLY for - * MAY_EXEC permission. + * Short-cut version of permission(), for calling on directories + * during pathname resolution. Combines parts of permission() + * and generic_permission(), and tests ONLY for MAY_EXEC permission. * * If appropriate, check DAC only. If not appropriate, or - * short-cut DAC fails, then call permission() to do more + * short-cut DAC fails, then call ->permission() to do more * complete permission check. */ -static int exec_permission_lite(struct inode *inode) +static int exec_permission(struct inode *inode) { int ret; @@ -465,99 +483,6 @@ ok: return security_inode_permission(inode, MAY_EXEC); } -/* - * This is called when everything else fails, and we actually have - * to go to the low-level filesystem to find out what we should do.. - * - * We get the directory semaphore, and after getting that we also - * make sure that nobody added the entry to the dcache in the meantime.. - * SMP-safe - */ -static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd) -{ - struct dentry * result; - struct inode *dir = parent->d_inode; - - mutex_lock(&dir->i_mutex); - /* - * First re-do the cached lookup just in case it was created - * while we waited for the directory semaphore.. - * - * FIXME! This could use version numbering or similar to - * avoid unnecessary cache lookups. - * - * The "dcache_lock" is purely to protect the RCU list walker - * from concurrent renames at this point (we mustn't get false - * negatives from the RCU list walk here, unlike the optimistic - * fast walk). - * - * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup - */ - result = d_lookup(parent, name); - if (!result) { - struct dentry *dentry; - - /* Don't create child dentry for a dead directory. */ - result = ERR_PTR(-ENOENT); - if (IS_DEADDIR(dir)) - goto out_unlock; - - dentry = d_alloc(parent, name); - result = ERR_PTR(-ENOMEM); - if (dentry) { - result = dir->i_op->lookup(dir, dentry, nd); - if (result) - dput(dentry); - else - result = dentry; - } -out_unlock: - mutex_unlock(&dir->i_mutex); - return result; - } - - /* - * Uhhuh! Nasty case: the cache was re-populated while - * we waited on the semaphore. Need to revalidate. - */ - mutex_unlock(&dir->i_mutex); - if (result->d_op && result->d_op->d_revalidate) { - result = do_revalidate(result, nd); - if (!result) - result = ERR_PTR(-ENOENT); - } - return result; -} - -/* - * Wrapper to retry pathname resolution whenever the underlying - * file system returns an ESTALE. - * - * Retry the whole path once, forcing real lookup requests - * instead of relying on the dcache. - */ -static __always_inline int link_path_walk(const char *name, struct nameidata *nd) -{ - struct path save = nd->path; - int result; - - /* make sure the stuff we saved doesn't go away */ - path_get(&save); - - result = __link_path_walk(name, nd); - if (result == -ESTALE) { - /* nd->path had been dropped */ - nd->path = save; - path_get(&nd->path); - nd->flags |= LOOKUP_REVAL; - result = __link_path_walk(name, nd); - } - - path_put(&save); - - return result; -} - static __always_inline void set_root(struct nameidata *nd) { if (!nd->root.mnt) { @@ -569,6 +494,8 @@ static __always_inline void set_root(struct nameidata *nd) } } +static int link_path_walk(const char *, struct nameidata *); + static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) { int res = 0; @@ -634,6 +561,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata dget(dentry); } mntget(path->mnt); + nd->last_type = LAST_BIND; cookie = dentry->d_inode->i_op->follow_link(dentry, nd); error = PTR_ERR(cookie); if (!IS_ERR(cookie)) { @@ -641,11 +569,14 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata error = 0; if (s) error = __vfs_follow_link(nd, s); + else if (nd->last_type == LAST_BIND) { + error = force_reval_path(&nd->path, nd); + if (error) + path_put(&nd->path); + } if (dentry->d_inode->i_op->put_link) dentry->d_inode->i_op->put_link(dentry, nd, cookie); } - path_put(path); - return error; } @@ -672,6 +603,7 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd) current->total_link_count++; nd->depth++; err = __do_follow_link(path, nd); + path_put(path); current->link_count--; nd->depth--; return err; @@ -797,8 +729,19 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, struct path *path) { struct vfsmount *mnt = nd->path.mnt; - struct dentry *dentry = __d_lookup(nd->path.dentry, name); + struct dentry *dentry, *parent; + struct inode *dir; + /* + * See if the low-level filesystem might want + * to use its own hash.. + */ + if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { + int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name); + if (err < 0) + return err; + } + dentry = __d_lookup(nd->path.dentry, name); if (!dentry) goto need_lookup; if (dentry->d_op && dentry->d_op->d_revalidate) @@ -810,7 +753,59 @@ done: return 0; need_lookup: - dentry = real_lookup(nd->path.dentry, name, nd); + parent = nd->path.dentry; + dir = parent->d_inode; + + mutex_lock(&dir->i_mutex); + /* + * First re-do the cached lookup just in case it was created + * while we waited for the directory semaphore.. + * + * FIXME! This could use version numbering or similar to + * avoid unnecessary cache lookups. + * + * The "dcache_lock" is purely to protect the RCU list walker + * from concurrent renames at this point (we mustn't get false + * negatives from the RCU list walk here, unlike the optimistic + * fast walk). + * + * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup + */ + dentry = d_lookup(parent, name); + if (!dentry) { + struct dentry *new; + + /* Don't create child dentry for a dead directory. */ + dentry = ERR_PTR(-ENOENT); + if (IS_DEADDIR(dir)) + goto out_unlock; + + new = d_alloc(parent, name); + dentry = ERR_PTR(-ENOMEM); + if (new) { + dentry = dir->i_op->lookup(dir, new, nd); + if (dentry) + dput(new); + else + dentry = new; + } +out_unlock: + mutex_unlock(&dir->i_mutex); + if (IS_ERR(dentry)) + goto fail; + goto done; + } + + /* + * Uhhuh! Nasty case: the cache was re-populated while + * we waited on the semaphore. Need to revalidate. + */ + mutex_unlock(&dir->i_mutex); + if (dentry->d_op && dentry->d_op->d_revalidate) { + dentry = do_revalidate(dentry, nd); + if (!dentry) + dentry = ERR_PTR(-ENOENT); + } if (IS_ERR(dentry)) goto fail; goto done; @@ -835,7 +830,7 @@ fail: * Returns 0 and nd will have valid dentry and mnt on success. * Returns error and drops reference to input namei data on failure. */ -static int __link_path_walk(const char *name, struct nameidata *nd) +static int link_path_walk(const char *name, struct nameidata *nd) { struct path next; struct inode *inode; @@ -858,7 +853,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd) unsigned int c; nd->flags |= LOOKUP_CONTINUE; - err = exec_permission_lite(inode); + err = exec_permission(inode); if (err) break; @@ -898,16 +893,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd) case 1: continue; } - /* - * See if the low-level filesystem might want - * to use its own hash.. - */ - if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { - err = nd->path.dentry->d_op->d_hash(nd->path.dentry, - &this); - if (err < 0) - break; - } /* This does the actual lookups.. */ err = do_lookup(nd, &this, &next); if (err) @@ -953,12 +938,6 @@ last_component: case 1: goto return_reval; } - if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { - err = nd->path.dentry->d_op->d_hash(nd->path.dentry, - &this); - if (err < 0) - break; - } err = do_lookup(nd, &this, &next); if (err) break; @@ -1017,8 +996,27 @@ return_err: static int path_walk(const char *name, struct nameidata *nd) { + struct path save = nd->path; + int result; + current->total_link_count = 0; - return link_path_walk(name, nd); + + /* make sure the stuff we saved doesn't go away */ + path_get(&save); + + result = link_path_walk(name, nd); + if (result == -ESTALE) { + /* nd->path had been dropped */ + current->total_link_count = 0; + nd->path = save; + path_get(&nd->path); + nd->flags |= LOOKUP_REVAL; + result = link_path_walk(name, nd); + } + + path_put(&save); + + return result; } static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) @@ -1141,36 +1139,6 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, return retval; } -/** - * path_lookup_open - lookup a file path with open intent - * @dfd: the directory to use as base, or AT_FDCWD - * @name: pointer to file name - * @lookup_flags: lookup intent flags - * @nd: pointer to nameidata - * @open_flags: open intent flags - */ -static int path_lookup_open(int dfd, const char *name, - unsigned int lookup_flags, struct nameidata *nd, int open_flags) -{ - struct file *filp = get_empty_filp(); - int err; - - if (filp == NULL) - return -ENFILE; - nd->intent.open.file = filp; - nd->intent.open.flags = open_flags; - nd->intent.open.create_mode = 0; - err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd); - if (IS_ERR(nd->intent.open.file)) { - if (err == 0) { - err = PTR_ERR(nd->intent.open.file); - path_put(&nd->path); - } - } else if (err != 0) - release_open_intent(nd); - return err; -} - static struct dentry *__lookup_hash(struct qstr *name, struct dentry *base, struct nameidata *nd) { @@ -1191,7 +1159,17 @@ static struct dentry *__lookup_hash(struct qstr *name, goto out; } - dentry = cached_lookup(base, name, nd); + dentry = __d_lookup(base, name); + + /* lockess __d_lookup may fail due to concurrent d_move() + * in some unrelated directory, so try with d_lookup + */ + if (!dentry) + dentry = d_lookup(base, name); + + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) + dentry = do_revalidate(dentry, nd); + if (!dentry) { struct dentry *new; @@ -1223,7 +1201,7 @@ static struct dentry *lookup_hash(struct nameidata *nd) { int err; - err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC); + err = exec_permission(nd->path.dentry->d_inode); if (err) return ERR_PTR(err); return __lookup_hash(&nd->last, nd->path.dentry, nd); @@ -1273,29 +1251,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) if (err) return ERR_PTR(err); - err = inode_permission(base->d_inode, MAY_EXEC); - if (err) - return ERR_PTR(err); - return __lookup_hash(&this, base, NULL); -} - -/** - * lookup_one_noperm - bad hack for sysfs - * @name: pathname component to lookup - * @base: base directory to lookup from - * - * This is a variant of lookup_one_len that doesn't perform any permission - * checks. It's a horrible hack to work around the braindead sysfs - * architecture and should not be used anywhere else. - * - * DON'T USE THIS FUNCTION EVER, thanks. - */ -struct dentry *lookup_one_noperm(const char *name, struct dentry *base) -{ - int err; - struct qstr this; - - err = __lookup_one_len(name, &this, base, strlen(name)); + err = exec_permission(base->d_inode); if (err) return ERR_PTR(err); return __lookup_hash(&this, base, NULL); @@ -1533,69 +1489,45 @@ int may_open(struct path *path, int acc_mode, int flag) if (error) return error; - error = ima_path_check(path, acc_mode ? - acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) : - ACC_MODE(flag) & (MAY_READ | MAY_WRITE), - IMA_COUNT_UPDATE); - - if (error) - return error; /* * An append-only file must be opened in append mode for writing. */ if (IS_APPEND(inode)) { - error = -EPERM; if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) - goto err_out; + return -EPERM; if (flag & O_TRUNC) - goto err_out; + return -EPERM; } /* O_NOATIME can only be set by the owner or superuser */ - if (flag & O_NOATIME) - if (!is_owner_or_cap(inode)) { - error = -EPERM; - goto err_out; - } + if (flag & O_NOATIME && !is_owner_or_cap(inode)) + return -EPERM; /* * Ensure there are no outstanding leases on the file. */ - error = break_lease(inode, flag); - if (error) - goto err_out; - - if (flag & O_TRUNC) { - error = get_write_access(inode); - if (error) - goto err_out; - - /* - * Refuse to truncate files with mandatory locks held on them. - */ - error = locks_verify_locked(inode); - if (!error) - error = security_path_truncate(path, 0, - ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); - if (!error) { - vfs_dq_init(inode); - - error = do_truncate(dentry, 0, - ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, - NULL); - } - put_write_access(inode); - if (error) - goto err_out; - } else - if (flag & FMODE_WRITE) - vfs_dq_init(inode); + return break_lease(inode, flag); +} - return 0; -err_out: - ima_counts_put(path, acc_mode ? - acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) : - ACC_MODE(flag) & (MAY_READ | MAY_WRITE)); +static int handle_truncate(struct path *path) +{ + struct inode *inode = path->dentry->d_inode; + int error = get_write_access(inode); + if (error) + return error; + /* + * Refuse to truncate files with mandatory locks held on them. + */ + error = locks_verify_locked(inode); + if (!error) + error = security_path_truncate(path, 0, + ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); + if (!error) { + error = do_truncate(path->dentry, 0, + ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, + NULL); + } + put_write_access(inode); return error; } @@ -1650,7 +1582,7 @@ static inline int open_to_namei_flags(int flag) return flag; } -static int open_will_write_to_fs(int flag, struct inode *inode) +static int open_will_truncate(int flag, struct inode *inode) { /* * We'll never write to the fs underlying @@ -1675,11 +1607,21 @@ struct file *do_filp_open(int dfd, const char *pathname, struct path path; struct dentry *dir; int count = 0; - int will_write; + int will_truncate; int flag = open_to_namei_flags(open_flag); + int force_reval = 0; + + /* + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. + */ + if (open_flag & __O_SYNC) + open_flag |= O_DSYNC; if (!acc_mode) - acc_mode = MAY_OPEN | ACC_MODE(flag); + acc_mode = MAY_OPEN | ACC_MODE(open_flag); /* O_TRUNC implies we need access checks for write permissions */ if (flag & O_TRUNC) @@ -1694,8 +1636,23 @@ struct file *do_filp_open(int dfd, const char *pathname, * The simplest case - just a plain lookup. */ if (!(flag & O_CREAT)) { - error = path_lookup_open(dfd, pathname, lookup_flags(flag), - &nd, flag); + filp = get_empty_filp(); + + if (filp == NULL) + return ERR_PTR(-ENFILE); + nd.intent.open.file = filp; + filp->f_flags = open_flag; + nd.intent.open.flags = flag; + nd.intent.open.create_mode = 0; + error = do_path_lookup(dfd, pathname, + lookup_flags(flag)|LOOKUP_OPEN, &nd); + if (IS_ERR(nd.intent.open.file)) { + if (error == 0) { + error = PTR_ERR(nd.intent.open.file); + path_put(&nd.path); + } + } else if (error) + release_open_intent(&nd); if (error) return ERR_PTR(error); goto ok; @@ -1704,9 +1661,12 @@ struct file *do_filp_open(int dfd, const char *pathname, /* * Create - we need to know the parent. */ +reval: error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); if (error) return ERR_PTR(error); + if (force_reval) + nd.flags |= LOOKUP_REVAL; error = path_walk(pathname, &nd); if (error) { if (nd.root.mnt) @@ -1730,6 +1690,7 @@ struct file *do_filp_open(int dfd, const char *pathname, if (filp == NULL) goto exit_parent; nd.intent.open.file = filp; + filp->f_flags = open_flag; nd.intent.open.flags = flag; nd.intent.open.create_mode = mode; dir = nd.path.dentry; @@ -1770,14 +1731,18 @@ do_last: mnt_drop_write(nd.path.mnt); goto exit; } - filp = nameidata_to_filp(&nd, open_flag); - if (IS_ERR(filp)) - ima_counts_put(&nd.path, - acc_mode & (MAY_READ | MAY_WRITE | - MAY_EXEC)); + filp = nameidata_to_filp(&nd); mnt_drop_write(nd.path.mnt); if (nd.root.mnt) path_put(&nd.root); + if (!IS_ERR(filp)) { + error = ima_path_check(&filp->f_path, filp->f_mode & + (MAY_READ | MAY_WRITE | MAY_EXEC)); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } return filp; } @@ -1805,7 +1770,7 @@ do_last: path_to_nameidata(&path, &nd); error = -EISDIR; - if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)) + if (S_ISDIR(path.dentry->d_inode->i_mode)) goto exit; ok: /* @@ -1818,28 +1783,45 @@ ok: * be avoided. Taking this mnt write here * ensures that (2) can not occur. */ - will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode); - if (will_write) { + will_truncate = open_will_truncate(flag, nd.path.dentry->d_inode); + if (will_truncate) { error = mnt_want_write(nd.path.mnt); if (error) goto exit; } error = may_open(&nd.path, acc_mode, flag); if (error) { - if (will_write) + if (will_truncate) mnt_drop_write(nd.path.mnt); goto exit; } - filp = nameidata_to_filp(&nd, open_flag); - if (IS_ERR(filp)) - ima_counts_put(&nd.path, - acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); + filp = nameidata_to_filp(&nd); + if (!IS_ERR(filp)) { + error = ima_path_check(&filp->f_path, filp->f_mode & + (MAY_READ | MAY_WRITE | MAY_EXEC)); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + if (!IS_ERR(filp)) { + if (acc_mode & MAY_WRITE) + vfs_dq_init(nd.path.dentry->d_inode); + + if (will_truncate) { + error = handle_truncate(&nd.path); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + } /* * It is now safe to drop the mnt write * because the filp has had a write taken * on its behalf. */ - if (will_write) + if (will_truncate) mnt_drop_write(nd.path.mnt); if (nd.root.mnt) path_put(&nd.root); @@ -1877,6 +1859,7 @@ do_link: if (error) goto exit_dput; error = __do_follow_link(&path, &nd); + path_put(&path); if (error) { /* Does someone understand code flow here? Or it is only * me so stupid? Anathema to whoever designed this non-sense @@ -1885,6 +1868,10 @@ do_link: release_open_intent(&nd); if (nd.root.mnt) path_put(&nd.root); + if (error == -ESTALE && !force_reval) { + force_reval = 1; + goto reval; + } return ERR_PTR(error); } nd.flags &= ~LOOKUP_PARENT; diff --git a/fs/namespace.c b/fs/namespace.c index 7d70d63ceb29..c768f733c8d6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -965,10 +965,12 @@ EXPORT_SYMBOL(may_umount_tree); int may_umount(struct vfsmount *mnt) { int ret = 1; + down_read(&namespace_sem); spin_lock(&vfsmount_lock); if (propagate_mount_busy(mnt, 2)) ret = 0; spin_unlock(&vfsmount_lock); + up_read(&namespace_sem); return ret; } @@ -1352,12 +1354,12 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, if (err) goto out_cleanup_ids; + spin_lock(&vfsmount_lock); + if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); } - - spin_lock(&vfsmount_lock); if (parent_path) { detach_mnt(source_mnt, parent_path); attach_mnt(source_mnt, path); @@ -1534,8 +1536,12 @@ static int do_remount(struct path *path, int flags, int mnt_flags, err = change_mount_flags(path->mnt, flags); else err = do_remount_sb(sb, flags, data, 0); - if (!err) + if (!err) { + spin_lock(&vfsmount_lock); + mnt_flags |= path->mnt->mnt_flags & MNT_PNODE_MASK; path->mnt->mnt_flags = mnt_flags; + spin_unlock(&vfsmount_lock); + } up_write(&sb->s_umount); if (!err) { security_sb_post_remount(path->mnt, flags, data); @@ -1665,6 +1671,8 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, { int err; + mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD); + down_write(&namespace_sem); /* Something was mounted here while we slept */ while (d_mountpoint(path->dentry) && diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 2a77bc25d5af..59e5673b4597 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -90,7 +90,7 @@ config ROOT_NFS If you want your system to mount its root file system via NFS, choose Y here. This is common practice for managing systems without local permanent storage. For details, read - <file:Documentation/filesystems/nfsroot.txt>. + <file:Documentation/filesystems/nfs/nfsroot.txt>. Most people say N here. diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 293fa0528a6e..73ab220354df 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -78,11 +78,6 @@ nfs4_callback_svc(void *vrqstp) set_freezable(); - /* - * FIXME: do we really need to run this under the BKL? If so, please - * add a comment about what it's intended to protect. - */ - lock_kernel(); while (!kthread_should_stop()) { /* * Listen for a request on the socket @@ -104,7 +99,6 @@ nfs4_callback_svc(void *vrqstp) preverr = err; svc_process(rqstp); } - unlock_kernel(); return 0; } @@ -160,11 +154,6 @@ nfs41_callback_svc(void *vrqstp) set_freezable(); - /* - * FIXME: do we really need to run this under the BKL? If so, please - * add a comment about what it's intended to protect. - */ - lock_kernel(); while (!kthread_should_stop()) { prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); @@ -183,7 +172,6 @@ nfs41_callback_svc(void *vrqstp) } finish_wait(&serv->sv_cb_waitq, &wq); } - unlock_kernel(); return 0; } @@ -397,6 +385,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp) */ static struct svc_version *nfs4_callback_version[] = { [1] = &nfs4_callback_version1, + [4] = &nfs4_callback_version4, }; static struct svc_stat nfs4_callback_stats; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 07baa8254ca1..d4036be0b589 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -106,6 +106,19 @@ struct cb_sequenceres { extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, struct cb_sequenceres *res); +extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, + const nfs4_stateid *stateid); + +#define RCA4_TYPE_MASK_RDATA_DLG 0 +#define RCA4_TYPE_MASK_WDATA_DLG 1 + +struct cb_recallanyargs { + struct sockaddr *craa_addr; + uint32_t craa_objs_to_keep; + uint32_t craa_type_mask; +}; + +extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy); #endif /* CONFIG_NFS_V4_1 */ extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); @@ -114,8 +127,9 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy); #ifdef CONFIG_NFS_V4 extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); extern void nfs_callback_down(int minorversion); +extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, + const nfs4_stateid *stateid); #endif /* CONFIG_NFS_V4 */ - /* * nfs41: Callbacks are expected to not cause substantial latency, * so we limit their concurrency to 1 by setting up the maximum number diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index b7da1f54da68..defa9b4c470e 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -61,6 +61,16 @@ out: return res->status; } +static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *) +{ +#if defined(CONFIG_NFS_V4_1) + if (clp->cl_minorversion > 0) + return nfs41_validate_delegation_stateid; +#endif + return nfs4_validate_delegation_stateid; +} + + __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) { struct nfs_client *clp; @@ -81,7 +91,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) inode = nfs_delegation_find_inode(clp, &args->fh); if (inode != NULL) { /* Set up a helper thread to actually return the delegation */ - switch(nfs_async_inode_return_delegation(inode, &args->stateid)) { + switch (nfs_async_inode_return_delegation(inode, &args->stateid, + nfs_validate_delegation_stateid(clp))) { case 0: res = 0; break; @@ -102,8 +113,31 @@ out: return res; } +int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) +{ + if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, + sizeof(delegation->stateid.data)) != 0) + return 0; + return 1; +} + #if defined(CONFIG_NFS_V4_1) +int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) +{ + if (delegation == NULL) + return 0; + + /* seqid is 4-bytes long */ + if (((u32 *) &stateid->data)[0] != 0) + return 0; + if (memcmp(&delegation->stateid.data[4], &stateid->data[4], + sizeof(stateid->data)-4)) + return 0; + + return 1; +} + /* * Validate the sequenceID sent by the server. * Return success if the sequenceID is one more than what we last saw on @@ -227,4 +261,32 @@ out: return res->csr_status; } +unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) +{ + struct nfs_client *clp; + int status; + fmode_t flags = 0; + + status = htonl(NFS4ERR_OP_NOT_IN_SESSION); + clp = nfs_find_client(args->craa_addr, 4); + if (clp == NULL) + goto out; + + dprintk("NFS: RECALL_ANY callback request from %s\n", + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags = FMODE_READ; + if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags |= FMODE_WRITE; + + if (flags) + nfs_expire_all_delegation_types(clp, flags); + status = htonl(NFS4_OK); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 76b0aa0f73bf..8e1a2511c8be 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -23,6 +23,7 @@ #if defined(CONFIG_NFS_V4_1) #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 4 + 1 + 3) +#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #endif /* CONFIG_NFS_V4_1 */ #define NFSDBG_FACILITY NFSDBG_CALLBACK @@ -326,6 +327,25 @@ out_free: goto out; } +static unsigned decode_recallany_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_recallanyargs *args) +{ + uint32_t *p; + + args->craa_addr = svc_addr(rqstp); + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + args->craa_objs_to_keep = ntohl(*p++); + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + args->craa_type_mask = ntohl(*p); + + return 0; +} + #endif /* CONFIG_NFS_V4_1 */ static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) @@ -533,6 +553,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_GETATTR: case OP_CB_RECALL: case OP_CB_SEQUENCE: + case OP_CB_RECALL_ANY: *op = &callback_ops[op_nr]; break; @@ -540,7 +561,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_NOTIFY_DEVICEID: case OP_CB_NOTIFY: case OP_CB_PUSH_DELEG: - case OP_CB_RECALL_ANY: case OP_CB_RECALLABLE_OBJ_AVAIL: case OP_CB_RECALL_SLOT: case OP_CB_WANTS_CANCELLED: @@ -688,6 +708,11 @@ static struct callback_op callback_ops[] = { .encode_res = (callback_encode_res_t)encode_cb_sequence_res, .res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ, }, + [OP_CB_RECALL_ANY] = { + .process_op = (callback_process_op_t)nfs4_callback_recallany, + .decode_args = (callback_decode_arg_t)decode_recallany_args, + .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ, + }, #endif /* CONFIG_NFS_V4_1 */ }; @@ -718,3 +743,10 @@ struct svc_version nfs4_callback_version1 = { .vs_dispatch = NULL, }; +struct svc_version nfs4_callback_version4 = { + .vs_vers = 4, + .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), + .vs_proc = nfs4_callback_procedures1, + .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, + .vs_dispatch = NULL, +}; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 99ea196f071f..ee77713ce68b 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1260,10 +1260,20 @@ error: static void nfs4_session_set_rwsize(struct nfs_server *server) { #ifdef CONFIG_NFS_V4_1 + struct nfs4_session *sess; + u32 server_resp_sz; + u32 server_rqst_sz; + if (!nfs4_has_session(server->nfs_client)) return; - server->rsize = server->nfs_client->cl_session->fc_attrs.max_resp_sz; - server->wsize = server->nfs_client->cl_session->fc_attrs.max_rqst_sz; + sess = server->nfs_client->cl_session; + server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; + server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; + + if (server->rsize > server_resp_sz) + server->rsize = server_resp_sz; + if (server->wsize > server_rqst_sz) + server->wsize = server_rqst_sz; #endif /* CONFIG_NFS_V4_1 */ } diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 6dd48a4405b4..2563bebc4c67 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -92,7 +92,7 @@ out: return status; } -static void nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid) +static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *ctx; @@ -116,10 +116,11 @@ again: err = nfs_delegation_claim_locks(ctx, state); put_nfs_open_context(ctx); if (err != 0) - return; + return err; goto again; } spin_unlock(&inode->i_lock); + return 0; } /* @@ -261,30 +262,34 @@ static void nfs_msync_inode(struct inode *inode) /* * Basic procedure for returning a delegation to the server */ -static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation) +static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) { struct nfs_inode *nfsi = NFS_I(inode); + int err; - nfs_msync_inode(inode); /* * Guard against new delegated open/lock/unlock calls and against * state recovery */ down_write(&nfsi->rwsem); - nfs_delegation_claim_opens(inode, &delegation->stateid); + err = nfs_delegation_claim_opens(inode, &delegation->stateid); up_write(&nfsi->rwsem); - nfs_msync_inode(inode); + if (err) + goto out; - return nfs_do_return_delegation(inode, delegation, 1); + err = nfs_do_return_delegation(inode, delegation, issync); +out: + return err; } /* * Return all delegations that have been marked for return */ -void nfs_client_return_marked_delegations(struct nfs_client *clp) +int nfs_client_return_marked_delegations(struct nfs_client *clp) { struct nfs_delegation *delegation; struct inode *inode; + int err = 0; restart: rcu_read_lock(); @@ -298,12 +303,18 @@ restart: delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); spin_unlock(&clp->cl_lock); rcu_read_unlock(); - if (delegation != NULL) - __nfs_inode_return_delegation(inode, delegation); + if (delegation != NULL) { + filemap_flush(inode->i_mapping); + err = __nfs_inode_return_delegation(inode, delegation, 0); + } iput(inode); - goto restart; + if (!err) + goto restart; + set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); + return err; } rcu_read_unlock(); + return 0; } /* @@ -338,8 +349,10 @@ int nfs_inode_return_delegation(struct inode *inode) spin_lock(&clp->cl_lock); delegation = nfs_detach_delegation_locked(nfsi, NULL); spin_unlock(&clp->cl_lock); - if (delegation != NULL) - err = __nfs_inode_return_delegation(inode, delegation); + if (delegation != NULL) { + nfs_msync_inode(inode); + err = __nfs_inode_return_delegation(inode, delegation, 1); + } } return err; } @@ -368,33 +381,47 @@ void nfs_super_return_all_delegations(struct super_block *sb) spin_unlock(&delegation->lock); } rcu_read_unlock(); - nfs_client_return_marked_delegations(clp); + if (nfs_client_return_marked_delegations(clp) != 0) + nfs4_schedule_state_manager(clp); } -static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) +static +void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags) { struct nfs_delegation *delegation; rcu_read_lock(); list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); + if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) + continue; + if (delegation->type & flags) + nfs_mark_return_delegation(clp, delegation); } rcu_read_unlock(); } +static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) +{ + nfs_client_mark_return_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); +} + static void nfs_delegation_run_state_manager(struct nfs_client *clp) { if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) nfs4_schedule_state_manager(clp); } -void nfs_expire_all_delegations(struct nfs_client *clp) +void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags) { - nfs_client_mark_return_all_delegations(clp); + nfs_client_mark_return_all_delegation_types(clp, flags); nfs_delegation_run_state_manager(clp); } +void nfs_expire_all_delegations(struct nfs_client *clp) +{ + nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); +} + /* * Return all delegations following an NFS4ERR_CB_PATH_DOWN error. */ @@ -413,8 +440,7 @@ static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *c list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) continue; - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); + nfs_mark_return_delegation(clp, delegation); } rcu_read_unlock(); } @@ -428,18 +454,21 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp) /* * Asynchronous delegation recall! */ -int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) +int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, + int (*validate_stateid)(struct nfs_delegation *delegation, + const nfs4_stateid *stateid)) { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_delegation *delegation; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, - sizeof(delegation->stateid.data)) != 0) { + + if (!validate_stateid(delegation, stateid)) { rcu_read_unlock(); return -ENOENT; } + nfs_mark_return_delegation(clp, delegation); rcu_read_unlock(); nfs_delegation_run_state_manager(clp); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 09f383795174..944b627ec6e1 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -34,15 +34,18 @@ enum { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); int nfs_inode_return_delegation(struct inode *inode); -int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); +int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, + int (*validate_stateid)(struct nfs_delegation *delegation, + const nfs4_stateid *stateid)); void nfs_inode_return_delegation_noreclaim(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); void nfs_super_return_all_delegations(struct super_block *sb); void nfs_expire_all_delegations(struct nfs_client *clp); +void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags); void nfs_expire_unreferenced_delegations(struct nfs_client *clp); void nfs_handle_cb_pathdown(struct nfs_client *clp); -void nfs_client_return_marked_delegations(struct nfs_client *clp); +int nfs_client_return_marked_delegations(struct nfs_client *clp); void nfs_delegation_mark_reclaim(struct nfs_client *clp); void nfs_delegation_reap_unclaimed(struct nfs_client *clp); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 7cb298525eef..3c7f03b669fb 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1579,55 +1579,47 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *dentry = NULL, *rehash = NULL; int error = -EBUSY; - /* - * To prevent any new references to the target during the rename, - * we unhash the dentry and free the inode in advance. - */ - if (!d_unhashed(new_dentry)) { - d_drop(new_dentry); - rehash = new_dentry; - } - dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", old_dentry->d_parent->d_name.name, old_dentry->d_name.name, new_dentry->d_parent->d_name.name, new_dentry->d_name.name, atomic_read(&new_dentry->d_count)); /* - * First check whether the target is busy ... we can't - * safely do _any_ rename if the target is in use. - * - * For files, make a copy of the dentry and then do a - * silly-rename. If the silly-rename succeeds, the - * copied dentry is hashed and becomes the new target. + * For non-directories, check whether the target is busy and if so, + * make a copy of the dentry and then do a silly-rename. If the + * silly-rename succeeds, the copied dentry is hashed and becomes + * the new target. */ - if (!new_inode) - goto go_ahead; - if (S_ISDIR(new_inode->i_mode)) { - error = -EISDIR; - if (!S_ISDIR(old_inode->i_mode)) - goto out; - } else if (atomic_read(&new_dentry->d_count) > 2) { - int err; - /* copy the target dentry's name */ - dentry = d_alloc(new_dentry->d_parent, - &new_dentry->d_name); - if (!dentry) - goto out; + if (new_inode && !S_ISDIR(new_inode->i_mode)) { + /* + * To prevent any new references to the target during the + * rename, we unhash the dentry in advance. + */ + if (!d_unhashed(new_dentry)) { + d_drop(new_dentry); + rehash = new_dentry; + } + + if (atomic_read(&new_dentry->d_count) > 2) { + int err; + + /* copy the target dentry's name */ + dentry = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!dentry) + goto out; - /* silly-rename the existing target ... */ - err = nfs_sillyrename(new_dir, new_dentry); - if (!err) { - new_dentry = rehash = dentry; + /* silly-rename the existing target ... */ + err = nfs_sillyrename(new_dir, new_dentry); + if (err) + goto out; + + new_dentry = dentry; + rehash = NULL; new_inode = NULL; - /* instantiate the replacement target */ - d_instantiate(new_dentry, NULL); - } else if (atomic_read(&new_dentry->d_count) > 1) - /* dentry still busy? */ - goto out; + } } -go_ahead: /* * ... prune child dentries and writebacks if needed. */ diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index f4d54ba97cc6..95e1ca765d47 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -146,7 +146,7 @@ static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd, return 0; } -struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, +static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, struct nfs_dns_ent *key) { struct cache_head *ch; @@ -159,7 +159,7 @@ struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, return container_of(ch, struct nfs_dns_ent, h); } -struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd, +static struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd, struct nfs_dns_ent *new, struct nfs_dns_ent *key) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f5fdd39e037a..6b891328f332 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -581,7 +581,7 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode) { struct nfs_open_context *ctx; - if (IS_SYNC(inode) || (filp->f_flags & O_SYNC)) + if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC)) return 1; ctx = nfs_file_open_context(filp); if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) @@ -622,7 +622,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count); result = generic_file_aio_write(iocb, iov, nr_segs, pos); - /* Return error values for O_SYNC and IS_SYNC() */ + /* Return error values for O_DSYNC and IS_SYNC() */ if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); if (err < 0) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index e21b1bb9972f..29e464d23b32 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -30,6 +30,15 @@ static inline int nfs4_has_session(const struct nfs_client *clp) return 0; } +static inline int nfs4_has_persistent_session(const struct nfs_client *clp) +{ +#ifdef CONFIG_NFS_V4_1 + if (nfs4_has_session(clp)) + return (clp->cl_session->flags & SESSION4_PERSIST); +#endif /* CONFIG_NFS_V4_1 */ + return 0; +} + struct nfs_clone_mount { const struct super_block *sb; const struct dentry *dentry; @@ -156,6 +165,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr /* callback_xdr.c */ extern struct svc_version nfs4_callback_version1; +extern struct svc_version nfs4_callback_version4; /* pagelist.c */ extern int __init nfs_init_nfspagecache(void); @@ -177,24 +187,14 @@ extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int); extern struct rpc_procinfo nfs3_procedures[]; extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int); -/* nfs4proc.c */ -static inline void nfs4_restart_rpc(struct rpc_task *task, - const struct nfs_client *clp) -{ -#ifdef CONFIG_NFS_V4_1 - if (nfs4_has_session(clp) && - test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) { - rpc_restart_call_prepare(task); - return; - } -#endif /* CONFIG_NFS_V4_1 */ - rpc_restart_call(task); -} - /* nfs4xdr.c */ #ifdef CONFIG_NFS_V4 extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); #endif +#ifdef CONFIG_NFS_V4_1 +extern const u32 nfs41_maxread_overhead; +extern const u32 nfs41_maxwrite_overhead; +#endif /* nfs4proc.c */ #ifdef CONFIG_NFS_V4 @@ -273,20 +273,6 @@ extern int _nfs4_call_sync_session(struct nfs_server *server, struct nfs4_sequence_res *res, int cache_reply); -#ifdef CONFIG_NFS_V4_1 -extern void nfs41_sequence_free_slot(const struct nfs_client *, - struct nfs4_sequence_res *res); -#endif /* CONFIG_NFS_V4_1 */ - -static inline void nfs4_sequence_free_slot(const struct nfs_client *clp, - struct nfs4_sequence_res *res) -{ -#ifdef CONFIG_NFS_V4_1 - if (nfs4_has_session(clp)) - nfs41_sequence_free_slot(clp, res); -#endif /* CONFIG_NFS_V4_1 */ -} - /* * Determine the device name as a string */ @@ -380,3 +366,15 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) return ((unsigned long)len + (unsigned long)base + PAGE_SIZE - 1) >> PAGE_SHIFT; } + +/* + * Helper for restarting RPC calls in the possible presence of NFSv4.1 + * sessions. + */ +static inline void nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp) +{ + if (nfs4_has_session(clp)) + rpc_restart_call_prepare(task); + else + rpc_restart_call(task); +} diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index ceda50aad73c..46d779abafd3 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -25,13 +25,7 @@ struct nfs_iostats { static inline void nfs_inc_server_stats(const struct nfs_server *server, enum nfs_stat_eventcounters stat) { - struct nfs_iostats *iostats; - int cpu; - - cpu = get_cpu(); - iostats = per_cpu_ptr(server->io_stats, cpu); - iostats->events[stat]++; - put_cpu(); + this_cpu_inc(server->io_stats->events[stat]); } static inline void nfs_inc_stats(const struct inode *inode, @@ -44,13 +38,7 @@ static inline void nfs_add_server_stats(const struct nfs_server *server, enum nfs_stat_bytecounters stat, unsigned long addend) { - struct nfs_iostats *iostats; - int cpu; - - cpu = get_cpu(); - iostats = per_cpu_ptr(server->io_stats, cpu); - iostats->bytes[stat] += addend; - put_cpu(); + this_cpu_add(server->io_stats->bytes[stat], addend); } static inline void nfs_add_stats(const struct inode *inode, @@ -65,13 +53,7 @@ static inline void nfs_add_fscache_stats(struct inode *inode, enum nfs_stat_fscachecounters stat, unsigned long addend) { - struct nfs_iostats *iostats; - int cpu; - - cpu = get_cpu(); - iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu); - iostats->fscache[stat] += addend; - put_cpu(); + this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); } #endif diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 6ea07a3c75d4..865265bdca03 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -44,7 +44,8 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_REBOOT, NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, - NFS4CLNT_SESSION_SETUP, + NFS4CLNT_SESSION_RESET, + NFS4CLNT_SESSION_DRAINING, }; /* @@ -107,6 +108,10 @@ enum { NFS_OWNER_RECLAIM_NOGRACE }; +#define NFS_LOCK_NEW 0 +#define NFS_LOCK_RECLAIM 1 +#define NFS_LOCK_EXPIRED 2 + /* * struct nfs4_state maintains the client-side state for a given * (state_owner,inode) tuple (OPEN) or state_owner (LOCK). @@ -180,6 +185,7 @@ struct nfs4_state_recovery_ops { int (*recover_lock)(struct nfs4_state *, struct file_lock *); int (*establish_clid)(struct nfs_client *, struct rpc_cred *); struct rpc_cred * (*get_clid_cred)(struct nfs_client *); + int (*reclaim_complete)(struct nfs_client *); }; struct nfs4_state_maintenance_ops { @@ -200,9 +206,11 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t); /* nfs4proc.c */ extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *); extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); +extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); +extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait); extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); @@ -218,9 +226,11 @@ extern int nfs4_setup_sequence(struct nfs_client *clp, int cache_reply, struct rpc_task *task); extern void nfs4_destroy_session(struct nfs4_session *session); extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); -extern int nfs4_proc_create_session(struct nfs_client *, int reset); +extern int nfs4_proc_create_session(struct nfs_client *); extern int nfs4_proc_destroy_session(struct nfs4_session *); extern int nfs4_init_session(struct nfs_server *server); +extern int nfs4_proc_get_lease_time(struct nfs_client *clp, + struct nfs_fsinfo *fsinfo); #else /* CONFIG_NFS_v4_1 */ static inline int nfs4_setup_sequence(struct nfs_client *clp, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -267,6 +277,7 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); extern void nfs4_schedule_state_recovery(struct nfs_client *); extern void nfs4_schedule_state_manager(struct nfs_client *); extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); +extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); @@ -275,6 +286,7 @@ extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); +extern void nfs_release_seqid(struct nfs_seqid *seqid); extern void nfs_free_seqid(struct nfs_seqid *seqid); extern const nfs4_stateid zero_stateid; @@ -287,6 +299,7 @@ struct nfs4_mount_data; /* callback_xdr.c */ extern struct svc_version nfs4_callback_version1; +extern struct svc_version nfs4_callback_version4; #else diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 741a562177fc..198d51d17c13 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -64,6 +64,7 @@ struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); +static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); @@ -270,11 +271,18 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR: %d Reset session\n", __func__, errorcode); - set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); + nfs4_schedule_state_recovery(clp); exception->retry = 1; - /* FALLTHROUGH */ + break; #endif /* !defined(CONFIG_NFS_V4_1) */ case -NFS4ERR_FILE_OPEN: + if (exception->timeout > HZ) { + /* We have retried a decent amount, time to + * fail + */ + ret = -EBUSY; + break; + } case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: ret = nfs4_delay(server->client, &exception->timeout); @@ -311,48 +319,67 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp * so we need to scan down from highest_used_slotid to 0 looking for the now * highest slotid in use. * If none found, highest_used_slotid is set to -1. + * + * Must be called while holding tbl->slot_tbl_lock */ static void nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) { int slotid = free_slotid; - spin_lock(&tbl->slot_tbl_lock); /* clear used bit in bitmap */ __clear_bit(slotid, tbl->used_slots); /* update highest_used_slotid when it is freed */ if (slotid == tbl->highest_used_slotid) { slotid = find_last_bit(tbl->used_slots, tbl->max_slots); - if (slotid >= 0 && slotid < tbl->max_slots) + if (slotid < tbl->max_slots) tbl->highest_used_slotid = slotid; else tbl->highest_used_slotid = -1; } - rpc_wake_up_next(&tbl->slot_tbl_waitq); - spin_unlock(&tbl->slot_tbl_lock); dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__, free_slotid, tbl->highest_used_slotid); } -void nfs41_sequence_free_slot(const struct nfs_client *clp, - struct nfs4_sequence_res *res) +/* + * Signal state manager thread if session is drained + */ +static void nfs41_check_drain_session_complete(struct nfs4_session *ses) { - struct nfs4_slot_table *tbl; + struct rpc_task *task; - if (!nfs4_has_session(clp)) { - dprintk("%s: No session\n", __func__); + if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) { + task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); + if (task) + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); return; } + + if (ses->fc_slot_table.highest_used_slotid != -1) + return; + + dprintk("%s COMPLETE: Session Drained\n", __func__); + complete(&ses->complete); +} + +static void nfs41_sequence_free_slot(const struct nfs_client *clp, + struct nfs4_sequence_res *res) +{ + struct nfs4_slot_table *tbl; + tbl = &clp->cl_session->fc_slot_table; if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { - dprintk("%s: No slot\n", __func__); /* just wake up the next guy waiting since * we may have not consumed a slot after all */ - rpc_wake_up_next(&tbl->slot_tbl_waitq); + dprintk("%s: No slot\n", __func__); return; } + + spin_lock(&tbl->slot_tbl_lock); nfs4_free_slot(tbl, res->sr_slotid); + nfs41_check_drain_session_complete(clp->cl_session); + spin_unlock(&tbl->slot_tbl_lock); res->sr_slotid = NFS4_MAX_SLOT_TABLE; } @@ -377,10 +404,10 @@ static void nfs41_sequence_done(struct nfs_client *clp, if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) goto out; - tbl = &clp->cl_session->fc_slot_table; - slot = tbl->slots + res->sr_slotid; - + /* Check the SEQUENCE operation status */ if (res->sr_status == 0) { + tbl = &clp->cl_session->fc_slot_table; + slot = tbl->slots + res->sr_slotid; /* Update the slot's sequence and clientid lease timer */ ++slot->seq_nr; timestamp = res->sr_renewal_time; @@ -388,7 +415,8 @@ static void nfs41_sequence_done(struct nfs_client *clp, if (time_before(clp->cl_last_renewal, timestamp)) clp->cl_last_renewal = timestamp; spin_unlock(&clp->cl_lock); - return; + /* Check sequence flags */ + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); } out: /* The session may be reset by one of the error handlers. */ @@ -407,7 +435,7 @@ out: * Note: must be called with under the slot_tbl_lock. */ static u8 -nfs4_find_slot(struct nfs4_slot_table *tbl, struct rpc_task *task) +nfs4_find_slot(struct nfs4_slot_table *tbl) { int slotid; u8 ret_id = NFS4_MAX_SLOT_TABLE; @@ -429,24 +457,6 @@ out: return ret_id; } -static int nfs4_recover_session(struct nfs4_session *session) -{ - struct nfs_client *clp = session->clp; - unsigned int loop; - int ret; - - for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { - ret = nfs4_wait_clnt_recover(clp); - if (ret != 0) - break; - if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) - break; - nfs4_schedule_state_manager(clp); - ret = -EIO; - } - return ret; -} - static int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -455,7 +465,6 @@ static int nfs41_setup_sequence(struct nfs4_session *session, { struct nfs4_slot *slot; struct nfs4_slot_table *tbl; - int status = 0; u8 slotid; dprintk("--> %s\n", __func__); @@ -468,24 +477,27 @@ static int nfs41_setup_sequence(struct nfs4_session *session, tbl = &session->fc_slot_table; spin_lock(&tbl->slot_tbl_lock); - if (test_bit(NFS4CLNT_SESSION_SETUP, &session->clp->cl_state)) { - if (tbl->highest_used_slotid != -1) { - rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); - spin_unlock(&tbl->slot_tbl_lock); - dprintk("<-- %s: Session reset: draining\n", __func__); - return -EAGAIN; - } + if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) && + !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { + /* + * The state manager will wait until the slot table is empty. + * Schedule the reset thread + */ + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); + spin_unlock(&tbl->slot_tbl_lock); + dprintk("%s Schedule Session Reset\n", __func__); + return -EAGAIN; + } - /* The slot table is empty; start the reset thread */ - dprintk("%s Session Reset\n", __func__); + if (!rpc_queue_empty(&tbl->slot_tbl_waitq) && + !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); spin_unlock(&tbl->slot_tbl_lock); - status = nfs4_recover_session(session); - if (status) - return status; - spin_lock(&tbl->slot_tbl_lock); + dprintk("%s enforce FIFO order\n", __func__); + return -EAGAIN; } - slotid = nfs4_find_slot(tbl, task); + slotid = nfs4_find_slot(tbl); if (slotid == NFS4_MAX_SLOT_TABLE) { rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); spin_unlock(&tbl->slot_tbl_lock); @@ -494,6 +506,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session, } spin_unlock(&tbl->slot_tbl_lock); + rpc_task_set_priority(task, RPC_PRIORITY_NORMAL); slot = tbl->slots + slotid; args->sa_session = session; args->sa_slotid = slotid; @@ -527,7 +540,7 @@ int nfs4_setup_sequence(struct nfs_client *clp, goto out; ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply, task); - if (ret != -EAGAIN) { + if (ret && ret != -EAGAIN) { /* terminate rpc task */ task->tk_status = ret; task->tk_action = NULL; @@ -556,12 +569,17 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) rpc_call_start(task); } +static void nfs41_call_priv_sync_prepare(struct rpc_task *task, void *calldata) +{ + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + nfs41_call_sync_prepare(task, calldata); +} + static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) { struct nfs41_call_sync_data *data = calldata; nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); - nfs41_sequence_free_slot(data->clp, data->seq_res); } struct rpc_call_ops nfs41_call_sync_ops = { @@ -569,12 +587,18 @@ struct rpc_call_ops nfs41_call_sync_ops = { .rpc_call_done = nfs41_call_sync_done, }; +struct rpc_call_ops nfs41_call_priv_sync_ops = { + .rpc_call_prepare = nfs41_call_priv_sync_prepare, + .rpc_call_done = nfs41_call_sync_done, +}; + static int nfs4_call_sync_sequence(struct nfs_client *clp, struct rpc_clnt *clnt, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, - int cache_reply) + int cache_reply, + int privileged) { int ret; struct rpc_task *task; @@ -592,6 +616,8 @@ static int nfs4_call_sync_sequence(struct nfs_client *clp, }; res->sr_slotid = NFS4_MAX_SLOT_TABLE; + if (privileged) + task_setup.callback_ops = &nfs41_call_priv_sync_ops; task = rpc_run_task(&task_setup); if (IS_ERR(task)) ret = PTR_ERR(task); @@ -609,7 +635,7 @@ int _nfs4_call_sync_session(struct nfs_server *server, int cache_reply) { return nfs4_call_sync_sequence(server->nfs_client, server->client, - msg, args, res, cache_reply); + msg, args, res, cache_reply, 0); } #endif /* CONFIG_NFS_V4_1 */ @@ -637,15 +663,6 @@ static void nfs4_sequence_done(const struct nfs_server *server, #endif /* CONFIG_NFS_V4_1 */ } -/* no restart, therefore free slot here */ -static void nfs4_sequence_done_free_slot(const struct nfs_server *server, - struct nfs4_sequence_res *res, - int rpc_status) -{ - nfs4_sequence_done(server, res, rpc_status); - nfs4_sequence_free_slot(server->nfs_client, res); -} - static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) { struct nfs_inode *nfsi = NFS_I(dir); @@ -720,9 +737,15 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, p->o_arg.bitmask = server->attr_bitmask; p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; if (flags & O_EXCL) { - u32 *s = (u32 *) p->o_arg.u.verifier.data; - s[0] = jiffies; - s[1] = current->pid; + if (nfs4_has_persistent_session(server->nfs_client)) { + /* GUARDED */ + p->o_arg.u.attrs = &p->attrs; + memcpy(&p->attrs, attrs, sizeof(p->attrs)); + } else { /* EXCLUSIVE4_1 */ + u32 *s = (u32 *) p->o_arg.u.verifier.data; + s[0] = jiffies; + s[1] = current->pid; + } } else if (flags & O_CREAT) { p->o_arg.u.attrs = &p->attrs; memcpy(&p->attrs, attrs, sizeof(p->attrs)); @@ -776,13 +799,16 @@ static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode goto out; switch (mode & (FMODE_READ|FMODE_WRITE)) { case FMODE_READ: - ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0; + ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 + && state->n_rdonly != 0; break; case FMODE_WRITE: - ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0; + ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 + && state->n_wronly != 0; break; case FMODE_READ|FMODE_WRITE: - ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0; + ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 + && state->n_rdwr != 0; } out: return ret; @@ -1047,7 +1073,7 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod memset(&opendata->o_res, 0, sizeof(opendata->o_res)); memset(&opendata->c_res, 0, sizeof(opendata->c_res)); nfs4_init_opendata_res(opendata); - ret = _nfs4_proc_open(opendata); + ret = _nfs4_recover_proc_open(opendata); if (ret != 0) return ret; newstate = nfs4_opendata_to_nfs4_state(opendata); @@ -1183,6 +1209,14 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state case -ENOENT: case -ESTALE: goto out; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + nfs4_schedule_state_recovery( + server->nfs_client); + goto out; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: @@ -1330,14 +1364,20 @@ out_no_action: } +static void nfs4_recover_open_prepare(struct rpc_task *task, void *calldata) +{ + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + nfs4_open_prepare(task, calldata); +} + static void nfs4_open_done(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; data->rpc_status = task->tk_status; - nfs4_sequence_done_free_slot(data->o_arg.server, &data->o_res.seq_res, - task->tk_status); + nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res, + task->tk_status); if (RPC_ASSASSINATED(task)) return; @@ -1388,10 +1428,13 @@ static const struct rpc_call_ops nfs4_open_ops = { .rpc_release = nfs4_open_release, }; -/* - * Note: On error, nfs4_proc_open will free the struct nfs4_opendata - */ -static int _nfs4_proc_open(struct nfs4_opendata *data) +static const struct rpc_call_ops nfs4_recover_open_ops = { + .rpc_call_prepare = nfs4_recover_open_prepare, + .rpc_call_done = nfs4_open_done, + .rpc_release = nfs4_open_release, +}; + +static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) { struct inode *dir = data->dir->d_inode; struct nfs_server *server = NFS_SERVER(dir); @@ -1418,21 +1461,57 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) data->rpc_done = 0; data->rpc_status = 0; data->cancelled = 0; + if (isrecover) + task_setup_data.callback_ops = &nfs4_recover_open_ops; task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - status = nfs4_wait_for_completion_rpc_task(task); - if (status != 0) { - data->cancelled = 1; - smp_wmb(); - } else - status = data->rpc_status; - rpc_put_task(task); + if (IS_ERR(task)) + return PTR_ERR(task); + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) { + data->cancelled = 1; + smp_wmb(); + } else + status = data->rpc_status; + rpc_put_task(task); + + return status; +} + +static int _nfs4_recover_proc_open(struct nfs4_opendata *data) +{ + struct inode *dir = data->dir->d_inode; + struct nfs_openres *o_res = &data->o_res; + int status; + + status = nfs4_run_open_task(data, 1); if (status != 0 || !data->rpc_done) return status; - if (o_res->fh.size == 0) - _nfs4_proc_lookup(dir, o_arg->name, &o_res->fh, o_res->f_attr); + nfs_refresh_inode(dir, o_res->dir_attr); + + if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { + status = _nfs4_proc_open_confirm(data); + if (status != 0) + return status; + } + + return status; +} + +/* + * Note: On error, nfs4_proc_open will free the struct nfs4_opendata + */ +static int _nfs4_proc_open(struct nfs4_opendata *data) +{ + struct inode *dir = data->dir->d_inode; + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_openargs *o_arg = &data->o_arg; + struct nfs_openres *o_res = &data->o_res; + int status; + + status = nfs4_run_open_task(data, 0); + if (status != 0 || !data->rpc_done) + return status; if (o_arg->open_flags & O_CREAT) { update_changeattr(dir, &o_res->cinfo); @@ -1488,7 +1567,7 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s return ret; } -static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) +static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_exception exception = { }; @@ -1496,10 +1575,16 @@ static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4 do { err = _nfs4_open_expired(ctx, state); - if (err != -NFS4ERR_DELAY) - break; - nfs4_handle_exception(server, err, &exception); + switch (err) { + default: + goto out; + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + nfs4_handle_exception(server, err, &exception); + err = 0; + } } while (exception.retry); +out: return err; } @@ -1712,6 +1797,18 @@ static void nfs4_free_closedata(void *data) kfree(calldata); } +static void nfs4_close_clear_stateid_flags(struct nfs4_state *state, + fmode_t fmode) +{ + spin_lock(&state->owner->so_lock); + if (!(fmode & FMODE_READ)) + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + if (!(fmode & FMODE_WRITE)) + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDWR_STATE, &state->flags); + spin_unlock(&state->owner->so_lock); +} + static void nfs4_close_done(struct rpc_task *task, void *data) { struct nfs4_closedata *calldata = data; @@ -1728,6 +1825,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data) case 0: nfs_set_open_stateid(state, &calldata->res.stateid, 0); renew_lease(server, calldata->timestamp); + nfs4_close_clear_stateid_flags(state, + calldata->arg.fmode); break; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_OLD_STATEID: @@ -1736,12 +1835,10 @@ static void nfs4_close_done(struct rpc_task *task, void *data) if (calldata->arg.fmode == 0) break; default: - if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { - nfs4_restart_rpc(task, server->nfs_client); - return; - } + if (nfs4_async_handle_error(task, server, state) == -EAGAIN) + rpc_restart_call_prepare(task); } - nfs4_sequence_free_slot(server->nfs_client, &calldata->res.seq_res); + nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, calldata->res.fattr); } @@ -1749,38 +1846,39 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) { struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; - int clear_rd, clear_wr, clear_rdwr; + int call_close = 0; if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) return; - clear_rd = clear_wr = clear_rdwr = 0; + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; + calldata->arg.fmode = FMODE_READ|FMODE_WRITE; spin_lock(&state->owner->so_lock); /* Calculate the change in open mode */ if (state->n_rdwr == 0) { if (state->n_rdonly == 0) { - clear_rd |= test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags); - clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + calldata->arg.fmode &= ~FMODE_READ; } if (state->n_wronly == 0) { - clear_wr |= test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags); - clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags); + call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + calldata->arg.fmode &= ~FMODE_WRITE; } } spin_unlock(&state->owner->so_lock); - if (!clear_rd && !clear_wr && !clear_rdwr) { + + if (!call_close) { /* Note: exit _without_ calling nfs4_close_done */ task->tk_action = NULL; return; } + + if (calldata->arg.fmode == 0) + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; + nfs_fattr_init(calldata->res.fattr); - if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) { - task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; - calldata->arg.fmode = FMODE_READ; - } else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) { - task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; - calldata->arg.fmode = FMODE_WRITE; - } calldata->timestamp = jiffies; if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, &calldata->arg.seq_args, &calldata->res.seq_res, @@ -1832,8 +1930,6 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) calldata->state = state; calldata->arg.fh = NFS_FH(state->inode); calldata->arg.stateid = &state->open_stateid; - if (nfs4_has_session(server->nfs_client)) - memset(calldata->arg.stateid->data, 0, 4); /* clear seqid */ /* Serialization for the sequence id */ calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); if (calldata->arg.seqid == NULL) @@ -1981,7 +2077,7 @@ out_drop: return 0; } -void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) +static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) { if (ctx->state == NULL) return; @@ -2532,7 +2628,6 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) return 0; - nfs4_sequence_free_slot(res->server->nfs_client, &res->seq_res); update_changeattr(dir, &res->cinfo); nfs_post_op_update_inode(dir, &res->dir_attr); return 1; @@ -2971,11 +3066,10 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) dprintk("--> %s\n", __func__); - /* nfs4_sequence_free_slot called in the read rpc_call_done */ nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { - nfs4_restart_rpc(task, server->nfs_client); + nfs_restart_rpc(task, server->nfs_client); return -EAGAIN; } @@ -2995,12 +3089,11 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; - /* slot is freed in nfs_writeback_done */ nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, task->tk_status); if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { - nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); return -EAGAIN; } if (task->tk_status >= 0) { @@ -3028,11 +3121,9 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, task->tk_status); if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { - nfs4_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); return -EAGAIN; } - nfs4_sequence_free_slot(NFS_SERVER(inode)->nfs_client, - &data->res.seq_res); nfs_refresh_inode(inode, data->res.fattr); return 0; } @@ -3350,7 +3441,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR %d, Reset session\n", __func__, task->tk_status); - set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); + nfs4_schedule_state_recovery(clp); task->tk_status = 0; return -EAGAIN; #endif /* CONFIG_NFS_V4_1 */ @@ -3483,12 +3574,23 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_delegreturndata *data = calldata; - nfs4_sequence_done_free_slot(data->res.server, &data->res.seq_res, - task->tk_status); + nfs4_sequence_done(data->res.server, &data->res.seq_res, + task->tk_status); - data->rpc_status = task->tk_status; - if (data->rpc_status == 0) + switch (task->tk_status) { + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + case 0: renew_lease(data->res.server, data->timestamp); + break; + default: + if (nfs4_async_handle_error(task, data->res.server, NULL) == + -EAGAIN) { + nfs_restart_rpc(task, data->res.server->nfs_client); + return; + } + } + data->rpc_status = task->tk_status; } static void nfs4_delegreturn_release(void *calldata) @@ -3741,11 +3843,9 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) break; default: if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) - nfs4_restart_rpc(task, - calldata->server->nfs_client); + nfs_restart_rpc(task, + calldata->server->nfs_client); } - nfs4_sequence_free_slot(calldata->server->nfs_client, - &calldata->res.seq_res); } static void nfs4_locku_prepare(struct rpc_task *task, void *data) @@ -3921,14 +4021,20 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); } +static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata) +{ + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + nfs4_lock_prepare(task, calldata); +} + static void nfs4_lock_done(struct rpc_task *task, void *calldata) { struct nfs4_lockdata *data = calldata; dprintk("%s: begin!\n", __func__); - nfs4_sequence_done_free_slot(data->server, &data->res.seq_res, - task->tk_status); + nfs4_sequence_done(data->server, &data->res.seq_res, + task->tk_status); data->rpc_status = task->tk_status; if (RPC_ASSASSINATED(task)) @@ -3976,7 +4082,13 @@ static const struct rpc_call_ops nfs4_lock_ops = { .rpc_release = nfs4_lock_release, }; -static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int reclaim) +static const struct rpc_call_ops nfs4_recover_lock_ops = { + .rpc_call_prepare = nfs4_recover_lock_prepare, + .rpc_call_done = nfs4_lock_done, + .rpc_release = nfs4_lock_release, +}; + +static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type) { struct nfs4_lockdata *data; struct rpc_task *task; @@ -4000,8 +4112,11 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f return -ENOMEM; if (IS_SETLKW(cmd)) data->arg.block = 1; - if (reclaim != 0) - data->arg.reclaim = 1; + if (recovery_type > NFS_LOCK_NEW) { + if (recovery_type == NFS_LOCK_RECLAIM) + data->arg.reclaim = NFS_LOCK_RECLAIM; + task_setup_data.callback_ops = &nfs4_recover_lock_ops; + } msg.rpc_argp = &data->arg, msg.rpc_resp = &data->res, task_setup_data.callback_data = data; @@ -4028,7 +4143,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request /* Cache the lock if possible... */ if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; - err = _nfs4_do_setlk(state, F_SETLK, request, 1); + err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -4048,11 +4163,17 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request do { if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; - err = _nfs4_do_setlk(state, F_SETLK, request, 0); - if (err != -NFS4ERR_DELAY) - break; - nfs4_handle_exception(server, err, &exception); + err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED); + switch (err) { + default: + goto out; + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + nfs4_handle_exception(server, err, &exception); + err = 0; + } } while (exception.retry); +out: return err; } @@ -4078,7 +4199,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock status = do_vfs_lock(request->fl_file, request); goto out_unlock; } - status = _nfs4_do_setlk(state, cmd, request, 0); + status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); if (status != 0) goto out_unlock; /* Note: we always want to sleep here! */ @@ -4161,7 +4282,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) if (err != 0) goto out; do { - err = _nfs4_do_setlk(state, F_SETLK, fl, 0); + err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); switch (err) { default: printk(KERN_ERR "%s: unhandled error %d.\n", @@ -4172,6 +4293,11 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case -NFS4ERR_EXPIRED: case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: nfs4_schedule_state_recovery(server->nfs_client); goto out; case -ERESTARTSYS: @@ -4296,7 +4422,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, * NFS4ERR_BADSESSION in the sequence operation, and will therefore * be in some phase of session reset. */ -static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) +int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) { nfs4_verifier verifier; struct nfs41_exchange_id_args args = { @@ -4318,6 +4444,9 @@ static int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) dprintk("--> %s\n", __func__); BUG_ON(clp == NULL); + /* Remove server-only flags */ + args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R; + p = (u32 *)verifier.data; *p++ = htonl((u32)clp->cl_boot_time.tv_sec); *p = htonl((u32)clp->cl_boot_time.tv_nsec); @@ -4361,11 +4490,12 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task, (struct nfs4_get_lease_time_data *)calldata; dprintk("--> %s\n", __func__); + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); /* just setup sequence, do not trigger session recovery since we're invoked within one */ ret = nfs41_setup_sequence(data->clp->cl_session, - &data->args->la_seq_args, - &data->res->lr_seq_res, 0, task); + &data->args->la_seq_args, + &data->res->lr_seq_res, 0, task); BUG_ON(ret == -EAGAIN); rpc_call_start(task); @@ -4389,10 +4519,9 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); rpc_delay(task, NFS4_POLL_RETRY_MIN); task->tk_status = 0; - nfs4_restart_rpc(task, data->clp); + nfs_restart_rpc(task, data->clp); return; } - nfs41_sequence_free_slot(data->clp, &data->res->lr_seq_res); dprintk("<-- %s\n", __func__); } @@ -4465,7 +4594,6 @@ static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, int max_slots, spin_lock(&tbl->slot_tbl_lock); for (i = 0; i < max_slots; ++i) tbl->slots[i].seq_nr = ivalue; - tbl->highest_used_slotid = -1; spin_unlock(&tbl->slot_tbl_lock); dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, tbl, tbl->slots, tbl->max_slots); @@ -4515,7 +4643,6 @@ static void nfs4_destroy_slot_tables(struct nfs4_session *session) static int nfs4_init_slot_table(struct nfs4_slot_table *tbl, int max_slots, int ivalue) { - int i; struct nfs4_slot *slot; int ret = -ENOMEM; @@ -4526,18 +4653,9 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl, slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL); if (!slot) goto out; - for (i = 0; i < max_slots; ++i) - slot[i].seq_nr = ivalue; ret = 0; spin_lock(&tbl->slot_tbl_lock); - if (tbl->slots != NULL) { - spin_unlock(&tbl->slot_tbl_lock); - dprintk("%s: slot table already initialized. tbl=%p slots=%p\n", - __func__, tbl, tbl->slots); - WARN_ON(1); - goto out_free; - } tbl->max_slots = max_slots; tbl->slots = slot; tbl->highest_used_slotid = -1; /* no slot is currently used */ @@ -4547,10 +4665,6 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl, out: dprintk("<-- %s: return %d\n", __func__, ret); return ret; - -out_free: - kfree(slot); - goto out; } /* @@ -4558,17 +4672,24 @@ out_free: */ static int nfs4_init_slot_tables(struct nfs4_session *session) { - int status; + struct nfs4_slot_table *tbl; + int status = 0; - status = nfs4_init_slot_table(&session->fc_slot_table, - session->fc_attrs.max_reqs, 1); - if (status) - return status; + tbl = &session->fc_slot_table; + if (tbl->slots == NULL) { + status = nfs4_init_slot_table(tbl, + session->fc_attrs.max_reqs, 1); + if (status) + return status; + } - status = nfs4_init_slot_table(&session->bc_slot_table, - session->bc_attrs.max_reqs, 0); - if (status) - nfs4_destroy_slot_tables(session); + tbl = &session->bc_slot_table; + if (tbl->slots == NULL) { + status = nfs4_init_slot_table(tbl, + session->bc_attrs.max_reqs, 0); + if (status) + nfs4_destroy_slot_tables(session); + } return status; } @@ -4582,7 +4703,6 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) if (!session) return NULL; - set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); /* * The create session reply races with the server back * channel probe. Mark the client NFS_CS_SESSION_INITING @@ -4590,12 +4710,15 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) * nfs_client struct */ clp->cl_cons_state = NFS_CS_SESSION_INITING; + init_completion(&session->complete); tbl = &session->fc_slot_table; + tbl->highest_used_slotid = -1; spin_lock_init(&tbl->slot_tbl_lock); - rpc_init_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); + rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); tbl = &session->bc_slot_table; + tbl->highest_used_slotid = -1; spin_lock_init(&tbl->slot_tbl_lock); rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); @@ -4747,11 +4870,10 @@ static int _nfs4_proc_create_session(struct nfs_client *clp) * It is the responsibility of the caller to verify the session is * expired before calling this routine. */ -int nfs4_proc_create_session(struct nfs_client *clp, int reset) +int nfs4_proc_create_session(struct nfs_client *clp) { int status; unsigned *ptr; - struct nfs_fsinfo fsinfo; struct nfs4_session *session = clp->cl_session; dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); @@ -4760,35 +4882,19 @@ int nfs4_proc_create_session(struct nfs_client *clp, int reset) if (status) goto out; - /* Init or reset the fore channel */ - if (reset) - status = nfs4_reset_slot_tables(session); - else - status = nfs4_init_slot_tables(session); - dprintk("fore channel slot table initialization returned %d\n", status); + /* Init and reset the fore channel */ + status = nfs4_init_slot_tables(session); + dprintk("slot table initialization returned %d\n", status); + if (status) + goto out; + status = nfs4_reset_slot_tables(session); + dprintk("slot table reset returned %d\n", status); if (status) goto out; ptr = (unsigned *)&session->sess_id.data[0]; dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__, clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]); - - if (reset) - /* Lease time is aleady set */ - goto out; - - /* Get the lease time */ - status = nfs4_proc_get_lease_time(clp, &fsinfo); - if (status == 0) { - /* Update lease time and schedule renewal */ - spin_lock(&clp->cl_lock); - clp->cl_lease_time = fsinfo.lease_time * HZ; - clp->cl_last_renewal = jiffies; - clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - spin_unlock(&clp->cl_lock); - - nfs4_schedule_state_renewal(clp); - } out: dprintk("<-- %s\n", __func__); return status; @@ -4827,13 +4933,24 @@ int nfs4_proc_destroy_session(struct nfs4_session *session) int nfs4_init_session(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; + struct nfs4_session *session; + unsigned int rsize, wsize; int ret; if (!nfs4_has_session(clp)) return 0; - clp->cl_session->fc_attrs.max_rqst_sz = server->wsize; - clp->cl_session->fc_attrs.max_resp_sz = server->rsize; + rsize = server->rsize; + if (rsize == 0) + rsize = NFS_MAX_FILE_IO_SIZE; + wsize = server->wsize; + if (wsize == 0) + wsize = NFS_MAX_FILE_IO_SIZE; + + session = clp->cl_session; + session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; + session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; + ret = nfs4_recover_expired_lease(server); if (!ret) ret = nfs4_check_client_ready(clp); @@ -4858,7 +4975,7 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) args.sa_cache_this = 0; return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args, - &res, 0); + &res, args.sa_cache_this, 1); } void nfs41_sequence_call_done(struct rpc_task *task, void *data) @@ -4872,11 +4989,10 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data) if (_nfs4_async_handle_error(task, NULL, clp, NULL) == -EAGAIN) { - nfs4_restart_rpc(task, clp); + nfs_restart_rpc(task, clp); return; } } - nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp); dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); kfree(task->tk_msg.rpc_argp); @@ -4931,6 +5047,110 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, &nfs41_sequence_ops, (void *)clp); } +struct nfs4_reclaim_complete_data { + struct nfs_client *clp; + struct nfs41_reclaim_complete_args arg; + struct nfs41_reclaim_complete_res res; +}; + +static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_reclaim_complete_data *calldata = data; + + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, + &calldata->res.seq_res, 0, task)) + return; + + rpc_call_start(task); +} + +static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) +{ + struct nfs4_reclaim_complete_data *calldata = data; + struct nfs_client *clp = calldata->clp; + struct nfs4_sequence_res *res = &calldata->res.seq_res; + + dprintk("--> %s\n", __func__); + nfs41_sequence_done(clp, res, task->tk_status); + switch (task->tk_status) { + case 0: + case -NFS4ERR_COMPLETE_ALREADY: + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + /* + * Handle the session error, but do not retry the operation, as + * we have no way of telling whether the clientid had to be + * reset before we got our reply. If reset, a new wave of + * reclaim operations will follow, containing their own reclaim + * complete. We don't want our retry to get on the way of + * recovery by incorrectly indicating to the server that we're + * done reclaiming state since the process had to be restarted. + */ + _nfs4_async_handle_error(task, NULL, clp, NULL); + break; + default: + if (_nfs4_async_handle_error( + task, NULL, clp, NULL) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + } + + dprintk("<-- %s\n", __func__); +} + +static void nfs4_free_reclaim_complete_data(void *data) +{ + struct nfs4_reclaim_complete_data *calldata = data; + + kfree(calldata); +} + +static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = { + .rpc_call_prepare = nfs4_reclaim_complete_prepare, + .rpc_call_done = nfs4_reclaim_complete_done, + .rpc_release = nfs4_free_reclaim_complete_data, +}; + +/* + * Issue a global reclaim complete. + */ +static int nfs41_proc_reclaim_complete(struct nfs_client *clp) +{ + struct nfs4_reclaim_complete_data *calldata; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs4_reclaim_complete_call_ops, + .flags = RPC_TASK_ASYNC, + }; + int status = -ENOMEM; + + dprintk("--> %s\n", __func__); + calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); + if (calldata == NULL) + goto out; + calldata->clp = clp; + calldata->arg.one_fs = 0; + calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; + + msg.rpc_argp = &calldata->arg; + msg.rpc_resp = &calldata->res; + task_setup_data.callback_data = calldata; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + status = PTR_ERR(task); + rpc_put_task(task); +out: + dprintk("<-- %s status=%d\n", __func__, status); + return status; +} #endif /* CONFIG_NFS_V4_1 */ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { @@ -4948,8 +5168,9 @@ struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = { .state_flag_bit = NFS_STATE_RECLAIM_REBOOT, .recover_open = nfs4_open_reclaim, .recover_lock = nfs4_lock_reclaim, - .establish_clid = nfs4_proc_exchange_id, + .establish_clid = nfs41_init_clientid, .get_clid_cred = nfs4_get_exchange_id_cred, + .reclaim_complete = nfs41_proc_reclaim_complete, }; #endif /* CONFIG_NFS_V4_1 */ @@ -4968,7 +5189,7 @@ struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, .recover_open = nfs4_open_expired, .recover_lock = nfs4_lock_expired, - .establish_clid = nfs4_proc_exchange_id, + .establish_clid = nfs41_init_clientid, .get_clid_cred = nfs4_get_exchange_id_cred, }; #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2ef4fecf3984..6d263ed79e92 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -116,6 +116,79 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) #if defined(CONFIG_NFS_V4_1) +static int nfs41_setup_state_renewal(struct nfs_client *clp) +{ + int status; + struct nfs_fsinfo fsinfo; + + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + /* Update lease time and schedule renewal */ + spin_lock(&clp->cl_lock); + clp->cl_lease_time = fsinfo.lease_time * HZ; + clp->cl_last_renewal = jiffies; + spin_unlock(&clp->cl_lock); + + nfs4_schedule_state_renewal(clp); + } + + return status; +} + +static void nfs4_end_drain_session(struct nfs_client *clp) +{ + struct nfs4_session *ses = clp->cl_session; + int max_slots; + + if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { + spin_lock(&ses->fc_slot_table.slot_tbl_lock); + max_slots = ses->fc_slot_table.max_slots; + while (max_slots--) { + struct rpc_task *task; + + task = rpc_wake_up_next(&ses->fc_slot_table. + slot_tbl_waitq); + if (!task) + break; + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + } + spin_unlock(&ses->fc_slot_table.slot_tbl_lock); + } +} + +static int nfs4_begin_drain_session(struct nfs_client *clp) +{ + struct nfs4_session *ses = clp->cl_session; + struct nfs4_slot_table *tbl = &ses->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); + set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state); + if (tbl->highest_used_slotid != -1) { + INIT_COMPLETION(ses->complete); + spin_unlock(&tbl->slot_tbl_lock); + return wait_for_completion_interruptible(&ses->complete); + } + spin_unlock(&tbl->slot_tbl_lock); + return 0; +} + +int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) +{ + int status; + + nfs4_begin_drain_session(clp); + status = nfs4_proc_exchange_id(clp, cred); + if (status != 0) + goto out; + status = nfs4_proc_create_session(clp); + if (status != 0) + goto out; + nfs41_setup_state_renewal(clp); + nfs_mark_client_ready(clp, NFS_CS_READY); +out: + return status; +} + struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp) { struct rpc_cred *cred; @@ -693,16 +766,21 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) return new; } -void nfs_free_seqid(struct nfs_seqid *seqid) +void nfs_release_seqid(struct nfs_seqid *seqid) { if (!list_empty(&seqid->list)) { struct rpc_sequence *sequence = seqid->sequence->sequence; spin_lock(&sequence->lock); - list_del(&seqid->list); + list_del_init(&seqid->list); spin_unlock(&sequence->lock); rpc_wake_up(&sequence->wait); } +} + +void nfs_free_seqid(struct nfs_seqid *seqid) +{ + nfs_release_seqid(seqid); kfree(seqid); } @@ -877,6 +955,10 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ case -NFS4ERR_EXPIRED: case -NFS4ERR_NO_GRACE: case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: goto out; default: printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", @@ -959,6 +1041,10 @@ restart: case -NFS4ERR_NO_GRACE: nfs4_state_mark_reclaim_nograce(sp->so_client, state); case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: goto out_err; } nfs4_put_open_state(state); @@ -1011,6 +1097,14 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot); } +static void nfs4_reclaim_complete(struct nfs_client *clp, + const struct nfs4_state_recovery_ops *ops) +{ + /* Notify the server we're done reclaiming our state */ + if (ops->reclaim_complete) + (void)ops->reclaim_complete(clp); +} + static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) { struct nfs4_state_owner *sp; @@ -1020,6 +1114,9 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) return; + nfs4_reclaim_complete(clp, + nfs4_reboot_recovery_ops[clp->cl_minorversion]); + for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); spin_lock(&sp->so_lock); @@ -1046,25 +1143,25 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); } -static void nfs4_state_end_reclaim_nograce(struct nfs_client *clp) -{ - clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); -} - -static void nfs4_recovery_handle_error(struct nfs_client *clp, int error) +static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) { switch (error) { case -NFS4ERR_CB_PATH_DOWN: nfs_handle_cb_pathdown(clp); - break; + return 0; + case -NFS4ERR_NO_GRACE: + nfs4_state_end_reclaim_reboot(clp); + return 0; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_LEASE_MOVED: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_end_reclaim_reboot(clp); nfs4_state_start_reclaim_reboot(clp); break; case -NFS4ERR_EXPIRED: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); nfs4_state_start_reclaim_nograce(clp); + break; case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: @@ -1072,8 +1169,11 @@ static void nfs4_recovery_handle_error(struct nfs_client *clp, int error) case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_SEQ_FALSE_RETRY: case -NFS4ERR_SEQ_MISORDERED: - set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + /* Zero session reset errors */ + return 0; } + return error; } static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops) @@ -1093,8 +1193,7 @@ restart: if (status < 0) { set_bit(ops->owner_flag_bit, &sp->so_flags); nfs4_put_state_owner(sp); - nfs4_recovery_handle_error(clp, status); - return status; + return nfs4_recovery_handle_error(clp, status); } nfs4_put_state_owner(sp); goto restart; @@ -1124,8 +1223,7 @@ static int nfs4_check_lease(struct nfs_client *clp) status = ops->renew_lease(clp, cred); put_rpccred(cred); out: - nfs4_recovery_handle_error(clp, status); - return status; + return nfs4_recovery_handle_error(clp, status); } static int nfs4_reclaim_lease(struct nfs_client *clp) @@ -1151,55 +1249,59 @@ static int nfs4_reclaim_lease(struct nfs_client *clp) } #ifdef CONFIG_NFS_V4_1 -static void nfs4_session_recovery_handle_error(struct nfs_client *clp, int err) +void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) { - switch (err) { - case -NFS4ERR_STALE_CLIENTID: + if (!flags) + return; + else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) { set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); - } + nfs4_state_start_reclaim_reboot(clp); + nfs4_schedule_state_recovery(clp); + } else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | + SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | + SEQ4_STATUS_ADMIN_STATE_REVOKED | + SEQ4_STATUS_RECALLABLE_STATE_REVOKED | + SEQ4_STATUS_LEASE_MOVED)) { + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_nograce(clp); + nfs4_schedule_state_recovery(clp); + } else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | + SEQ4_STATUS_BACKCHANNEL_FAULT | + SEQ4_STATUS_CB_PATH_DOWN_SESSION)) + nfs_expire_all_delegations(clp); } static int nfs4_reset_session(struct nfs_client *clp) { int status; + nfs4_begin_drain_session(clp); status = nfs4_proc_destroy_session(clp->cl_session); if (status && status != -NFS4ERR_BADSESSION && status != -NFS4ERR_DEADSESSION) { - nfs4_session_recovery_handle_error(clp, status); + status = nfs4_recovery_handle_error(clp, status); goto out; } memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN); - status = nfs4_proc_create_session(clp, 1); + status = nfs4_proc_create_session(clp); if (status) - nfs4_session_recovery_handle_error(clp, status); - /* fall through*/ -out: - /* Wake up the next rpc task even on error */ - rpc_wake_up_next(&clp->cl_session->fc_slot_table.slot_tbl_waitq); - return status; -} + status = nfs4_recovery_handle_error(clp, status); -static int nfs4_initialize_session(struct nfs_client *clp) -{ - int status; +out: + /* + * Let the state manager reestablish state + */ + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && + status == 0) + nfs41_setup_state_renewal(clp); - status = nfs4_proc_create_session(clp, 0); - if (!status) { - nfs_mark_client_ready(clp, NFS_CS_READY); - } else if (status == -NFS4ERR_STALE_CLIENTID) { - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - set_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state); - } else { - nfs_mark_client_ready(clp, status); - } return status; } + #else /* CONFIG_NFS_V4_1 */ static int nfs4_reset_session(struct nfs_client *clp) { return 0; } -static int nfs4_initialize_session(struct nfs_client *clp) { return 0; } +static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; } #endif /* CONFIG_NFS_V4_1 */ /* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors @@ -1234,7 +1336,8 @@ static void nfs4_state_manager(struct nfs_client *clp) status = nfs4_reclaim_lease(clp); if (status) { nfs4_set_lease_expired(clp, status); - if (status == -EAGAIN) + if (test_bit(NFS4CLNT_LEASE_EXPIRED, + &clp->cl_state)) continue; if (clp->cl_cons_state == NFS_CS_SESSION_INITING) @@ -1242,57 +1345,54 @@ static void nfs4_state_manager(struct nfs_client *clp) goto out_error; } clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); } if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { status = nfs4_check_lease(clp); - if (status != 0) + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) continue; + if (status < 0 && status != -NFS4ERR_CB_PATH_DOWN) + goto out_error; } + /* Initialize or reset the session */ - if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state) + if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) && nfs4_has_session(clp)) { - if (clp->cl_cons_state == NFS_CS_SESSION_INITING) - status = nfs4_initialize_session(clp); - else - status = nfs4_reset_session(clp); - if (status) { - if (status == -NFS4ERR_STALE_CLIENTID) - continue; + status = nfs4_reset_session(clp); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + continue; + if (status < 0) goto out_error; - } } + /* First recover reboot state... */ - if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { + if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { status = nfs4_do_reclaim(clp, nfs4_reboot_recovery_ops[clp->cl_minorversion]); - if (status == -NFS4ERR_STALE_CLIENTID) - continue; - if (test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) continue; nfs4_state_end_reclaim_reboot(clp); - continue; + if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) + continue; + if (status < 0) + goto out_error; } /* Now recover expired state... */ if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { status = nfs4_do_reclaim(clp, nfs4_nograce_recovery_ops[clp->cl_minorversion]); - if (status < 0) { - set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); - if (status == -NFS4ERR_STALE_CLIENTID) - continue; - if (status == -NFS4ERR_EXPIRED) - continue; - if (test_bit(NFS4CLNT_SESSION_SETUP, - &clp->cl_state)) - continue; + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || + test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + continue; + if (status < 0) goto out_error; - } else - nfs4_state_end_reclaim_nograce(clp); - continue; } + nfs4_end_drain_session(clp); if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { nfs_client_return_marked_delegations(clp); continue; @@ -1309,8 +1409,7 @@ static void nfs4_state_manager(struct nfs_client *clp) out_error: printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s" " with error %d\n", clp->cl_hostname, -status); - if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) - nfs4_state_end_reclaim_reboot(clp); + nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 20b4e30e6c82..e437fd6a819f 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -46,11 +46,13 @@ #include <linux/proc_fs.h> #include <linux/kdev_t.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/msg_prot.h> #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> #include <linux/nfs_idmap.h> #include "nfs4_fs.h" +#include "internal.h" #define NFSDBG_FACILITY NFSDBG_XDR @@ -134,7 +136,7 @@ static int nfs4_stat_to_errno(int); #define decode_lookup_maxsz (op_decode_hdr_maxsz) #define encode_share_access_maxsz \ (2) -#define encode_createmode_maxsz (1 + encode_attrs_maxsz) +#define encode_createmode_maxsz (1 + encode_attrs_maxsz + encode_verifier_maxsz) #define encode_opentype_maxsz (1 + encode_createmode_maxsz) #define encode_claim_null_maxsz (1 + nfs4_name_maxsz) #define encode_open_maxsz (op_encode_hdr_maxsz + \ @@ -299,6 +301,8 @@ static int nfs4_stat_to_errno(int); XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4) #define decode_sequence_maxsz (op_decode_hdr_maxsz + \ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) +#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) +#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -676,6 +680,25 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putrootfh_maxsz + \ decode_fsinfo_maxsz) +#define NFS4_enc_reclaim_complete_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_reclaim_complete_maxsz) +#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_reclaim_complete_maxsz) + +const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + + encode_sequence_maxsz + + encode_putfh_maxsz + + encode_getattr_maxsz) * + XDR_UNIT); + +const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_decode_hdr_maxsz + + decode_sequence_maxsz + + decode_putfh_maxsz) * + XDR_UNIT); #endif /* CONFIG_NFS_V4_1 */ static const umode_t nfs_type2fmt[] = { @@ -1140,6 +1163,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) { __be32 *p; + struct nfs_client *clp; p = reserve_space(xdr, 4); switch(arg->open_flags & O_EXCL) { @@ -1148,8 +1172,23 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op encode_attrs(xdr, arg->u.attrs, arg->server); break; default: - *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); - encode_nfs4_verifier(xdr, &arg->u.verifier); + clp = arg->server->nfs_client; + if (clp->cl_minorversion > 0) { + if (nfs4_has_persistent_session(clp)) { + *p = cpu_to_be32(NFS4_CREATE_GUARDED); + encode_attrs(xdr, arg->u.attrs, arg->server); + } else { + struct iattr dummy; + + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); + encode_nfs4_verifier(xdr, &arg->u.verifier); + dummy.ia_valid = 0; + encode_attrs(xdr, &dummy, arg->server); + } + } else { + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); + encode_nfs4_verifier(xdr, &arg->u.verifier); + } } } @@ -1592,6 +1631,19 @@ static void encode_destroy_session(struct xdr_stream *xdr, hdr->nops++; hdr->replen += decode_destroy_session_maxsz; } + +static void encode_reclaim_complete(struct xdr_stream *xdr, + struct nfs41_reclaim_complete_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_RECLAIM_COMPLETE); + *p++ = cpu_to_be32(args->one_fs); + hdr->nops++; + hdr->replen += decode_reclaim_complete_maxsz; +} #endif /* CONFIG_NFS_V4_1 */ static void encode_sequence(struct xdr_stream *xdr, @@ -2096,7 +2148,7 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p, encode_compound_hdr(&xdr, req, &hdr); encode_sequence(&xdr, &args->seq_args, &hdr); encode_putfh(&xdr, args->fh, &hdr); - replen = hdr.replen + nfs4_fattr_bitmap_maxsz + 1; + replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1; encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr); xdr_inline_pages(&req->rq_rcv_buf, replen << 2, @@ -2420,6 +2472,26 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, encode_nops(&hdr); return 0; } + +/* + * a RECLAIM_COMPLETE request + */ +static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p, + struct nfs41_reclaim_complete_args *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args) + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_sequence(&xdr, &args->seq_args, &hdr); + encode_reclaim_complete(&xdr, args, &hdr); + encode_nops(&hdr); + return 0; +} + #endif /* CONFIG_NFS_V4_1 */ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -4528,6 +4600,11 @@ static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) { return decode_op_hdr(xdr, OP_DESTROY_SESSION); } + +static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy) +{ + return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE); +} #endif /* CONFIG_NFS_V4_1 */ static int decode_sequence(struct xdr_stream *xdr, @@ -4583,8 +4660,8 @@ static int decode_sequence(struct xdr_stream *xdr, dummy = be32_to_cpup(p++); /* target highest slot id - currently not processed */ dummy = be32_to_cpup(p++); - /* result flags - currently not processed */ - dummy = be32_to_cpup(p); + /* result flags */ + res->sr_status_flags = be32_to_cpup(p); status = 0; out_err: res->sr_status = status; @@ -5309,7 +5386,7 @@ out: } /* - * FSINFO request + * Decode FSINFO response */ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_res *res) @@ -5330,7 +5407,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, } /* - * PATHCONF request + * Decode PATHCONF response */ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs4_pathconf_res *res) @@ -5351,7 +5428,7 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, } /* - * STATFS request + * Decode STATFS response */ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs4_statfs_res *res) @@ -5372,7 +5449,7 @@ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, } /* - * GETATTR_BITMAP request + * Decode GETATTR_BITMAP response */ static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res) { @@ -5411,7 +5488,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy) } /* - * a SETCLIENTID request + * Decode SETCLIENTID response */ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) @@ -5428,7 +5505,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, } /* - * a SETCLIENTID_CONFIRM request + * Decode SETCLIENTID_CONFIRM response */ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo) { @@ -5448,7 +5525,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str } /* - * DELEGRETURN request + * Decode DELEGRETURN response */ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res) { @@ -5474,7 +5551,7 @@ out: } /* - * FS_LOCATIONS request + * Decode FS_LOCATIONS response */ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_res *res) @@ -5504,7 +5581,7 @@ out: #if defined(CONFIG_NFS_V4_1) /* - * EXCHANGE_ID request + * Decode EXCHANGE_ID response */ static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p, void *res) @@ -5521,7 +5598,7 @@ static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p, } /* - * a CREATE_SESSION request + * Decode CREATE_SESSION response */ static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p, struct nfs41_create_session_res *res) @@ -5538,7 +5615,7 @@ static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p, } /* - * a DESTROY_SESSION request + * Decode DESTROY_SESSION response */ static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p, void *dummy) @@ -5555,7 +5632,7 @@ static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p, } /* - * a SEQUENCE request + * Decode SEQUENCE response */ static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_sequence_res *res) @@ -5572,7 +5649,7 @@ static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p, } /* - * a GET_LEASE_TIME request + * Decode GET_LEASE_TIME response */ static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_get_lease_time_res *res) @@ -5591,6 +5668,25 @@ static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p, status = decode_fsinfo(&xdr, res->lr_fsinfo); return status; } + +/* + * Decode RECLAIM_COMPLETE response + */ +static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p, + struct nfs41_reclaim_complete_res *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (!status) + status = decode_sequence(&xdr, &res->seq_res, rqstp); + if (!status) + status = decode_reclaim_complete(&xdr, (void *)NULL); + return status; +} #endif /* CONFIG_NFS_V4_1 */ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) @@ -5767,6 +5863,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), PROC(SEQUENCE, enc_sequence, dec_sequence), PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 12c9e66d3f1d..db9b360ae19d 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -356,25 +356,19 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data struct nfs_readres *resp = &data->res; if (resp->eof || resp->count == argp->count) - goto out; + return; /* This is a short read! */ nfs_inc_stats(data->inode, NFSIOS_SHORTREAD); /* Has the server at least made some progress? */ if (resp->count == 0) - goto out; + return; /* Yes, so retry the read at the end of the data */ argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; - nfs4_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); - return; -out: - nfs4_sequence_free_slot(NFS_SERVER(data->inode)->nfs_client, - &data->res.seq_res); - return; - + nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); } /* diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 90be551b80c1..ce907efc5508 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -175,14 +175,16 @@ static const match_table_t nfs_mount_option_tokens = { }; enum { - Opt_xprt_udp, Opt_xprt_tcp, Opt_xprt_rdma, + Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma, Opt_xprt_err }; static const match_table_t nfs_xprt_protocol_tokens = { { Opt_xprt_udp, "udp" }, + { Opt_xprt_udp6, "udp6" }, { Opt_xprt_tcp, "tcp" }, + { Opt_xprt_tcp6, "tcp6" }, { Opt_xprt_rdma, "rdma" }, { Opt_xprt_err, NULL } @@ -492,6 +494,45 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) return sec_flavours[i].str; } +static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address; + + seq_printf(m, ",mountproto="); + switch (sap->sa_family) { + case AF_INET: + switch (nfss->mountd_protocol) { + case IPPROTO_UDP: + seq_printf(m, RPCBIND_NETID_UDP); + break; + case IPPROTO_TCP: + seq_printf(m, RPCBIND_NETID_TCP); + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } + break; + case AF_INET6: + switch (nfss->mountd_protocol) { + case IPPROTO_UDP: + seq_printf(m, RPCBIND_NETID_UDP6); + break; + case IPPROTO_TCP: + seq_printf(m, RPCBIND_NETID_TCP6); + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } +} + static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults) { @@ -505,7 +546,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, } case AF_INET6: { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - seq_printf(m, ",mountaddr=%pI6", &sin6->sin6_addr); + seq_printf(m, ",mountaddr=%pI6c", &sin6->sin6_addr); break; } default: @@ -518,17 +559,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, if (nfss->mountd_port || showdefaults) seq_printf(m, ",mountport=%u", nfss->mountd_port); - switch (nfss->mountd_protocol) { - case IPPROTO_UDP: - seq_printf(m, ",mountproto=udp"); - break; - case IPPROTO_TCP: - seq_printf(m, ",mountproto=tcp"); - break; - default: - if (showdefaults) - seq_printf(m, ",mountproto=auto"); - } + nfs_show_mountd_netid(m, nfss, showdefaults); } /* @@ -578,7 +609,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, seq_puts(m, nfs_infop->nostr); } seq_printf(m, ",proto=%s", - rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO)); + rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID)); if (version == 4) { if (nfss->port != NFS_PORT) seq_printf(m, ",port=%u", nfss->port); @@ -714,8 +745,6 @@ static void nfs_umount_begin(struct super_block *sb) struct nfs_server *server; struct rpc_clnt *rpc; - lock_kernel(); - server = NFS_SB(sb); /* -EIO all pending I/O */ rpc = server->client_acl; @@ -724,8 +753,6 @@ static void nfs_umount_begin(struct super_block *sb) rpc = server->client; if (!IS_ERR(rpc)) rpc_killall_tasks(rpc); - - unlock_kernel(); } static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version) @@ -734,8 +761,6 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve data = kzalloc(sizeof(*data), GFP_KERNEL); if (data) { - data->rsize = NFS_MAX_FILE_IO_SIZE; - data->wsize = NFS_MAX_FILE_IO_SIZE; data->acregmin = NFS_DEF_ACREGMIN; data->acregmax = NFS_DEF_ACREGMAX; data->acdirmin = NFS_DEF_ACDIRMIN; @@ -887,6 +912,8 @@ static int nfs_parse_mount_options(char *raw, { char *p, *string, *secdata; int rc, sloppy = 0, invalid_option = 0; + unsigned short protofamily = AF_UNSPEC; + unsigned short mountfamily = AF_UNSPEC; if (!raw) { dfprintk(MOUNT, "NFS: mount options string was NULL.\n"); @@ -1232,12 +1259,17 @@ static int nfs_parse_mount_options(char *raw, token = match_token(string, nfs_xprt_protocol_tokens, args); + protofamily = AF_INET; switch (token) { + case Opt_xprt_udp6: + protofamily = AF_INET6; case Opt_xprt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; kfree(string); break; + case Opt_xprt_tcp6: + protofamily = AF_INET6; case Opt_xprt_tcp: mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; @@ -1265,10 +1297,15 @@ static int nfs_parse_mount_options(char *raw, nfs_xprt_protocol_tokens, args); kfree(string); + mountfamily = AF_INET; switch (token) { + case Opt_xprt_udp6: + mountfamily = AF_INET6; case Opt_xprt_udp: mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; break; + case Opt_xprt_tcp6: + mountfamily = AF_INET6; case Opt_xprt_tcp: mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; break; @@ -1367,8 +1404,33 @@ static int nfs_parse_mount_options(char *raw, if (!sloppy && invalid_option) return 0; + /* + * verify that any proto=/mountproto= options match the address + * familiies in the addr=/mountaddr= options. + */ + if (protofamily != AF_UNSPEC && + protofamily != mnt->nfs_server.address.ss_family) + goto out_proto_mismatch; + + if (mountfamily != AF_UNSPEC) { + if (mnt->mount_server.addrlen) { + if (mountfamily != mnt->mount_server.address.ss_family) + goto out_mountproto_mismatch; + } else { + if (mountfamily != mnt->nfs_server.address.ss_family) + goto out_mountproto_mismatch; + } + } + return 1; +out_mountproto_mismatch: + printk(KERN_INFO "NFS: mount server address does not match mountproto= " + "option\n"); + return 0; +out_proto_mismatch: + printk(KERN_INFO "NFS: server address does not match proto= option\n"); + return 0; out_invalid_address: printk(KERN_INFO "NFS: bad IP address specified: %s\n", p); return 0; @@ -1881,7 +1943,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) if (data == NULL) return -ENOMEM; - lock_kernel(); /* fill out struct with values from existing mount */ data->flags = nfss->flags; data->rsize = nfss->rsize; @@ -1907,7 +1968,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) error = nfs_compare_remount_data(nfss, data); out: kfree(data); - unlock_kernel(); return error; } diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 1064c91ae810..6da3d3ff6edd 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -83,7 +83,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) struct inode *dir = data->dir; if (!NFS_PROTO(dir)->unlink_done(task, dir)) - nfs4_restart_rpc(task, NFS_SERVER(dir)->nfs_client); + nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client); } /** diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c84b5cc1a943..d171696017f4 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -774,7 +774,7 @@ int nfs_updatepage(struct file *file, struct page *page, */ if (nfs_write_pageuptodate(page, inode) && inode->i_flock == NULL && - !(file->f_flags & O_SYNC)) { + !(file->f_flags & O_DSYNC)) { count = max(count + offset, nfs_page_length(page)); offset = 0; } @@ -1216,7 +1216,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ argp->stable = NFS_FILE_SYNC; } - nfs4_restart_rpc(task, server->nfs_client); + nfs_restart_rpc(task, server->nfs_client); return -EAGAIN; } if (time_before(complain, jiffies)) { @@ -1228,7 +1228,6 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) /* Can't do anything about it except throw an error. */ task->tk_status = -EIO; } - nfs4_sequence_free_slot(server->nfs_client, &data->res.seq_res); return 0; } @@ -1612,15 +1611,16 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, if (ret) goto out_unlock; page_cache_get(newpage); + spin_lock(&mapping->host->i_lock); req->wb_page = newpage; SetPagePrivate(newpage); - set_page_private(newpage, page_private(page)); + set_page_private(newpage, (unsigned long)req); ClearPagePrivate(page); set_page_private(page, 0); + spin_unlock(&mapping->host->i_lock); page_cache_release(page); out_unlock: nfs_clear_page_tag_locked(req); - nfs_release_request(req); out: return ret; } diff --git a/fs/nfsctl.c b/fs/nfsctl.c index 8f9a20556f79..d3854d94b7cf 100644 --- a/fs/nfsctl.c +++ b/fs/nfsctl.c @@ -7,8 +7,6 @@ #include <linux/types.h> #include <linux/file.h> #include <linux/fs.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> #include <linux/nfsd/syscall.h> #include <linux/cred.h> #include <linux/sched.h> diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 36fcabbf5186..79717a40daba 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -1,15 +1,7 @@ -/* - * linux/fs/nfsd/auth.c - * - * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> - */ +/* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/types.h> #include <linux/sched.h> -#include <linux/sunrpc/svc.h> -#include <linux/sunrpc/svcauth.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/export.h> +#include "nfsd.h" #include "auth.h" int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h new file mode 100644 index 000000000000..d892be61016c --- /dev/null +++ b/fs/nfsd/cache.h @@ -0,0 +1,83 @@ +/* + * Request reply cache. This was heavily inspired by the + * implementation in 4.3BSD/4.4BSD. + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + */ + +#ifndef NFSCACHE_H +#define NFSCACHE_H + +#include <linux/sunrpc/svc.h> + +/* + * Representation of a reply cache entry. + */ +struct svc_cacherep { + struct hlist_node c_hash; + struct list_head c_lru; + + unsigned char c_state, /* unused, inprog, done */ + c_type, /* status, buffer */ + c_secure : 1; /* req came from port < 1024 */ + struct sockaddr_in c_addr; + __be32 c_xid; + u32 c_prot; + u32 c_proc; + u32 c_vers; + unsigned long c_timestamp; + union { + struct kvec u_vec; + __be32 u_status; + } c_u; +}; + +#define c_replvec c_u.u_vec +#define c_replstat c_u.u_status + +/* cache entry states */ +enum { + RC_UNUSED, + RC_INPROG, + RC_DONE +}; + +/* return values */ +enum { + RC_DROPIT, + RC_REPLY, + RC_DOIT, + RC_INTR +}; + +/* + * Cache types. + * We may want to add more types one day, e.g. for diropres and + * attrstat replies. Using cache entries with fixed length instead + * of buffer pointers may be more efficient. + */ +enum { + RC_NOCACHE, + RC_REPLSTAT, + RC_REPLBUFF, +}; + +/* + * If requests are retransmitted within this interval, they're dropped. + */ +#define RC_DELAY (HZ/5) + +int nfsd_reply_cache_init(void); +void nfsd_reply_cache_shutdown(void); +int nfsd_cache_lookup(struct svc_rqst *, int); +void nfsd_cache_update(struct svc_rqst *, int, __be32 *); + +#ifdef CONFIG_NFSD_V4 +void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp); +#else /* CONFIG_NFSD_V4 */ +static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp) +{ +} +#endif /* CONFIG_NFSD_V4 */ + +#endif /* NFSCACHE_H */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c1c9e035d4a4..c487810a2366 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1,7 +1,5 @@ #define MSNFS /* HACK HACK */ /* - * linux/fs/nfsd/export.c - * * NFS exporting and validation. * * We maintain a list of clients, each of which has a list of @@ -14,29 +12,16 @@ * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de> */ -#include <linux/unistd.h> -#include <linux/slab.h> -#include <linux/stat.h> -#include <linux/in.h> -#include <linux/seq_file.h> -#include <linux/syscalls.h> -#include <linux/rwsem.h> -#include <linux/dcache.h> #include <linux/namei.h> -#include <linux/mount.h> -#include <linux/hash.h> #include <linux/module.h> #include <linux/exportfs.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/nfsfh.h> #include <linux/nfsd/syscall.h> -#include <linux/lockd/bind.h> -#include <linux/sunrpc/msg_prot.h> -#include <linux/sunrpc/gss_api.h> #include <net/ipv6.h> +#include "nfsd.h" +#include "nfsfh.h" + #define NFSDDBG_FACILITY NFSDDBG_EXPORT typedef struct auth_domain svc_client; @@ -369,16 +354,25 @@ static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); -static int check_export(struct inode *inode, int flags, unsigned char *uuid) +static int check_export(struct inode *inode, int *flags, unsigned char *uuid) { - /* We currently export only dirs and regular files. - * This is what umountd does. + /* + * We currently export only dirs, regular files, and (for v4 + * pseudoroot) symlinks. */ if (!S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode) && !S_ISREG(inode->i_mode)) return -ENOTDIR; + /* + * Mountd should never pass down a writeable V4ROOT export, but, + * just to make sure: + */ + if (*flags & NFSEXP_V4ROOT) + *flags |= NFSEXP_READONLY; + /* There are two requirements on a filesystem to be exportable. * 1: We must be able to identify the filesystem from a number. * either a device number (so FS_REQUIRES_DEV needed) @@ -387,7 +381,7 @@ static int check_export(struct inode *inode, int flags, unsigned char *uuid) * This means that s_export_op must be set. */ if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && - !(flags & NFSEXP_FSID) && + !(*flags & NFSEXP_FSID) && uuid == NULL) { dprintk("exp_export: export of non-dev fs without fsid\n"); return -EINVAL; @@ -602,7 +596,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) goto out4; } - err = check_export(exp.ex_path.dentry->d_inode, exp.ex_flags, + err = check_export(exp.ex_path.dentry->d_inode, &exp.ex_flags, exp.ex_uuid); if (err) goto out4; @@ -1041,7 +1035,7 @@ exp_export(struct nfsctl_export *nxp) goto finish; } - err = check_export(path.dentry->d_inode, nxp->ex_flags, NULL); + err = check_export(path.dentry->d_inode, &nxp->ex_flags, NULL); if (err) goto finish; err = -ENOMEM; @@ -1320,6 +1314,23 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path) return exp; } +static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp) +{ + struct svc_export *exp; + u32 fsidv[2]; + + mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); + + exp = rqst_exp_find(rqstp, FSID_NUM, fsidv); + /* + * We shouldn't have accepting an nfsv4 request at all if we + * don't have a pseudoexport!: + */ + if (IS_ERR(exp) && PTR_ERR(exp) == -ENOENT) + exp = ERR_PTR(-ESERVERFAULT); + return exp; +} + /* * Called when we need the filehandle for the root of the pseudofs, * for a given NFSv4 client. The root is defined to be the @@ -1330,11 +1341,8 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) { struct svc_export *exp; __be32 rv; - u32 fsidv[2]; - mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); - - exp = rqst_exp_find(rqstp, FSID_NUM, fsidv); + exp = find_fsidzero_export(rqstp); if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); @@ -1425,6 +1433,7 @@ static struct flags { { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, + { NFSEXP_V4ROOT, {"v4root", ""}}, #ifdef MSNFS { NFSEXP_MSNFS, {"msnfs", ""}}, #endif diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index b2786a5f9afe..0c6d81670137 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -1,6 +1,4 @@ /* - * linux/fs/nfsd/lockd.c - * * This file contains all the stubs needed when communicating with lockd. * This level of indirection is necessary so we can run nfsd+lockd without * requiring the nfs client to be compiled in/loaded, and vice versa. @@ -8,14 +6,10 @@ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> */ -#include <linux/types.h> -#include <linux/fs.h> #include <linux/file.h> -#include <linux/mount.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> #include <linux/lockd/bind.h> +#include "nfsd.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_LOCKD diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 4e3219e84116..f20589d2ae27 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -1,19 +1,15 @@ /* - * linux/fs/nfsd/nfs2acl.c - * * Process version 2 NFSACL requests. * * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de> */ -#include <linux/sunrpc/svc.h> -#include <linux/nfs.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> -#include <linux/nfsd/xdr.h> -#include <linux/nfsd/xdr3.h> -#include <linux/posix_acl.h> +#include "nfsd.h" +/* FIXME: nfsacl.h is a broken header */ #include <linux/nfsacl.h> +#include "cache.h" +#include "xdr3.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC #define RETURN_STATUS(st) { resp->status = (st); return (st); } @@ -217,6 +213,16 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p, * XDR encode functions */ +/* + * There must be an encoding function for void results so svc_process + * will work properly. + */ +int +nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} + /* GETACL */ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_getaclres *resp) @@ -308,7 +314,6 @@ static int nfsaclsvc_release_access(struct svc_rqst *rqstp, __be32 *p, } #define nfsaclsvc_decode_voidargs NULL -#define nfsaclsvc_encode_voidres NULL #define nfsaclsvc_release_void NULL #define nfsd3_fhandleargs nfsd_fhandle #define nfsd3_attrstatres nfsd_attrstat @@ -346,5 +351,5 @@ struct svc_version nfsd_acl_version2 = { .vs_proc = nfsd_acl_procedures2, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS3_SVC_XDRSIZE, - .vs_hidden = 1, + .vs_hidden = 0, }; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 9981dbb377a3..e0c4846bad92 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -1,18 +1,15 @@ /* - * linux/fs/nfsd/nfs3acl.c - * * Process version 3 NFSACL requests. * * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de> */ -#include <linux/sunrpc/svc.h> -#include <linux/nfs3.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> -#include <linux/nfsd/xdr3.h> -#include <linux/posix_acl.h> +#include "nfsd.h" +/* FIXME: nfsacl.h is a broken header */ #include <linux/nfsacl.h> +#include "cache.h" +#include "xdr3.h" +#include "vfs.h" #define RETURN_STATUS(st) { resp->status = (st); return (st); } @@ -264,6 +261,6 @@ struct svc_version nfsd_acl_version3 = { .vs_proc = nfsd_acl_procedures3, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS3_SVC_XDRSIZE, - .vs_hidden = 1, + .vs_hidden = 0, }; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index a713c418a922..3d68f45a37b9 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -1,30 +1,16 @@ /* - * linux/fs/nfsd/nfs3proc.c - * * Process version 3 NFS requests. * * Copyright (C) 1996, 1997, 1998 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/linkage.h> -#include <linux/time.h> -#include <linux/errno.h> #include <linux/fs.h> #include <linux/ext2_fs.h> -#include <linux/stat.h> -#include <linux/fcntl.h> -#include <linux/net.h> -#include <linux/in.h> -#include <linux/unistd.h> -#include <linux/slab.h> -#include <linux/major.h> #include <linux/magic.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> -#include <linux/nfsd/xdr3.h> -#include <linux/nfs3.h> +#include "cache.h" +#include "xdr3.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index d0a2ce1b4324..2a533a0af2a9 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -1,6 +1,4 @@ /* - * linux/fs/nfsd/nfs3xdr.c - * * XDR support for nfsd/protocol version 3. * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> @@ -8,19 +6,8 @@ * 2003-08-09 Jamie Lokier: Use htonl() for nanoseconds, not htons()! */ -#include <linux/types.h> -#include <linux/time.h> -#include <linux/nfs3.h> -#include <linux/list.h> -#include <linux/spinlock.h> -#include <linux/dcache.h> #include <linux/namei.h> -#include <linux/mm.h> -#include <linux/vfs.h> -#include <linux/sunrpc/xdr.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/xdr3.h> +#include "xdr3.h" #include "auth.h" #define NFSDDBG_FACILITY NFSDDBG_XDR diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 725d02f210e2..88150685df34 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -1,6 +1,4 @@ /* - * fs/nfs4acl/acl.c - * * Common NFSv4 ACL handling code. * * Copyright (c) 2002, 2003 The Regents of the University of Michigan. @@ -36,15 +34,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/list.h> -#include <linux/types.h> -#include <linux/fs.h> -#include <linux/module.h> #include <linux/nfs_fs.h> -#include <linux/posix_acl.h> -#include <linux/nfs4.h> #include <linux/nfs4_acl.h> @@ -389,7 +379,7 @@ sort_pacl(struct posix_acl *pacl) sort_pacl_range(pacl, 1, i-1); BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ); - j = i++; + j = ++i; while (pacl->a_entries[j].e_tag == ACL_GROUP) j++; sort_pacl_range(pacl, i, j-1); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 24e8d78f8dde..c6eed2a3b093 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1,6 +1,4 @@ /* - * linux/fs/nfsd/nfs4callback.c - * * Copyright (c) 2001 The Regents of the University of Michigan. * All rights reserved. * @@ -33,22 +31,9 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <linux/module.h> -#include <linux/list.h> -#include <linux/inet.h> -#include <linux/errno.h> -#include <linux/delay.h> -#include <linux/sched.h> -#include <linux/kthread.h> -#include <linux/sunrpc/xdr.h> -#include <linux/sunrpc/svc.h> #include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/svcsock.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/state.h> -#include <linux/sunrpc/sched.h> -#include <linux/nfs4.h> -#include <linux/sunrpc/xprtsock.h> +#include "nfsd.h" +#include "state.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index ba2c199592fd..6e2983b27f3c 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -1,6 +1,4 @@ /* - * fs/nfsd/nfs4idmap.c - * * Mapping of UID/GIDs to name and vice versa. * * Copyright (c) 2002, 2003 The Regents of the University of @@ -35,22 +33,9 @@ */ #include <linux/module.h> -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/sunrpc/clnt.h> -#include <linux/nfs.h> -#include <linux/nfs4.h> -#include <linux/nfs_fs.h> -#include <linux/nfs_page.h> -#include <linux/sunrpc/cache.h> #include <linux/nfsd_idmap.h> -#include <linux/list.h> -#include <linux/time.h> #include <linux/seq_file.h> -#include <linux/sunrpc/svcauth.h> +#include <linux/sched.h> /* * Cache entry diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index bebc0c2e1b0a..37514c469846 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1,6 +1,4 @@ /* - * fs/nfsd/nfs4proc.c - * * Server-side procedures for NFSv4. * * Copyright (c) 2002 The Regents of the University of Michigan. @@ -34,20 +32,11 @@ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -#include <linux/param.h> -#include <linux/major.h> -#include <linux/slab.h> #include <linux/file.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> -#include <linux/nfs4.h> -#include <linux/nfsd/state.h> -#include <linux/nfsd/xdr4.h> -#include <linux/nfs4_acl.h> -#include <linux/sunrpc/gss_api.h> +#include "cache.h" +#include "xdr4.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -170,7 +159,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs accmode |= NFSD_MAY_READ; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC); - if (open->op_share_deny & NFS4_SHARE_DENY_WRITE) + if (open->op_share_deny & NFS4_SHARE_DENY_READ) accmode |= NFSD_MAY_WRITE; status = fh_verify(rqstp, current_fh, S_IFREG, accmode); diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index b5348405046b..5a754f7b71ed 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -1,6 +1,4 @@ /* -* linux/fs/nfsd/nfs4recover.c -* * Copyright (c) 2004 The Regents of the University of Michigan. * All rights reserved. * @@ -33,20 +31,14 @@ * */ -#include <linux/err.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfs4.h> -#include <linux/nfsd/state.h> -#include <linux/nfsd/xdr4.h> -#include <linux/param.h> #include <linux/file.h> #include <linux/namei.h> -#include <asm/uaccess.h> -#include <linux/scatterlist.h> #include <linux/crypto.h> #include <linux/sched.h> -#include <linux/mount.h> + +#include "nfsd.h" +#include "state.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2153f9bdbebd..f19ed866c95f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1,6 +1,4 @@ /* -* linux/fs/nfsd/nfs4state.c -* * Copyright (c) 2001 The Regents of the University of Michigan. * All rights reserved. * @@ -34,28 +32,14 @@ * */ -#include <linux/param.h> -#include <linux/major.h> -#include <linux/slab.h> - -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> #include <linux/file.h> -#include <linux/mount.h> -#include <linux/workqueue.h> #include <linux/smp_lock.h> -#include <linux/kthread.h> -#include <linux/nfs4.h> -#include <linux/nfsd/state.h> -#include <linux/nfsd/xdr4.h> #include <linux/namei.h> #include <linux/swap.h> -#include <linux/mutex.h> -#include <linux/lockd/bind.h> -#include <linux/module.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/clnt.h> +#include "xdr4.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -477,13 +461,14 @@ static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan) /* * fchan holds the client values on input, and the server values on output + * sv_max_mesg is the maximum payload plus one page for overhead. */ static int init_forechannel_attrs(struct svc_rqst *rqstp, struct nfsd4_channel_attrs *session_fchan, struct nfsd4_channel_attrs *fchan) { int status = 0; - __u32 maxcount = svc_max_payload(rqstp); + __u32 maxcount = nfsd_serv->sv_max_mesg; /* headerpadsz set to zero in encode routine */ @@ -523,6 +508,15 @@ free_session_slots(struct nfsd4_session *ses) kfree(ses->se_slots[i]); } +/* + * We don't actually need to cache the rpc and session headers, so we + * can allocate a little less for each slot: + */ +static inline int slot_bytes(struct nfsd4_channel_attrs *ca) +{ + return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; +} + static int alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses) @@ -554,7 +548,7 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, memcpy(new, &tmp, sizeof(*new)); /* allocate each struct nfsd4_slot and data cache in one piece */ - cachesize = new->se_fchannel.maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; + cachesize = slot_bytes(&new->se_fchannel); for (i = 0; i < new->se_fchannel.maxreqs; i++) { sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL); if (!sp) @@ -628,10 +622,12 @@ void free_session(struct kref *kref) { struct nfsd4_session *ses; + int mem; ses = container_of(kref, struct nfsd4_session, se_ref); spin_lock(&nfsd_drc_lock); - nfsd_drc_mem_used -= ses->se_fchannel.maxreqs * NFSD_SLOT_CACHE_SIZE; + mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel); + nfsd_drc_mem_used -= mem; spin_unlock(&nfsd_drc_lock); free_session_slots(ses); kfree(ses); @@ -2404,11 +2400,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); - dprintk("NFSD: delegation stateid=(%08x/%08x/%08x/%08x)\n\n", - dp->dl_stateid.si_boot, - dp->dl_stateid.si_stateownerid, - dp->dl_stateid.si_fileid, - dp->dl_stateid.si_generation); + dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", + STATEID_VAL(&dp->dl_stateid)); out: if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && flag == NFS4_OPEN_DELEGATE_NONE @@ -2498,9 +2491,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs_ok; - dprintk("nfs4_process_open2: stateid=(%08x/%08x/%08x/%08x)\n", - stp->st_stateid.si_boot, stp->st_stateid.si_stateownerid, - stp->st_stateid.si_fileid, stp->st_stateid.si_generation); + dprintk("%s: stateid=" STATEID_FMT "\n", __func__, + STATEID_VAL(&stp->st_stateid)); out: if (fp) put_nfs4_file(fp); @@ -2666,9 +2658,8 @@ STALE_STATEID(stateid_t *stateid) { if (time_after((unsigned long)boot_time, (unsigned long)stateid->si_boot)) { - dprintk("NFSD: stale stateid (%08x/%08x/%08x/%08x)!\n", - stateid->si_boot, stateid->si_stateownerid, - stateid->si_fileid, stateid->si_generation); + dprintk("NFSD: stale stateid " STATEID_FMT "!\n", + STATEID_VAL(stateid)); return 1; } return 0; @@ -2680,9 +2671,8 @@ EXPIRED_STATEID(stateid_t *stateid) if (time_before((unsigned long)boot_time, ((unsigned long)stateid->si_boot)) && time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) { - dprintk("NFSD: expired stateid (%08x/%08x/%08x/%08x)!\n", - stateid->si_boot, stateid->si_stateownerid, - stateid->si_fileid, stateid->si_generation); + dprintk("NFSD: expired stateid " STATEID_FMT "!\n", + STATEID_VAL(stateid)); return 1; } return 0; @@ -2696,9 +2686,8 @@ stateid_error_map(stateid_t *stateid) if (EXPIRED_STATEID(stateid)) return nfserr_expired; - dprintk("NFSD: bad stateid (%08x/%08x/%08x/%08x)!\n", - stateid->si_boot, stateid->si_stateownerid, - stateid->si_fileid, stateid->si_generation); + dprintk("NFSD: bad stateid " STATEID_FMT "!\n", + STATEID_VAL(stateid)); return nfserr_bad_stateid; } @@ -2884,10 +2873,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, struct svc_fh *current_fh = &cstate->current_fh; __be32 status; - dprintk("NFSD: preprocess_seqid_op: seqid=%d " - "stateid = (%08x/%08x/%08x/%08x)\n", seqid, - stateid->si_boot, stateid->si_stateownerid, stateid->si_fileid, - stateid->si_generation); + dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, + seqid, STATEID_VAL(stateid)); *stpp = NULL; *sopp = NULL; @@ -3019,12 +3006,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, sop->so_confirmed = 1; update_stateid(&stp->st_stateid); memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t)); - dprintk("NFSD: nfsd4_open_confirm: success, seqid=%d " - "stateid=(%08x/%08x/%08x/%08x)\n", oc->oc_seqid, - stp->st_stateid.si_boot, - stp->st_stateid.si_stateownerid, - stp->st_stateid.si_fileid, - stp->st_stateid.si_generation); + dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", + __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid)); nfsd4_create_clid_dir(sop->so_client); out: @@ -3283,9 +3266,8 @@ find_delegation_stateid(struct inode *ino, stateid_t *stid) struct nfs4_file *fp; struct nfs4_delegation *dl; - dprintk("NFSD:find_delegation_stateid stateid=(%08x/%08x/%08x/%08x)\n", - stid->si_boot, stid->si_stateownerid, - stid->si_fileid, stid->si_generation); + dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__, + STATEID_VAL(stid)); fp = find_file(ino); if (!fp) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 0fbd50cee1f6..a8587e90fd5a 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -40,24 +40,16 @@ * at the end of nfs4svc_decode_compoundargs. */ -#include <linux/param.h> -#include <linux/smp.h> -#include <linux/fs.h> #include <linux/namei.h> -#include <linux/vfs.h> +#include <linux/statfs.h> #include <linux/utsname.h> -#include <linux/sunrpc/xdr.h> -#include <linux/sunrpc/svc.h> -#include <linux/sunrpc/clnt.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/state.h> -#include <linux/nfsd/xdr4.h> #include <linux/nfsd_idmap.h> -#include <linux/nfs4.h> #include <linux/nfs4_acl.h> -#include <linux/sunrpc/gss_api.h> #include <linux/sunrpc/svcauth_gss.h> +#include "xdr4.h" +#include "vfs.h" + #define NFSDDBG_FACILITY NFSDDBG_XDR /* @@ -2204,11 +2196,14 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, * we will not follow the cross mount and will fill the attribtutes * directly from the mountpoint dentry. */ - if (d_mountpoint(dentry) && !attributes_need_mount(cd->rd_bmval)) - ignore_crossmnt = 1; - else if (d_mountpoint(dentry)) { + if (nfsd_mountpoint(dentry, exp)) { int err; + if (!(exp->ex_flags & NFSEXP_V4ROOT) + && !attributes_need_mount(cd->rd_bmval)) { + ignore_crossmnt = 1; + goto out_encode; + } /* * Why the heck aren't we just using nfsd_lookup?? * Different "."/".." handling? Something else? @@ -2224,6 +2219,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, goto out_put; } +out_encode: nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval, cd->rd_rqstp, ignore_crossmnt); out_put: diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 4638635c5d87..da08560c4818 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -1,6 +1,4 @@ /* - * linux/fs/nfsd/nfscache.c - * * Request reply cache. This is currently a global cache, but this may * change in the future and be a per-client cache. * @@ -10,16 +8,8 @@ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/kernel.h> -#include <linux/time.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/spinlock.h> -#include <linux/list.h> - -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> +#include "nfsd.h" +#include "cache.h" /* Size of reply cache. Common values are: * 4.3BSD: 128 diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 5c01fc148ce8..2604c3e70ea5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1,46 +1,20 @@ /* - * linux/fs/nfsd/nfsctl.c - * * Syscall interface to knfsd. * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/module.h> - -#include <linux/linkage.h> -#include <linux/time.h> -#include <linux/errno.h> -#include <linux/fs.h> #include <linux/namei.h> -#include <linux/fcntl.h> -#include <linux/net.h> -#include <linux/in.h> -#include <linux/syscalls.h> -#include <linux/unistd.h> -#include <linux/slab.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/pagemap.h> -#include <linux/init.h> -#include <linux/inet.h> -#include <linux/string.h> #include <linux/ctype.h> -#include <linux/nfs.h> #include <linux/nfsd_idmap.h> -#include <linux/lockd/bind.h> -#include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> -#include <linux/nfsd/xdr.h> #include <linux/nfsd/syscall.h> #include <linux/lockd/lockd.h> #include <linux/sunrpc/clnt.h> -#include <asm/uaccess.h> -#include <net/ipv6.h> +#include "nfsd.h" +#include "cache.h" /* * We have a single directory with 9 nodes in it. @@ -55,6 +29,7 @@ enum { NFSD_Getfd, NFSD_Getfs, NFSD_List, + NFSD_Export_features, NFSD_Fh, NFSD_FO_UnlockIP, NFSD_FO_UnlockFS, @@ -173,6 +148,24 @@ static const struct file_operations exports_operations = { .owner = THIS_MODULE, }; +static int export_features_show(struct seq_file *m, void *v) +{ + seq_printf(m, "0x%x 0x%x\n", NFSEXP_ALLFLAGS, NFSEXP_SECINFO_FLAGS); + return 0; +} + +static int export_features_open(struct inode *inode, struct file *file) +{ + return single_open(file, export_features_show, NULL); +} + +static struct file_operations export_features_operations = { + .open = export_features_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); @@ -1330,6 +1323,8 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, + [NFSD_Export_features] = {"export_features", + &export_features_operations, S_IRUGO}, [NFSD_FO_UnlockIP] = {"unlock_ip", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_FO_UnlockFS] = {"unlock_filesystem", diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h new file mode 100644 index 000000000000..e942a1aaac92 --- /dev/null +++ b/fs/nfsd/nfsd.h @@ -0,0 +1,338 @@ +/* + * Hodge-podge collection of knfsd-related stuff. + * I will sort this out later. + * + * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> + */ + +#ifndef LINUX_NFSD_NFSD_H +#define LINUX_NFSD_NFSD_H + +#include <linux/types.h> +#include <linux/mount.h> + +#include <linux/nfsd/debug.h> +#include <linux/nfsd/export.h> +#include <linux/nfsd/stats.h> +/* + * nfsd version + */ +#define NFSD_SUPPORTED_MINOR_VERSION 1 + +struct readdir_cd { + __be32 err; /* 0, nfserr, or nfserr_eof */ +}; + + +extern struct svc_program nfsd_program; +extern struct svc_version nfsd_version2, nfsd_version3, + nfsd_version4; +extern u32 nfsd_supported_minorversion; +extern struct mutex nfsd_mutex; +extern struct svc_serv *nfsd_serv; +extern spinlock_t nfsd_drc_lock; +extern unsigned int nfsd_drc_max_mem; +extern unsigned int nfsd_drc_mem_used; + +extern const struct seq_operations nfs_exports_op; + +/* + * Function prototypes. + */ +int nfsd_svc(unsigned short port, int nrservs); +int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); + +int nfsd_nrthreads(void); +int nfsd_nrpools(void); +int nfsd_get_nrthreads(int n, int *); +int nfsd_set_nrthreads(int n, int *); + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +#ifdef CONFIG_NFSD_V2_ACL +extern struct svc_version nfsd_acl_version2; +#else +#define nfsd_acl_version2 NULL +#endif +#ifdef CONFIG_NFSD_V3_ACL +extern struct svc_version nfsd_acl_version3; +#else +#define nfsd_acl_version3 NULL +#endif +#endif + +enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL }; +int nfsd_vers(int vers, enum vers_op change); +int nfsd_minorversion(u32 minorversion, enum vers_op change); +void nfsd_reset_versions(void); +int nfsd_create_serv(void); + +extern int nfsd_max_blksize; + +static inline int nfsd_v4client(struct svc_rqst *rq) +{ + return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4; +} + +/* + * NFSv4 State + */ +#ifdef CONFIG_NFSD_V4 +extern unsigned int max_delegations; +int nfs4_state_init(void); +void nfsd4_free_slabs(void); +int nfs4_state_start(void); +void nfs4_state_shutdown(void); +time_t nfs4_lease_time(void); +void nfs4_reset_lease(time_t leasetime); +int nfs4_reset_recoverydir(char *recdir); +#else +static inline int nfs4_state_init(void) { return 0; } +static inline void nfsd4_free_slabs(void) { } +static inline int nfs4_state_start(void) { return 0; } +static inline void nfs4_state_shutdown(void) { } +static inline time_t nfs4_lease_time(void) { return 0; } +static inline void nfs4_reset_lease(time_t leasetime) { } +static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } +#endif + +/* + * lockd binding + */ +void nfsd_lockd_init(void); +void nfsd_lockd_shutdown(void); + + +/* + * These macros provide pre-xdr'ed values for faster operation. + */ +#define nfs_ok cpu_to_be32(NFS_OK) +#define nfserr_perm cpu_to_be32(NFSERR_PERM) +#define nfserr_noent cpu_to_be32(NFSERR_NOENT) +#define nfserr_io cpu_to_be32(NFSERR_IO) +#define nfserr_nxio cpu_to_be32(NFSERR_NXIO) +#define nfserr_eagain cpu_to_be32(NFSERR_EAGAIN) +#define nfserr_acces cpu_to_be32(NFSERR_ACCES) +#define nfserr_exist cpu_to_be32(NFSERR_EXIST) +#define nfserr_xdev cpu_to_be32(NFSERR_XDEV) +#define nfserr_nodev cpu_to_be32(NFSERR_NODEV) +#define nfserr_notdir cpu_to_be32(NFSERR_NOTDIR) +#define nfserr_isdir cpu_to_be32(NFSERR_ISDIR) +#define nfserr_inval cpu_to_be32(NFSERR_INVAL) +#define nfserr_fbig cpu_to_be32(NFSERR_FBIG) +#define nfserr_nospc cpu_to_be32(NFSERR_NOSPC) +#define nfserr_rofs cpu_to_be32(NFSERR_ROFS) +#define nfserr_mlink cpu_to_be32(NFSERR_MLINK) +#define nfserr_opnotsupp cpu_to_be32(NFSERR_OPNOTSUPP) +#define nfserr_nametoolong cpu_to_be32(NFSERR_NAMETOOLONG) +#define nfserr_notempty cpu_to_be32(NFSERR_NOTEMPTY) +#define nfserr_dquot cpu_to_be32(NFSERR_DQUOT) +#define nfserr_stale cpu_to_be32(NFSERR_STALE) +#define nfserr_remote cpu_to_be32(NFSERR_REMOTE) +#define nfserr_wflush cpu_to_be32(NFSERR_WFLUSH) +#define nfserr_badhandle cpu_to_be32(NFSERR_BADHANDLE) +#define nfserr_notsync cpu_to_be32(NFSERR_NOT_SYNC) +#define nfserr_badcookie cpu_to_be32(NFSERR_BAD_COOKIE) +#define nfserr_notsupp cpu_to_be32(NFSERR_NOTSUPP) +#define nfserr_toosmall cpu_to_be32(NFSERR_TOOSMALL) +#define nfserr_serverfault cpu_to_be32(NFSERR_SERVERFAULT) +#define nfserr_badtype cpu_to_be32(NFSERR_BADTYPE) +#define nfserr_jukebox cpu_to_be32(NFSERR_JUKEBOX) +#define nfserr_denied cpu_to_be32(NFSERR_DENIED) +#define nfserr_deadlock cpu_to_be32(NFSERR_DEADLOCK) +#define nfserr_expired cpu_to_be32(NFSERR_EXPIRED) +#define nfserr_bad_cookie cpu_to_be32(NFSERR_BAD_COOKIE) +#define nfserr_same cpu_to_be32(NFSERR_SAME) +#define nfserr_clid_inuse cpu_to_be32(NFSERR_CLID_INUSE) +#define nfserr_stale_clientid cpu_to_be32(NFSERR_STALE_CLIENTID) +#define nfserr_resource cpu_to_be32(NFSERR_RESOURCE) +#define nfserr_moved cpu_to_be32(NFSERR_MOVED) +#define nfserr_nofilehandle cpu_to_be32(NFSERR_NOFILEHANDLE) +#define nfserr_minor_vers_mismatch cpu_to_be32(NFSERR_MINOR_VERS_MISMATCH) +#define nfserr_share_denied cpu_to_be32(NFSERR_SHARE_DENIED) +#define nfserr_stale_stateid cpu_to_be32(NFSERR_STALE_STATEID) +#define nfserr_old_stateid cpu_to_be32(NFSERR_OLD_STATEID) +#define nfserr_bad_stateid cpu_to_be32(NFSERR_BAD_STATEID) +#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID) +#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK) +#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME) +#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH) +#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) +#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) +#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE) +#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD) +#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL) +#define nfserr_grace cpu_to_be32(NFSERR_GRACE) +#define nfserr_no_grace cpu_to_be32(NFSERR_NO_GRACE) +#define nfserr_reclaim_bad cpu_to_be32(NFSERR_RECLAIM_BAD) +#define nfserr_badname cpu_to_be32(NFSERR_BADNAME) +#define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN) +#define nfserr_locked cpu_to_be32(NFSERR_LOCKED) +#define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC) +#define nfserr_badiomode cpu_to_be32(NFS4ERR_BADIOMODE) +#define nfserr_badlayout cpu_to_be32(NFS4ERR_BADLAYOUT) +#define nfserr_bad_session_digest cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST) +#define nfserr_badsession cpu_to_be32(NFS4ERR_BADSESSION) +#define nfserr_badslot cpu_to_be32(NFS4ERR_BADSLOT) +#define nfserr_complete_already cpu_to_be32(NFS4ERR_COMPLETE_ALREADY) +#define nfserr_conn_not_bound_to_session cpu_to_be32(NFS4ERR_CONN_NOT_BOUND_TO_SESSION) +#define nfserr_deleg_already_wanted cpu_to_be32(NFS4ERR_DELEG_ALREADY_WANTED) +#define nfserr_back_chan_busy cpu_to_be32(NFS4ERR_BACK_CHAN_BUSY) +#define nfserr_layouttrylater cpu_to_be32(NFS4ERR_LAYOUTTRYLATER) +#define nfserr_layoutunavailable cpu_to_be32(NFS4ERR_LAYOUTUNAVAILABLE) +#define nfserr_nomatching_layout cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT) +#define nfserr_recallconflict cpu_to_be32(NFS4ERR_RECALLCONFLICT) +#define nfserr_unknown_layouttype cpu_to_be32(NFS4ERR_UNKNOWN_LAYOUTTYPE) +#define nfserr_seq_misordered cpu_to_be32(NFS4ERR_SEQ_MISORDERED) +#define nfserr_sequence_pos cpu_to_be32(NFS4ERR_SEQUENCE_POS) +#define nfserr_req_too_big cpu_to_be32(NFS4ERR_REQ_TOO_BIG) +#define nfserr_rep_too_big cpu_to_be32(NFS4ERR_REP_TOO_BIG) +#define nfserr_rep_too_big_to_cache cpu_to_be32(NFS4ERR_REP_TOO_BIG_TO_CACHE) +#define nfserr_retry_uncached_rep cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP) +#define nfserr_unsafe_compound cpu_to_be32(NFS4ERR_UNSAFE_COMPOUND) +#define nfserr_too_many_ops cpu_to_be32(NFS4ERR_TOO_MANY_OPS) +#define nfserr_op_not_in_session cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION) +#define nfserr_hash_alg_unsupp cpu_to_be32(NFS4ERR_HASH_ALG_UNSUPP) +#define nfserr_clientid_busy cpu_to_be32(NFS4ERR_CLIENTID_BUSY) +#define nfserr_pnfs_io_hole cpu_to_be32(NFS4ERR_PNFS_IO_HOLE) +#define nfserr_seq_false_retry cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY) +#define nfserr_bad_high_slot cpu_to_be32(NFS4ERR_BAD_HIGH_SLOT) +#define nfserr_deadsession cpu_to_be32(NFS4ERR_DEADSESSION) +#define nfserr_encr_alg_unsupp cpu_to_be32(NFS4ERR_ENCR_ALG_UNSUPP) +#define nfserr_pnfs_no_layout cpu_to_be32(NFS4ERR_PNFS_NO_LAYOUT) +#define nfserr_not_only_op cpu_to_be32(NFS4ERR_NOT_ONLY_OP) +#define nfserr_wrong_cred cpu_to_be32(NFS4ERR_WRONG_CRED) +#define nfserr_wrong_type cpu_to_be32(NFS4ERR_WRONG_TYPE) +#define nfserr_dirdeleg_unavail cpu_to_be32(NFS4ERR_DIRDELEG_UNAVAIL) +#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG) +#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT) +#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED) + +/* error codes for internal use */ +/* if a request fails due to kmalloc failure, it gets dropped. + * Client should resend eventually + */ +#define nfserr_dropit cpu_to_be32(30000) +/* end-of-file indicator in readdir */ +#define nfserr_eof cpu_to_be32(30001) +/* replay detected */ +#define nfserr_replay_me cpu_to_be32(11001) +/* nfs41 replay detected */ +#define nfserr_replay_cache cpu_to_be32(11002) + +/* Check for dir entries '.' and '..' */ +#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.')) + +/* + * Time of server startup + */ +extern struct timeval nfssvc_boot; + +#ifdef CONFIG_NFSD_V4 + +/* before processing a COMPOUND operation, we have to check that there + * is enough space in the buffer for XDR encode to succeed. otherwise, + * we might process an operation with side effects, and be unable to + * tell the client that the operation succeeded. + * + * COMPOUND_SLACK_SPACE - this is the minimum bytes of buffer space + * needed to encode an "ordinary" _successful_ operation. (GETATTR, + * READ, READDIR, and READLINK have their own buffer checks.) if we + * fall below this level, we fail the next operation with NFS4ERR_RESOURCE. + * + * COMPOUND_ERR_SLACK_SPACE - this is the minimum bytes of buffer space + * needed to encode an operation which has failed with NFS4ERR_RESOURCE. + * care is taken to ensure that we never fall below this level for any + * reason. + */ +#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ +#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ + +#define NFSD_LEASE_TIME (nfs4_lease_time()) +#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ + +/* + * The following attributes are currently not supported by the NFSv4 server: + * ARCHIVE (deprecated anyway) + * HIDDEN (unlikely to be supported any time soon) + * MIMETYPE (unlikely to be supported any time soon) + * QUOTA_* (will be supported in a forthcoming patch) + * SYSTEM (unlikely to be supported any time soon) + * TIME_BACKUP (unlikely to be supported any time soon) + * TIME_CREATE (unlikely to be supported any time soon) + */ +#define NFSD4_SUPPORTED_ATTRS_WORD0 \ +(FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_TYPE | FATTR4_WORD0_FH_EXPIRE_TYPE \ + | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_LINK_SUPPORT \ + | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR | FATTR4_WORD0_FSID \ + | FATTR4_WORD0_UNIQUE_HANDLES | FATTR4_WORD0_LEASE_TIME | FATTR4_WORD0_RDATTR_ERROR \ + | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_CANSETTIME | FATTR4_WORD0_CASE_INSENSITIVE \ + | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED \ + | FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FILEID | FATTR4_WORD0_FILES_AVAIL \ + | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_HOMOGENEOUS \ + | FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME \ + | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_ACL) + +#define NFSD4_SUPPORTED_ATTRS_WORD1 \ +(FATTR4_WORD1_MODE | FATTR4_WORD1_NO_TRUNC | FATTR4_WORD1_NUMLINKS \ + | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \ + | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \ + | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \ + | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \ + | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID) + +#define NFSD4_SUPPORTED_ATTRS_WORD2 0 + +#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ + NFSD4_SUPPORTED_ATTRS_WORD0 + +#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + NFSD4_SUPPORTED_ATTRS_WORD1 + +#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ + (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) + +static inline u32 nfsd_suppattrs0(u32 minorversion) +{ + return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0 + : NFSD4_SUPPORTED_ATTRS_WORD0; +} + +static inline u32 nfsd_suppattrs1(u32 minorversion) +{ + return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1 + : NFSD4_SUPPORTED_ATTRS_WORD1; +} + +static inline u32 nfsd_suppattrs2(u32 minorversion) +{ + return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2 + : NFSD4_SUPPORTED_ATTRS_WORD2; +} + +/* These will return ERR_INVAL if specified in GETATTR or READDIR. */ +#define NFSD_WRITEONLY_ATTRS_WORD1 \ +(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) + +/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */ +#define NFSD_WRITEABLE_ATTRS_WORD0 \ +(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL ) +#define NFSD_WRITEABLE_ATTRS_WORD1 \ +(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ + | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) +#define NFSD_WRITEABLE_ATTRS_WORD2 0 + +#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ + NFSD_WRITEABLE_ATTRS_WORD0 +/* + * we currently store the exclusive create verifier in the v_{a,m}time + * attributes so the client can't set these at create time using EXCLUSIVE4_1 + */ +#define NFSD_SUPPATTR_EXCLCREAT_WORD1 \ + (NFSD_WRITEABLE_ATTRS_WORD1 & \ + ~(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)) +#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \ + NFSD_WRITEABLE_ATTRS_WORD2 + +#endif /* CONFIG_NFSD_V4 */ + +#endif /* LINUX_NFSD_NFSD_H */ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 01965b2f3a76..55c8e63af0be 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -1,6 +1,4 @@ /* - * linux/fs/nfsd/nfsfh.c - * * NFS server file handle treatment. * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> @@ -9,19 +7,11 @@ * ... and again Southern-Winter 2001 to support export_operations */ -#include <linux/slab.h> -#include <linux/fs.h> -#include <linux/unistd.h> -#include <linux/string.h> -#include <linux/stat.h> -#include <linux/dcache.h> #include <linux/exportfs.h> -#include <linux/mount.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcauth_gss.h> -#include <linux/nfsd/nfsd.h> +#include "nfsd.h" +#include "vfs.h" #include "auth.h" #define NFSDDBG_FACILITY NFSDDBG_FH @@ -96,8 +86,10 @@ nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type) static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, struct svc_export *exp) { + int flags = nfsexp_flags(rqstp, exp); + /* Check if the request originated from a secure port. */ - if (!rqstp->rq_secure && EX_SECURE(exp)) { + if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) { RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk(KERN_WARNING "nfsd: request from insecure port %s!\n", @@ -109,6 +101,36 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, return nfserrno(nfsd_setuser(rqstp, exp)); } +static inline __be32 check_pseudo_root(struct svc_rqst *rqstp, + struct dentry *dentry, struct svc_export *exp) +{ + if (!(exp->ex_flags & NFSEXP_V4ROOT)) + return nfs_ok; + /* + * v2/v3 clients have no need for the V4ROOT export--they use + * the mount protocl instead; also, further V4ROOT checks may be + * in v4-specific code, in which case v2/v3 clients could bypass + * them. + */ + if (!nfsd_v4client(rqstp)) + return nfserr_stale; + /* + * We're exposing only the directories and symlinks that have to be + * traversed on the way to real exports: + */ + if (unlikely(!S_ISDIR(dentry->d_inode->i_mode) && + !S_ISLNK(dentry->d_inode->i_mode))) + return nfserr_stale; + /* + * A pseudoroot export gives permission to access only one + * single directory; the kernel has to make another upcall + * before granting access to anything else under it: + */ + if (unlikely(dentry != exp->ex_path.dentry)) + return nfserr_stale; + return nfs_ok; +} + /* * Use the given filehandle to look up the corresponding export and * dentry. On success, the results are used to set fh_export and @@ -232,14 +254,6 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) goto out; } - if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) { - error = nfsd_setuser_and_check_port(rqstp, exp); - if (error) { - dput(dentry); - goto out; - } - } - if (S_ISDIR(dentry->d_inode->i_mode) && (dentry->d_flags & DCACHE_DISCONNECTED)) { printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n", @@ -294,28 +308,32 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) error = nfsd_set_fh_dentry(rqstp, fhp); if (error) goto out; - dentry = fhp->fh_dentry; - exp = fhp->fh_export; - } else { - /* - * just rechecking permissions - * (e.g. nfsproc_create calls fh_verify, then nfsd_create - * does as well) - */ - dprintk("nfsd: fh_verify - just checking\n"); - dentry = fhp->fh_dentry; - exp = fhp->fh_export; - /* - * Set user creds for this exportpoint; necessary even - * in the "just checking" case because this may be a - * filehandle that was created by fh_compose, and that - * is about to be used in another nfsv4 compound - * operation. - */ - error = nfsd_setuser_and_check_port(rqstp, exp); - if (error) - goto out; } + dentry = fhp->fh_dentry; + exp = fhp->fh_export; + /* + * We still have to do all these permission checks, even when + * fh_dentry is already set: + * - fh_verify may be called multiple times with different + * "access" arguments (e.g. nfsd_proc_create calls + * fh_verify(...,NFSD_MAY_EXEC) first, then later (in + * nfsd_create) calls fh_verify(...,NFSD_MAY_CREATE). + * - in the NFSv4 case, the filehandle may have been filled + * in by fh_compose, and given a dentry, but further + * compound operations performed with that filehandle + * still need permissions checks. In the worst case, a + * mountpoint crossing may have changed the export + * options, and we may now need to use a different uid + * (for example, if different id-squashing options are in + * effect on the new filesystem). + */ + error = check_pseudo_root(rqstp, dentry, exp); + if (error) + goto out; + + error = nfsd_setuser_and_check_port(rqstp, exp); + if (error) + goto out; error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type); if (error) diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h new file mode 100644 index 000000000000..cdfb8c6a4206 --- /dev/null +++ b/fs/nfsd/nfsfh.h @@ -0,0 +1,208 @@ +/* Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ + +#ifndef _LINUX_NFSD_FH_INT_H +#define _LINUX_NFSD_FH_INT_H + +#include <linux/nfsd/nfsfh.h> + +enum nfsd_fsid { + FSID_DEV = 0, + FSID_NUM, + FSID_MAJOR_MINOR, + FSID_ENCODE_DEV, + FSID_UUID4_INUM, + FSID_UUID8, + FSID_UUID16, + FSID_UUID16_INUM, +}; + +enum fsid_source { + FSIDSOURCE_DEV, + FSIDSOURCE_FSID, + FSIDSOURCE_UUID, +}; +extern enum fsid_source fsid_source(struct svc_fh *fhp); + + +/* This might look a little large to "inline" but in all calls except + * one, 'vers' is constant so moste of the function disappears. + */ +static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino, + u32 fsid, unsigned char *uuid) +{ + u32 *up; + switch(vers) { + case FSID_DEV: + fsidv[0] = htonl((MAJOR(dev)<<16) | + MINOR(dev)); + fsidv[1] = ino_t_to_u32(ino); + break; + case FSID_NUM: + fsidv[0] = fsid; + break; + case FSID_MAJOR_MINOR: + fsidv[0] = htonl(MAJOR(dev)); + fsidv[1] = htonl(MINOR(dev)); + fsidv[2] = ino_t_to_u32(ino); + break; + + case FSID_ENCODE_DEV: + fsidv[0] = new_encode_dev(dev); + fsidv[1] = ino_t_to_u32(ino); + break; + + case FSID_UUID4_INUM: + /* 4 byte fsid and inode number */ + up = (u32*)uuid; + fsidv[0] = ino_t_to_u32(ino); + fsidv[1] = up[0] ^ up[1] ^ up[2] ^ up[3]; + break; + + case FSID_UUID8: + /* 8 byte fsid */ + up = (u32*)uuid; + fsidv[0] = up[0] ^ up[2]; + fsidv[1] = up[1] ^ up[3]; + break; + + case FSID_UUID16: + /* 16 byte fsid - NFSv3+ only */ + memcpy(fsidv, uuid, 16); + break; + + case FSID_UUID16_INUM: + /* 8 byte inode and 16 byte fsid */ + *(u64*)fsidv = (u64)ino; + memcpy(fsidv+2, uuid, 16); + break; + default: BUG(); + } +} + +static inline int key_len(int type) +{ + switch(type) { + case FSID_DEV: return 8; + case FSID_NUM: return 4; + case FSID_MAJOR_MINOR: return 12; + case FSID_ENCODE_DEV: return 8; + case FSID_UUID4_INUM: return 8; + case FSID_UUID8: return 8; + case FSID_UUID16: return 16; + case FSID_UUID16_INUM: return 24; + default: return 0; + } +} + +/* + * Shorthand for dprintk()'s + */ +extern char * SVCFH_fmt(struct svc_fh *fhp); + +/* + * Function prototypes + */ +__be32 fh_verify(struct svc_rqst *, struct svc_fh *, int, int); +__be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *); +__be32 fh_update(struct svc_fh *); +void fh_put(struct svc_fh *); + +static __inline__ struct svc_fh * +fh_copy(struct svc_fh *dst, struct svc_fh *src) +{ + WARN_ON(src->fh_dentry || src->fh_locked); + + *dst = *src; + return dst; +} + +static inline void +fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src) +{ + dst->fh_size = src->fh_size; + memcpy(&dst->fh_base, &src->fh_base, src->fh_size); +} + +static __inline__ struct svc_fh * +fh_init(struct svc_fh *fhp, int maxsize) +{ + memset(fhp, 0, sizeof(*fhp)); + fhp->fh_maxsize = maxsize; + return fhp; +} + +#ifdef CONFIG_NFSD_V3 +/* + * Fill in the pre_op attr for the wcc data + */ +static inline void +fill_pre_wcc(struct svc_fh *fhp) +{ + struct inode *inode; + + inode = fhp->fh_dentry->d_inode; + if (!fhp->fh_pre_saved) { + fhp->fh_pre_mtime = inode->i_mtime; + fhp->fh_pre_ctime = inode->i_ctime; + fhp->fh_pre_size = inode->i_size; + fhp->fh_pre_change = inode->i_version; + fhp->fh_pre_saved = 1; + } +} + +extern void fill_post_wcc(struct svc_fh *); +#else +#define fill_pre_wcc(ignored) +#define fill_post_wcc(notused) +#endif /* CONFIG_NFSD_V3 */ + + +/* + * Lock a file handle/inode + * NOTE: both fh_lock and fh_unlock are done "by hand" in + * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once + * so, any changes here should be reflected there. + */ + +static inline void +fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) +{ + struct dentry *dentry = fhp->fh_dentry; + struct inode *inode; + + BUG_ON(!dentry); + + if (fhp->fh_locked) { + printk(KERN_WARNING "fh_lock: %s/%s already locked!\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + return; + } + + inode = dentry->d_inode; + mutex_lock_nested(&inode->i_mutex, subclass); + fill_pre_wcc(fhp); + fhp->fh_locked = 1; +} + +static inline void +fh_lock(struct svc_fh *fhp) +{ + fh_lock_nested(fhp, I_MUTEX_NORMAL); +} + +/* + * Unlock a file handle/inode + */ +static inline void +fh_unlock(struct svc_fh *fhp) +{ + BUG_ON(!fhp->fh_dentry); + + if (fhp->fh_locked) { + fill_post_wcc(fhp); + mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); + fhp->fh_locked = 0; + } +} + +#endif /* _LINUX_NFSD_FH_INT_H */ diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 0eb9c820b7a6..a047ad6111ef 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -1,29 +1,14 @@ /* - * nfsproc2.c Process version 2 NFS requests. - * linux/fs/nfsd/nfs2proc.c - * * Process version 2 NFS requests. * * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/linkage.h> -#include <linux/time.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/stat.h> -#include <linux/fcntl.h> -#include <linux/net.h> -#include <linux/in.h> #include <linux/namei.h> -#include <linux/unistd.h> -#include <linux/slab.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> -#include <linux/nfsd/xdr.h> +#include "cache.h" +#include "xdr.h" +#include "vfs.h" typedef struct svc_rqst svc_rqst; typedef struct svc_buf svc_buf; @@ -758,6 +743,7 @@ nfserrno (int errno) { nfserr_io, -ETXTBSY }, { nfserr_notsupp, -EOPNOTSUPP }, { nfserr_toosmall, -ETOOSMALL }, + { nfserr_serverfault, -ESERVERFAULT }, }; int i; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 67ea83eedd43..171699eb07c8 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -1,6 +1,4 @@ /* - * linux/fs/nfsd/nfssvc.c - * * Central processing for nfsd. * * Authors: Olaf Kirch (okir@monad.swb.de) @@ -8,33 +6,19 @@ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/module.h> #include <linux/sched.h> -#include <linux/time.h> -#include <linux/errno.h> -#include <linux/nfs.h> -#include <linux/in.h> -#include <linux/uio.h> -#include <linux/unistd.h> -#include <linux/slab.h> -#include <linux/smp.h> #include <linux/freezer.h> #include <linux/fs_struct.h> -#include <linux/kthread.h> #include <linux/swap.h> -#include <linux/sunrpc/types.h> #include <linux/sunrpc/stats.h> -#include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> -#include <linux/sunrpc/cache.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/stats.h> -#include <linux/nfsd/cache.h> -#include <linux/nfsd/syscall.h> #include <linux/lockd/bind.h> #include <linux/nfsacl.h> #include <linux/seq_file.h> +#include "nfsd.h" +#include "cache.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_SVC diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index afd08e2c90a5..4ce005dbf3e6 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -1,20 +1,10 @@ /* - * linux/fs/nfsd/nfsxdr.c - * * XDR support for nfsd * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/types.h> -#include <linux/time.h> -#include <linux/nfs.h> -#include <linux/vfs.h> -#include <linux/sunrpc/xdr.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/xdr.h> -#include <linux/mm.h> +#include "xdr.h" #include "auth.h" #define NFSDDBG_FACILITY NFSDDBG_XDR diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h new file mode 100644 index 000000000000..fefeae27f25e --- /dev/null +++ b/fs/nfsd/state.h @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith <kmsmith@umich.edu> + * Andy Adamson <andros@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _NFSD4_STATE_H +#define _NFSD4_STATE_H + +#include <linux/nfsd/nfsfh.h> +#include "nfsfh.h" + +typedef struct { + u32 cl_boot; + u32 cl_id; +} clientid_t; + +typedef struct { + u32 so_boot; + u32 so_stateownerid; + u32 so_fileid; +} stateid_opaque_t; + +typedef struct { + u32 si_generation; + stateid_opaque_t si_opaque; +} stateid_t; +#define si_boot si_opaque.so_boot +#define si_stateownerid si_opaque.so_stateownerid +#define si_fileid si_opaque.so_fileid + +#define STATEID_FMT "(%08x/%08x/%08x/%08x)" +#define STATEID_VAL(s) \ + (s)->si_boot, \ + (s)->si_stateownerid, \ + (s)->si_fileid, \ + (s)->si_generation + +struct nfsd4_cb_sequence { + /* args/res */ + u32 cbs_minorversion; + struct nfs4_client *cbs_clp; +}; + +struct nfs4_delegation { + struct list_head dl_perfile; + struct list_head dl_perclnt; + struct list_head dl_recall_lru; /* delegation recalled */ + atomic_t dl_count; /* ref count */ + struct nfs4_client *dl_client; + struct nfs4_file *dl_file; + struct file_lock *dl_flock; + struct file *dl_vfs_file; + u32 dl_type; + time_t dl_time; +/* For recall: */ + u32 dl_ident; + stateid_t dl_stateid; + struct knfsd_fh dl_fh; + int dl_retries; +}; + +/* client delegation callback info */ +struct nfs4_cb_conn { + /* SETCLIENTID info */ + struct sockaddr_storage cb_addr; + size_t cb_addrlen; + u32 cb_prog; + u32 cb_minorversion; + u32 cb_ident; /* minorversion 0 only */ + /* RPC client info */ + atomic_t cb_set; /* successful CB_NULL call */ + struct rpc_clnt * cb_client; +}; + +/* Maximum number of slots per session. 160 is useful for long haul TCP */ +#define NFSD_MAX_SLOTS_PER_SESSION 160 +/* Maximum number of operations per session compound */ +#define NFSD_MAX_OPS_PER_COMPOUND 16 +/* Maximum session per slot cache size */ +#define NFSD_SLOT_CACHE_SIZE 1024 +/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ +#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION 32 +#define NFSD_MAX_MEM_PER_SESSION \ + (NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE) + +struct nfsd4_slot { + bool sl_inuse; + bool sl_cachethis; + u16 sl_opcnt; + u32 sl_seqid; + __be32 sl_status; + u32 sl_datalen; + char sl_data[]; +}; + +struct nfsd4_channel_attrs { + u32 headerpadsz; + u32 maxreq_sz; + u32 maxresp_sz; + u32 maxresp_cached; + u32 maxops; + u32 maxreqs; + u32 nr_rdma_attrs; + u32 rdma_attrs; +}; + +struct nfsd4_create_session { + clientid_t clientid; + struct nfs4_sessionid sessionid; + u32 seqid; + u32 flags; + struct nfsd4_channel_attrs fore_channel; + struct nfsd4_channel_attrs back_channel; + u32 callback_prog; + u32 uid; + u32 gid; +}; + +/* The single slot clientid cache structure */ +struct nfsd4_clid_slot { + u32 sl_seqid; + __be32 sl_status; + struct nfsd4_create_session sl_cr_ses; +}; + +struct nfsd4_session { + struct kref se_ref; + struct list_head se_hash; /* hash by sessionid */ + struct list_head se_perclnt; + u32 se_flags; + struct nfs4_client *se_client; /* for expire_client */ + struct nfs4_sessionid se_sessionid; + struct nfsd4_channel_attrs se_fchannel; + struct nfsd4_channel_attrs se_bchannel; + struct nfsd4_slot *se_slots[]; /* forward channel slots */ +}; + +static inline void +nfsd4_put_session(struct nfsd4_session *ses) +{ + extern void free_session(struct kref *kref); + kref_put(&ses->se_ref, free_session); +} + +static inline void +nfsd4_get_session(struct nfsd4_session *ses) +{ + kref_get(&ses->se_ref); +} + +/* formatted contents of nfs4_sessionid */ +struct nfsd4_sessionid { + clientid_t clientid; + u32 sequence; + u32 reserved; +}; + +#define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */ + +/* + * struct nfs4_client - one per client. Clientids live here. + * o Each nfs4_client is hashed by clientid. + * + * o Each nfs4_clients is also hashed by name + * (the opaque quantity initially sent by the client to identify itself). + * + * o cl_perclient list is used to ensure no dangling stateowner references + * when we expire the nfs4_client + */ +struct nfs4_client { + struct list_head cl_idhash; /* hash by cl_clientid.id */ + struct list_head cl_strhash; /* hash by cl_name */ + struct list_head cl_openowners; + struct list_head cl_delegations; + struct list_head cl_lru; /* tail queue */ + struct xdr_netobj cl_name; /* id generated by client */ + char cl_recdir[HEXDIR_LEN]; /* recovery dir */ + nfs4_verifier cl_verifier; /* generated by client */ + time_t cl_time; /* time of last lease renewal */ + struct sockaddr_storage cl_addr; /* client ipaddress */ + u32 cl_flavor; /* setclientid pseudoflavor */ + char *cl_principal; /* setclientid principal name */ + struct svc_cred cl_cred; /* setclientid principal */ + clientid_t cl_clientid; /* generated by server */ + nfs4_verifier cl_confirm; /* generated by server */ + struct nfs4_cb_conn cl_cb_conn; /* callback info */ + atomic_t cl_count; /* ref count */ + u32 cl_firststate; /* recovery dir creation */ + + /* for nfs41 */ + struct list_head cl_sessions; + struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ + u32 cl_exchange_flags; + struct nfs4_sessionid cl_sessionid; + + /* for nfs41 callbacks */ + /* We currently support a single back channel with a single slot */ + unsigned long cl_cb_slot_busy; + u32 cl_cb_seq_nr; + struct svc_xprt *cl_cb_xprt; /* 4.1 callback transport */ + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ +}; + +/* struct nfs4_client_reset + * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl + * upon lease reset, or from upcall to state_daemon (to read in state + * from non-volitile storage) upon reboot. + */ +struct nfs4_client_reclaim { + struct list_head cr_strhash; /* hash by cr_name */ + char cr_recdir[HEXDIR_LEN]; /* recover dir */ +}; + +static inline void +update_stateid(stateid_t *stateid) +{ + stateid->si_generation++; +} + +/* A reasonable value for REPLAY_ISIZE was estimated as follows: + * The OPEN response, typically the largest, requires + * 4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) + 8(verifier) + + * 4(deleg. type) + 8(deleg. stateid) + 4(deleg. recall flag) + + * 20(deleg. space limit) + ~32(deleg. ace) = 112 bytes + */ + +#define NFSD4_REPLAY_ISIZE 112 + +/* + * Replay buffer, where the result of the last seqid-mutating operation + * is cached. + */ +struct nfs4_replay { + __be32 rp_status; + unsigned int rp_buflen; + char *rp_buf; + unsigned intrp_allocated; + struct knfsd_fh rp_openfh; + char rp_ibuf[NFSD4_REPLAY_ISIZE]; +}; + +/* +* nfs4_stateowner can either be an open_owner, or a lock_owner +* +* so_idhash: stateid_hashtbl[] for open owner, lockstateid_hashtbl[] +* for lock_owner +* so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[] +* for lock_owner +* so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client +* struct is reaped. +* so_perfilestate: heads the list of nfs4_stateid (either open or lock) +* and is used to ensure no dangling nfs4_stateid references when we +* release a stateowner. +* so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when +* close is called to reap associated byte-range locks +* so_close_lru: (open) stateowner is placed on this list instead of being +* reaped (when so_perfilestate is empty) to hold the last close replay. +* reaped by laundramat thread after lease period. +*/ +struct nfs4_stateowner { + struct kref so_ref; + struct list_head so_idhash; /* hash by so_id */ + struct list_head so_strhash; /* hash by op_name */ + struct list_head so_perclient; + struct list_head so_stateids; + struct list_head so_perstateid; /* for lockowners only */ + struct list_head so_close_lru; /* tail queue */ + time_t so_time; /* time of placement on so_close_lru */ + int so_is_open_owner; /* 1=openowner,0=lockowner */ + u32 so_id; + struct nfs4_client * so_client; + /* after increment in ENCODE_SEQID_OP_TAIL, represents the next + * sequence id expected from the client: */ + u32 so_seqid; + struct xdr_netobj so_owner; /* open owner name */ + int so_confirmed; /* successful OPEN_CONFIRM? */ + struct nfs4_replay so_replay; +}; + +/* +* nfs4_file: a file opened by some number of (open) nfs4_stateowners. +* o fi_perfile list is used to search for conflicting +* share_acces, share_deny on the file. +*/ +struct nfs4_file { + atomic_t fi_ref; + struct list_head fi_hash; /* hash by "struct inode *" */ + struct list_head fi_stateids; + struct list_head fi_delegations; + struct inode *fi_inode; + u32 fi_id; /* used with stateowner->so_id + * for stateid_hashtbl hash */ + bool fi_had_conflict; +}; + +/* +* nfs4_stateid can either be an open stateid or (eventually) a lock stateid +* +* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file +* +* st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry +* st_perfile: file_hashtbl[] entry. +* st_perfile_state: nfs4_stateowner->so_perfilestate +* st_perlockowner: (open stateid) list of lock nfs4_stateowners +* st_access_bmap: used only for open stateid +* st_deny_bmap: used only for open stateid +* st_openstp: open stateid lock stateid was derived from +* +* XXX: open stateids and lock stateids have diverged sufficiently that +* we should consider defining separate structs for the two cases. +*/ + +struct nfs4_stateid { + struct list_head st_hash; + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_lockowners; + struct nfs4_stateowner * st_stateowner; + struct nfs4_file * st_file; + stateid_t st_stateid; + struct file * st_vfs_file; + unsigned long st_access_bmap; + unsigned long st_deny_bmap; + struct nfs4_stateid * st_openstp; +}; + +/* flags for preprocess_seqid_op() */ +#define HAS_SESSION 0x00000001 +#define CONFIRM 0x00000002 +#define OPEN_STATE 0x00000004 +#define LOCK_STATE 0x00000008 +#define RD_STATE 0x00000010 +#define WR_STATE 0x00000020 +#define CLOSE_STATE 0x00000040 + +#define seqid_mutating_err(err) \ + (((err) != nfserr_stale_clientid) && \ + ((err) != nfserr_bad_seqid) && \ + ((err) != nfserr_stale_stateid) && \ + ((err) != nfserr_bad_stateid)) + +struct nfsd4_compound_state; + +extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, + stateid_t *stateid, int flags, struct file **filp); +extern void nfs4_lock_state(void); +extern void nfs4_unlock_state(void); +extern int nfs4_in_grace(void); +extern __be32 nfs4_check_open_reclaim(clientid_t *clid); +extern void put_nfs4_client(struct nfs4_client *clp); +extern void nfs4_free_stateowner(struct kref *kref); +extern int set_callback_cred(void); +extern void nfsd4_probe_callback(struct nfs4_client *clp); +extern void nfsd4_cb_recall(struct nfs4_delegation *dp); +extern void nfs4_put_delegation(struct nfs4_delegation *dp); +extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); +extern void nfsd4_init_recdir(char *recdir_name); +extern int nfsd4_recdir_load(void); +extern void nfsd4_shutdown_recdir(void); +extern int nfs4_client_to_reclaim(const char *name); +extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); +extern void nfsd4_recdir_purge_old(void); +extern int nfsd4_create_clid_dir(struct nfs4_client *clp); +extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); + +static inline void +nfs4_put_stateowner(struct nfs4_stateowner *so) +{ + kref_put(&so->so_ref, nfs4_free_stateowner); +} + +static inline void +nfs4_get_stateowner(struct nfs4_stateowner *so) +{ + kref_get(&so->so_ref); +} + +#endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 71944cddf680..5232d3e8fb2f 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -1,6 +1,4 @@ /* - * linux/fs/nfsd/stats.c - * * procfs-based user access to knfsd statistics * * /proc/net/rpc/nfsd @@ -23,18 +21,13 @@ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/kernel.h> -#include <linux/time.h> -#include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <linux/stat.h> #include <linux/module.h> - -#include <linux/sunrpc/svc.h> #include <linux/sunrpc/stats.h> -#include <linux/nfsd/nfsd.h> #include <linux/nfsd/stats.h> +#include "nfsd.h" + struct nfsd_stats nfsdstats; struct svc_stat nfsd_svcstats = { .program = &nfsd_program, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a293f0273263..c194793b642b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1,7 +1,5 @@ #define MSNFS /* HACK HACK */ /* - * linux/fs/nfsd/vfs.c - * * File operations used by nfsd. Some of these have been ripped from * other parts of the kernel because they weren't exported, others * are partial duplicates with added or changed functionality. @@ -16,48 +14,31 @@ * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp> */ -#include <linux/string.h> -#include <linux/time.h> -#include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> -#include <linux/mount.h> -#include <linux/major.h> #include <linux/splice.h> -#include <linux/proc_fs.h> -#include <linux/stat.h> #include <linux/fcntl.h> -#include <linux/net.h> -#include <linux/unistd.h> -#include <linux/slab.h> -#include <linux/pagemap.h> -#include <linux/in.h> -#include <linux/module.h> #include <linux/namei.h> -#include <linux/vfs.h> #include <linux/delay.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#ifdef CONFIG_NFSD_V3 -#include <linux/nfs3.h> -#include <linux/nfsd/xdr3.h> -#endif /* CONFIG_NFSD_V3 */ -#include <linux/nfsd/nfsfh.h> #include <linux/quotaops.h> #include <linux/fsnotify.h> -#include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> +#include <linux/jhash.h> +#include <linux/ima.h> +#include <asm/uaccess.h> + +#ifdef CONFIG_NFSD_V3 +#include "xdr3.h" +#endif /* CONFIG_NFSD_V3 */ + #ifdef CONFIG_NFSD_V4 -#include <linux/nfs4.h> #include <linux/nfs4_acl.h> #include <linux/nfsd_idmap.h> -#include <linux/security.h> #endif /* CONFIG_NFSD_V4 */ -#include <linux/jhash.h> -#include <linux/ima.h> -#include <asm/uaccess.h> +#include "nfsd.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_FILEOP @@ -89,12 +70,6 @@ struct raparm_hbucket { #define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; -static inline int -nfsd_v4client(struct svc_rqst *rq) -{ - return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4; -} - /* * Called from nfsd_lookup and encode_dirent. Check if we have crossed * a mount point. @@ -116,8 +91,16 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, exp2 = rqst_exp_get_by_name(rqstp, &path); if (IS_ERR(exp2)) { - if (PTR_ERR(exp2) != -ENOENT) - err = PTR_ERR(exp2); + err = PTR_ERR(exp2); + /* + * We normally allow NFS clients to continue + * "underneath" a mountpoint that is not exported. + * The exception is V4ROOT, where no traversal is ever + * allowed without an explicit export of the new + * directory. + */ + if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT)) + err = 0; path_put(&path); goto out; } @@ -141,6 +124,53 @@ out: return err; } +static void follow_to_parent(struct path *path) +{ + struct dentry *dp; + + while (path->dentry == path->mnt->mnt_root && follow_up(path)) + ; + dp = dget_parent(path->dentry); + dput(path->dentry); + path->dentry = dp; +} + +static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp) +{ + struct svc_export *exp2; + struct path path = {.mnt = mntget((*exp)->ex_path.mnt), + .dentry = dget(dparent)}; + + follow_to_parent(&path); + + exp2 = rqst_exp_parent(rqstp, &path); + if (PTR_ERR(exp2) == -ENOENT) { + *dentryp = dget(dparent); + } else if (IS_ERR(exp2)) { + path_put(&path); + return PTR_ERR(exp2); + } else { + *dentryp = dget(path.dentry); + exp_put(*exp); + *exp = exp2; + } + path_put(&path); + return 0; +} + +/* + * For nfsd purposes, we treat V4ROOT exports as though there was an + * export at *every* directory. + */ +int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) +{ + if (d_mountpoint(dentry)) + return 1; + if (!(exp->ex_flags & NFSEXP_V4ROOT)) + return 0; + return dentry->d_inode != NULL; +} + __be32 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, unsigned int len, @@ -169,35 +199,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = dget(dparent); else if (dparent != exp->ex_path.dentry) dentry = dget_parent(dparent); - else if (!EX_NOHIDE(exp)) + else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp)) dentry = dget(dparent); /* .. == . just like at / */ else { /* checking mountpoint crossing is very different when stepping up */ - struct svc_export *exp2 = NULL; - struct dentry *dp; - struct path path = {.mnt = mntget(exp->ex_path.mnt), - .dentry = dget(dparent)}; - - while (path.dentry == path.mnt->mnt_root && - follow_up(&path)) - ; - dp = dget_parent(path.dentry); - dput(path.dentry); - path.dentry = dp; - - exp2 = rqst_exp_parent(rqstp, &path); - if (PTR_ERR(exp2) == -ENOENT) { - dentry = dget(dparent); - } else if (IS_ERR(exp2)) { - host_err = PTR_ERR(exp2); - path_put(&path); + host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry); + if (host_err) goto out_nfserr; - } else { - dentry = dget(path.dentry); - exp_put(exp); - exp = exp2; - } - path_put(&path); } } else { fh_lock(fhp); @@ -208,7 +216,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, /* * check if we have crossed a mount point ... */ - if (d_mountpoint(dentry)) { + if (nfsd_mountpoint(dentry, exp)) { if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { dput(dentry); goto out_nfserr; @@ -744,8 +752,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, flags, current_cred()); if (IS_ERR(*filp)) host_err = PTR_ERR(*filp); - else - ima_counts_get(*filp); out_nfserr: err = nfserrno(host_err); out: @@ -774,12 +780,9 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp, int (*fsync) (struct file *, struct dentry *, int); int err; - err = filemap_fdatawrite(inode->i_mapping); + err = filemap_write_and_wait(inode->i_mapping); if (err == 0 && fop && (fsync = fop->fsync)) err = fsync(filp, dp, 0); - if (err == 0) - err = filemap_fdatawait(inode->i_mapping); - return err; } @@ -2124,8 +2127,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, */ path.mnt = exp->ex_path.mnt; path.dentry = dentry; - err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC), - IMA_COUNT_LEAVE); + err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC)); nfsd_out: return err? nfserrno(err) : 0; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h new file mode 100644 index 000000000000..4b1de0a9ea75 --- /dev/null +++ b/fs/nfsd/vfs.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> + */ + +#ifndef LINUX_NFSD_VFS_H +#define LINUX_NFSD_VFS_H + +#include "nfsfh.h" + +/* + * Flags for nfsd_permission + */ +#define NFSD_MAY_NOP 0 +#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */ +#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */ +#define NFSD_MAY_READ 4 /* == MAY_READ */ +#define NFSD_MAY_SATTR 8 +#define NFSD_MAY_TRUNC 16 +#define NFSD_MAY_LOCK 32 +#define NFSD_MAY_OWNER_OVERRIDE 64 +#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ +#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 + +#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) +#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) + +/* + * Callback function for readdir + */ +typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int); + +/* nfsd/vfs.c */ +int fh_lock_parent(struct svc_fh *, struct dentry *); +int nfsd_racache_init(int); +void nfsd_racache_shutdown(void); +int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, + struct svc_export **expp); +__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, + const char *, unsigned int, struct svc_fh *); +__be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, + const char *, unsigned int, + struct svc_export **, struct dentry **); +__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, + struct iattr *, int, time_t); +int nfsd_mountpoint(struct dentry *, struct svc_export *); +#ifdef CONFIG_NFSD_V4 +__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *, + struct nfs4_acl *); +int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **); +#endif /* CONFIG_NFSD_V4 */ +__be32 nfsd_create(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + int type, dev_t rdev, struct svc_fh *res); +#ifdef CONFIG_NFSD_V3 +__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); +__be32 nfsd_create_v3(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + struct svc_fh *res, int createmode, + u32 *verifier, int *truncp, int *created); +__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, + loff_t, unsigned long); +#endif /* CONFIG_NFSD_V3 */ +__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int, + int, struct file **); +void nfsd_close(struct file *); +__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *, + loff_t, struct kvec *, int, unsigned long *); +__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, + loff_t, struct kvec *,int, unsigned long *, int *); +__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, + char *, int *); +__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, + char *name, int len, char *path, int plen, + struct svc_fh *res, struct iattr *); +__be32 nfsd_link(struct svc_rqst *, struct svc_fh *, + char *, int, struct svc_fh *); +__be32 nfsd_rename(struct svc_rqst *, + struct svc_fh *, char *, int, + struct svc_fh *, char *, int); +__be32 nfsd_remove(struct svc_rqst *, + struct svc_fh *, char *, int); +__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, + char *name, int len); +int nfsd_truncate(struct svc_rqst *, struct svc_fh *, + unsigned long size); +__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, + loff_t *, struct readdir_cd *, filldir_t); +__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, + struct kstatfs *, int access); + +int nfsd_notify_change(struct inode *, struct iattr *); +__be32 nfsd_permission(struct svc_rqst *, struct svc_export *, + struct dentry *, int); +int nfsd_sync_dir(struct dentry *dp); + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int); +int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *); +#endif + +#endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h new file mode 100644 index 000000000000..53b1863dd8f6 --- /dev/null +++ b/fs/nfsd/xdr.h @@ -0,0 +1,173 @@ +/* XDR types for nfsd. This is mainly a typing exercise. */ + +#ifndef LINUX_NFSD_H +#define LINUX_NFSD_H + +#include <linux/vfs.h> +#include "nfsd.h" +#include "nfsfh.h" + +struct nfsd_fhandle { + struct svc_fh fh; +}; + +struct nfsd_sattrargs { + struct svc_fh fh; + struct iattr attrs; +}; + +struct nfsd_diropargs { + struct svc_fh fh; + char * name; + unsigned int len; +}; + +struct nfsd_readargs { + struct svc_fh fh; + __u32 offset; + __u32 count; + int vlen; +}; + +struct nfsd_writeargs { + svc_fh fh; + __u32 offset; + int len; + int vlen; +}; + +struct nfsd_createargs { + struct svc_fh fh; + char * name; + unsigned int len; + struct iattr attrs; +}; + +struct nfsd_renameargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd_readlinkargs { + struct svc_fh fh; + char * buffer; +}; + +struct nfsd_linkargs { + struct svc_fh ffh; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd_symlinkargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + char * tname; + unsigned int tlen; + struct iattr attrs; +}; + +struct nfsd_readdirargs { + struct svc_fh fh; + __u32 cookie; + __u32 count; + __be32 * buffer; +}; + +struct nfsd_attrstat { + struct svc_fh fh; + struct kstat stat; +}; + +struct nfsd_diropres { + struct svc_fh fh; + struct kstat stat; +}; + +struct nfsd_readlinkres { + int len; +}; + +struct nfsd_readres { + struct svc_fh fh; + unsigned long count; + struct kstat stat; +}; + +struct nfsd_readdirres { + int count; + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; +}; + +struct nfsd_statfsres { + struct kstatfs stats; +}; + +/* + * Storage requirements for XDR arguments and results. + */ +union nfsd_xdrstore { + struct nfsd_sattrargs sattr; + struct nfsd_diropargs dirop; + struct nfsd_readargs read; + struct nfsd_writeargs write; + struct nfsd_createargs create; + struct nfsd_renameargs rename; + struct nfsd_linkargs link; + struct nfsd_symlinkargs symlink; + struct nfsd_readdirargs readdir; +}; + +#define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore) + + +int nfssvc_decode_void(struct svc_rqst *, __be32 *, void *); +int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); +int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *, + struct nfsd_sattrargs *); +int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *, + struct nfsd_diropargs *); +int nfssvc_decode_readargs(struct svc_rqst *, __be32 *, + struct nfsd_readargs *); +int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *, + struct nfsd_writeargs *); +int nfssvc_decode_createargs(struct svc_rqst *, __be32 *, + struct nfsd_createargs *); +int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *, + struct nfsd_renameargs *); +int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *, + struct nfsd_readlinkargs *); +int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *, + struct nfsd_linkargs *); +int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *, + struct nfsd_symlinkargs *); +int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *, + struct nfsd_readdirargs *); +int nfssvc_encode_void(struct svc_rqst *, __be32 *, void *); +int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *, struct nfsd_attrstat *); +int nfssvc_encode_diropres(struct svc_rqst *, __be32 *, struct nfsd_diropres *); +int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *, struct nfsd_readlinkres *); +int nfssvc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd_readres *); +int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *, struct nfsd_statfsres *); +int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *, struct nfsd_readdirres *); + +int nfssvc_encode_entry(void *, const char *name, + int namlen, loff_t offset, u64 ino, unsigned int); + +int nfssvc_release_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); + +/* Helper functions for NFSv2 ACL code */ +__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp); +__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp); + +#endif /* LINUX_NFSD_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h new file mode 100644 index 000000000000..7df980eb0562 --- /dev/null +++ b/fs/nfsd/xdr3.h @@ -0,0 +1,344 @@ +/* + * XDR types for NFSv3 in nfsd. + * + * Copyright (C) 1996-1998, Olaf Kirch <okir@monad.swb.de> + */ + +#ifndef _LINUX_NFSD_XDR3_H +#define _LINUX_NFSD_XDR3_H + +#include "xdr.h" + +struct nfsd3_sattrargs { + struct svc_fh fh; + struct iattr attrs; + int check_guard; + time_t guardtime; +}; + +struct nfsd3_diropargs { + struct svc_fh fh; + char * name; + unsigned int len; +}; + +struct nfsd3_accessargs { + struct svc_fh fh; + unsigned int access; +}; + +struct nfsd3_readargs { + struct svc_fh fh; + __u64 offset; + __u32 count; + int vlen; +}; + +struct nfsd3_writeargs { + svc_fh fh; + __u64 offset; + __u32 count; + int stable; + __u32 len; + int vlen; +}; + +struct nfsd3_createargs { + struct svc_fh fh; + char * name; + unsigned int len; + int createmode; + struct iattr attrs; + __be32 * verf; +}; + +struct nfsd3_mknodargs { + struct svc_fh fh; + char * name; + unsigned int len; + __u32 ftype; + __u32 major, minor; + struct iattr attrs; +}; + +struct nfsd3_renameargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd3_readlinkargs { + struct svc_fh fh; + char * buffer; +}; + +struct nfsd3_linkargs { + struct svc_fh ffh; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd3_symlinkargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + char * tname; + unsigned int tlen; + struct iattr attrs; +}; + +struct nfsd3_readdirargs { + struct svc_fh fh; + __u64 cookie; + __u32 dircount; + __u32 count; + __be32 * verf; + __be32 * buffer; +}; + +struct nfsd3_commitargs { + struct svc_fh fh; + __u64 offset; + __u32 count; +}; + +struct nfsd3_getaclargs { + struct svc_fh fh; + int mask; +}; + +struct posix_acl; +struct nfsd3_setaclargs { + struct svc_fh fh; + int mask; + struct posix_acl *acl_access; + struct posix_acl *acl_default; +}; + +struct nfsd3_attrstat { + __be32 status; + struct svc_fh fh; + struct kstat stat; +}; + +/* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */ +struct nfsd3_diropres { + __be32 status; + struct svc_fh dirfh; + struct svc_fh fh; +}; + +struct nfsd3_accessres { + __be32 status; + struct svc_fh fh; + __u32 access; +}; + +struct nfsd3_readlinkres { + __be32 status; + struct svc_fh fh; + __u32 len; +}; + +struct nfsd3_readres { + __be32 status; + struct svc_fh fh; + unsigned long count; + int eof; +}; + +struct nfsd3_writeres { + __be32 status; + struct svc_fh fh; + unsigned long count; + int committed; +}; + +struct nfsd3_renameres { + __be32 status; + struct svc_fh ffh; + struct svc_fh tfh; +}; + +struct nfsd3_linkres { + __be32 status; + struct svc_fh tfh; + struct svc_fh fh; +}; + +struct nfsd3_readdirres { + __be32 status; + struct svc_fh fh; + int count; + __be32 verf[2]; + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; + __be32 * offset1; + struct svc_rqst * rqstp; + +}; + +struct nfsd3_fsstatres { + __be32 status; + struct kstatfs stats; + __u32 invarsec; +}; + +struct nfsd3_fsinfores { + __be32 status; + __u32 f_rtmax; + __u32 f_rtpref; + __u32 f_rtmult; + __u32 f_wtmax; + __u32 f_wtpref; + __u32 f_wtmult; + __u32 f_dtpref; + __u64 f_maxfilesize; + __u32 f_properties; +}; + +struct nfsd3_pathconfres { + __be32 status; + __u32 p_link_max; + __u32 p_name_max; + __u32 p_no_trunc; + __u32 p_chown_restricted; + __u32 p_case_insensitive; + __u32 p_case_preserving; +}; + +struct nfsd3_commitres { + __be32 status; + struct svc_fh fh; +}; + +struct nfsd3_getaclres { + __be32 status; + struct svc_fh fh; + int mask; + struct posix_acl *acl_access; + struct posix_acl *acl_default; +}; + +/* dummy type for release */ +struct nfsd3_fhandle_pair { + __u32 dummy; + struct svc_fh fh1; + struct svc_fh fh2; +}; + +/* + * Storage requirements for XDR arguments and results. + */ +union nfsd3_xdrstore { + struct nfsd3_sattrargs sattrargs; + struct nfsd3_diropargs diropargs; + struct nfsd3_readargs readargs; + struct nfsd3_writeargs writeargs; + struct nfsd3_createargs createargs; + struct nfsd3_renameargs renameargs; + struct nfsd3_linkargs linkargs; + struct nfsd3_symlinkargs symlinkargs; + struct nfsd3_readdirargs readdirargs; + struct nfsd3_diropres diropres; + struct nfsd3_accessres accessres; + struct nfsd3_readlinkres readlinkres; + struct nfsd3_readres readres; + struct nfsd3_writeres writeres; + struct nfsd3_renameres renameres; + struct nfsd3_linkres linkres; + struct nfsd3_readdirres readdirres; + struct nfsd3_fsstatres fsstatres; + struct nfsd3_fsinfores fsinfores; + struct nfsd3_pathconfres pathconfres; + struct nfsd3_commitres commitres; + struct nfsd3_getaclres getaclres; +}; + +#define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore) + +int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); +int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *, + struct nfsd3_sattrargs *); +int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *, + struct nfsd3_diropargs *); +int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *, + struct nfsd3_accessargs *); +int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *, + struct nfsd3_readargs *); +int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *, + struct nfsd3_writeargs *); +int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *, + struct nfsd3_createargs *); +int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *, + struct nfsd3_createargs *); +int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *, + struct nfsd3_mknodargs *); +int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *, + struct nfsd3_renameargs *); +int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *, + struct nfsd3_readlinkargs *); +int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *, + struct nfsd3_linkargs *); +int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *, + struct nfsd3_symlinkargs *); +int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *, + struct nfsd3_readdirargs *); +int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *, + struct nfsd3_readdirargs *); +int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *, + struct nfsd3_commitargs *); +int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *, void *); +int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *, + struct nfsd3_attrstat *); +int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *, + struct nfsd3_attrstat *); +int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *, + struct nfsd3_diropres *); +int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *, + struct nfsd3_accessres *); +int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *, + struct nfsd3_readlinkres *); +int nfs3svc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd3_readres *); +int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *, struct nfsd3_writeres *); +int nfs3svc_encode_createres(struct svc_rqst *, __be32 *, + struct nfsd3_diropres *); +int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *, + struct nfsd3_renameres *); +int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *, + struct nfsd3_linkres *); +int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *, + struct nfsd3_readdirres *); +int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *, + struct nfsd3_fsstatres *); +int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *, + struct nfsd3_fsinfores *); +int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *, + struct nfsd3_pathconfres *); +int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *, + struct nfsd3_commitres *); + +int nfs3svc_release_fhandle(struct svc_rqst *, __be32 *, + struct nfsd3_attrstat *); +int nfs3svc_release_fhandle2(struct svc_rqst *, __be32 *, + struct nfsd3_fhandle_pair *); +int nfs3svc_encode_entry(void *, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int); +int nfs3svc_encode_entry_plus(void *, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int); +/* Helper functions for NFSv3 ACL code */ +__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, + struct svc_fh *fhp); +__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp); + + +#endif /* _LINUX_NFSD_XDR3_H */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h new file mode 100644 index 000000000000..efa337739534 --- /dev/null +++ b/fs/nfsd/xdr4.h @@ -0,0 +1,562 @@ +/* + * Server-side types for NFSv4. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith <kmsmith@umich.edu> + * Andy Adamson <andros@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _LINUX_NFSD_XDR4_H +#define _LINUX_NFSD_XDR4_H + +#include "state.h" +#include "nfsd.h" + +#define NFSD4_MAX_TAGLEN 128 +#define XDR_LEN(n) (((n) + 3) & ~3) + +struct nfsd4_compound_state { + struct svc_fh current_fh; + struct svc_fh save_fh; + struct nfs4_stateowner *replay_owner; + /* For sessions DRC */ + struct nfsd4_session *session; + struct nfsd4_slot *slot; + __be32 *datap; + size_t iovlen; + u32 minorversion; + u32 status; +}; + +static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs) +{ + return cs->slot != NULL; +} + +struct nfsd4_change_info { + u32 atomic; + bool change_supported; + u32 before_ctime_sec; + u32 before_ctime_nsec; + u64 before_change; + u32 after_ctime_sec; + u32 after_ctime_nsec; + u64 after_change; +}; + +struct nfsd4_access { + u32 ac_req_access; /* request */ + u32 ac_supported; /* response */ + u32 ac_resp_access; /* response */ +}; + +struct nfsd4_close { + u32 cl_seqid; /* request */ + stateid_t cl_stateid; /* request+response */ + struct nfs4_stateowner * cl_stateowner; /* response */ +}; + +struct nfsd4_commit { + u64 co_offset; /* request */ + u32 co_count; /* request */ + nfs4_verifier co_verf; /* response */ +}; + +struct nfsd4_create { + u32 cr_namelen; /* request */ + char * cr_name; /* request */ + u32 cr_type; /* request */ + union { /* request */ + struct { + u32 namelen; + char *name; + } link; /* NF4LNK */ + struct { + u32 specdata1; + u32 specdata2; + } dev; /* NF4BLK, NF4CHR */ + } u; + u32 cr_bmval[3]; /* request */ + struct iattr cr_iattr; /* request */ + struct nfsd4_change_info cr_cinfo; /* response */ + struct nfs4_acl *cr_acl; +}; +#define cr_linklen u.link.namelen +#define cr_linkname u.link.name +#define cr_specdata1 u.dev.specdata1 +#define cr_specdata2 u.dev.specdata2 + +struct nfsd4_delegreturn { + stateid_t dr_stateid; +}; + +struct nfsd4_getattr { + u32 ga_bmval[3]; /* request */ + struct svc_fh *ga_fhp; /* response */ +}; + +struct nfsd4_link { + u32 li_namelen; /* request */ + char * li_name; /* request */ + struct nfsd4_change_info li_cinfo; /* response */ +}; + +struct nfsd4_lock_denied { + clientid_t ld_clientid; + struct nfs4_stateowner *ld_sop; + u64 ld_start; + u64 ld_length; + u32 ld_type; +}; + +struct nfsd4_lock { + /* request */ + u32 lk_type; + u32 lk_reclaim; /* boolean */ + u64 lk_offset; + u64 lk_length; + u32 lk_is_new; + union { + struct { + u32 open_seqid; + stateid_t open_stateid; + u32 lock_seqid; + clientid_t clientid; + struct xdr_netobj owner; + } new; + struct { + stateid_t lock_stateid; + u32 lock_seqid; + } old; + } v; + + /* response */ + union { + struct { + stateid_t stateid; + } ok; + struct nfsd4_lock_denied denied; + } u; + /* The lk_replay_owner is the open owner in the open_to_lock_owner + * case and the lock owner otherwise: */ + struct nfs4_stateowner *lk_replay_owner; +}; +#define lk_new_open_seqid v.new.open_seqid +#define lk_new_open_stateid v.new.open_stateid +#define lk_new_lock_seqid v.new.lock_seqid +#define lk_new_clientid v.new.clientid +#define lk_new_owner v.new.owner +#define lk_old_lock_stateid v.old.lock_stateid +#define lk_old_lock_seqid v.old.lock_seqid + +#define lk_rflags u.ok.rflags +#define lk_resp_stateid u.ok.stateid +#define lk_denied u.denied + + +struct nfsd4_lockt { + u32 lt_type; + clientid_t lt_clientid; + struct xdr_netobj lt_owner; + u64 lt_offset; + u64 lt_length; + struct nfs4_stateowner * lt_stateowner; + struct nfsd4_lock_denied lt_denied; +}; + + +struct nfsd4_locku { + u32 lu_type; + u32 lu_seqid; + stateid_t lu_stateid; + u64 lu_offset; + u64 lu_length; + struct nfs4_stateowner *lu_stateowner; +}; + + +struct nfsd4_lookup { + u32 lo_len; /* request */ + char * lo_name; /* request */ +}; + +struct nfsd4_putfh { + u32 pf_fhlen; /* request */ + char *pf_fhval; /* request */ +}; + +struct nfsd4_open { + u32 op_claim_type; /* request */ + struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */ + u32 op_delegate_type; /* request - CLAIM_PREV only */ + stateid_t op_delegate_stateid; /* request - response */ + u32 op_create; /* request */ + u32 op_createmode; /* request */ + u32 op_bmval[3]; /* request */ + struct iattr iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ + nfs4_verifier verf; /* EXCLUSIVE4 */ + clientid_t op_clientid; /* request */ + struct xdr_netobj op_owner; /* request */ + u32 op_seqid; /* request */ + u32 op_share_access; /* request */ + u32 op_share_deny; /* request */ + stateid_t op_stateid; /* response */ + u32 op_recall; /* recall */ + struct nfsd4_change_info op_cinfo; /* response */ + u32 op_rflags; /* response */ + int op_truncate; /* used during processing */ + struct nfs4_stateowner *op_stateowner; /* used during processing */ + struct nfs4_acl *op_acl; +}; +#define op_iattr iattr +#define op_verf verf + +struct nfsd4_open_confirm { + stateid_t oc_req_stateid /* request */; + u32 oc_seqid /* request */; + stateid_t oc_resp_stateid /* response */; + struct nfs4_stateowner * oc_stateowner; /* response */ +}; + +struct nfsd4_open_downgrade { + stateid_t od_stateid; + u32 od_seqid; + u32 od_share_access; + u32 od_share_deny; + struct nfs4_stateowner *od_stateowner; +}; + + +struct nfsd4_read { + stateid_t rd_stateid; /* request */ + u64 rd_offset; /* request */ + u32 rd_length; /* request */ + int rd_vlen; + struct file *rd_filp; + + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh * rd_fhp; /* response */ +}; + +struct nfsd4_readdir { + u64 rd_cookie; /* request */ + nfs4_verifier rd_verf; /* request */ + u32 rd_dircount; /* request */ + u32 rd_maxcount; /* request */ + u32 rd_bmval[3]; /* request */ + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh * rd_fhp; /* response */ + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; +}; + +struct nfsd4_release_lockowner { + clientid_t rl_clientid; + struct xdr_netobj rl_owner; +}; +struct nfsd4_readlink { + struct svc_rqst *rl_rqstp; /* request */ + struct svc_fh * rl_fhp; /* request */ +}; + +struct nfsd4_remove { + u32 rm_namelen; /* request */ + char * rm_name; /* request */ + struct nfsd4_change_info rm_cinfo; /* response */ +}; + +struct nfsd4_rename { + u32 rn_snamelen; /* request */ + char * rn_sname; /* request */ + u32 rn_tnamelen; /* request */ + char * rn_tname; /* request */ + struct nfsd4_change_info rn_sinfo; /* response */ + struct nfsd4_change_info rn_tinfo; /* response */ +}; + +struct nfsd4_secinfo { + u32 si_namelen; /* request */ + char *si_name; /* request */ + struct svc_export *si_exp; /* response */ +}; + +struct nfsd4_setattr { + stateid_t sa_stateid; /* request */ + u32 sa_bmval[3]; /* request */ + struct iattr sa_iattr; /* request */ + struct nfs4_acl *sa_acl; +}; + +struct nfsd4_setclientid { + nfs4_verifier se_verf; /* request */ + u32 se_namelen; /* request */ + char * se_name; /* request */ + u32 se_callback_prog; /* request */ + u32 se_callback_netid_len; /* request */ + char * se_callback_netid_val; /* request */ + u32 se_callback_addr_len; /* request */ + char * se_callback_addr_val; /* request */ + u32 se_callback_ident; /* request */ + clientid_t se_clientid; /* response */ + nfs4_verifier se_confirm; /* response */ +}; + +struct nfsd4_setclientid_confirm { + clientid_t sc_clientid; + nfs4_verifier sc_confirm; +}; + +/* also used for NVERIFY */ +struct nfsd4_verify { + u32 ve_bmval[3]; /* request */ + u32 ve_attrlen; /* request */ + char * ve_attrval; /* request */ +}; + +struct nfsd4_write { + stateid_t wr_stateid; /* request */ + u64 wr_offset; /* request */ + u32 wr_stable_how; /* request */ + u32 wr_buflen; /* request */ + int wr_vlen; + + u32 wr_bytes_written; /* response */ + u32 wr_how_written; /* response */ + nfs4_verifier wr_verifier; /* response */ +}; + +struct nfsd4_exchange_id { + nfs4_verifier verifier; + struct xdr_netobj clname; + u32 flags; + clientid_t clientid; + u32 seqid; + int spa_how; +}; + +struct nfsd4_sequence { + struct nfs4_sessionid sessionid; /* request/response */ + u32 seqid; /* request/response */ + u32 slotid; /* request/response */ + u32 maxslots; /* request/response */ + u32 cachethis; /* request */ +#if 0 + u32 target_maxslots; /* response */ + u32 status_flags; /* response */ +#endif /* not yet */ +}; + +struct nfsd4_destroy_session { + struct nfs4_sessionid sessionid; +}; + +struct nfsd4_op { + int opnum; + __be32 status; + union { + struct nfsd4_access access; + struct nfsd4_close close; + struct nfsd4_commit commit; + struct nfsd4_create create; + struct nfsd4_delegreturn delegreturn; + struct nfsd4_getattr getattr; + struct svc_fh * getfh; + struct nfsd4_link link; + struct nfsd4_lock lock; + struct nfsd4_lockt lockt; + struct nfsd4_locku locku; + struct nfsd4_lookup lookup; + struct nfsd4_verify nverify; + struct nfsd4_open open; + struct nfsd4_open_confirm open_confirm; + struct nfsd4_open_downgrade open_downgrade; + struct nfsd4_putfh putfh; + struct nfsd4_read read; + struct nfsd4_readdir readdir; + struct nfsd4_readlink readlink; + struct nfsd4_remove remove; + struct nfsd4_rename rename; + clientid_t renew; + struct nfsd4_secinfo secinfo; + struct nfsd4_setattr setattr; + struct nfsd4_setclientid setclientid; + struct nfsd4_setclientid_confirm setclientid_confirm; + struct nfsd4_verify verify; + struct nfsd4_write write; + struct nfsd4_release_lockowner release_lockowner; + + /* NFSv4.1 */ + struct nfsd4_exchange_id exchange_id; + struct nfsd4_create_session create_session; + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; + } u; + struct nfs4_replay * replay; +}; + +struct nfsd4_compoundargs { + /* scratch variables for XDR decode */ + __be32 * p; + __be32 * end; + struct page ** pagelist; + int pagelen; + __be32 tmp[8]; + __be32 * tmpp; + struct tmpbuf { + struct tmpbuf *next; + void (*release)(const void *); + void *buf; + } *to_free; + + struct svc_rqst *rqstp; + + u32 taglen; + char * tag; + u32 minorversion; + u32 opcnt; + struct nfsd4_op *ops; + struct nfsd4_op iops[8]; +}; + +struct nfsd4_compoundres { + /* scratch variables for XDR encode */ + __be32 * p; + __be32 * end; + struct xdr_buf * xbuf; + struct svc_rqst * rqstp; + + u32 taglen; + char * tag; + u32 opcnt; + __be32 * tagp; /* tag, opcount encode location */ + struct nfsd4_compound_state cstate; +}; + +static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp) +{ + struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; + return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE; +} + +static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) +{ + return !resp->cstate.slot->sl_cachethis || nfsd4_is_solo_sequence(resp); +} + +#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) + +static inline void +set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) +{ + BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved); + cinfo->atomic = 1; + cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); + if (cinfo->change_supported) { + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + } else { + cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; + cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; + cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; + cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; + } +} + +int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); +int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *, + struct nfsd4_compoundargs *); +int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, + struct nfsd4_compoundres *); +void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); +void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); +__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, __be32 *buffer, int *countp, + u32 *bmval, struct svc_rqst *, int ignore_crossmnt); +extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_setclientid *setclid); +extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_setclientid_confirm *setclientid_confirm); +extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp); +extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, + struct nfsd4_sequence *seq); +extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, +struct nfsd4_exchange_id *); + extern __be32 nfsd4_create_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_create_session *); +extern __be32 nfsd4_sequence(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_sequence *); +extern __be32 nfsd4_destroy_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_destroy_session *); +extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, + struct nfsd4_open *open); +extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, + struct svc_fh *current_fh, struct nfsd4_open *open); +extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); +extern __be32 nfsd4_close(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_close *close); +extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_open_downgrade *od); +extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *, + struct nfsd4_lock *lock); +extern __be32 nfsd4_lockt(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_lockt *lockt); +extern __be32 nfsd4_locku(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_locku *locku); +extern __be32 +nfsd4_release_lockowner(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_release_lockowner *rlockowner); +extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *); +extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_delegreturn *dr); +extern __be32 nfsd4_renew(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, clientid_t *clid); +#endif + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index d69e6ae59251..3f959f1879d8 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -142,29 +142,75 @@ static void nilfs_palloc_desc_block_init(struct inode *inode, } } +static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff, + int create, + void (*init_block)(struct inode *, + struct buffer_head *, + void *), + struct buffer_head **bhp, + struct nilfs_bh_assoc *prev, + spinlock_t *lock) +{ + int ret; + + spin_lock(lock); + if (prev->bh && blkoff == prev->blkoff) { + get_bh(prev->bh); + *bhp = prev->bh; + spin_unlock(lock); + return 0; + } + spin_unlock(lock); + + ret = nilfs_mdt_get_block(inode, blkoff, create, init_block, bhp); + if (!ret) { + spin_lock(lock); + /* + * The following code must be safe for change of the + * cache contents during the get block call. + */ + brelse(prev->bh); + get_bh(*bhp); + prev->bh = *bhp; + prev->blkoff = blkoff; + spin_unlock(lock); + } + return ret; +} + static int nilfs_palloc_get_desc_block(struct inode *inode, unsigned long group, int create, struct buffer_head **bhp) { - return nilfs_mdt_get_block(inode, - nilfs_palloc_desc_blkoff(inode, group), - create, nilfs_palloc_desc_block_init, bhp); + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_get_block(inode, + nilfs_palloc_desc_blkoff(inode, group), + create, nilfs_palloc_desc_block_init, + bhp, &cache->prev_desc, &cache->lock); } static int nilfs_palloc_get_bitmap_block(struct inode *inode, unsigned long group, int create, struct buffer_head **bhp) { - return nilfs_mdt_get_block(inode, - nilfs_palloc_bitmap_blkoff(inode, group), - create, NULL, bhp); + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_get_block(inode, + nilfs_palloc_bitmap_blkoff(inode, group), + create, NULL, bhp, + &cache->prev_bitmap, &cache->lock); } int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, int create, struct buffer_head **bhp) { - return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr), - create, NULL, bhp); + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_get_block(inode, + nilfs_palloc_entry_blkoff(inode, nr), + create, NULL, bhp, + &cache->prev_entry, &cache->lock); } static struct nilfs_palloc_group_desc * @@ -176,13 +222,6 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode, group % nilfs_palloc_groups_per_desc_block(inode); } -static unsigned char * -nilfs_palloc_block_get_bitmap(const struct inode *inode, - const struct buffer_head *bh, void *kaddr) -{ - return (unsigned char *)(kaddr + bh_offset(bh)); -} - void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, const struct buffer_head *bh, void *kaddr) { @@ -289,8 +328,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, if (ret < 0) goto out_desc; bitmap_kaddr = kmap(bitmap_bh->b_page); - bitmap = nilfs_palloc_block_get_bitmap( - inode, bitmap_bh, bitmap_kaddr); + bitmap = bitmap_kaddr + bh_offset(bitmap_bh); pos = nilfs_palloc_find_available_slot( inode, group, group_offset, bitmap, entries_per_group); @@ -351,8 +389,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, desc = nilfs_palloc_block_get_group_desc(inode, group, req->pr_desc_bh, desc_kaddr); bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); - bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh, - bitmap_kaddr); + bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), group_offset, bitmap)) @@ -385,8 +422,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, desc = nilfs_palloc_block_get_group_desc(inode, group, req->pr_desc_bh, desc_kaddr); bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); - bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh, - bitmap_kaddr); + bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), group_offset, bitmap)) printk(KERN_WARNING "%s: entry numer %llu already freed\n", @@ -472,8 +508,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) desc = nilfs_palloc_block_get_group_desc( inode, group, desc_bh, desc_kaddr); bitmap_kaddr = kmap(bitmap_bh->b_page); - bitmap = nilfs_palloc_block_get_bitmap( - inode, bitmap_bh, bitmap_kaddr); + bitmap = bitmap_kaddr + bh_offset(bitmap_bh); for (j = i, n = 0; (j < nitems) && nilfs_palloc_group_is_in(inode, group, entry_nrs[j]); @@ -502,3 +537,30 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) } return 0; } + +void nilfs_palloc_setup_cache(struct inode *inode, + struct nilfs_palloc_cache *cache) +{ + NILFS_MDT(inode)->mi_palloc_cache = cache; + spin_lock_init(&cache->lock); +} + +void nilfs_palloc_clear_cache(struct inode *inode) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + spin_lock(&cache->lock); + brelse(cache->prev_desc.bh); + brelse(cache->prev_bitmap.bh); + brelse(cache->prev_entry.bh); + cache->prev_desc.bh = NULL; + cache->prev_bitmap.bh = NULL; + cache->prev_entry.bh = NULL; + spin_unlock(&cache->lock); +} + +void nilfs_palloc_destroy_cache(struct inode *inode) +{ + nilfs_palloc_clear_cache(inode); + NILFS_MDT(inode)->mi_palloc_cache = NULL; +} diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index 4ace5475c2c7..f4543ac4f560 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -69,4 +69,25 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t); #define nilfs_clear_bit_atomic ext2_clear_bit_atomic #define nilfs_find_next_zero_bit ext2_find_next_zero_bit +/* + * persistent object allocator cache + */ + +struct nilfs_bh_assoc { + unsigned long blkoff; + struct buffer_head *bh; +}; + +struct nilfs_palloc_cache { + spinlock_t lock; + struct nilfs_bh_assoc prev_desc; + struct nilfs_bh_assoc prev_bitmap; + struct nilfs_bh_assoc prev_entry; +}; + +void nilfs_palloc_setup_cache(struct inode *inode, + struct nilfs_palloc_cache *cache); +void nilfs_palloc_clear_cache(struct inode *inode); +void nilfs_palloc_destroy_cache(struct inode *inode); + #endif /* _NILFS_ALLOC_H */ diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 08834df6ec68..effdbdbe6c11 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -402,19 +402,11 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap) void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n) { inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n); - if (NILFS_MDT(bmap->b_inode)) - nilfs_mdt_mark_dirty(bmap->b_inode); - else - mark_inode_dirty(bmap->b_inode); } void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n) { inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n); - if (NILFS_MDT(bmap->b_inode)) - nilfs_mdt_mark_dirty(bmap->b_inode); - else - mark_inode_dirty(bmap->b_inode); } __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, @@ -425,8 +417,8 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT - bmap->b_inode->i_blkbits); - for (pbh = page_buffers(bh->b_page); pbh != bh; - pbh = pbh->b_this_page, key++); + for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page) + key++; return key; } diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 84c25382f8e3..471e269536ae 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -68,9 +68,34 @@ void nilfs_btnode_cache_clear(struct address_space *btnc) truncate_inode_pages(btnc, 0); } +struct buffer_head * +nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) +{ + struct inode *inode = NILFS_BTNC_I(btnc); + struct buffer_head *bh; + + bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); + if (unlikely(!bh)) + return NULL; + + if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) || + buffer_dirty(bh))) { + brelse(bh); + BUG(); + } + memset(bh->b_data, 0, 1 << inode->i_blkbits); + bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + bh->b_blocknr = blocknr; + set_buffer_mapped(bh); + set_buffer_uptodate(bh); + + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + return bh; +} + int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, - sector_t pblocknr, struct buffer_head **pbh, - int newblk) + sector_t pblocknr, struct buffer_head **pbh) { struct buffer_head *bh; struct inode *inode = NILFS_BTNC_I(btnc); @@ -81,19 +106,6 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, return -ENOMEM; err = -EEXIST; /* internal code */ - if (newblk) { - if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) || - buffer_dirty(bh))) { - brelse(bh); - BUG(); - } - memset(bh->b_data, 0, 1 << inode->i_blkbits); - bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; - bh->b_blocknr = blocknr; - set_buffer_mapped(bh); - set_buffer_uptodate(bh); - goto found; - } if (buffer_uptodate(bh) || buffer_dirty(bh)) goto found; @@ -135,27 +147,6 @@ out_locked: return err; } -int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr, - sector_t pblocknr, struct buffer_head **pbh, int newblk) -{ - struct buffer_head *bh; - int err; - - err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk); - if (err == -EEXIST) /* internal code (cache hit) */ - return 0; - if (unlikely(err)) - return err; - - bh = *pbh; - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - brelse(bh); - return -EIO; - } - return 0; -} - /** * nilfs_btnode_delete - delete B-tree node buffer * @bh: buffer to be deleted @@ -244,12 +235,13 @@ retry: unlock_page(obh->b_page); } - err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1); - if (likely(!err)) { - BUG_ON(nbh == obh); - ctxt->newbh = nbh; - } - return err; + nbh = nilfs_btnode_create_block(btnc, newkey); + if (!nbh) + return -ENOMEM; + + BUG_ON(nbh == obh); + ctxt->newbh = nbh; + return 0; failed_unlock: unlock_page(obh->b_page); diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 3e2275172ed6..07da83f07712 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -40,10 +40,10 @@ struct nilfs_btnode_chkey_ctxt { void nilfs_btnode_cache_init_once(struct address_space *); void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *); void nilfs_btnode_cache_clear(struct address_space *); +struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, + __u64 blocknr); int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, - struct buffer_head **, int); -int nilfs_btnode_get(struct address_space *, __u64, sector_t, - struct buffer_head **, int); + struct buffer_head **); void nilfs_btnode_delete(struct buffer_head *); int nilfs_btnode_prepare_change_key(struct address_space *, struct nilfs_btnode_chkey_ctxt *); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index e25b507a474f..7cdd98b8d514 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -114,7 +114,18 @@ static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr, { struct address_space *btnc = &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache; - return nilfs_btnode_get(btnc, ptr, 0, bhp, 0); + int err; + + err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp); + if (err) + return err == -EEXIST ? 0 : err; + + wait_on_buffer(*bhp); + if (!buffer_uptodate(*bhp)) { + brelse(*bhp); + return -EIO; + } + return 0; } static int nilfs_btree_get_new_block(const struct nilfs_btree *btree, @@ -122,12 +133,15 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree, { struct address_space *btnc = &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache; - int ret; + struct buffer_head *bh; - ret = nilfs_btnode_get(btnc, ptr, 0, bhp, 1); - if (!ret) - set_buffer_nilfs_volatile(*bhp); - return ret; + bh = nilfs_btnode_create_block(btnc, ptr); + if (!bh) + return -ENOMEM; + + set_buffer_nilfs_volatile(bh); + *bhp = bh; + return 0; } static inline int @@ -444,6 +458,18 @@ nilfs_btree_get_node(const struct nilfs_btree *btree, nilfs_btree_get_nonroot_node(path, level); } +static inline int +nilfs_btree_bad_node(struct nilfs_btree_node *node, int level) +{ + if (unlikely(nilfs_btree_node_get_level(node) != level)) { + dump_stack(); + printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n", + nilfs_btree_node_get_level(node), level); + return 1; + } + return 0; +} + static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, struct nilfs_btree_path *path, __u64 key, __u64 *ptrp, int minlevel) @@ -467,7 +493,8 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, if (ret < 0) return ret; node = nilfs_btree_get_nonroot_node(path, level); - BUG_ON(level != nilfs_btree_node_get_level(node)); + if (nilfs_btree_bad_node(node, level)) + return -EINVAL; if (!found) found = nilfs_btree_node_lookup(node, key, &index); else @@ -512,7 +539,8 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, if (ret < 0) return ret; node = nilfs_btree_get_nonroot_node(path, level); - BUG_ON(level != nilfs_btree_node_get_level(node)); + if (nilfs_btree_bad_node(node, level)) + return -EINVAL; index = nilfs_btree_node_get_nchildren(node) - 1; ptr = nilfs_btree_node_get_ptr(btree, node, index); path[level].bp_index = index; @@ -638,13 +666,11 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree, { if (level < nilfs_btree_height(btree) - 1) { do { - lock_buffer(path[level].bp_bh); nilfs_btree_node_set_key( nilfs_btree_get_nonroot_node(path, level), path[level].bp_index, key); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); - unlock_buffer(path[level].bp_bh); } while ((path[level].bp_index == 0) && (++level < nilfs_btree_height(btree) - 1)); } @@ -663,13 +689,11 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree, struct nilfs_btree_node *node; if (level < nilfs_btree_height(btree) - 1) { - lock_buffer(path[level].bp_bh); node = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_insert(btree, node, *keyp, *ptrp, path[level].bp_index); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); - unlock_buffer(path[level].bp_bh); if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, @@ -689,9 +713,6 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, struct nilfs_btree_node *node, *left; int nchildren, lnchildren, n, move; - lock_buffer(path[level].bp_bh); - lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(path, level); left = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); @@ -712,9 +733,6 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); - unlock_buffer(path[level].bp_bh); - unlock_buffer(path[level].bp_sib_bh); - nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); @@ -740,9 +758,6 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, struct nilfs_btree_node *node, *right; int nchildren, rnchildren, n, move; - lock_buffer(path[level].bp_bh); - lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); @@ -763,9 +778,6 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); - unlock_buffer(path[level].bp_bh); - unlock_buffer(path[level].bp_sib_bh); - path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(right, 0)); @@ -794,9 +806,6 @@ static void nilfs_btree_split(struct nilfs_btree *btree, __u64 newptr; int nchildren, n, move; - lock_buffer(path[level].bp_bh); - lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); @@ -815,9 +824,6 @@ static void nilfs_btree_split(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); - unlock_buffer(path[level].bp_bh); - unlock_buffer(path[level].bp_sib_bh); - newkey = nilfs_btree_node_get_key(right, 0); newptr = path[level].bp_newreq.bpr_ptr; @@ -852,8 +858,6 @@ static void nilfs_btree_grow(struct nilfs_btree *btree, struct nilfs_btree_node *root, *child; int n; - lock_buffer(path[level].bp_sib_bh); - root = nilfs_btree_get_root(btree); child = nilfs_btree_get_sib_node(path, level); @@ -865,8 +869,6 @@ static void nilfs_btree_grow(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); - unlock_buffer(path[level].bp_sib_bh); - path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; @@ -1023,11 +1025,9 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, stats->bs_nblocks++; - lock_buffer(bh); nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data, 0, level, 0, NULL, NULL); - unlock_buffer(bh); path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_split; } @@ -1052,10 +1052,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, if (ret < 0) goto err_out_curr_node; - lock_buffer(bh); nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data, 0, level, 0, NULL, NULL); - unlock_buffer(bh); path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_grow; @@ -1154,13 +1152,11 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree, struct nilfs_btree_node *node; if (level < nilfs_btree_height(btree) - 1) { - lock_buffer(path[level].bp_bh); node = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_delete(btree, node, keyp, ptrp, path[level].bp_index); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); - unlock_buffer(path[level].bp_bh); if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); @@ -1180,9 +1176,6 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, nilfs_btree_do_delete(btree, path, level, keyp, ptrp); - lock_buffer(path[level].bp_bh); - lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(path, level); left = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); @@ -1197,9 +1190,6 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); - unlock_buffer(path[level].bp_bh); - unlock_buffer(path[level].bp_sib_bh); - nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); @@ -1217,9 +1207,6 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, nilfs_btree_do_delete(btree, path, level, keyp, ptrp); - lock_buffer(path[level].bp_bh); - lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); @@ -1234,9 +1221,6 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); - unlock_buffer(path[level].bp_bh); - unlock_buffer(path[level].bp_sib_bh); - path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(right, 0)); @@ -1255,9 +1239,6 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree, nilfs_btree_do_delete(btree, path, level, keyp, ptrp); - lock_buffer(path[level].bp_bh); - lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(path, level); left = nilfs_btree_get_sib_node(path, level); @@ -1268,9 +1249,6 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); - unlock_buffer(path[level].bp_bh); - unlock_buffer(path[level].bp_sib_bh); - nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; @@ -1286,9 +1264,6 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree, nilfs_btree_do_delete(btree, path, level, keyp, ptrp); - lock_buffer(path[level].bp_bh); - lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); @@ -1299,9 +1274,6 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree, if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); - unlock_buffer(path[level].bp_bh); - unlock_buffer(path[level].bp_sib_bh); - nilfs_btnode_delete(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; path[level + 1].bp_index++; @@ -1316,7 +1288,6 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree, nilfs_btree_do_delete(btree, path, level, keyp, ptrp); - lock_buffer(path[level].bp_bh); root = nilfs_btree_get_root(btree); child = nilfs_btree_get_nonroot_node(path, level); @@ -1324,7 +1295,6 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree, nilfs_btree_node_set_level(root, level); n = nilfs_btree_node_get_nchildren(child); nilfs_btree_node_move_left(btree, root, child, n); - unlock_buffer(path[level].bp_bh); nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = NULL; @@ -1699,7 +1669,6 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat); /* create child node at level 1 */ - lock_buffer(bh); node = (struct nilfs_btree_node *)bh->b_data; nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs); nilfs_btree_node_insert(btree, node, @@ -1709,7 +1678,6 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, if (!nilfs_bmap_dirty(bmap)) nilfs_bmap_set_dirty(bmap); - unlock_buffer(bh); brelse(bh); /* create root node at level 2 */ @@ -2050,7 +2018,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap, for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < NILFS_BTREE_LEVEL_MAX; level++) - list_splice(&lists[level], listp->prev); + list_splice_tail(&lists[level], listp); } static int nilfs_btree_assign_p(struct nilfs_btree *btree, diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h index 0e72bbbc6b64..4b82d84ade75 100644 --- a/fs/nilfs2/btree.h +++ b/fs/nilfs2/btree.h @@ -34,28 +34,6 @@ struct nilfs_btree; struct nilfs_btree_path; /** - * struct nilfs_btree_node - B-tree node - * @bn_flags: flags - * @bn_level: level - * @bn_nchildren: number of children - * @bn_pad: padding - */ -struct nilfs_btree_node { - __u8 bn_flags; - __u8 bn_level; - __le16 bn_nchildren; - __le32 bn_pad; -}; - -/* flags */ -#define NILFS_BTREE_NODE_ROOT 0x01 - -/* level */ -#define NILFS_BTREE_LEVEL_DATA 0 -#define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1) -#define NILFS_BTREE_LEVEL_MAX 14 - -/** * struct nilfs_btree - B-tree structure * @bt_bmap: bmap base structure */ diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 3f5d5d06f53c..18737818db63 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -328,19 +328,24 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, tnicps += nicps; nilfs_mdt_mark_buffer_dirty(cp_bh); nilfs_mdt_mark_dirty(cpfile); - if (!nilfs_cpfile_is_in_first(cpfile, cno) && - (count = nilfs_cpfile_block_sub_valid_checkpoints( - cpfile, cp_bh, kaddr, nicps)) == 0) { - /* make hole */ - kunmap_atomic(kaddr, KM_USER0); - brelse(cp_bh); - ret = nilfs_cpfile_delete_checkpoint_block( - cpfile, cno); - if (ret == 0) - continue; - printk(KERN_ERR "%s: cannot delete block\n", - __func__); - break; + if (!nilfs_cpfile_is_in_first(cpfile, cno)) { + count = + nilfs_cpfile_block_sub_valid_checkpoints( + cpfile, cp_bh, kaddr, nicps); + if (count == 0) { + /* make hole */ + kunmap_atomic(kaddr, KM_USER0); + brelse(cp_bh); + ret = + nilfs_cpfile_delete_checkpoint_block( + cpfile, cno); + if (ret == 0) + continue; + printk(KERN_ERR + "%s: cannot delete block\n", + __func__); + break; + } } } @@ -926,3 +931,29 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) up_read(&NILFS_MDT(cpfile)->mi_sem); return ret; } + +/** + * nilfs_cpfile_read - read cpfile inode + * @cpfile: cpfile inode + * @raw_inode: on-disk cpfile inode + */ +int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode) +{ + return nilfs_read_inode_common(cpfile, raw_inode); +} + +/** + * nilfs_cpfile_new - create cpfile + * @nilfs: nilfs object + * @cpsize: size of a checkpoint entry + */ +struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize) +{ + struct inode *cpfile; + + cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0); + if (cpfile) + nilfs_mdt_set_entry_size(cpfile, cpsize, + sizeof(struct nilfs_cpfile_header)); + return cpfile; +} diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index debea896e701..bc0809e0ab43 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -40,4 +40,7 @@ int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *); ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned, size_t); +int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode); +struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize); + #endif /* _NILFS_CPFILE_H */ diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 1ff8e15bd36b..187dd07ba86c 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -33,6 +33,16 @@ #define NILFS_CNO_MIN ((__u64)1) #define NILFS_CNO_MAX (~(__u64)0) +struct nilfs_dat_info { + struct nilfs_mdt_info mi; + struct nilfs_palloc_cache palloc_cache; +}; + +static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat) +{ + return (struct nilfs_dat_info *)NILFS_MDT(dat); +} + static int nilfs_dat_prepare_entry(struct inode *dat, struct nilfs_palloc_req *req, int create) { @@ -425,3 +435,40 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz, return nvi; } + +/** + * nilfs_dat_read - read dat inode + * @dat: dat inode + * @raw_inode: on-disk dat inode + */ +int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode) +{ + return nilfs_read_inode_common(dat, raw_inode); +} + +/** + * nilfs_dat_new - create dat file + * @nilfs: nilfs object + * @entry_size: size of a dat entry + */ +struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size) +{ + static struct lock_class_key dat_lock_key; + struct inode *dat; + struct nilfs_dat_info *di; + int err; + + dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di)); + if (dat) { + err = nilfs_palloc_init_blockgroup(dat, entry_size); + if (unlikely(err)) { + nilfs_mdt_destroy(dat); + return NULL; + } + + di = NILFS_DAT_I(dat); + lockdep_set_class(&di->mi.mi_sem, &dat_lock_key); + nilfs_palloc_setup_cache(dat, &di->palloc_cache); + } + return dat; +} diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h index 406070d3ff49..d31c3aab0efe 100644 --- a/fs/nilfs2/dat.h +++ b/fs/nilfs2/dat.h @@ -53,4 +53,7 @@ int nilfs_dat_freev(struct inode *, __u64 *, size_t); int nilfs_dat_move(struct inode *, __u64, sector_t); ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t); +int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode); +struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size); + #endif /* _NILFS_DAT_H */ diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index e097099bfc8f..76d803e060a9 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -99,9 +99,9 @@ static int nilfs_prepare_chunk(struct page *page, NULL, nilfs_get_block); } -static int nilfs_commit_chunk(struct page *page, - struct address_space *mapping, - unsigned from, unsigned to) +static void nilfs_commit_chunk(struct page *page, + struct address_space *mapping, + unsigned from, unsigned to) { struct inode *dir = mapping->host; struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb); @@ -112,15 +112,13 @@ static int nilfs_commit_chunk(struct page *page, nr_dirty = nilfs_page_count_clean_buffers(page, from, to); copied = block_write_end(NULL, mapping, pos, len, len, page, NULL); - if (pos + copied > dir->i_size) { + if (pos + copied > dir->i_size) i_size_write(dir, pos + copied); - mark_inode_dirty(dir); - } if (IS_DIRSYNC(dir)) nilfs_set_transaction_flag(NILFS_TI_SYNC); err = nilfs_set_file_dirty(sbi, dir, nr_dirty); + WARN_ON(err); /* do not happen */ unlock_page(page); - return err; } static void nilfs_check_page(struct page *page) @@ -455,11 +453,10 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, BUG_ON(err); de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); - err = nilfs_commit_chunk(page, mapping, from, to); + nilfs_commit_chunk(page, mapping, from, to); nilfs_put_page(page); dir->i_mtime = dir->i_ctime = CURRENT_TIME; /* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */ - mark_inode_dirty(dir); } /* @@ -548,10 +545,10 @@ got_it: memcpy(de->name, name, namelen); de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); - err = nilfs_commit_chunk(page, page->mapping, from, to); + nilfs_commit_chunk(page, page->mapping, from, to); dir->i_mtime = dir->i_ctime = CURRENT_TIME; /* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */ - mark_inode_dirty(dir); + nilfs_mark_inode_dirty(dir); /* OFFSET_CACHE */ out_put: nilfs_put_page(page); @@ -595,10 +592,9 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) if (pde) pde->rec_len = cpu_to_le16(to - from); dir->inode = 0; - err = nilfs_commit_chunk(page, mapping, from, to); + nilfs_commit_chunk(page, mapping, from, to); inode->i_ctime = inode->i_mtime = CURRENT_TIME; /* NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */ - mark_inode_dirty(inode); out: nilfs_put_page(page); return err; @@ -640,7 +636,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent) memcpy(de->name, "..\0", 4); nilfs_set_de_type(de, inode); kunmap_atomic(kaddr, KM_USER0); - err = nilfs_commit_chunk(page, mapping, 0, chunk_size); + nilfs_commit_chunk(page, mapping, 0, chunk_size); fail: page_cache_release(page); return err; diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index d369ac718277..236753df5cdf 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -51,11 +51,11 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *bmap, struct nilfs_direct *direct; __u64 ptr; - direct = (struct nilfs_direct *)bmap; - if ((key > NILFS_DIRECT_KEY_MAX) || - (level != 1) || /* XXX: use macro for level 1 */ - ((ptr = nilfs_direct_get_ptr(direct, key)) == - NILFS_BMAP_INVALID_PTR)) + direct = (struct nilfs_direct *)bmap; /* XXX: use macro for level 1 */ + if (key > NILFS_DIRECT_KEY_MAX || level != 1) + return -ENOENT; + ptr = nilfs_direct_get_ptr(direct, key); + if (ptr == NILFS_BMAP_INVALID_PTR) return -ENOENT; if (ptrp != NULL) @@ -73,9 +73,10 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap, sector_t blocknr; int ret, cnt; - if (key > NILFS_DIRECT_KEY_MAX || - (ptr = nilfs_direct_get_ptr(direct, key)) == - NILFS_BMAP_INVALID_PTR) + if (key > NILFS_DIRECT_KEY_MAX) + return -ENOENT; + ptr = nilfs_direct_get_ptr(direct, key); + if (ptr == NILFS_BMAP_INVALID_PTR) return -ENOENT; if (NILFS_BMAP_USE_VBN(bmap)) { diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c index 93383c5cee90..dd5f7e0a95f6 100644 --- a/fs/nilfs2/gcdat.c +++ b/fs/nilfs2/gcdat.c @@ -61,6 +61,8 @@ void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs) nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap); + nilfs_palloc_clear_cache(dat); + nilfs_palloc_clear_cache(gcdat); nilfs_clear_dirty_pages(mapping); nilfs_copy_back_pages(mapping, gmapping); /* note: mdt dirty flags should be cleared by segctor. */ @@ -79,6 +81,7 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs) gcdat->i_state = I_CLEAR; gii->i_flags = 0; + nilfs_palloc_clear_cache(gcdat); truncate_inode_pages(gcdat->i_mapping, 0); truncate_inode_pages(&gii->i_btnode_cache, 0); } diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index e6de0a27ab5d..e16a6664dfa2 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -149,7 +149,7 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, __u64 vbn, struct buffer_head **out_bh) { int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, - vbn ? : pbn, pbn, out_bh, 0); + vbn ? : pbn, pbn, out_bh); if (ret == -EEXIST) /* internal code (cache hit) */ ret = 0; return ret; @@ -212,9 +212,10 @@ void nilfs_destroy_gccache(struct the_nilfs *nilfs) static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino, __u64 cno) { - struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS); + struct inode *inode; struct nilfs_inode_info *ii; + inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0); if (!inode) return NULL; @@ -265,7 +266,6 @@ struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno) */ void nilfs_clear_gcinode(struct inode *inode) { - nilfs_mdt_clear(inode); nilfs_mdt_destroy(inode); } diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index de86401f209f..922d9dd42c8f 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -29,6 +29,17 @@ #include "alloc.h" #include "ifile.h" + +struct nilfs_ifile_info { + struct nilfs_mdt_info mi; + struct nilfs_palloc_cache palloc_cache; +}; + +static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile) +{ + return (struct nilfs_ifile_info *)NILFS_MDT(ifile); +} + /** * nilfs_ifile_create_inode - create a new disk inode * @ifile: ifile inode @@ -148,3 +159,27 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino, } return err; } + +/** + * nilfs_ifile_new - create inode file + * @sbi: nilfs_sb_info struct + * @inode_size: size of an inode + */ +struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size) +{ + struct inode *ifile; + int err; + + ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO, + sizeof(struct nilfs_ifile_info)); + if (ifile) { + err = nilfs_palloc_init_blockgroup(ifile, inode_size); + if (unlikely(err)) { + nilfs_mdt_destroy(ifile); + return NULL; + } + nilfs_palloc_setup_cache(ifile, + &NILFS_IFILE_I(ifile)->palloc_cache); + } + return ifile; +} diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index ecc3ba76db47..cbca32e498f2 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -49,4 +49,6 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **); int nilfs_ifile_delete_inode(struct inode *, ino_t); int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); +struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size); + #endif /* _NILFS_IFILE_H */ diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 2a0a5a3ac134..7868cc122ac7 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -97,6 +97,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, nilfs_transaction_abort(inode->i_sb); goto out; } + nilfs_mark_inode_dirty(inode); nilfs_transaction_commit(inode->i_sb); /* never fails */ /* Error handling should be detailed */ set_buffer_new(bh_result); @@ -322,7 +323,6 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode) nilfs_init_acl(), proper cancellation of above jobs should be considered */ - mark_inode_dirty(inode); return inode; failed_acl: @@ -525,7 +525,6 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh) raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh); - /* The buffer is guarded with lock_buffer() by the caller */ if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size); set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); @@ -599,6 +598,7 @@ void nilfs_truncate(struct inode *inode) if (IS_SYNC(inode)) nilfs_set_transaction_flag(NILFS_TI_SYNC); + nilfs_mark_inode_dirty(inode); nilfs_set_file_dirty(NILFS_SB(sb), inode, 0); nilfs_transaction_commit(sb); /* May construct a logical segment and may fail in sync mode. @@ -623,6 +623,7 @@ void nilfs_delete_inode(struct inode *inode) truncate_inode_pages(&inode->i_data, 0); nilfs_truncate_bmap(ii, 0); + nilfs_mark_inode_dirty(inode); nilfs_free_inode(inode); /* nilfs_free_inode() marks inode buffer dirty */ if (IS_SYNC(inode)) @@ -745,9 +746,7 @@ int nilfs_mark_inode_dirty(struct inode *inode) "failed to reget inode block.\n"); return err; } - lock_buffer(ibh); nilfs_update_inode(inode, ibh); - unlock_buffer(ibh); nilfs_mdt_mark_buffer_dirty(ibh); nilfs_mdt_mark_dirty(sbi->s_ifile); brelse(ibh); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index f6af76042d80..d6b2b83de363 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -480,7 +480,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { struct nilfs_argv argv[5]; - const static size_t argsz[5] = { + static const size_t argsz[5] = { sizeof(struct nilfs_vdesc), sizeof(struct nilfs_period), sizeof(__u64), diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index f6326112d647..06713ffcc7f2 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -186,7 +186,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, } static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, - struct buffer_head **out_bh) + int readahead, struct buffer_head **out_bh) { struct buffer_head *first_bh, *bh; unsigned long blkoff; @@ -200,16 +200,18 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, if (unlikely(err)) goto failed; - blkoff = block + 1; - for (i = 0; i < nr_ra_blocks; i++, blkoff++) { - err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh); - if (likely(!err || err == -EEXIST)) - brelse(bh); - else if (err != -EBUSY) - break; /* abort readahead if bmap lookup failed */ - - if (!buffer_locked(first_bh)) - goto out_no_wait; + if (readahead) { + blkoff = block + 1; + for (i = 0; i < nr_ra_blocks; i++, blkoff++) { + err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh); + if (likely(!err || err == -EEXIST)) + brelse(bh); + else if (err != -EBUSY) + break; + /* abort readahead if bmap lookup failed */ + if (!buffer_locked(first_bh)) + goto out_no_wait; + } } wait_on_buffer(first_bh); @@ -263,7 +265,7 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, /* Should be rewritten with merging nilfs_mdt_read_block() */ retry: - ret = nilfs_mdt_read_block(inode, blkoff, out_bh); + ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh); if (!create || ret != -ENOENT) return ret; @@ -371,7 +373,7 @@ int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) struct buffer_head *bh; int err; - err = nilfs_mdt_read_block(inode, block, &bh); + err = nilfs_mdt_read_block(inode, block, 0, &bh); if (unlikely(err)) return err; nilfs_mark_buffer_dirty(bh); @@ -445,9 +447,17 @@ static const struct file_operations def_mdt_fops; * longer than those of the super block structs; they may continue for * several consecutive mounts/umounts. This would need discussions. */ +/** + * nilfs_mdt_new_common - allocate a pseudo inode for metadata file + * @nilfs: nilfs object + * @sb: super block instance the metadata file belongs to + * @ino: inode number + * @gfp_mask: gfp mask for data pages + * @objsz: size of the private object attached to inode->i_private + */ struct inode * nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, - ino_t ino, gfp_t gfp_mask) + ino_t ino, gfp_t gfp_mask, size_t objsz) { struct inode *inode = nilfs_alloc_inode_common(nilfs); @@ -455,8 +465,9 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, return NULL; else { struct address_space * const mapping = &inode->i_data; - struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS); + struct nilfs_mdt_info *mi; + mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS); if (!mi) { nilfs_destroy_inode(inode); return NULL; @@ -513,11 +524,11 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, } struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, - ino_t ino) + ino_t ino, size_t objsz) { - struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, - NILFS_MDT_GFP); + struct inode *inode; + inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz); if (!inode) return NULL; @@ -544,14 +555,15 @@ void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow) &NILFS_I(orig)->i_btnode_cache; } -void nilfs_mdt_clear(struct inode *inode) +static void nilfs_mdt_clear(struct inode *inode) { struct nilfs_inode_info *ii = NILFS_I(inode); invalidate_mapping_pages(inode->i_mapping, 0, -1); truncate_inode_pages(inode->i_mapping, 0); - nilfs_bmap_clear(ii->i_bmap); + if (test_bit(NILFS_I_BMAP, &ii->i_state)) + nilfs_bmap_clear(ii->i_bmap); nilfs_btnode_cache_clear(&ii->i_btnode_cache); } @@ -559,6 +571,10 @@ void nilfs_mdt_destroy(struct inode *inode) { struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + if (mdi->mi_palloc_cache) + nilfs_palloc_destroy_cache(inode); + nilfs_mdt_clear(inode); + kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ kfree(mdi); nilfs_destroy_inode(inode); diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index 431599733c9b..6c4bbb0470fc 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -36,6 +36,7 @@ * @mi_entry_size: size of an entry * @mi_first_entry_offset: offset to the first entry * @mi_entries_per_block: number of entries in a block + * @mi_palloc_cache: persistent object allocator cache * @mi_blocks_per_group: number of blocks in a group * @mi_blocks_per_desc_block: number of blocks per descriptor block */ @@ -46,6 +47,7 @@ struct nilfs_mdt_info { unsigned mi_entry_size; unsigned mi_first_entry_offset; unsigned long mi_entries_per_block; + struct nilfs_palloc_cache *mi_palloc_cache; unsigned long mi_blocks_per_group; unsigned long mi_blocks_per_desc_block; }; @@ -74,11 +76,11 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long); int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); int nilfs_mdt_fetch_dirty(struct inode *); -struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t); +struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, + size_t); struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *, - ino_t, gfp_t); + ino_t, gfp_t, size_t); void nilfs_mdt_destroy(struct inode *); -void nilfs_mdt_clear(struct inode *); void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned); void nilfs_mdt_set_shadow(struct inode *, struct inode *); @@ -104,21 +106,4 @@ static inline __u64 nilfs_mdt_cno(struct inode *inode) #define nilfs_mdt_bgl_lock(inode, bg) \ (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock) - -static inline int -nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh, - unsigned n) -{ - return nilfs_read_inode_common( - inode, (struct nilfs_inode *)(bh->b_data + n)); -} - -static inline void -nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh, - unsigned n) -{ - nilfs_write_inode_common( - inode, (struct nilfs_inode *)(bh->b_data + n), 1); -} - #endif /* _NILFS_MDT_H */ diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index ed02e886fa79..07ba838ef089 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -120,7 +120,7 @@ static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode, inode->i_op = &nilfs_file_inode_operations; inode->i_fop = &nilfs_file_operations; inode->i_mapping->a_ops = &nilfs_aops; - mark_inode_dirty(inode); + nilfs_mark_inode_dirty(inode); err = nilfs_add_nondir(dentry, inode); } if (!err) @@ -148,7 +148,7 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); - mark_inode_dirty(inode); + nilfs_mark_inode_dirty(inode); err = nilfs_add_nondir(dentry, inode); } if (!err) @@ -188,7 +188,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry, goto out_fail; /* mark_inode_dirty(inode); */ - /* nilfs_new_inode() and page_symlink() do this */ + /* page_symlink() do this */ err = nilfs_add_nondir(dentry, inode); out: @@ -200,7 +200,8 @@ out: return err; out_fail: - inode_dec_link_count(inode); + drop_nlink(inode); + nilfs_mark_inode_dirty(inode); iput(inode); goto out; } @@ -245,7 +246,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (err) return err; - inode_inc_link_count(dir); + inc_nlink(dir); inode = nilfs_new_inode(dir, S_IFDIR | mode); err = PTR_ERR(inode); @@ -256,7 +257,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) inode->i_fop = &nilfs_dir_operations; inode->i_mapping->a_ops = &nilfs_aops; - inode_inc_link_count(inode); + inc_nlink(inode); err = nilfs_make_empty(inode, dir); if (err) @@ -266,6 +267,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (err) goto out_fail; + nilfs_mark_inode_dirty(inode); d_instantiate(dentry, inode); out: if (!err) @@ -276,26 +278,23 @@ out: return err; out_fail: - inode_dec_link_count(inode); - inode_dec_link_count(inode); + drop_nlink(inode); + drop_nlink(inode); + nilfs_mark_inode_dirty(inode); iput(inode); out_dir: - inode_dec_link_count(dir); + drop_nlink(dir); + nilfs_mark_inode_dirty(dir); goto out; } -static int nilfs_unlink(struct inode *dir, struct dentry *dentry) +static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode; struct nilfs_dir_entry *de; struct page *page; - struct nilfs_transaction_info ti; int err; - err = nilfs_transaction_begin(dir->i_sb, &ti, 0); - if (err) - return err; - err = -ENOENT; de = nilfs_find_entry(dir, dentry, &page); if (!de) @@ -317,12 +316,28 @@ static int nilfs_unlink(struct inode *dir, struct dentry *dentry) goto out; inode->i_ctime = dir->i_ctime; - inode_dec_link_count(inode); + drop_nlink(inode); err = 0; out: - if (!err) + return err; +} + +static int nilfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 0); + if (err) + return err; + + err = nilfs_do_unlink(dir, dentry); + + if (!err) { + nilfs_mark_inode_dirty(dir); + nilfs_mark_inode_dirty(dentry->d_inode); err = nilfs_transaction_commit(dir->i_sb); - else + } else nilfs_transaction_abort(dir->i_sb); return err; @@ -340,11 +355,13 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) err = -ENOTEMPTY; if (nilfs_empty_dir(inode)) { - err = nilfs_unlink(dir, dentry); + err = nilfs_do_unlink(dir, dentry); if (!err) { inode->i_size = 0; - inode_dec_link_count(inode); - inode_dec_link_count(dir); + drop_nlink(inode); + nilfs_mark_inode_dirty(inode); + drop_nlink(dir); + nilfs_mark_inode_dirty(dir); } } if (!err) @@ -395,42 +412,48 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_de = nilfs_find_entry(new_dir, new_dentry, &new_page); if (!new_de) goto out_dir; - inode_inc_link_count(old_inode); + inc_nlink(old_inode); nilfs_set_link(new_dir, new_de, new_page, old_inode); + nilfs_mark_inode_dirty(new_dir); new_inode->i_ctime = CURRENT_TIME; if (dir_de) drop_nlink(new_inode); - inode_dec_link_count(new_inode); + drop_nlink(new_inode); + nilfs_mark_inode_dirty(new_inode); } else { if (dir_de) { err = -EMLINK; if (new_dir->i_nlink >= NILFS_LINK_MAX) goto out_dir; } - inode_inc_link_count(old_inode); + inc_nlink(old_inode); err = nilfs_add_link(new_dentry, old_inode); if (err) { - inode_dec_link_count(old_inode); + drop_nlink(old_inode); + nilfs_mark_inode_dirty(old_inode); goto out_dir; } - if (dir_de) - inode_inc_link_count(new_dir); + if (dir_de) { + inc_nlink(new_dir); + nilfs_mark_inode_dirty(new_dir); + } } /* * Like most other Unix systems, set the ctime for inodes on a * rename. - * inode_dec_link_count() will mark the inode dirty. */ old_inode->i_ctime = CURRENT_TIME; nilfs_delete_entry(old_de, old_page); - inode_dec_link_count(old_inode); + drop_nlink(old_inode); if (dir_de) { nilfs_set_link(old_inode, dir_de, dir_page, new_dir); - inode_dec_link_count(old_dir); + drop_nlink(old_dir); } + nilfs_mark_inode_dirty(old_dir); + nilfs_mark_inode_dirty(old_inode); err = nilfs_transaction_commit(old_dir->i_sb); return err; diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 6dc83591d118..c9c96c7825dc 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -770,14 +770,8 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs, nilfs_finish_roll_forward(nilfs, sbi, ri); } - nilfs_detach_checkpoint(sbi); - return 0; - failed: nilfs_detach_checkpoint(sbi); - nilfs_mdt_clear(nilfs->ns_cpfile); - nilfs_mdt_clear(nilfs->ns_sufile); - nilfs_mdt_clear(nilfs->ns_dat); return err; } @@ -804,6 +798,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, struct nilfs_segsum_info ssi; sector_t pseg_start, pseg_end, sr_pseg_start = 0; sector_t seg_start, seg_end; /* range of full segment (block number) */ + sector_t b, end; u64 seg_seq; __u64 segnum, nextnum = 0; __u64 cno; @@ -819,6 +814,11 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, /* Calculate range of segment */ nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + /* Read ahead segment */ + b = seg_start; + while (b <= seg_end) + sb_breadahead(sbi->s_super, b++); + for (;;) { /* Load segment summary */ ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); @@ -841,14 +841,20 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, ri->ri_nextnum = nextnum; empty_seg = 0; + if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) { + /* This will never happen because a superblock + (last_segment) always points to a pseg + having a super root. */ + ret = NILFS_SEG_FAIL_CONSISTENCY; + goto failed; + } + + if (pseg_start == seg_start) { + nilfs_get_segment_range(nilfs, nextnum, &b, &end); + while (b <= end) + sb_breadahead(sbi->s_super, b++); + } if (!NILFS_SEG_HAS_SR(&ssi)) { - if (!scan_newer) { - /* This will never happen because a superblock - (last_segment) always points to a pseg - having a super root. */ - ret = NILFS_SEG_FAIL_CONSISTENCY; - goto failed; - } if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) { ri->ri_lsegs_start = pseg_start; ri->ri_lsegs_start_seq = seg_seq; @@ -919,7 +925,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, super_root_found: /* Updating pointers relating to the latest checkpoint */ - list_splice(&segments, ri->ri_used_segments.prev); + list_splice_tail(&segments, &ri->ri_used_segments); nilfs->ns_last_pseg = sr_pseg_start; nilfs->ns_last_seq = nilfs->ns_seg_seq; nilfs->ns_last_cno = ri->ri_cno; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index e6d9e37fa241..645c78656aa0 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -24,10 +24,22 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/crc32.h> +#include <linux/backing-dev.h> #include "page.h" #include "segbuf.h" +struct nilfs_write_info { + struct the_nilfs *nilfs; + struct bio *bio; + int start, end; /* The region to be submitted */ + int rest_blocks; + int max_pages; + int nr_vecs; + sector_t blocknr; +}; + + static struct kmem_cache *nilfs_segbuf_cachep; static void nilfs_segbuf_init_once(void *obj) @@ -63,6 +75,11 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) INIT_LIST_HEAD(&segbuf->sb_list); INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); INIT_LIST_HEAD(&segbuf->sb_payload_buffers); + + init_completion(&segbuf->sb_bio_event); + atomic_set(&segbuf->sb_err, 0); + segbuf->sb_nbio = 0; + return segbuf; } @@ -83,6 +100,22 @@ void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum, segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; } +/** + * nilfs_segbuf_map_cont - map a new log behind a given log + * @segbuf: new segment buffer + * @prev: segment buffer containing a log to be continued + */ +void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, + struct nilfs_segment_buffer *prev) +{ + segbuf->sb_segnum = prev->sb_segnum; + segbuf->sb_fseg_start = prev->sb_fseg_start; + segbuf->sb_fseg_end = prev->sb_fseg_end; + segbuf->sb_pseg_start = prev->sb_pseg_start + prev->sb_sum.nblocks; + segbuf->sb_rest_blocks = + segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; +} + void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf, __u64 nextnum, struct the_nilfs *nilfs) { @@ -132,8 +165,6 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; segbuf->sb_sum.ctime = ctime; - - segbuf->sb_io_error = 0; return 0; } @@ -219,7 +250,7 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, raw_sum->ss_datasum = cpu_to_le32(crc); } -void nilfs_release_buffers(struct list_head *list) +static void nilfs_release_buffers(struct list_head *list) { struct buffer_head *bh, *n; @@ -241,13 +272,56 @@ void nilfs_release_buffers(struct list_head *list) } } +static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf) +{ + nilfs_release_buffers(&segbuf->sb_segsum_buffers); + nilfs_release_buffers(&segbuf->sb_payload_buffers); +} + +/* + * Iterators for segment buffers + */ +void nilfs_clear_logs(struct list_head *logs) +{ + struct nilfs_segment_buffer *segbuf; + + list_for_each_entry(segbuf, logs, sb_list) + nilfs_segbuf_clear(segbuf); +} + +void nilfs_truncate_logs(struct list_head *logs, + struct nilfs_segment_buffer *last) +{ + struct nilfs_segment_buffer *n, *segbuf; + + segbuf = list_prepare_entry(last, logs, sb_list); + list_for_each_entry_safe_continue(segbuf, n, logs, sb_list) { + list_del_init(&segbuf->sb_list); + nilfs_segbuf_clear(segbuf); + nilfs_segbuf_free(segbuf); + } +} + +int nilfs_wait_on_logs(struct list_head *logs) +{ + struct nilfs_segment_buffer *segbuf; + int err; + + list_for_each_entry(segbuf, logs, sb_list) { + err = nilfs_segbuf_wait(segbuf); + if (err) + return err; + } + return 0; +} + /* * BIO operations */ static void nilfs_end_bio_write(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct nilfs_write_info *wi = bio->bi_private; + struct nilfs_segment_buffer *segbuf = bio->bi_private; if (err == -EOPNOTSUPP) { set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); @@ -256,21 +330,22 @@ static void nilfs_end_bio_write(struct bio *bio, int err) } if (!uptodate) - atomic_inc(&wi->err); + atomic_inc(&segbuf->sb_err); bio_put(bio); - complete(&wi->bio_event); + complete(&segbuf->sb_bio_event); } -static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode) +static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi, int mode) { struct bio *bio = wi->bio; int err; - if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) { - wait_for_completion(&wi->bio_event); - wi->nbio--; - if (unlikely(atomic_read(&wi->err))) { + if (segbuf->sb_nbio > 0 && bdi_write_congested(wi->nilfs->ns_bdi)) { + wait_for_completion(&segbuf->sb_bio_event); + segbuf->sb_nbio--; + if (unlikely(atomic_read(&segbuf->sb_err))) { bio_put(bio); err = -EIO; goto failed; @@ -278,7 +353,7 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode) } bio->bi_end_io = nilfs_end_bio_write; - bio->bi_private = wi; + bio->bi_private = segbuf; bio_get(bio); submit_bio(mode, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) { @@ -286,7 +361,7 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode) err = -EOPNOTSUPP; goto failed; } - wi->nbio++; + segbuf->sb_nbio++; bio_put(bio); wi->bio = NULL; @@ -301,17 +376,15 @@ static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode) } /** - * nilfs_alloc_seg_bio - allocate a bio for writing segment. - * @sb: super block - * @start: beginning disk block number of this BIO. + * nilfs_alloc_seg_bio - allocate a new bio for writing log + * @nilfs: nilfs object + * @start: start block number of the bio * @nr_vecs: request size of page vector. * - * alloc_seg_bio() allocates a new BIO structure and initialize it. - * * Return Value: On success, pointer to the struct bio is returned. * On error, NULL is returned. */ -static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start, +static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start, int nr_vecs) { struct bio *bio; @@ -322,36 +395,33 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start, bio = bio_alloc(GFP_NOIO, nr_vecs); } if (likely(bio)) { - bio->bi_bdev = sb->s_bdev; - bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9); + bio->bi_bdev = nilfs->ns_bdev; + bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9); } return bio; } -void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, - struct nilfs_write_info *wi) +static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi) { wi->bio = NULL; wi->rest_blocks = segbuf->sb_sum.nblocks; - wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev); + wi->max_pages = bio_get_nr_vecs(wi->nilfs->ns_bdev); wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); wi->start = wi->end = 0; - wi->nbio = 0; wi->blocknr = segbuf->sb_pseg_start; - - atomic_set(&wi->err, 0); - init_completion(&wi->bio_event); } -static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh, - int mode) +static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi, + struct buffer_head *bh, int mode) { int len, err; BUG_ON(wi->nr_vecs <= 0); repeat: if (!wi->bio) { - wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end, + wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end, wi->nr_vecs); if (unlikely(!wi->bio)) return -ENOMEM; @@ -363,76 +433,83 @@ static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh, return 0; } /* bio is FULL */ - err = nilfs_submit_seg_bio(wi, mode); + err = nilfs_segbuf_submit_bio(segbuf, wi, mode); /* never submit current bh */ if (likely(!err)) goto repeat; return err; } +/** + * nilfs_segbuf_write - submit write requests of a log + * @segbuf: buffer storing a log to be written + * @nilfs: nilfs object + * + * Return Value: On Success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error + * + * %-ENOMEM - Insufficient memory available. + */ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, - struct nilfs_write_info *wi) + struct the_nilfs *nilfs) { + struct nilfs_write_info wi; struct buffer_head *bh; - int res, rw = WRITE; + int res = 0, rw = WRITE; + + wi.nilfs = nilfs; + nilfs_segbuf_prepare_write(segbuf, &wi); list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { - res = nilfs_submit_bh(wi, bh, rw); + res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw); if (unlikely(res)) goto failed_bio; } list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - res = nilfs_submit_bh(wi, bh, rw); + res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw); if (unlikely(res)) goto failed_bio; } - if (wi->bio) { + if (wi.bio) { /* * Last BIO is always sent through the following * submission. */ rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); - res = nilfs_submit_seg_bio(wi, rw); - if (unlikely(res)) - goto failed_bio; + res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); } - res = 0; - out: - return res; - failed_bio: - atomic_inc(&wi->err); - goto out; + return res; } /** * nilfs_segbuf_wait - wait for completion of requested BIOs - * @wi: nilfs_write_info + * @segbuf: segment buffer * * Return Value: On Success, 0 is returned. On Error, one of the following * negative error code is returned. * * %-EIO - I/O error */ -int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf, - struct nilfs_write_info *wi) +int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf) { int err = 0; - if (!wi->nbio) + if (!segbuf->sb_nbio) return 0; do { - wait_for_completion(&wi->bio_event); - } while (--wi->nbio > 0); + wait_for_completion(&segbuf->sb_bio_event); + } while (--segbuf->sb_nbio > 0); - if (unlikely(atomic_read(&wi->err) > 0)) { + if (unlikely(atomic_read(&segbuf->sb_err) > 0)) { printk(KERN_ERR "NILFS: IO error writing segment\n"); err = -EIO; - segbuf->sb_io_error = 1; } return err; } diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h index 0c3076f4e592..6af1630fb401 100644 --- a/fs/nilfs2/segbuf.h +++ b/fs/nilfs2/segbuf.h @@ -27,7 +27,6 @@ #include <linux/buffer_head.h> #include <linux/bio.h> #include <linux/completion.h> -#include <linux/backing-dev.h> /** * struct nilfs_segsum_info - On-memory segment summary @@ -77,7 +76,9 @@ struct nilfs_segsum_info { * @sb_rest_blocks: Number of residual blocks in the current segment * @sb_segsum_buffers: List of buffers for segment summaries * @sb_payload_buffers: List of buffers for segment payload - * @sb_io_error: I/O error status + * @sb_nbio: Number of flying bio requests + * @sb_err: I/O error status + * @sb_bio_event: Completion event of log writing */ struct nilfs_segment_buffer { struct super_block *sb_super; @@ -96,7 +97,9 @@ struct nilfs_segment_buffer { struct list_head sb_payload_buffers; /* including super root */ /* io status */ - int sb_io_error; + int sb_nbio; + atomic_t sb_err; + struct completion sb_bio_event; }; #define NILFS_LIST_SEGBUF(head) \ @@ -125,6 +128,8 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *); void nilfs_segbuf_free(struct nilfs_segment_buffer *); void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long, struct the_nilfs *); +void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, + struct nilfs_segment_buffer *prev); void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, struct the_nilfs *); int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t); @@ -161,41 +166,18 @@ nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf, segbuf->sb_sum.nfileblk++; } -void nilfs_release_buffers(struct list_head *); +int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, + struct the_nilfs *nilfs); +int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf); -static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf) +void nilfs_clear_logs(struct list_head *logs); +void nilfs_truncate_logs(struct list_head *logs, + struct nilfs_segment_buffer *last); +int nilfs_wait_on_logs(struct list_head *logs); + +static inline void nilfs_destroy_logs(struct list_head *logs) { - nilfs_release_buffers(&segbuf->sb_segsum_buffers); - nilfs_release_buffers(&segbuf->sb_payload_buffers); + nilfs_truncate_logs(logs, NULL); } -struct nilfs_write_info { - struct bio *bio; - int start, end; /* The region to be submitted */ - int rest_blocks; - int max_pages; - int nr_vecs; - sector_t blocknr; - - int nbio; - atomic_t err; - struct completion bio_event; - /* completion event of segment write */ - - /* - * The following fields must be set explicitly - */ - struct super_block *sb; - struct backing_dev_info *bdi; /* backing dev info */ - struct buffer_head *bh_sr; -}; - - -void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *, - struct nilfs_write_info *); -int nilfs_segbuf_write(struct nilfs_segment_buffer *, - struct nilfs_write_info *); -int nilfs_segbuf_wait(struct nilfs_segment_buffer *, - struct nilfs_write_info *); - #endif /* _NILFS_SEGBUF_H */ diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 6eff66a070d5..17584c524486 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -974,12 +974,12 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, nilfs->ns_nongc_ctime : sci->sc_seg_ctime); raw_sr->sr_flags = 0; - nilfs_mdt_write_inode_direct( - nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz)); - nilfs_mdt_write_inode_direct( - nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz)); - nilfs_mdt_write_inode_direct( - nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz)); + nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr + + NILFS_SR_DAT_OFFSET(isz), 1); + nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr + + NILFS_SR_CPFILE_OFFSET(isz), 1); + nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr + + NILFS_SR_SUFILE_OFFSET(isz), 1); } static void nilfs_redirty_inodes(struct list_head *head) @@ -1273,73 +1273,75 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) return err; } -static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum) -{ - struct buffer_head *bh_su; - struct nilfs_segment_usage *raw_su; - int err; - - err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su); - if (unlikely(err)) - return err; - nilfs_mdt_mark_buffer_dirty(bh_su); - nilfs_mdt_mark_dirty(sufile); - nilfs_sufile_put_segment_usage(sufile, segnum, bh_su); - return 0; -} - +/** + * nilfs_segctor_begin_construction - setup segment buffer to make a new log + * @sci: nilfs_sc_info + * @nilfs: nilfs object + */ static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { - struct nilfs_segment_buffer *segbuf, *n; + struct nilfs_segment_buffer *segbuf, *prev; __u64 nextnum; - int err; + int err, alloc = 0; - if (list_empty(&sci->sc_segbufs)) { - segbuf = nilfs_segbuf_new(sci->sc_super); - if (unlikely(!segbuf)) - return -ENOMEM; - list_add(&segbuf->sb_list, &sci->sc_segbufs); - } else - segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + segbuf = nilfs_segbuf_new(sci->sc_super); + if (unlikely(!segbuf)) + return -ENOMEM; + + if (list_empty(&sci->sc_write_logs)) { + nilfs_segbuf_map(segbuf, nilfs->ns_segnum, + nilfs->ns_pseg_offset, nilfs); + if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { + nilfs_shift_to_next_segment(nilfs); + nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs); + } - nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset, - nilfs); + segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq; + nextnum = nilfs->ns_nextnum; - if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { - nilfs_shift_to_next_segment(nilfs); - nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs); + if (nilfs->ns_segnum == nilfs->ns_nextnum) + /* Start from the head of a new full segment */ + alloc++; + } else { + /* Continue logs */ + prev = NILFS_LAST_SEGBUF(&sci->sc_write_logs); + nilfs_segbuf_map_cont(segbuf, prev); + segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq; + nextnum = prev->sb_nextnum; + + if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { + nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs); + segbuf->sb_sum.seg_seq++; + alloc++; + } } - sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks; - err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum); - if (unlikely(err)) - return err; + err = nilfs_sufile_mark_dirty(nilfs->ns_sufile, segbuf->sb_segnum); + if (err) + goto failed; - if (nilfs->ns_segnum == nilfs->ns_nextnum) { - /* Start from the head of a new full segment */ + if (alloc) { err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum); - if (unlikely(err)) - return err; - } else - nextnum = nilfs->ns_nextnum; - - segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq; + if (err) + goto failed; + } nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs); - /* truncating segment buffers */ - list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs, - sb_list) { - list_del_init(&segbuf->sb_list); - nilfs_segbuf_free(segbuf); - } + BUG_ON(!list_empty(&sci->sc_segbufs)); + list_add_tail(&segbuf->sb_list, &sci->sc_segbufs); + sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks; return 0; + + failed: + nilfs_segbuf_free(segbuf); + return err; } static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci, struct the_nilfs *nilfs, int nadd) { - struct nilfs_segment_buffer *segbuf, *prev, *n; + struct nilfs_segment_buffer *segbuf, *prev; struct inode *sufile = nilfs->ns_sufile; __u64 nextnextnum; LIST_HEAD(list); @@ -1352,7 +1354,7 @@ static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci, * not be dirty. The following call ensures that the buffer is dirty * and will pin the buffer on memory until the sufile is written. */ - err = nilfs_touch_segusage(sufile, prev->sb_nextnum); + err = nilfs_sufile_mark_dirty(sufile, prev->sb_nextnum); if (unlikely(err)) return err; @@ -1378,33 +1380,33 @@ static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci, list_add_tail(&segbuf->sb_list, &list); prev = segbuf; } - list_splice(&list, sci->sc_segbufs.prev); + list_splice_tail(&list, &sci->sc_segbufs); return 0; failed_segbuf: nilfs_segbuf_free(segbuf); failed: - list_for_each_entry_safe(segbuf, n, &list, sb_list) { + list_for_each_entry(segbuf, &list, sb_list) { ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); WARN_ON(ret); /* never fails */ - list_del_init(&segbuf->sb_list); - nilfs_segbuf_free(segbuf); } + nilfs_destroy_logs(&list); return err; } -static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci, - struct the_nilfs *nilfs) +static void nilfs_free_incomplete_logs(struct list_head *logs, + struct the_nilfs *nilfs) { - struct nilfs_segment_buffer *segbuf; - int ret, done = 0; + struct nilfs_segment_buffer *segbuf, *prev; + struct inode *sufile = nilfs->ns_sufile; + int ret; - segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + segbuf = NILFS_FIRST_SEGBUF(logs); if (nilfs->ns_nextnum != segbuf->sb_nextnum) { - ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum); + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); WARN_ON(ret); /* never fails */ } - if (segbuf->sb_io_error) { + if (atomic_read(&segbuf->sb_err)) { /* Case 1: The first segment failed */ if (segbuf->sb_pseg_start != segbuf->sb_fseg_start) /* Case 1a: Partial segment appended into an existing @@ -1413,106 +1415,54 @@ static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci, segbuf->sb_fseg_end); else /* Case 1b: New full segment */ set_nilfs_discontinued(nilfs); - done++; } - list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { - ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum); - WARN_ON(ret); /* never fails */ - if (!done && segbuf->sb_io_error) { - if (segbuf->sb_segnum != nilfs->ns_nextnum) - /* Case 2: extended segment (!= next) failed */ - nilfs_sufile_set_error(nilfs->ns_sufile, - segbuf->sb_segnum); - done++; - } - } -} - -static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci) -{ - struct nilfs_segment_buffer *segbuf; - - list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) - nilfs_segbuf_clear(segbuf); - sci->sc_super_root = NULL; -} - -static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci) -{ - struct nilfs_segment_buffer *segbuf; - - while (!list_empty(&sci->sc_segbufs)) { - segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); - list_del_init(&segbuf->sb_list); - nilfs_segbuf_free(segbuf); - } - /* sci->sc_curseg = NULL; */ -} - -static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci, - struct the_nilfs *nilfs, int err) -{ - if (unlikely(err)) { - nilfs_segctor_free_incomplete_segments(sci, nilfs); - if (sci->sc_stage.flags & NILFS_CF_SUFREED) { - int ret; - - ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile, - sci->sc_freesegs, - sci->sc_nfreesegs, - NULL); - WARN_ON(ret); /* do not happen */ + prev = segbuf; + list_for_each_entry_continue(segbuf, logs, sb_list) { + if (prev->sb_nextnum != segbuf->sb_nextnum) { + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ } + if (atomic_read(&segbuf->sb_err) && + segbuf->sb_segnum != nilfs->ns_nextnum) + /* Case 2: extended segment (!= next) failed */ + nilfs_sufile_set_error(sufile, segbuf->sb_segnum); + prev = segbuf; } - nilfs_segctor_clear_segment_buffers(sci); } static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci, struct inode *sufile) { struct nilfs_segment_buffer *segbuf; - struct buffer_head *bh_su; - struct nilfs_segment_usage *raw_su; unsigned long live_blocks; int ret; list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { - ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, - &raw_su, &bh_su); - WARN_ON(ret); /* always succeed because bh_su is dirty */ live_blocks = segbuf->sb_sum.nblocks + (segbuf->sb_pseg_start - segbuf->sb_fseg_start); - raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime); - raw_su->su_nblocks = cpu_to_le32(live_blocks); - nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, - bh_su); + ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, + live_blocks, + sci->sc_seg_ctime); + WARN_ON(ret); /* always succeed because the segusage is dirty */ } } -static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci, - struct inode *sufile) +static void nilfs_cancel_segusage(struct list_head *logs, struct inode *sufile) { struct nilfs_segment_buffer *segbuf; - struct buffer_head *bh_su; - struct nilfs_segment_usage *raw_su; int ret; - segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); - ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, - &raw_su, &bh_su); - WARN_ON(ret); /* always succeed because bh_su is dirty */ - raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start - - segbuf->sb_fseg_start); - nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su); + segbuf = NILFS_FIRST_SEGBUF(logs); + ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, + segbuf->sb_pseg_start - + segbuf->sb_fseg_start, 0); + WARN_ON(ret); /* always succeed because the segusage is dirty */ - list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { - ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, - &raw_su, &bh_su); + list_for_each_entry_continue(segbuf, logs, sb_list) { + ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, + 0, 0); WARN_ON(ret); /* always succeed */ - raw_su->su_nblocks = 0; - nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, - bh_su); } } @@ -1520,17 +1470,15 @@ static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci, struct nilfs_segment_buffer *last, struct inode *sufile) { - struct nilfs_segment_buffer *segbuf = last, *n; + struct nilfs_segment_buffer *segbuf = last; int ret; - list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs, - sb_list) { - list_del_init(&segbuf->sb_list); + list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks; ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); WARN_ON(ret); - nilfs_segbuf_free(segbuf); } + nilfs_truncate_logs(&sci->sc_segbufs, last); } @@ -1569,7 +1517,7 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, NULL); WARN_ON(err); /* do not happen */ } - nilfs_segctor_clear_segment_buffers(sci); + nilfs_clear_logs(&sci->sc_segbufs); err = nilfs_segctor_extend_segments(sci, nilfs, nadd); if (unlikely(err)) @@ -1814,26 +1762,18 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci, } static int nilfs_segctor_write(struct nilfs_sc_info *sci, - struct backing_dev_info *bdi) + struct the_nilfs *nilfs) { struct nilfs_segment_buffer *segbuf; - struct nilfs_write_info wi; - int err, res; - - wi.sb = sci->sc_super; - wi.bh_sr = sci->sc_super_root; - wi.bdi = bdi; + int ret = 0; list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { - nilfs_segbuf_prepare_write(segbuf, &wi); - err = nilfs_segbuf_write(segbuf, &wi); - - res = nilfs_segbuf_wait(segbuf, &wi); - err = err ? : res; - if (err) - return err; + ret = nilfs_segbuf_write(segbuf, nilfs); + if (ret) + break; } - return 0; + list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs); + return ret; } static void __nilfs_end_page_io(struct page *page, int err) @@ -1911,15 +1851,17 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err) } } -static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci, - struct page *failed_page, int err) +static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page, + struct buffer_head *bh_sr, int err) { struct nilfs_segment_buffer *segbuf; struct page *bd_page = NULL, *fs_page = NULL; + struct buffer_head *bh; - list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { - struct buffer_head *bh; + if (list_empty(logs)) + return; + list_for_each_entry(segbuf, logs, sb_list) { list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { if (bh->b_page != bd_page) { @@ -1931,7 +1873,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci, list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - if (bh == sci->sc_super_root) { + if (bh == bh_sr) { if (bh->b_page != bd_page) { end_page_writeback(bd_page); bd_page = bh->b_page; @@ -1941,7 +1883,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci, if (bh->b_page != fs_page) { nilfs_end_page_io(fs_page, err); if (fs_page && fs_page == failed_page) - goto done; + return; fs_page = bh->b_page; } } @@ -1950,8 +1892,34 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci, end_page_writeback(bd_page); nilfs_end_page_io(fs_page, err); - done: +} + +static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int err) +{ + LIST_HEAD(logs); + int ret; + + list_splice_tail_init(&sci->sc_write_logs, &logs); + ret = nilfs_wait_on_logs(&logs); + if (ret) + nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret); + + list_splice_tail_init(&sci->sc_segbufs, &logs); + nilfs_cancel_segusage(&logs, nilfs->ns_sufile); + nilfs_free_incomplete_logs(&logs, nilfs); nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err); + + if (sci->sc_stage.flags & NILFS_CF_SUFREED) { + ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile, + sci->sc_freesegs, + sci->sc_nfreesegs, + NULL); + WARN_ON(ret); /* do not happen */ + } + + nilfs_destroy_logs(&logs); + sci->sc_super_root = NULL; } static void nilfs_set_next_segment(struct the_nilfs *nilfs, @@ -1973,7 +1941,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) struct the_nilfs *nilfs = sbi->s_nilfs; int update_sr = (sci->sc_super_root != NULL); - list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { struct buffer_head *bh; list_for_each_entry(bh, &segbuf->sb_segsum_buffers, @@ -2046,7 +2014,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) sci->sc_nblk_inc += sci->sc_nblk_this_inc; - segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs); + segbuf = NILFS_LAST_SEGBUF(&sci->sc_write_logs); nilfs_set_next_segment(nilfs, segbuf); if (update_sr) { @@ -2057,10 +2025,23 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); + nilfs_segctor_clear_metadata_dirty(sci); } else clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); } +static int nilfs_segctor_wait(struct nilfs_sc_info *sci) +{ + int ret; + + ret = nilfs_wait_on_logs(&sci->sc_write_logs); + if (!ret) { + nilfs_segctor_complete_write(sci); + nilfs_destroy_logs(&sci->sc_write_logs); + } + return ret; +} + static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci, struct nilfs_sb_info *sbi) { @@ -2173,7 +2154,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) /* Avoid empty segment */ if (sci->sc_stage.scnt == NILFS_ST_DONE && NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { - nilfs_segctor_end_construction(sci, nilfs, 1); + nilfs_segctor_abort_construction(sci, nilfs, 1); goto out; } @@ -2187,7 +2168,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) if (has_sr) { err = nilfs_segctor_fill_in_checkpoint(sci); if (unlikely(err)) - goto failed_to_make_up; + goto failed_to_write; nilfs_segctor_fill_in_super_root(sci, nilfs); } @@ -2195,42 +2176,46 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) /* Write partial segments */ err = nilfs_segctor_prepare_write(sci, &failed_page); - if (unlikely(err)) + if (err) { + nilfs_abort_logs(&sci->sc_segbufs, failed_page, + sci->sc_super_root, err); goto failed_to_write; - + } nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed); - err = nilfs_segctor_write(sci, nilfs->ns_bdi); + err = nilfs_segctor_write(sci, nilfs); if (unlikely(err)) goto failed_to_write; - nilfs_segctor_complete_write(sci); - - /* Commit segments */ - if (has_sr) - nilfs_segctor_clear_metadata_dirty(sci); - - nilfs_segctor_end_construction(sci, nilfs, 0); - + if (sci->sc_stage.scnt == NILFS_ST_DONE || + nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) { + /* + * At this point, we avoid double buffering + * for blocksize < pagesize because page dirty + * flag is turned off during write and dirty + * buffers are not properly collected for + * pages crossing over segments. + */ + err = nilfs_segctor_wait(sci); + if (err) + goto failed_to_write; + } } while (sci->sc_stage.scnt != NILFS_ST_DONE); + sci->sc_super_root = NULL; + out: - nilfs_segctor_destroy_segment_buffers(sci); nilfs_segctor_check_out_files(sci, sbi); return err; failed_to_write: - nilfs_segctor_abort_write(sci, failed_page, err); - nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile); - - failed_to_make_up: if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) nilfs_redirty_inodes(&sci->sc_dirty_files); failed: if (nilfs_doing_gc()) nilfs_redirty_inodes(&sci->sc_gc_inodes); - nilfs_segctor_end_construction(sci, nilfs, err); + nilfs_segctor_abort_construction(sci, nilfs, err); goto out; } @@ -2559,7 +2544,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, sci->sc_freesegs = kbufs[4]; sci->sc_nfreesegs = argv[4].v_nmembs; - list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev); + list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes); for (;;) { nilfs_segctor_accept(sci, &req); @@ -2788,6 +2773,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi) spin_lock_init(&sci->sc_state_lock); INIT_LIST_HEAD(&sci->sc_dirty_files); INIT_LIST_HEAD(&sci->sc_segbufs); + INIT_LIST_HEAD(&sci->sc_write_logs); INIT_LIST_HEAD(&sci->sc_gc_inodes); INIT_LIST_HEAD(&sci->sc_copied_buffers); @@ -2855,6 +2841,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) } WARN_ON(!list_empty(&sci->sc_segbufs)); + WARN_ON(!list_empty(&sci->sc_write_logs)); down_write(&sbi->s_nilfs->ns_segctor_sem); diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 0d2a475a741b..3d3ab2f9864c 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -97,6 +97,7 @@ struct nilfs_segsum_pointer { * @sc_dsync_start: start byte offset of data pages * @sc_dsync_end: end byte offset of data pages (inclusive) * @sc_segbufs: List of segment buffers + * @sc_write_logs: List of segment buffers to hold logs under writing * @sc_segbuf_nblocks: Number of available blocks in segment buffers. * @sc_curseg: Current segment buffer * @sc_super_root: Pointer to the super root buffer @@ -143,6 +144,7 @@ struct nilfs_sc_info { /* Segment buffers */ struct list_head sc_segbufs; + struct list_head sc_write_logs; unsigned long sc_segbuf_nblocks; struct nilfs_segment_buffer *sc_curseg; struct buffer_head *sc_super_root; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 37994d4a59cc..b6c36d0cc331 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -31,6 +31,16 @@ #include "sufile.h" +struct nilfs_sufile_info { + struct nilfs_mdt_info mi; + unsigned long ncleansegs; +}; + +static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile) +{ + return (struct nilfs_sufile_info *)NILFS_MDT(sufile); +} + static inline unsigned long nilfs_sufile_segment_usages_per_block(const struct inode *sufile) { @@ -62,14 +72,6 @@ nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr, max - curr + 1); } -static inline struct nilfs_sufile_header * -nilfs_sufile_block_get_header(const struct inode *sufile, - struct buffer_head *bh, - void *kaddr) -{ - return kaddr + bh_offset(bh); -} - static struct nilfs_segment_usage * nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum, struct buffer_head *bh, void *kaddr) @@ -110,6 +112,15 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, } /** + * nilfs_sufile_get_ncleansegs - return the number of clean segments + * @sufile: inode of segment usage file + */ +unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile) +{ + return NILFS_SUI(sufile)->ncleansegs; +} + +/** * nilfs_sufile_updatev - modify multiple segment usages at a time * @sufile: inode of segment usage file * @segnumv: array of segment numbers @@ -270,7 +281,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) if (ret < 0) goto out_sem; kaddr = kmap_atomic(header_bh->b_page, KM_USER0); - header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + header = kaddr + bh_offset(header_bh); ncleansegs = le64_to_cpu(header->sh_ncleansegs); last_alloc = le64_to_cpu(header->sh_last_alloc); kunmap_atomic(kaddr, KM_USER0); @@ -302,13 +313,13 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) kunmap_atomic(kaddr, KM_USER0); kaddr = kmap_atomic(header_bh->b_page, KM_USER0); - header = nilfs_sufile_block_get_header( - sufile, header_bh, kaddr); + header = kaddr + bh_offset(header_bh); le64_add_cpu(&header->sh_ncleansegs, -1); le64_add_cpu(&header->sh_ndirtysegs, 1); header->sh_last_alloc = cpu_to_le64(segnum); kunmap_atomic(kaddr, KM_USER0); + NILFS_SUI(sufile)->ncleansegs--; nilfs_mdt_mark_buffer_dirty(header_bh); nilfs_mdt_mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); @@ -351,6 +362,8 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, kunmap_atomic(kaddr, KM_USER0); nilfs_sufile_mod_counter(header_bh, -1, 1); + NILFS_SUI(sufile)->ncleansegs--; + nilfs_mdt_mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } @@ -380,6 +393,8 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, kunmap_atomic(kaddr, KM_USER0); nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); + NILFS_SUI(sufile)->ncleansegs -= clean; + nilfs_mdt_mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } @@ -409,79 +424,65 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, nilfs_mdt_mark_buffer_dirty(su_bh); nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); + NILFS_SUI(sufile)->ncleansegs++; + nilfs_mdt_mark_dirty(sufile); } /** - * nilfs_sufile_get_segment_usage - get a segment usage + * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty * @sufile: inode of segment usage file * @segnum: segment number - * @sup: pointer to segment usage - * @bhp: pointer to buffer head - * - * Description: nilfs_sufile_get_segment_usage() acquires the segment usage - * specified by @segnum. - * - * Return Value: On success, 0 is returned, and the segment usage and the - * buffer head of the buffer on which the segment usage is located are stored - * in the place pointed by @sup and @bhp, respectively. On error, one of the - * following negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EINVAL - Invalid segment usage number. */ -int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum, - struct nilfs_segment_usage **sup, - struct buffer_head **bhp) +int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) { struct buffer_head *bh; - struct nilfs_segment_usage *su; - void *kaddr; int ret; - /* segnum is 0 origin */ - if (segnum >= nilfs_sufile_get_nsegments(sufile)) - return -EINVAL; - down_write(&NILFS_MDT(sufile)->mi_sem); - ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh); - if (ret < 0) - goto out_sem; - kaddr = kmap(bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); - if (nilfs_segment_usage_error(su)) { - kunmap(bh->b_page); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); + if (!ret) { + nilfs_mdt_mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(sufile); brelse(bh); - ret = -EINVAL; - goto out_sem; } - - if (sup != NULL) - *sup = su; - *bhp = bh; - - out_sem: - up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } /** - * nilfs_sufile_put_segment_usage - put a segment usage + * nilfs_sufile_set_segment_usage - set usage of a segment * @sufile: inode of segment usage file * @segnum: segment number - * @bh: buffer head - * - * Description: nilfs_sufile_put_segment_usage() releases the segment usage - * specified by @segnum. @bh must be the buffer head which have been returned - * by a previous call to nilfs_sufile_get_segment_usage() with @segnum. + * @nblocks: number of live blocks in the segment + * @modtime: modification time (option) */ -void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum, - struct buffer_head *bh) +int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, + unsigned long nblocks, time_t modtime) { - kunmap(bh->b_page); + struct buffer_head *bh; + struct nilfs_segment_usage *su; + void *kaddr; + int ret; + + down_write(&NILFS_MDT(sufile)->mi_sem); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); + if (ret < 0) + goto out_sem; + + kaddr = kmap_atomic(bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + WARN_ON(nilfs_segment_usage_error(su)); + if (modtime) + su->su_lastmod = cpu_to_le64(modtime); + su->su_nblocks = cpu_to_le32(nblocks); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(sufile); brelse(bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; } /** @@ -515,7 +516,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) goto out_sem; kaddr = kmap_atomic(header_bh->b_page, KM_USER0); - header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + header = kaddr + bh_offset(header_bh); sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile); sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs); sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs); @@ -532,33 +533,6 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) return ret; } -/** - * nilfs_sufile_get_ncleansegs - get the number of clean segments - * @sufile: inode of segment usage file - * @nsegsp: pointer to the number of clean segments - * - * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean - * segments. - * - * Return Value: On success, 0 is returned and the number of clean segments is - * stored in the place pointed by @nsegsp. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - */ -int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp) -{ - struct nilfs_sustat sustat; - int ret; - - ret = nilfs_sufile_get_stat(sufile, &sustat); - if (ret == 0) - *nsegsp = sustat.ss_ncleansegs; - return ret; -} - void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, struct buffer_head *header_bh, struct buffer_head *su_bh) @@ -577,8 +551,10 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, nilfs_segment_usage_set_error(su); kunmap_atomic(kaddr, KM_USER0); - if (suclean) + if (suclean) { nilfs_sufile_mod_counter(header_bh, -1, 0); + NILFS_SUI(sufile)->ncleansegs--; + } nilfs_mdt_mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } @@ -657,3 +633,48 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, up_read(&NILFS_MDT(sufile)->mi_sem); return ret; } + +/** + * nilfs_sufile_read - read sufile inode + * @sufile: sufile inode + * @raw_inode: on-disk sufile inode + */ +int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode) +{ + struct nilfs_sufile_info *sui = NILFS_SUI(sufile); + struct buffer_head *header_bh; + struct nilfs_sufile_header *header; + void *kaddr; + int ret; + + ret = nilfs_read_inode_common(sufile, raw_inode); + if (ret < 0) + return ret; + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (!ret) { + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = kaddr + bh_offset(header_bh); + sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs); + kunmap_atomic(kaddr, KM_USER0); + brelse(header_bh); + } + return ret; +} + +/** + * nilfs_sufile_new - create sufile + * @nilfs: nilfs object + * @susize: size of a segment usage entry + */ +struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize) +{ + struct inode *sufile; + + sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO, + sizeof(struct nilfs_sufile_info)); + if (sufile) + nilfs_mdt_set_entry_size(sufile, susize, + sizeof(struct nilfs_sufile_header)); + return sufile; +} diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index 0e99e5c0bd0f..15163b8aff7d 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -34,14 +34,13 @@ static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments; } +unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile); + int nilfs_sufile_alloc(struct inode *, __u64 *); -int nilfs_sufile_get_segment_usage(struct inode *, __u64, - struct nilfs_segment_usage **, - struct buffer_head **); -void nilfs_sufile_put_segment_usage(struct inode *, __u64, - struct buffer_head *); +int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum); +int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, + unsigned long nblocks, time_t modtime); int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); -int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *); ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned, size_t); @@ -62,6 +61,9 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *, void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, struct buffer_head *); +int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode); +struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize); + /** * nilfs_sufile_scrap - make a segment garbage * @sufile: inode of segment usage file diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 644e66727dd0..8173faee31e6 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -363,14 +363,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) list_add(&sbi->s_list, &nilfs->ns_supers); up_write(&nilfs->ns_super_sem); - sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO); + sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size); if (!sbi->s_ifile) return -ENOMEM; - err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size); - if (unlikely(err)) - goto failed; - down_read(&nilfs->ns_segctor_sem); err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, &bh_cp); @@ -411,7 +407,6 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi) { struct the_nilfs *nilfs = sbi->s_nilfs; - nilfs_mdt_clear(sbi->s_ifile); nilfs_mdt_destroy(sbi->s_ifile); sbi->s_ifile = NULL; down_write(&nilfs->ns_super_sem); @@ -419,22 +414,6 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi) up_write(&nilfs->ns_super_sem); } -static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi) -{ - struct the_nilfs *nilfs = sbi->s_nilfs; - int err = 0; - - down_write(&nilfs->ns_sem); - if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) { - nilfs->ns_mount_state |= NILFS_VALID_FS; - err = nilfs_commit_super(sbi, 1); - if (likely(!err)) - printk(KERN_INFO "NILFS: recovery complete.\n"); - } - up_write(&nilfs->ns_sem); - return err; -} - static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -490,7 +469,7 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) struct nilfs_sb_info *sbi = NILFS_SB(sb); if (!nilfs_test_opt(sbi, BARRIER)) - seq_printf(seq, ",barrier=off"); + seq_printf(seq, ",nobarrier"); if (nilfs_test_opt(sbi, SNAPSHOT)) seq_printf(seq, ",cp=%llu", (unsigned long long int)sbi->s_snapshot_cno); @@ -500,6 +479,8 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_printf(seq, ",errors=panic"); if (nilfs_test_opt(sbi, STRICT_ORDER)) seq_printf(seq, ",order=strict"); + if (nilfs_test_opt(sbi, NORECOVERY)) + seq_printf(seq, ",norecovery"); return 0; } @@ -568,7 +549,7 @@ static const struct export_operations nilfs_export_ops = { enum { Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_barrier, Opt_snapshot, Opt_order, + Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, Opt_err, }; @@ -576,25 +557,13 @@ static match_table_t tokens = { {Opt_err_cont, "errors=continue"}, {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, - {Opt_barrier, "barrier=%s"}, + {Opt_nobarrier, "nobarrier"}, {Opt_snapshot, "cp=%u"}, {Opt_order, "order=%s"}, + {Opt_norecovery, "norecovery"}, {Opt_err, NULL} }; -static int match_bool(substring_t *s, int *result) -{ - int len = s->to - s->from; - - if (strncmp(s->from, "on", len) == 0) - *result = 1; - else if (strncmp(s->from, "off", len) == 0) - *result = 0; - else - return 1; - return 0; -} - static int parse_options(char *options, struct super_block *sb) { struct nilfs_sb_info *sbi = NILFS_SB(sb); @@ -612,13 +581,8 @@ static int parse_options(char *options, struct super_block *sb) token = match_token(p, tokens, args); switch (token) { - case Opt_barrier: - if (match_bool(&args[0], &option)) - return 0; - if (option) - nilfs_set_opt(sbi, BARRIER); - else - nilfs_clear_opt(sbi, BARRIER); + case Opt_nobarrier: + nilfs_clear_opt(sbi, BARRIER); break; case Opt_order: if (strcmp(args[0].from, "relaxed") == 0) @@ -647,6 +611,9 @@ static int parse_options(char *options, struct super_block *sb) sbi->s_snapshot_cno = option; nilfs_set_opt(sbi, SNAPSHOT); break; + case Opt_norecovery: + nilfs_set_opt(sbi, NORECOVERY); + break; default: printk(KERN_ERR "NILFS: Unrecognized mount option \"%s\"\n", p); @@ -672,9 +639,7 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi) int mnt_count = le16_to_cpu(sbp->s_mnt_count); /* nilfs->sem must be locked by the caller. */ - if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) { - printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n"); - } else if (nilfs->ns_mount_state & NILFS_ERROR_FS) { + if (nilfs->ns_mount_state & NILFS_ERROR_FS) { printk(KERN_WARNING "NILFS warning: mounting fs with errors\n"); #if 0 @@ -782,11 +747,10 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, sb->s_root = NULL; sb->s_time_gran = 1; - if (!nilfs_loaded(nilfs)) { - err = load_nilfs(nilfs, sbi); - if (err) - goto failed_sbi; - } + err = load_nilfs(nilfs, sbi); + if (err) + goto failed_sbi; + cno = nilfs_last_cno(nilfs); if (sb->s_flags & MS_RDONLY) { @@ -854,12 +818,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, up_write(&nilfs->ns_sem); } - err = nilfs_mark_recovery_complete(sbi); - if (unlikely(err)) { - printk(KERN_ERR "NILFS: recovery failed.\n"); - goto failed_root; - } - down_write(&nilfs->ns_super_sem); if (!nilfs_test_opt(sbi, SNAPSHOT)) nilfs->ns_current = sbi; @@ -867,10 +825,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, return 0; - failed_root: - dput(sb->s_root); - sb->s_root = NULL; - failed_segctor: nilfs_detach_segment_constructor(sbi); @@ -915,6 +869,14 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (!nilfs_valid_fs(nilfs)) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount because the filesystem is in an " + "incomplete recovery state.\n", sb->s_id); + err = -EINVAL; + goto restore_opts; + } + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) goto out; if (*flags & MS_RDONLY) { @@ -1156,8 +1118,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, /* Abandoning the newly allocated superblock */ mutex_unlock(&nilfs->ns_mount_mutex); put_nilfs(nilfs); - up_write(&s->s_umount); - deactivate_super(s); + deactivate_locked_super(s); /* * deactivate_super() invokes close_bdev_exclusive(). * We must finish all post-cleaning before this call; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index ad391a8c3e7e..6241e1722efc 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -146,13 +146,9 @@ void put_nilfs(struct the_nilfs *nilfs) might_sleep(); if (nilfs_loaded(nilfs)) { - nilfs_mdt_clear(nilfs->ns_sufile); nilfs_mdt_destroy(nilfs->ns_sufile); - nilfs_mdt_clear(nilfs->ns_cpfile); nilfs_mdt_destroy(nilfs->ns_cpfile); - nilfs_mdt_clear(nilfs->ns_dat); nilfs_mdt_destroy(nilfs->ns_dat); - /* XXX: how and when to clear nilfs->ns_gc_dat? */ nilfs_mdt_destroy(nilfs->ns_gc_dat); } if (nilfs_init(nilfs)) { @@ -166,7 +162,6 @@ void put_nilfs(struct the_nilfs *nilfs) static int nilfs_load_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, sector_t sr_block) { - static struct lock_class_key dat_lock_key; struct buffer_head *bh_sr; struct nilfs_super_root *raw_sr; struct nilfs_super_block **sbp = nilfs->ns_sbp; @@ -187,51 +182,36 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, inode_size = nilfs->ns_inode_size; err = -ENOMEM; - nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO); + nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size); if (unlikely(!nilfs->ns_dat)) goto failed; - nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO); + nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size); if (unlikely(!nilfs->ns_gc_dat)) goto failed_dat; - nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO); + nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size); if (unlikely(!nilfs->ns_cpfile)) goto failed_gc_dat; - nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO); + nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size); if (unlikely(!nilfs->ns_sufile)) goto failed_cpfile; - err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size); - if (unlikely(err)) - goto failed_sufile; - - err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size); - if (unlikely(err)) - goto failed_sufile; - - lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key); - lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key); - nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat); - nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size, - sizeof(struct nilfs_cpfile_header)); - nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size, - sizeof(struct nilfs_sufile_header)); - err = nilfs_mdt_read_inode_direct( - nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size)); + err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data + + NILFS_SR_DAT_OFFSET(inode_size)); if (unlikely(err)) goto failed_sufile; - err = nilfs_mdt_read_inode_direct( - nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size)); + err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data + + NILFS_SR_CPFILE_OFFSET(inode_size)); if (unlikely(err)) goto failed_sufile; - err = nilfs_mdt_read_inode_direct( - nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size)); + err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data + + NILFS_SR_SUFILE_OFFSET(inode_size)); if (unlikely(err)) goto failed_sufile; @@ -281,29 +261,30 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) struct nilfs_recovery_info ri; unsigned int s_flags = sbi->s_super->s_flags; int really_read_only = bdev_read_only(nilfs->ns_bdev); - unsigned valid_fs; - int err = 0; - - nilfs_init_recovery_info(&ri); + int valid_fs = nilfs_valid_fs(nilfs); + int err; - down_write(&nilfs->ns_sem); - valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS); - up_write(&nilfs->ns_sem); + if (nilfs_loaded(nilfs)) { + if (valid_fs || + ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY))) + return 0; + printk(KERN_ERR "NILFS: the filesystem is in an incomplete " + "recovery state.\n"); + return -EINVAL; + } - if (!valid_fs && (s_flags & MS_RDONLY)) { - printk(KERN_INFO "NILFS: INFO: recovery " - "required for readonly filesystem.\n"); - if (really_read_only) { - printk(KERN_ERR "NILFS: write access " - "unavailable, cannot proceed.\n"); - err = -EROFS; - goto failed; + if (!valid_fs) { + printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n"); + if (s_flags & MS_RDONLY) { + printk(KERN_INFO "NILFS: INFO: recovery " + "required for readonly filesystem.\n"); + printk(KERN_INFO "NILFS: write access will " + "be enabled during recovery.\n"); } - printk(KERN_INFO "NILFS: write access will " - "be enabled during recovery.\n"); - sbi->s_super->s_flags &= ~MS_RDONLY; } + nilfs_init_recovery_info(&ri); + err = nilfs_search_super_root(nilfs, sbi, &ri); if (unlikely(err)) { printk(KERN_ERR "NILFS: error searching super root.\n"); @@ -316,19 +297,56 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) goto failed; } - if (!valid_fs) { - err = nilfs_recover_logical_segments(nilfs, sbi, &ri); - if (unlikely(err)) { - nilfs_mdt_destroy(nilfs->ns_cpfile); - nilfs_mdt_destroy(nilfs->ns_sufile); - nilfs_mdt_destroy(nilfs->ns_dat); - goto failed; + if (valid_fs) + goto skip_recovery; + + if (s_flags & MS_RDONLY) { + if (nilfs_test_opt(sbi, NORECOVERY)) { + printk(KERN_INFO "NILFS: norecovery option specified. " + "skipping roll-forward recovery\n"); + goto skip_recovery; } - if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED) - sbi->s_super->s_dirt = 1; + if (really_read_only) { + printk(KERN_ERR "NILFS: write access " + "unavailable, cannot proceed.\n"); + err = -EROFS; + goto failed_unload; + } + sbi->s_super->s_flags &= ~MS_RDONLY; + } else if (nilfs_test_opt(sbi, NORECOVERY)) { + printk(KERN_ERR "NILFS: recovery cancelled because norecovery " + "option was specified for a read/write mount\n"); + err = -EINVAL; + goto failed_unload; } + err = nilfs_recover_logical_segments(nilfs, sbi, &ri); + if (err) + goto failed_unload; + + down_write(&nilfs->ns_sem); + nilfs->ns_mount_state |= NILFS_VALID_FS; + nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); + err = nilfs_commit_super(sbi, 1); + up_write(&nilfs->ns_sem); + + if (err) { + printk(KERN_ERR "NILFS: failed to update super block. " + "recovery unfinished.\n"); + goto failed_unload; + } + printk(KERN_INFO "NILFS: recovery complete.\n"); + + skip_recovery: set_nilfs_loaded(nilfs); + nilfs_clear_recovery_info(&ri); + sbi->s_super->s_flags = s_flags; + return 0; + + failed_unload: + nilfs_mdt_destroy(nilfs->ns_cpfile); + nilfs_mdt_destroy(nilfs->ns_sufile); + nilfs_mdt_destroy(nilfs->ns_dat); failed: nilfs_clear_recovery_info(&ri); @@ -632,30 +650,23 @@ int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) { struct inode *dat = nilfs_dat_inode(nilfs); unsigned long ncleansegs; - int err; down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ - err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs); + ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ - if (likely(!err)) - *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; - return err; + *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; + return 0; } int nilfs_near_disk_full(struct the_nilfs *nilfs) { - struct inode *sufile = nilfs->ns_sufile; unsigned long ncleansegs, nincsegs; - int ret; - ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs); - if (likely(!ret)) { - nincsegs = atomic_read(&nilfs->ns_ndirtyblks) / - nilfs->ns_blocks_per_segment + 1; - if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs) - ret++; - } - return ret; + ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); + nincsegs = atomic_read(&nilfs->ns_ndirtyblks) / + nilfs->ns_blocks_per_segment + 1; + + return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs; } /** diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 20abd55881e0..589786e33464 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -258,6 +258,16 @@ static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi) kfree(sbi); } +static inline int nilfs_valid_fs(struct the_nilfs *nilfs) +{ + unsigned valid_fs; + + down_read(&nilfs->ns_sem); + valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS); + up_read(&nilfs->ns_sem); + return valid_fs; +} + static inline void nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum, sector_t *seg_start, sector_t *seg_end) diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index c9ee67b442e1..1afb0a10229f 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -121,7 +121,7 @@ static int idr_callback(int id, void *p, void *data) if (warned) return 0; - warned = false; + warned = true; entry = p; ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 5ef5f365a5c8..a94e8bd8eb1f 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -552,7 +552,7 @@ retry: spin_lock(&group->inotify_data.idr_lock); ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, - group->inotify_data.last_wd, + group->inotify_data.last_wd+1, &tmp_ientry->wd); spin_unlock(&group->inotify_data.idr_lock); if (ret) { @@ -632,7 +632,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); - group->inotify_data.last_wd = 1; + group->inotify_data.last_wd = 0; group->inotify_data.user = user; group->inotify_data.fa = NULL; @@ -646,6 +646,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) struct fsnotify_group *group; struct user_struct *user; struct file *filp; + struct path path; int fd, ret; /* Check the IN_* constants for consistency. */ @@ -659,12 +660,6 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) if (fd < 0) return fd; - filp = get_empty_filp(); - if (!filp) { - ret = -ENFILE; - goto out_put_fd; - } - user = get_current_user(); if (unlikely(atomic_read(&user->inotify_devs) >= inotify_max_user_instances)) { @@ -679,24 +674,28 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) goto out_free_uid; } - filp->f_op = &inotify_fops; - filp->f_path.mnt = mntget(inotify_mnt); - filp->f_path.dentry = dget(inotify_mnt->mnt_root); - filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; - filp->f_mode = FMODE_READ; + atomic_inc(&user->inotify_devs); + + path.mnt = inotify_mnt; + path.dentry = inotify_mnt->mnt_root; + path_get(&path); + filp = alloc_file(&path, FMODE_READ, &inotify_fops); + if (!filp) + goto Enfile; + filp->f_flags = O_RDONLY | (flags & O_NONBLOCK); filp->private_data = group; - atomic_inc(&user->inotify_devs); - fd_install(fd, filp); return fd; +Enfile: + ret = -ENFILE; + path_put(&path); + atomic_dec(&user->inotify_devs); out_free_uid: free_uid(user); - put_filp(filp); -out_put_fd: put_unused_fd(fd); return ret; } diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 9938034762cc..dc2505abb6d7 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -530,7 +530,7 @@ err_corrupt_attr: * the ntfs inode. * * Q: What locks are held when the function is called? - * A: i_state has I_LOCK set, hence the inode is locked, also + * A: i_state has I_NEW set, hence the inode is locked, also * i_count is set to 1, so it is not going to go away * i_flags is set to 0 and we have no business touching it. Only an ioctl() * is allowed to write to them. We should of course be honouring them but @@ -1207,7 +1207,7 @@ err_out: * necessary fields in @vi as well as initializing the ntfs inode. * * Q: What locks are held when the function is called? - * A: i_state has I_LOCK set, hence the inode is locked, also + * A: i_state has I_NEW set, hence the inode is locked, also * i_count is set to 1, so it is not going to go away * * Return 0 on success and -errno on error. In the error case, the inode will @@ -1474,7 +1474,7 @@ err_out: * normal directory inodes. * * Q: What locks are held when the function is called? - * A: i_state has I_LOCK set, hence the inode is locked, also + * A: i_state has I_NEW set, hence the inode is locked, also * i_count is set to 1, so it is not going to go away * * Return 0 on success and -errno on error. In the error case, the inode will diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 701b7a3a872e..0d840669698e 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -6,6 +6,7 @@ config OCFS2_FS select CRC32 select QUOTA select QUOTA_TREE + select FS_POSIX_ACL help OCFS2 is a general purpose extent based shared disk cluster file system with many similarities to ext3. It supports 64 bit inode @@ -74,12 +75,3 @@ config OCFS2_DEBUG_FS This option will enable expensive consistency checks. Enable this option for debugging only as it is likely to decrease performance of the filesystem. - -config OCFS2_FS_POSIX_ACL - bool "OCFS2 POSIX Access Control Lists" - depends on OCFS2_FS - select FS_POSIX_ACL - default n - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 31f25ce32c97..600d2d2ade11 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -39,11 +39,8 @@ ocfs2-objs := \ ver.o \ quota_local.o \ quota_global.o \ - xattr.o - -ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y) -ocfs2-objs += acl.o -endif + xattr.o \ + acl.o ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index fbeaec762103..0501974bedd0 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -98,15 +98,11 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode, int type, struct buffer_head *di_bh) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int name_index; char *value = NULL; struct posix_acl *acl; int retval; - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return NULL; - switch (type) { case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -331,13 +327,14 @@ cleanup: return ret; } -static size_t ocfs2_xattr_list_acl_access(struct inode *inode, +static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, const char *name, - size_t name_len) + size_t name_len, + int type) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) @@ -348,13 +345,14 @@ static size_t ocfs2_xattr_list_acl_access(struct inode *inode, return size; } -static size_t ocfs2_xattr_list_acl_default(struct inode *inode, +static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, const char *name, - size_t name_len) + size_t name_len, + int type) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) @@ -365,19 +363,19 @@ static size_t ocfs2_xattr_list_acl_default(struct inode *inode, return size; } -static int ocfs2_xattr_get_acl(struct inode *inode, - int type, - void *buffer, - size_t size) +static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); struct posix_acl *acl; int ret; + if (strcmp(name, "") != 0) + return -EINVAL; if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return -EOPNOTSUPP; - acl = ocfs2_get_acl(inode, type); + acl = ocfs2_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -388,35 +386,16 @@ static int ocfs2_xattr_get_acl(struct inode *inode, return ret; } -static int ocfs2_xattr_get_acl_access(struct inode *inode, - const char *name, - void *buffer, - size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int ocfs2_xattr_get_acl_default(struct inode *inode, - const char *name, - void *buffer, - size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int ocfs2_xattr_set_acl(struct inode *inode, - int type, - const void *value, - size_t size) +static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct posix_acl *acl; int ret = 0; + if (strcmp(name, "") != 0) + return -EINVAL; if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return -EOPNOTSUPP; @@ -442,38 +421,18 @@ cleanup: return ret; } -static int ocfs2_xattr_set_acl_access(struct inode *inode, - const char *name, - const void *value, - size_t size, - int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int ocfs2_xattr_set_acl_default(struct inode *inode, - const char *name, - const void *value, - size_t size, - int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - struct xattr_handler ocfs2_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, .list = ocfs2_xattr_list_acl_access, - .get = ocfs2_xattr_get_acl_access, - .set = ocfs2_xattr_set_acl_access, + .get = ocfs2_xattr_get_acl, + .set = ocfs2_xattr_set_acl, }; struct xattr_handler ocfs2_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, .list = ocfs2_xattr_list_acl_default, - .get = ocfs2_xattr_get_acl_default, - .set = ocfs2_xattr_set_acl_default, + .get = ocfs2_xattr_get_acl, + .set = ocfs2_xattr_set_acl, }; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 8f6389ed4da5..5c5d31f05853 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -26,8 +26,6 @@ struct ocfs2_acl_entry { __le32 e_id; }; -#ifdef CONFIG_OCFS2_FS_POSIX_ACL - extern int ocfs2_check_acl(struct inode *, int); extern int ocfs2_acl_chmod(struct inode *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, @@ -35,24 +33,4 @@ extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct ocfs2_alloc_context *, struct ocfs2_alloc_context *); -#else /* CONFIG_OCFS2_FS_POSIX_ACL*/ - -#define ocfs2_check_acl NULL -static inline int ocfs2_acl_chmod(struct inode *inode) -{ - return 0; -} -static inline int ocfs2_init_acl(handle_t *handle, - struct inode *inode, - struct inode *dir, - struct buffer_head *di_bh, - struct buffer_head *dir_bh, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_alloc_context *data_ac) -{ - return 0; -} - -#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/ - #endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 7c7198a5bc90..d17bdc718f74 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1765,9 +1765,9 @@ set_and_inc: * * The array index of the subtree root is passed back. */ -static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, - struct ocfs2_path *left, - struct ocfs2_path *right) +int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, + struct ocfs2_path *left, + struct ocfs2_path *right) { int i = 0; @@ -2872,8 +2872,8 @@ out: * This looks similar, but is subtly different to * ocfs2_find_cpos_for_left_leaf(). */ -static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, - struct ocfs2_path *path, u32 *cpos) +int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) { int i, j, ret = 0; u64 blkno; @@ -7190,8 +7190,8 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, * wait on them - the truncate_inode_pages() call later will * do that for us. */ - ret = do_sync_mapping_range(inode->i_mapping, range_start, - range_end - 1, SYNC_FILE_RANGE_WRITE); + ret = filemap_fdatawrite_range(inode->i_mapping, range_start, + range_end - 1); if (ret) mlog_errno(ret); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 9c122d574464..1db4359ccb90 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -317,4 +317,9 @@ int ocfs2_path_bh_journal_access(handle_t *handle, int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, handle_t *handle, struct ocfs2_path *path); +int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos); +int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, + struct ocfs2_path *left, + struct ocfs2_path *right); #endif /* OCFS2_ALLOC_H */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index deb2b132ae5e..3dae4a13f6e4 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -547,6 +547,9 @@ bail: * * called like this: dio->get_blocks(dio->inode, fs_startblk, * fs_count, map_bh, dio->rw == WRITE); + * + * Note that we never bother to allocate blocks here, and thus ignore the + * create argument. */ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - /* - * Any write past EOF is not allowed because we'd be extending. - */ - if (create && (iblock + max_blocks) > inode_blocks) { - ret = -EIO; - goto bail; - } - /* This figures out the size of the next contiguous block, and * our logical offset */ ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, @@ -582,15 +577,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, goto bail; } - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) { - ocfs2_error(inode->i_sb, - "Inode %llu has a hole at block %llu\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)iblock); - ret = -EROFS; - goto bail; - } - /* We should already CoW the refcounted extent. */ BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); /* @@ -601,20 +587,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, */ if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) map_bh(bh_result, inode->i_sb, p_blkno); - else { - /* - * ocfs2_prepare_inode_for_write() should have caught - * the case where we'd be filling a hole and triggered - * a buffered write instead. - */ - if (create) { - ret = -EIO; - mlog_errno(ret); - goto bail; - } - + else clear_buffer_mapped(bh_result); - } /* make sure we don't map more than max_blocks blocks here as that's all the kernel will handle at this point. */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index c452d116b892..eda5b8bcddd5 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -176,7 +176,8 @@ static void o2hb_write_timeout(struct work_struct *work) static void o2hb_arm_write_timeout(struct o2hb_region *reg) { - mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS); + mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", + O2HB_MAX_WRITE_TIMEOUT_MS); cancel_delayed_work(®->hr_write_timeout_work); reg->hr_last_timeout_start = jiffies; @@ -874,7 +875,8 @@ static int o2hb_thread(void *data) do_gettimeofday(&after_hb); elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); - mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n", + mlog(ML_HEARTBEAT, + "start = %lu.%lu, end = %lu.%lu, msec = %u\n", before_hb.tv_sec, (unsigned long) before_hb.tv_usec, after_hb.tv_sec, (unsigned long) after_hb.tv_usec, elapsed_msec); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 7ee6188bc79a..c81142e3ef84 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -35,6 +35,10 @@ * cluster references throughout where nodes are looked up */ struct o2nm_cluster *o2nm_single_cluster = NULL; +char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { + "reset", /* O2NM_FENCE_RESET */ + "panic", /* O2NM_FENCE_PANIC */ +}; struct o2nm_node *o2nm_get_node_by_num(u8 node_num) { @@ -579,6 +583,43 @@ static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write( return o2nm_cluster_attr_write(page, count, &cluster->cl_reconnect_delay_ms); } + +static ssize_t o2nm_cluster_attr_fence_method_read( + struct o2nm_cluster *cluster, char *page) +{ + ssize_t ret = 0; + + if (cluster) + ret = sprintf(page, "%s\n", + o2nm_fence_method_desc[cluster->cl_fence_method]); + return ret; +} + +static ssize_t o2nm_cluster_attr_fence_method_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + unsigned int i; + + if (page[count - 1] != '\n') + goto bail; + + for (i = 0; i < O2NM_FENCE_METHODS; ++i) { + if (count != strlen(o2nm_fence_method_desc[i]) + 1) + continue; + if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1)) + continue; + if (cluster->cl_fence_method != i) { + printk(KERN_INFO "ocfs2: Changing fence method to %s\n", + o2nm_fence_method_desc[i]); + cluster->cl_fence_method = i; + } + return count; + } + +bail: + return -EINVAL; +} + static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { .attr = { .ca_owner = THIS_MODULE, .ca_name = "idle_timeout_ms", @@ -603,10 +644,19 @@ static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = { .store = o2nm_cluster_attr_reconnect_delay_ms_write, }; +static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "fence_method", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_fence_method_read, + .store = o2nm_cluster_attr_fence_method_write, +}; + static struct configfs_attribute *o2nm_cluster_attrs[] = { &o2nm_cluster_attr_idle_timeout_ms.attr, &o2nm_cluster_attr_keepalive_delay_ms.attr, &o2nm_cluster_attr_reconnect_delay_ms.attr, + &o2nm_cluster_attr_fence_method.attr, NULL, }; static ssize_t o2nm_cluster_show(struct config_item *item, @@ -778,6 +828,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT; cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT; + cluster->cl_fence_method = O2NM_FENCE_RESET; ret = &cluster->cl_group; o2nm_single_cluster = cluster; diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index c992ea0da4ad..09ea2d388bbb 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -33,6 +33,12 @@ #include <linux/configfs.h> #include <linux/rbtree.h> +enum o2nm_fence_method { + O2NM_FENCE_RESET = 0, + O2NM_FENCE_PANIC, + O2NM_FENCE_METHODS, /* Number of fence methods */ +}; + struct o2nm_node { spinlock_t nd_lock; struct config_item nd_item; @@ -58,6 +64,7 @@ struct o2nm_cluster { unsigned int cl_idle_timeout_ms; unsigned int cl_keepalive_delay_ms; unsigned int cl_reconnect_delay_ms; + enum o2nm_fence_method cl_fence_method; /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index bbacf7da48a4..639024033fce 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -74,8 +74,20 @@ static void o2quo_fence_self(void) * threads can still schedule, etc, etc */ o2hb_stop_all_regions(); - printk("ocfs2 is very sorry to be fencing this system by restarting\n"); - emergency_restart(); + switch (o2nm_single_cluster->cl_fence_method) { + case O2NM_FENCE_PANIC: + panic("*** ocfs2 is very sorry to be fencing this system by " + "panicing ***\n"); + break; + default: + WARN_ON(o2nm_single_cluster->cl_fence_method >= + O2NM_FENCE_METHODS); + case O2NM_FENCE_RESET: + printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this " + "system by restarting ***\n"); + emergency_restart(); + break; + }; } /* Indicate that a timeout occured on a hearbeat region write. The diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index d9fa3d22e17c..2f9e4e19a4f2 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2589,6 +2589,14 @@ retry: "begin reco msg (%d)\n", dlm->name, nodenum, ret); ret = 0; } + if (ret == -EAGAIN) { + mlog(0, "%s: trying to start recovery of node " + "%u, but node %u is waiting for last recovery " + "to complete, backoff for a bit\n", dlm->name, + dead_node, nodenum); + msleep(100); + goto retry; + } if (ret < 0) { struct dlm_lock_resource *res; /* this is now a serious problem, possibly ENOMEM @@ -2608,14 +2616,6 @@ retry: * another ENOMEM */ msleep(100); goto retry; - } else if (ret == EAGAIN) { - mlog(0, "%s: trying to start recovery of node " - "%u, but node %u is waiting for last recovery " - "to complete, backoff for a bit\n", dlm->name, - dead_node, nodenum); - /* TODO Look into replacing msleep with cond_resched() */ - msleep(100); - goto retry; } } @@ -2639,7 +2639,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, dlm->name, br->node_idx, br->dead_node, dlm->reco.dead_node, dlm->reco.new_master); spin_unlock(&dlm->spinlock); - return EAGAIN; + return -EAGAIN; } spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 843db64e9d4a..d35a27f4523e 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -37,6 +37,7 @@ #include "extent_map.h" #include "inode.h" #include "super.h" +#include "symlink.h" #include "buffer_head_io.h" @@ -703,6 +704,12 @@ out: return ret; } +/* + * The ocfs2_fiemap_inline() may be a little bit misleading, since + * it not only handles the fiemap for inlined files, but also deals + * with the fast symlink, cause they have no difference for extent + * mapping per se. + */ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct fiemap_extent_info *fieinfo, u64 map_start) @@ -715,11 +722,18 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct ocfs2_inode_info *oi = OCFS2_I(inode); di = (struct ocfs2_dinode *)di_bh->b_data; - id_count = le16_to_cpu(di->id2.i_data.id_count); + if (ocfs2_inode_is_fast_symlink(inode)) + id_count = ocfs2_fast_symlink_chars(inode->i_sb); + else + id_count = le16_to_cpu(di->id2.i_data.id_count); if (map_start < id_count) { phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits; - phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); + if (ocfs2_inode_is_fast_symlink(inode)) + phys += offsetof(struct ocfs2_dinode, id2.i_symlink); + else + phys += offsetof(struct ocfs2_dinode, + id2.i_data.id_data); ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, flags); @@ -756,9 +770,10 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, down_read(&OCFS2_I(inode)->ip_alloc_sem); /* - * Handle inline-data separately. + * Handle inline-data and fast symlink separately. */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || + ocfs2_inode_is_fast_symlink(inode)) { ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start); goto out_unlock; } @@ -786,6 +801,8 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, fe_flags = 0; if (rec.e_flags & OCFS2_EXT_UNWRITTEN) fe_flags |= FIEMAP_EXTENT_UNWRITTEN; + if (rec.e_flags & OCFS2_EXT_REFCOUNTED) + fe_flags |= FIEMAP_EXTENT_SHARED; if (is_last) fe_flags |= FIEMAP_EXTENT_LAST; len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index de059f490586..06ccf6a86d35 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1772,7 +1772,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, loff_t *ppos, size_t count, int appending, - int *direct_io) + int *direct_io, + int *has_refcount) { int ret = 0, meta_level = 0; struct inode *inode = dentry->d_inode; @@ -1833,6 +1834,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, saved_pos, count, &meta_level); + if (has_refcount) + *has_refcount = 1; } if (ret < 0) { @@ -1856,6 +1859,10 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, break; } + if (has_refcount && *has_refcount == 1) { + *direct_io = 0; + break; + } /* * Allowing concurrent direct writes means * i_size changes wouldn't be synchronized, so @@ -1899,7 +1906,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, loff_t pos) { int ret, direct_io, appending, rw_level, have_alloc_sem = 0; - int can_do_direct; + int can_do_direct, has_refcount = 0; ssize_t written = 0; size_t ocount; /* original count */ size_t count; /* after file limit checks */ @@ -1942,7 +1949,7 @@ relock: can_do_direct = direct_io; ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, iocb->ki_left, appending, - &can_do_direct); + &can_do_direct, &has_refcount); if (ret < 0) { mlog_errno(ret); goto out; @@ -2006,14 +2013,16 @@ out_dio: /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); - if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { + if ((file->f_flags & O_DSYNC && !direct_io) || IS_SYNC(inode) || + (file->f_flags & O_DIRECT && has_refcount)) { ret = filemap_fdatawrite_range(file->f_mapping, pos, pos + count - 1); if (ret < 0) written = ret; if (!ret && (old_size != i_size_read(inode) || - old_clusters != OCFS2_I(inode)->ip_clusters)) { + old_clusters != OCFS2_I(inode)->ip_clusters || + has_refcount)) { ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; @@ -2062,7 +2071,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, int ret; ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, - sd->total_len, 0, NULL); + sd->total_len, 0, NULL, NULL); if (ret < 0) { mlog_errno(ret); return ret; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f010b22b1c44..50fb26a6a5f5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2108,6 +2108,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, } did_quota_inode = 1; + inode->i_nlink = 0; /* do the real work now. */ status = ocfs2_mknod_locked(osb, dir, inode, 0, &new_di_bh, parent_di_bh, handle, @@ -2136,6 +2137,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, if (status < 0) mlog_errno(status); + insert_inode_hash(inode); leave: if (status < 0 && did_quota_inode) vfs_dq_free_inode(inode); @@ -2267,6 +2269,8 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, di = (struct ocfs2_dinode *)di_bh->b_data; le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); di->i_orphaned_slot = 0; + inode->i_nlink = 1; + ocfs2_set_links_count(di, inode->i_nlink); ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, @@ -2284,7 +2288,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto out_commit; } - insert_inode_hash(inode); dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); status = 0; @@ -2326,4 +2329,5 @@ const struct inode_operations ocfs2_dir_iops = { .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, }; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index d963d8638709..9362eea7424b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -245,9 +245,11 @@ enum ocfs2_mount_options OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ - OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */ - OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */ - OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */ + OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */ + OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access + control lists */ + OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ + OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ }; #define OCFS2_OSB_SOFT_RO 0x0001 diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index e9431e4a5e7c..1a1a679e51b5 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -1202,7 +1202,7 @@ struct ocfs2_local_disk_dqinfo { /* Header of one chunk of a quota file */ struct ocfs2_local_disk_chunk { __le32 dqc_free; /* Number of free entries in the bitmap */ - u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding + __u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding * chunk of quota file */ }; diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index e5df9d170b0c..123bc520a2c0 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -17,10 +17,6 @@ #include "ocfs2.h" -/* Common stuff */ -/* id number of quota format */ -#define QFMT_OCFS2 3 - /* * In-memory structures */ diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 1a2c50a759fa..21f9e71223ca 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -1325,7 +1325,7 @@ out: return status; } -static struct quota_format_ops ocfs2_format_ops = { +static const struct quota_format_ops ocfs2_format_ops = { .check_quota_file = ocfs2_local_check_quota_file, .read_file_info = ocfs2_local_read_info, .write_file_info = ocfs2_global_write_info, diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 30967e3f5e43..74db2be75dd6 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -276,7 +276,7 @@ static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } -void ocfs2_kref_remove_refcount_tree(struct kref *kref) +static void ocfs2_kref_remove_refcount_tree(struct kref *kref) { struct ocfs2_refcount_tree *tree = container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); @@ -524,23 +524,6 @@ out: return ret; } -int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw, - struct ocfs2_refcount_tree **ret_tree, - struct buffer_head **ref_bh) -{ - int ret; - u64 ref_blkno; - - ret = ocfs2_get_refcount_block(inode, &ref_blkno); - if (ret) { - mlog_errno(ret); - return ret; - } - - return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, - rw, ret_tree, ref_bh); -} - void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, struct ocfs2_refcount_tree *tree, int rw) { @@ -969,6 +952,103 @@ out: } /* + * Find the end range for a leaf refcount block indicated by + * el->l_recs[index].e_blkno. + */ +static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct ocfs2_extent_block *eb, + struct ocfs2_extent_list *el, + int index, u32 *cpos_end) +{ + int ret, i, subtree_root; + u32 cpos; + u64 blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_path *left_path = NULL, *right_path = NULL; + struct ocfs2_extent_tree et; + struct ocfs2_extent_list *tmp_el; + + if (index < le16_to_cpu(el->l_next_free_rec) - 1) { + /* + * We have a extent rec after index, so just use the e_cpos + * of the next extent rec. + */ + *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos); + return 0; + } + + if (!eb || (eb && !eb->h_next_leaf_blk)) { + /* + * We are the last extent rec, so any high cpos should + * be stored in this leaf refcount block. + */ + *cpos_end = UINT_MAX; + return 0; + } + + /* + * If the extent block isn't the last one, we have to find + * the subtree root between this extent block and the next + * leaf extent block and get the corresponding e_cpos from + * the subroot. Otherwise we may corrupt the b-tree. + */ + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + + left_path = ocfs2_new_path_from_et(&et); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos); + ret = ocfs2_find_path(ci, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + right_path = ocfs2_new_path_from_path(left_path); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(ci, right_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + subtree_root = ocfs2_find_subtree_root(&et, left_path, + right_path); + + tmp_el = left_path->p_node[subtree_root].el; + blkno = left_path->p_node[subtree_root+1].bh->b_blocknr; + for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) { + if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) { + *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos); + break; + } + } + + BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec)); + +out: + ocfs2_free_path(left_path); + ocfs2_free_path(right_path); + return ret; +} + +/* * Given a cpos and len, try to find the refcount record which contains cpos. * 1. If cpos can be found in one refcount record, return the record. * 2. If cpos can't be found, return a fake record which start from cpos @@ -983,10 +1063,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, struct buffer_head **ret_bh) { int ret = 0, i, found; - u32 low_cpos; + u32 low_cpos, uninitialized_var(cpos_end); struct ocfs2_extent_list *el; - struct ocfs2_extent_rec *tmp, *rec = NULL; - struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec = NULL; + struct ocfs2_extent_block *eb = NULL; struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct ocfs2_refcount_block *rb = @@ -1034,12 +1114,16 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, } } - /* adjust len when we have ocfs2_extent_rec after it. */ - if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) { - tmp = &el->l_recs[i+1]; + if (found) { + ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh, + eb, el, i, &cpos_end); + if (ret) { + mlog_errno(ret); + goto out; + } - if (le32_to_cpu(tmp->e_cpos) < cpos + len) - len = le32_to_cpu(tmp->e_cpos) - cpos; + if (cpos_end < low_cpos + len) + len = cpos_end - low_cpos; } ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), @@ -1418,7 +1502,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, /* change old and new rl_used accordingly. */ le16_add_cpu(&rl->rl_used, -num_moved); - new_rl->rl_used = cpu_to_le32(num_moved); + new_rl->rl_used = cpu_to_le16(num_moved); sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), sizeof(struct ocfs2_refcount_rec), @@ -1797,7 +1881,8 @@ static int ocfs2_split_refcount_rec(handle_t *handle, recs_need++; /* If the leaf block don't have enough record, expand it. */ - if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) { + if (le16_to_cpu(rf_list->rl_used) + recs_need > + le16_to_cpu(rf_list->rl_count)) { struct ocfs2_refcount_rec tmp_rec; u64 cpos = le64_to_cpu(orig_rec->r_cpos); len = le32_to_cpu(orig_rec->r_clusters); @@ -1859,7 +1944,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle, memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); le64_add_cpu(&tail_rec->r_cpos, le32_to_cpu(tail_rec->r_clusters) - len); - tail_rec->r_clusters = le32_to_cpu(len); + tail_rec->r_clusters = cpu_to_le32(len); } /* @@ -3840,8 +3925,7 @@ static int ocfs2_add_refcounted_extent(struct inode *inode, } ret = ocfs2_insert_extent(handle, et, cpos, - cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, - p_cluster)), + ocfs2_clusters_to_blocks(inode->i_sb, p_cluster), num_clusters, ext_flags, meta_ac); if (ret) { mlog_errno(ret); @@ -4253,8 +4337,8 @@ static int ocfs2_user_path_parent(const char __user *path, * @new_dentry: target dentry * @preserve: if true, preserve all file attributes */ -int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry, bool preserve) +static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool preserve) { struct inode *inode = old_dentry->d_inode; int error; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index ff4c798a5635..da78a2a334fd 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -814,7 +814,7 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing, static int user_cluster_connect(struct ocfs2_cluster_connection *conn) { dlm_lockspace_t *fsdlm; - struct ocfs2_live_connection *control; + struct ocfs2_live_connection *uninitialized_var(control); int rc = 0; BUG_ON(conn == NULL); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 14f47d2bfe02..26069917a9f5 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -100,6 +100,8 @@ struct mount_options static int ocfs2_parse_options(struct super_block *sb, char *options, struct mount_options *mopt, int is_remount); +static int ocfs2_check_set_options(struct super_block *sb, + struct mount_options *options); static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt); static void ocfs2_put_super(struct super_block *sb); static int ocfs2_mount_volume(struct super_block *sb); @@ -600,7 +602,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) lock_kernel(); - if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { + if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || + !ocfs2_check_set_options(sb, &parsed_options)) { ret = -EINVAL; goto out; } @@ -691,8 +694,6 @@ unlock_osb: if (!ret) { /* Only save off the new mount options in case of a successful * remount. */ - if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)) - parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; @@ -701,6 +702,10 @@ unlock_osb: if (!ocfs2_is_hard_readonly(osb)) ocfs2_set_journal_params(osb); + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? + MS_POSIXACL : 0); } out: unlock_kernel(); @@ -1011,31 +1016,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); bh = NULL; - if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)) - parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; - + if (!ocfs2_check_set_options(sb, &parsed_options)) { + status = -EINVAL; + goto read_super_error; + } osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; osb->osb_commit_interval = parsed_options.commit_interval; osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); osb->local_alloc_bits = osb->local_alloc_default_bits; - if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA && - !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { - status = -EINVAL; - mlog(ML_ERROR, "User quotas were requested, but this " - "filesystem does not have the feature enabled.\n"); - goto read_super_error; - } - if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA && - !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { - status = -EINVAL; - mlog(ML_ERROR, "Group quotas were requested, but this " - "filesystem does not have the feature enabled.\n"); - goto read_super_error; - } status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) @@ -1245,6 +1235,40 @@ static struct file_system_type ocfs2_fs_type = { .next = NULL }; +static int ocfs2_check_set_options(struct super_block *sb, + struct mount_options *options) +{ + if (options->mount_opt & OCFS2_MOUNT_USRQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + mlog(ML_ERROR, "User quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + return 0; + } + if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + mlog(ML_ERROR, "Group quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + return 0; + } + if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL && + !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) { + mlog(ML_ERROR, "ACL support requested but extended attributes " + "feature is not enabled\n"); + return 0; + } + /* No ACL setting specified? Use XATTR feature... */ + if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL | + OCFS2_MOUNT_NO_POSIX_ACL))) { + if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) + options->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + else + options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; + } + return 1; +} + static int ocfs2_parse_options(struct super_block *sb, char *options, struct mount_options *mopt, @@ -1392,40 +1416,19 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->mount_opt |= OCFS2_MOUNT_INODE64; break; case Opt_usrquota: - /* We check only on remount, otherwise features - * aren't yet initialized. */ - if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { - mlog(ML_ERROR, "User quota requested but " - "filesystem feature is not set\n"); - status = 0; - goto bail; - } mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; break; case Opt_grpquota: - if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { - mlog(ML_ERROR, "Group quota requested but " - "filesystem feature is not set\n"); - status = 0; - goto bail; - } mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; break; -#ifdef CONFIG_OCFS2_FS_POSIX_ACL case Opt_acl: mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; break; case Opt_noacl: + mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; break; -#else - case Opt_acl: - case Opt_noacl: - printk(KERN_INFO "ocfs2 (no)acl options not supported\n"); - break; -#endif default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1502,12 +1505,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (opts & OCFS2_MOUNT_INODE64) seq_printf(s, ",inode64"); -#ifdef CONFIG_OCFS2_FS_POSIX_ACL if (opts & OCFS2_MOUNT_POSIX_ACL) seq_printf(s, ",acl"); else seq_printf(s, ",noacl"); -#endif return 0; } diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index e3421030a69f..49b133ccbf11 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -163,6 +163,7 @@ const struct inode_operations ocfs2_symlink_inode_operations = { .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, }; const struct inode_operations ocfs2_fast_symlink_inode_operations = { .readlink = ocfs2_readlink, @@ -174,4 +175,5 @@ const struct inode_operations ocfs2_fast_symlink_inode_operations = { .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, }; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index fe3419068df2..8fc6fb071c6d 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -98,10 +98,8 @@ static struct ocfs2_xattr_def_value_root def_xv = { struct xattr_handler *ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, -#ifdef CONFIG_OCFS2_FS_POSIX_ACL &ocfs2_xattr_acl_access_handler, &ocfs2_xattr_acl_default_handler, -#endif &ocfs2_xattr_trusted_handler, &ocfs2_xattr_security_handler, NULL @@ -109,12 +107,10 @@ struct xattr_handler *ocfs2_xattr_handlers[] = { static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, -#ifdef CONFIG_OCFS2_FS_POSIX_ACL [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ocfs2_xattr_acl_access_handler, [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ocfs2_xattr_acl_default_handler, -#endif [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, }; @@ -205,8 +201,6 @@ static int ocfs2_get_xattr_tree_value_root(struct super_block *sb, int offset, struct ocfs2_xattr_value_root **xv, struct buffer_head **bh); -static int ocfs2_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags); static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) { @@ -6066,7 +6060,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb, * to the extent block, so just calculate a maximum record num. */ if (!xv->xr_list.l_tree_depth) - *num_recs += xv->xr_list.l_next_free_rec; + *num_recs += le16_to_cpu(xv->xr_list.l_next_free_rec); else *num_recs += ocfs2_clusters_for_bytes(sb, XATTR_SIZE_MAX); @@ -6978,9 +6972,9 @@ int ocfs2_init_security_and_acl(struct inode *dir, ret = ocfs2_init_security_get(inode, dir, &si); if (!ret) { - ret = ocfs2_xattr_security_set(inode, si.name, - si.value, si.value_len, - XATTR_CREATE); + ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, + si.name, si.value, si.value_len, + XATTR_CREATE); if (ret) { mlog_errno(ret); goto leave; @@ -7008,9 +7002,9 @@ leave: /* * 'security' attributes support */ -static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, +static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, const char *name, - size_t name_len) + size_t name_len, int type) { const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -7023,23 +7017,23 @@ static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, return total_len; } -static int ocfs2_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name, - buffer, size); + return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + name, buffer, size); } -static int ocfs2_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value, - size, flags); + return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + name, value, size, flags); } int ocfs2_init_security_get(struct inode *inode, @@ -7076,9 +7070,9 @@ struct xattr_handler ocfs2_xattr_security_handler = { /* * 'trusted' attributes support */ -static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, +static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, const char *name, - size_t name_len) + size_t name_len, int type) { const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -7091,23 +7085,23 @@ static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, return total_len; } -static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name, - void *buffer, size_t size) +static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name, - buffer, size); + return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + name, buffer, size); } -static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value, - size, flags); + return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + name, value, size, flags); } struct xattr_handler ocfs2_xattr_trusted_handler = { @@ -7120,13 +7114,13 @@ struct xattr_handler ocfs2_xattr_trusted_handler = { /* * 'user' attributes support */ -static size_t ocfs2_xattr_user_list(struct inode *inode, char *list, +static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, const char *name, - size_t name_len) + size_t name_len, int type) { const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return 0; @@ -7139,31 +7133,31 @@ static size_t ocfs2_xattr_user_list(struct inode *inode, char *list, return total_len; } -static int ocfs2_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) +static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); if (strcmp(name, "") == 0) return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name, + return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name, buffer, size); } -static int ocfs2_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); if (strcmp(name, "") == 0) return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value, - size, flags); + return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER, + name, value, size, flags); } struct xattr_handler ocfs2_xattr_user_handler = { diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 08e36389f56d..abd72a47f520 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -40,10 +40,8 @@ struct ocfs2_security_xattr_info { extern struct xattr_handler ocfs2_xattr_user_handler; extern struct xattr_handler ocfs2_xattr_trusted_handler; extern struct xattr_handler ocfs2_xattr_security_handler; -#ifdef CONFIG_OCFS2_FS_POSIX_ACL extern struct xattr_handler ocfs2_xattr_acl_access_handler; extern struct xattr_handler ocfs2_xattr_acl_default_handler; -#endif extern struct xattr_handler *ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); diff --git a/fs/open.c b/fs/open.c index b4b31d277f3a..040cef72bc00 100644 --- a/fs/open.c +++ b/fs/open.c @@ -30,6 +30,9 @@ #include <linux/audit.h> #include <linux/falloc.h> #include <linux/fs_struct.h> +#include <linux/ima.h> + +#include "internal.h" int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) { @@ -818,15 +821,14 @@ static inline int __get_file_write_access(struct inode *inode, } static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, - int flags, struct file *f, + struct file *f, int (*open)(struct inode *, struct file *), const struct cred *cred) { struct inode *inode; int error; - f->f_flags = flags; - f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK | + f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { @@ -855,6 +857,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, if (error) goto cleanup_all; } + ima_counts_get(f); f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); @@ -926,7 +929,6 @@ struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry if (IS_ERR(dentry)) goto out_err; nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt), - nd->intent.open.flags - 1, nd->intent.open.file, open, cred); out: @@ -945,7 +947,7 @@ EXPORT_SYMBOL_GPL(lookup_instantiate_filp); * * Note that this function destroys the original nameidata */ -struct file *nameidata_to_filp(struct nameidata *nd, int flags) +struct file *nameidata_to_filp(struct nameidata *nd) { const struct cred *cred = current_cred(); struct file *filp; @@ -954,7 +956,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags) filp = nd->intent.open.file; /* Has the filesystem initialised the file for us? */ if (filp->f_path.dentry == NULL) - filp = __dentry_open(nd->path.dentry, nd->path.mnt, flags, filp, + filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp, NULL, cred); else path_put(&nd->path); @@ -993,7 +995,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, return ERR_PTR(error); } - return __dentry_open(dentry, mnt, flags, f, NULL, cred); + f->f_flags = flags; + return __dentry_open(dentry, mnt, f, NULL, cred); } EXPORT_SYMBOL(dentry_open); diff --git a/fs/pipe.c b/fs/pipe.c index ae17d026aaa3..37ba29ff3158 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -906,17 +906,6 @@ void free_pipe_info(struct inode *inode) } static struct vfsmount *pipe_mnt __read_mostly; -static int pipefs_delete_dentry(struct dentry *dentry) -{ - /* - * At creation time, we pretended this dentry was hashed - * (by clearing DCACHE_UNHASHED bit in d_flags) - * At delete time, we restore the truth : not hashed. - * (so that dput() can proceed correctly) - */ - dentry->d_flags |= DCACHE_UNHASHED; - return 0; -} /* * pipefs_dname() is called from d_path(). @@ -928,7 +917,6 @@ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) } static const struct dentry_operations pipefs_dentry_operations = { - .d_delete = pipefs_delete_dentry, .d_dname = pipefs_dname, }; @@ -974,7 +962,7 @@ struct file *create_write_pipe(int flags) int err; struct inode *inode; struct file *f; - struct dentry *dentry; + struct path path; struct qstr name = { .name = "" }; err = -ENFILE; @@ -983,21 +971,16 @@ struct file *create_write_pipe(int flags) goto err; err = -ENOMEM; - dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); - if (!dentry) + path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); + if (!path.dentry) goto err_inode; + path.mnt = mntget(pipe_mnt); - dentry->d_op = &pipefs_dentry_operations; - /* - * We dont want to publish this dentry into global dentry hash table. - * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED - * This permits a working /proc/$pid/fd/XXX on pipes - */ - dentry->d_flags &= ~DCACHE_UNHASHED; - d_instantiate(dentry, inode); + path.dentry->d_op = &pipefs_dentry_operations; + d_instantiate(path.dentry, inode); err = -ENFILE; - f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipefifo_fops); + f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); if (!f) goto err_dentry; f->f_mapping = inode->i_mapping; @@ -1009,7 +992,7 @@ struct file *create_write_pipe(int flags) err_dentry: free_pipe_info(inode); - dput(dentry); + path_put(&path); return ERR_PTR(err); err_inode: @@ -1028,20 +1011,14 @@ void free_write_pipe(struct file *f) struct file *create_read_pipe(struct file *wrf, int flags) { - struct file *f = get_empty_filp(); + /* Grab pipe from the writer */ + struct file *f = alloc_file(&wrf->f_path, FMODE_READ, + &read_pipefifo_fops); if (!f) return ERR_PTR(-ENFILE); - /* Grab pipe from the writer */ - f->f_path = wrf->f_path; path_get(&wrf->f_path); - f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping; - - f->f_pos = 0; f->f_flags = O_RDONLY | (flags & O_NONBLOCK); - f->f_op = &read_pipefifo_fops; - f->f_mode = FMODE_READ; - f->f_version = 0; return f; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 4badde179b18..13b5d0708175 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -134,13 +134,16 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) * simple bit tests. */ static const char *task_state_array[] = { - "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "T (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)" /* 32 */ + "R (running)", /* 0 */ + "S (sleeping)", /* 1 */ + "D (disk sleep)", /* 2 */ + "T (stopped)", /* 4 */ + "t (tracing stop)", /* 8 */ + "Z (zombie)", /* 16 */ + "X (dead)", /* 32 */ + "x (dead)", /* 64 */ + "K (wakekill)", /* 128 */ + "W (waking)", /* 256 */ }; static inline const char *get_task_state(struct task_struct *tsk) @@ -148,6 +151,8 @@ static inline const char *get_task_state(struct task_struct *tsk) unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state; const char **p = &task_state_array[0]; + BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array)); + while (state) { p++; state >>= 1; @@ -322,94 +327,6 @@ static inline void task_context_switch_counts(struct seq_file *m, p->nivcsw); } -#ifdef CONFIG_MMU - -struct stack_stats { - struct vm_area_struct *vma; - unsigned long startpage; - unsigned long usage; -}; - -static int stack_usage_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) -{ - struct stack_stats *ss = walk->private; - struct vm_area_struct *vma = ss->vma; - pte_t *pte, ptent; - spinlock_t *ptl; - int ret = 0; - - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; - -#ifdef CONFIG_STACK_GROWSUP - if (pte_present(ptent) || is_swap_pte(ptent)) - ss->usage = addr - ss->startpage + PAGE_SIZE; -#else - if (pte_present(ptent) || is_swap_pte(ptent)) { - ss->usage = ss->startpage - addr + PAGE_SIZE; - pte++; - ret = 1; - break; - } -#endif - } - pte_unmap_unlock(pte - 1, ptl); - cond_resched(); - return ret; -} - -static inline unsigned long get_stack_usage_in_bytes(struct vm_area_struct *vma, - struct task_struct *task) -{ - struct stack_stats ss; - struct mm_walk stack_walk = { - .pmd_entry = stack_usage_pte_range, - .mm = vma->vm_mm, - .private = &ss, - }; - - if (!vma->vm_mm || is_vm_hugetlb_page(vma)) - return 0; - - ss.vma = vma; - ss.startpage = task->stack_start & PAGE_MASK; - ss.usage = 0; - -#ifdef CONFIG_STACK_GROWSUP - walk_page_range(KSTK_ESP(task) & PAGE_MASK, vma->vm_end, - &stack_walk); -#else - walk_page_range(vma->vm_start, (KSTK_ESP(task) & PAGE_MASK) + PAGE_SIZE, - &stack_walk); -#endif - return ss.usage; -} - -static inline void task_show_stack_usage(struct seq_file *m, - struct task_struct *task) -{ - struct vm_area_struct *vma; - struct mm_struct *mm = get_task_mm(task); - - if (mm) { - down_read(&mm->mmap_sem); - vma = find_vma(mm, task->stack_start); - if (vma) - seq_printf(m, "Stack usage:\t%lu kB\n", - get_stack_usage_in_bytes(vma, task) >> 10); - - up_read(&mm->mmap_sem); - mmput(mm); - } -} -#else -static void task_show_stack_usage(struct seq_file *m, struct task_struct *task) -{ -} -#endif /* CONFIG_MMU */ - static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Cpus_allowed:\t"); @@ -440,7 +357,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_show_regs(m, task); #endif task_context_switch_counts(m, task); - task_show_stack_usage(m, task); return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index af643b5aefe8..e42bbd843ed1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1265,6 +1265,72 @@ static const struct file_operations proc_pid_sched_operations = { #endif +static ssize_t comm_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + char buffer[TASK_COMM_LEN]; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (same_thread_group(current, p)) + set_task_comm(p, buffer); + else + count = -EINVAL; + + put_task_struct(p); + + return count; +} + +static int comm_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + task_lock(p); + seq_printf(m, "%s\n", p->comm); + task_unlock(p); + + put_task_struct(p); + + return 0; +} + +static int comm_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = single_open(filp, comm_show, NULL); + if (!ret) { + struct seq_file *m = filp->private_data; + + m->private = inode; + } + return ret; +} + +static const struct file_operations proc_pid_set_comm_operations = { + .open = comm_open, + .read = seq_read, + .write = comm_write, + .llseek = seq_lseek, + .release = single_release, +}; + /* * We added or removed a vma mapping the executable. The vmas are only mapped * during exec and are not mapped with the mmap system call. @@ -1353,7 +1419,6 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); - nd->last_type = LAST_BIND; out: return ERR_PTR(error); } @@ -2200,7 +2265,7 @@ static const struct inode_operations proc_attr_dir_inode_operations = { #endif -#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) +#ifdef CONFIG_ELF_CORE static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -2504,6 +2569,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif + REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK INF("syscall", S_IRUSR, proc_pid_syscall), #endif @@ -2556,7 +2622,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), #endif -#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) +#ifdef CONFIG_ELF_CORE REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING @@ -2838,6 +2904,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif + REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK INF("syscall", S_IRUSR, proc_pid_syscall), #endif diff --git a/fs/proc/generic.c b/fs/proc/generic.c index fa678abc9db1..480cb1065eec 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -429,7 +429,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, unsigned int ino; ino = de->low_ino; - de_get(de); + pde_get(de); spin_unlock(&proc_subdir_lock); error = -EINVAL; inode = proc_get_inode(dir->i_sb, ino, de); @@ -445,7 +445,7 @@ out_unlock: return NULL; } if (de) - de_put(de); + pde_put(de); return ERR_PTR(error); } @@ -509,17 +509,17 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, struct proc_dir_entry *next; /* filldir passes info to user space */ - de_get(de); + pde_get(de); spin_unlock(&proc_subdir_lock); if (filldir(dirent, de->name, de->namelen, filp->f_pos, de->low_ino, de->mode >> 12) < 0) { - de_put(de); + pde_put(de); goto out; } spin_lock(&proc_subdir_lock); filp->f_pos++; next = de->next; - de_put(de); + pde_put(de); de = next; } while (de); spin_unlock(&proc_subdir_lock); @@ -763,7 +763,7 @@ out: return NULL; } -void free_proc_entry(struct proc_dir_entry *de) +static void free_proc_entry(struct proc_dir_entry *de) { unsigned int ino = de->low_ino; @@ -777,6 +777,12 @@ void free_proc_entry(struct proc_dir_entry *de) kfree(de); } +void pde_put(struct proc_dir_entry *pde) +{ + if (atomic_dec_and_test(&pde->count)) + free_proc_entry(pde); +} + /* * Remove a /proc entry and free it if it's not currently in use. */ @@ -845,6 +851,5 @@ continue_removing: WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory " "'%s/%s', leaking at least '%s'\n", __func__, de->parent->name, de->name, de->subdir->name); - if (atomic_dec_and_test(&de->count)) - free_proc_entry(de); + pde_put(de); } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d78ade305541..445a02bcaab3 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -24,29 +24,6 @@ #include "internal.h" -struct proc_dir_entry *de_get(struct proc_dir_entry *de) -{ - atomic_inc(&de->count); - return de; -} - -/* - * Decrements the use count and checks for deferred deletion. - */ -void de_put(struct proc_dir_entry *de) -{ - if (!atomic_read(&de->count)) { - printk("de_put: entry %s already free!\n", de->name); - return; - } - - if (atomic_dec_and_test(&de->count)) - free_proc_entry(de); -} - -/* - * Decrement the use count of the proc_dir_entry. - */ static void proc_delete_inode(struct inode *inode) { struct proc_dir_entry *de; @@ -59,7 +36,7 @@ static void proc_delete_inode(struct inode *inode) /* Let go of any associated proc directory entry */ de = PROC_I(inode)->pde; if (de) - de_put(de); + pde_put(de); if (PROC_I(inode)->sysctl) sysctl_head_put(PROC_I(inode)->sysctl); clear_inode(inode); @@ -480,7 +457,7 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, } unlock_new_inode(inode); } else - de_put(de); + pde_put(de); return inode; } @@ -495,7 +472,7 @@ int proc_fill_super(struct super_block *s) s->s_op = &proc_sops; s->s_time_gran = 1; - de_get(&proc_root); + pde_get(&proc_root); root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root); if (!root_inode) goto out_no_root; @@ -509,6 +486,6 @@ int proc_fill_super(struct super_block *s) out_no_root: printk("proc_read_super: get root inode failed\n"); iput(root_inode); - de_put(&proc_root); + pde_put(&proc_root); return -ENOMEM; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 753ca37002c8..1f24a3eddd12 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -61,8 +61,6 @@ extern const struct file_operations proc_pagemap_operations; extern const struct file_operations proc_net_operations; extern const struct inode_operations proc_net_inode_operations; -void free_proc_entry(struct proc_dir_entry *de); - void proc_init_inodecache(void); static inline struct pid *proc_pid(struct inode *inode) @@ -101,8 +99,12 @@ unsigned long task_vsize(struct mm_struct *); int task_statm(struct mm_struct *, int *, int *, int *, int *); void task_mem(struct seq_file *, struct mm_struct *); -struct proc_dir_entry *de_get(struct proc_dir_entry *de); -void de_put(struct proc_dir_entry *de); +static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) +{ + atomic_inc(&pde->count); + return pde; +} +void pde_put(struct proc_dir_entry *pde); extern struct vfsmount *proc_mnt; int proc_fill_super(struct super_block *); diff --git a/fs/proc/page.c b/fs/proc/page.c index 5033ce0d254b..180cf5a0bd67 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -8,6 +8,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/hugetlb.h> +#include <linux/kernel-page-flags.h> #include <asm/uaccess.h> #include "internal.h" @@ -71,52 +72,12 @@ static const struct file_operations proc_kpagecount_operations = { * physical page flags. */ -/* These macros are used to decouple internal flags from exported ones */ - -#define KPF_LOCKED 0 -#define KPF_ERROR 1 -#define KPF_REFERENCED 2 -#define KPF_UPTODATE 3 -#define KPF_DIRTY 4 -#define KPF_LRU 5 -#define KPF_ACTIVE 6 -#define KPF_SLAB 7 -#define KPF_WRITEBACK 8 -#define KPF_RECLAIM 9 -#define KPF_BUDDY 10 - -/* 11-20: new additions in 2.6.31 */ -#define KPF_MMAP 11 -#define KPF_ANON 12 -#define KPF_SWAPCACHE 13 -#define KPF_SWAPBACKED 14 -#define KPF_COMPOUND_HEAD 15 -#define KPF_COMPOUND_TAIL 16 -#define KPF_HUGE 17 -#define KPF_UNEVICTABLE 18 -#define KPF_HWPOISON 19 -#define KPF_NOPAGE 20 - -#define KPF_KSM 21 - -/* kernel hacking assistances - * WARNING: subject to change, never rely on them! - */ -#define KPF_RESERVED 32 -#define KPF_MLOCKED 33 -#define KPF_MAPPEDTODISK 34 -#define KPF_PRIVATE 35 -#define KPF_PRIVATE_2 36 -#define KPF_OWNER_PRIVATE 37 -#define KPF_ARCH 38 -#define KPF_UNCACHED 39 - static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) { return ((kflags >> kbit) & 1) << ubit; } -static u64 get_uflags(struct page *page) +u64 stable_page_flags(struct page *page) { u64 k; u64 u; @@ -219,7 +180,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, else ppage = NULL; - if (put_user(get_uflags(ppage), out)) { + if (put_user(stable_page_flags(ppage), out)) { ret = -EFAULT; break; } diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 7ba79a54948c..123257bb356b 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -7,6 +7,7 @@ #include <linux/init.h> #include <linux/time.h> #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <linux/stat.h> #include <linux/string.h> #include <asm/prom.h> @@ -25,26 +26,27 @@ static struct proc_dir_entry *proc_device_tree; /* * Supply data on a read from /proc/device-tree/node/property. */ -static int property_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int property_proc_show(struct seq_file *m, void *v) { - struct property *pp = data; - int n; + struct property *pp = m->private; - if (off >= pp->length) { - *eof = 1; - return 0; - } - n = pp->length - off; - if (n > count) - n = count; - else - *eof = 1; - memcpy(page, (char *)pp->value + off, n); - *start = page; - return n; + seq_write(m, pp->value, pp->length); + return 0; +} + +static int property_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, property_proc_show, PDE(inode)->data); } +static const struct file_operations property_proc_fops = { + .owner = THIS_MODULE, + .open = property_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /* * For a node with a name like "gc@10", we make symlinks called "gc" * and "@10" to it. @@ -63,10 +65,9 @@ __proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp, * Unfortunately proc_register puts each new entry * at the beginning of the list. So we rearrange them. */ - ent = create_proc_read_entry(name, - strncmp(name, "security-", 9) - ? S_IRUGO : S_IRUSR, de, - property_read_proc, pp); + ent = proc_create_data(name, + strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR, + de, &property_proc_fops, pp); if (ent == NULL) return NULL; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2a1bef9203c6..f277c4a111cb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -361,12 +361,11 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte_present(ptent)) continue; - mss->resident += PAGE_SIZE; - page = vm_normal_page(vma, addr, ptent); if (!page) continue; + mss->resident += PAGE_SIZE; /* Accumulate the size in pages that have been accessed. */ if (pte_young(ptent) || PageReferenced(page)) mss->referenced += PAGE_SIZE; @@ -650,6 +649,50 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return err; } +static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) +{ + u64 pme = 0; + if (pte_present(pte)) + pme = PM_PFRAME(pte_pfn(pte) + offset) + | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; + return pme; +} + +static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma; + struct pagemapread *pm = walk->private; + struct hstate *hs = NULL; + int err = 0; + + vma = find_vma(walk->mm, addr); + if (vma) + hs = hstate_vma(vma); + for (; addr != end; addr += PAGE_SIZE) { + u64 pfn = PM_NOT_PRESENT; + + if (vma && (addr >= vma->vm_end)) { + vma = find_vma(walk->mm, addr); + if (vma) + hs = hstate_vma(vma); + } + + if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) { + /* calculate pfn of the "raw" page in the hugepage. */ + int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT; + pfn = huge_pte_to_pagemap_entry(*pte, offset); + } + err = add_to_pagemap(addr, pfn, pm); + if (err) + return err; + } + + cond_resched(); + + return err; +} + /* * /proc/pid/pagemap - an array mapping virtual pages to pfns * @@ -742,6 +785,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, pagemap_walk.pmd_entry = pagemap_pte_range; pagemap_walk.pte_hole = pagemap_pte_hole; + pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; pagemap_walk.mm = mm; pagemap_walk.private = ± diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 8f5c05d3dbd3..5d9fd64ef81a 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -110,9 +110,13 @@ int task_statm(struct mm_struct *mm, int *shared, int *text, } } - size += (*text = mm->end_code - mm->start_code); - size += (*data = mm->start_stack - mm->start_data); + *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) + >> PAGE_SHIFT; + *data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK)) + >> PAGE_SHIFT; up_read(&mm->mmap_sem); + size >>= PAGE_SHIFT; + size += *text + *data; *resident = size; return size; } diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c index 32f5d131a644..22e0d60e53ef 100644 --- a/fs/qnx4/bitmap.c +++ b/fs/qnx4/bitmap.c @@ -17,13 +17,6 @@ #include <linux/bitops.h> #include "qnx4.h" -#if 0 -int qnx4_new_block(struct super_block *sb) -{ - return 0; -} -#endif /* 0 */ - static void count_bits(register const char *bmPart, register int size, int *const tf) { @@ -35,22 +28,7 @@ static void count_bits(register const char *bmPart, register int size, } do { b = *bmPart++; - if ((b & 1) == 0) - tot++; - if ((b & 2) == 0) - tot++; - if ((b & 4) == 0) - tot++; - if ((b & 8) == 0) - tot++; - if ((b & 16) == 0) - tot++; - if ((b & 32) == 0) - tot++; - if ((b & 64) == 0) - tot++; - if ((b & 128) == 0) - tot++; + tot += 8 - hweight8(b); size--; } while (size != 0); *tf = tot; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 449f5a66dd34..ebf3440d28ca 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -64,25 +64,7 @@ static struct buffer_head *qnx4_getblk(struct inode *inode, int nr, result = sb_getblk(inode->i_sb, nr); return result; } - if (!create) { - return NULL; - } -#if 0 - tmp = qnx4_new_block(inode->i_sb); - if (!tmp) { - return NULL; - } - result = sb_getblk(inode->i_sb, tmp); - if (tst) { - qnx4_free_block(inode->i_sb, tmp); - brelse(result); - goto repeat; - } - tst = tmp; -#endif - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - return result; + return NULL; } struct buffer_head *qnx4_bread(struct inode *inode, int block, int create) @@ -113,8 +95,6 @@ static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_h if ( phys ) { // logical block is before EOF map_bh(bh, inode->i_sb, phys); - } else if ( create ) { - // to be done. } return 0; } diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index 353e78a9ebee..efc02ebb8c70 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -46,12 +46,14 @@ config QFMT_V1 format say Y here. config QFMT_V2 - tristate "Quota format v2 support" + tristate "Quota format vfsv0 and vfsv1 support" depends on QUOTA select QUOTA_TREE help - This quota format allows using quotas with 32-bit UIDs/GIDs. If you - need this functionality say Y here. + This config option enables kernel support for vfsv0 and vfsv1 quota + formats. Both these formats support 32-bit UIDs/GIDs and vfsv1 format + also supports 64-bit inode and block quota limits. If you need this + functionality say Y here. config QUOTACTL bool diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index eb5a755718f6..3fc62b097bed 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -323,6 +323,30 @@ int dquot_mark_dquot_dirty(struct dquot *dquot) } EXPORT_SYMBOL(dquot_mark_dquot_dirty); +/* Dirtify all the dquots - this can block when journalling */ +static inline int mark_all_dquot_dirty(struct dquot * const *dquot) +{ + int ret, err, cnt; + + ret = err = 0; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (dquot[cnt]) + /* Even in case of error we have to continue */ + ret = mark_dquot_dirty(dquot[cnt]); + if (!err) + err = ret; + } + return err; +} + +static inline void dqput_all(struct dquot **dquot) +{ + unsigned int cnt; + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + dqput(dquot[cnt]); +} + /* This function needs dq_list_lock */ static inline int clear_dquot_dirty(struct dquot *dquot) { @@ -1268,8 +1292,7 @@ int dquot_initialize(struct inode *inode, int type) out_err: up_write(&sb_dqopt(sb)->dqptr_sem); /* Drop unused references */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - dqput(got[cnt]); + dqput_all(got); return ret; } EXPORT_SYMBOL(dquot_initialize); @@ -1288,9 +1311,7 @@ int dquot_drop(struct inode *inode) inode->i_dquot[cnt] = NULL; } up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - dqput(put[cnt]); + dqput_all(put); return 0; } EXPORT_SYMBOL(dquot_drop); @@ -1319,6 +1340,70 @@ void vfs_dq_drop(struct inode *inode) EXPORT_SYMBOL(vfs_dq_drop); /* + * inode_reserved_space is managed internally by quota, and protected by + * i_lock similar to i_blocks+i_bytes. + */ +static qsize_t *inode_reserved_space(struct inode * inode) +{ + /* Filesystem must explicitly define it's own method in order to use + * quota reservation interface */ + BUG_ON(!inode->i_sb->dq_op->get_reserved_space); + return inode->i_sb->dq_op->get_reserved_space(inode); +} + +static void inode_add_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) += number; + spin_unlock(&inode->i_lock); +} + + +static void inode_claim_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + __inode_add_bytes(inode, number); + spin_unlock(&inode->i_lock); +} + +static void inode_sub_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + spin_unlock(&inode->i_lock); +} + +static qsize_t inode_get_rsv_space(struct inode *inode) +{ + qsize_t ret; + + if (!inode->i_sb->dq_op->get_reserved_space) + return 0; + spin_lock(&inode->i_lock); + ret = *inode_reserved_space(inode); + spin_unlock(&inode->i_lock); + return ret; +} + +static void inode_incr_space(struct inode *inode, qsize_t number, + int reserve) +{ + if (reserve) + inode_add_rsv_space(inode, number); + else + inode_add_bytes(inode, number); +} + +static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) +{ + if (reserve) + inode_sub_rsv_space(inode, number); + else + inode_sub_bytes(inode, number); +} + +/* * Following four functions update i_blocks+i_bytes fields and * quota information (together with appropriate checks) * NOTE: We absolutely rely on the fact that caller dirties @@ -1336,6 +1421,21 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int cnt, ret = QUOTA_OK; char warntype[MAXQUOTAS]; + /* + * First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex + */ + if (IS_NOQUOTA(inode)) { + inode_incr_space(inode, number, reserve); + goto out; + } + + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + if (IS_NOQUOTA(inode)) { + inode_incr_space(inode, number, reserve); + goto out_unlock; + } + for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; @@ -1346,7 +1446,8 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) == NO_QUOTA) { ret = NO_QUOTA; - goto out_unlock; + spin_unlock(&dq_data_lock); + goto out_flush_warn; } } for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1357,64 +1458,29 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, else dquot_incr_space(inode->i_dquot[cnt], number); } - if (!reserve) - inode_add_bytes(inode, number); -out_unlock: + inode_incr_space(inode, number, reserve); spin_unlock(&dq_data_lock); + + if (reserve) + goto out_flush_warn; + mark_all_dquot_dirty(inode->i_dquot); +out_flush_warn: flush_warnings(inode->i_dquot, warntype); +out_unlock: + up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); +out: return ret; } int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) { - int cnt, ret = QUOTA_OK; - - /* - * First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex - */ - if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); - goto out; - } - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); - goto out_unlock; - } - - ret = __dquot_alloc_space(inode, number, warn, 0); - if (ret == NO_QUOTA) - goto out_unlock; - - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return ret; + return __dquot_alloc_space(inode, number, warn, 0); } EXPORT_SYMBOL(dquot_alloc_space); int dquot_reserve_space(struct inode *inode, qsize_t number, int warn) { - int ret = QUOTA_OK; - - if (IS_NOQUOTA(inode)) - goto out; - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) - goto out_unlock; - - ret = __dquot_alloc_space(inode, number, warn, 1); -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return ret; + return __dquot_alloc_space(inode, number, warn, 1); } EXPORT_SYMBOL(dquot_reserve_space); @@ -1455,10 +1521,7 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number) warn_put_all: spin_unlock(&dq_data_lock); if (ret == QUOTA_OK) - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return ret; @@ -1471,14 +1534,14 @@ int dquot_claim_space(struct inode *inode, qsize_t number) int ret = QUOTA_OK; if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); goto out; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); if (IS_NOQUOTA(inode)) { up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); goto out; } @@ -1490,12 +1553,9 @@ int dquot_claim_space(struct inode *inode, qsize_t number) number); } /* Update inode bytes */ - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); spin_unlock(&dq_data_lock); - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + mark_all_dquot_dirty(inode->i_dquot); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); out: return ret; @@ -1503,38 +1563,9 @@ out: EXPORT_SYMBOL(dquot_claim_space); /* - * Release reserved quota space - */ -void dquot_release_reserved_space(struct inode *inode, qsize_t number) -{ - int cnt; - - if (IS_NOQUOTA(inode)) - goto out; - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) - goto out_unlock; - - spin_lock(&dq_data_lock); - /* Release reserved dquots */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (inode->i_dquot[cnt]) - dquot_free_reserved_space(inode->i_dquot[cnt], number); - } - spin_unlock(&dq_data_lock); - -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return; -} -EXPORT_SYMBOL(dquot_release_reserved_space); - -/* * This operation can block, but only after everything is updated */ -int dquot_free_space(struct inode *inode, qsize_t number) +int __dquot_free_space(struct inode *inode, qsize_t number, int reserve) { unsigned int cnt; char warntype[MAXQUOTAS]; @@ -1543,7 +1574,7 @@ int dquot_free_space(struct inode *inode, qsize_t number) * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) { out_sub: - inode_sub_bytes(inode, number); + inode_decr_space(inode, number, reserve); return QUOTA_OK; } @@ -1558,21 +1589,40 @@ out_sub: if (!inode->i_dquot[cnt]) continue; warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); - dquot_decr_space(inode->i_dquot[cnt], number); + if (reserve) + dquot_free_reserved_space(inode->i_dquot[cnt], number); + else + dquot_decr_space(inode->i_dquot[cnt], number); } - inode_sub_bytes(inode, number); + inode_decr_space(inode, number, reserve); spin_unlock(&dq_data_lock); - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + + if (reserve) + goto out_unlock; + mark_all_dquot_dirty(inode->i_dquot); +out_unlock: flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; } + +int dquot_free_space(struct inode *inode, qsize_t number) +{ + return __dquot_free_space(inode, number, 0); +} EXPORT_SYMBOL(dquot_free_space); /* + * Release reserved quota space + */ +void dquot_release_reserved_space(struct inode *inode, qsize_t number) +{ + __dquot_free_space(inode, number, 1); + +} +EXPORT_SYMBOL(dquot_release_reserved_space); + +/* * This operation can block, but only after everything is updated */ int dquot_free_inode(const struct inode *inode, qsize_t number) @@ -1599,10 +1649,7 @@ int dquot_free_inode(const struct inode *inode, qsize_t number) dquot_decr_inodes(inode->i_dquot[cnt], number); } spin_unlock(&dq_data_lock); - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; @@ -1610,19 +1657,6 @@ int dquot_free_inode(const struct inode *inode, qsize_t number) EXPORT_SYMBOL(dquot_free_inode); /* - * call back function, get reserved quota space from underlying fs - */ -qsize_t dquot_get_reserved_space(struct inode *inode) -{ - qsize_t reserved_space = 0; - - if (sb_any_quota_active(inode->i_sb) && - inode->i_sb->dq_op->get_reserved_space) - reserved_space = inode->i_sb->dq_op->get_reserved_space(inode); - return reserved_space; -} - -/* * Transfer the number of inode and blocks from one diskquota to an other. * * This operation can block, but only after everything is updated @@ -1665,7 +1699,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) } spin_lock(&dq_data_lock); cur_space = inode_get_bytes(inode); - rsv_space = dquot_get_reserved_space(inode); + rsv_space = inode_get_rsv_space(inode); space = cur_space + rsv_space; /* Build the transfer_from list and check the limits */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1709,25 +1743,18 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) spin_unlock(&dq_data_lock); up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (transfer_from[cnt]) - mark_dquot_dirty(transfer_from[cnt]); - if (transfer_to[cnt]) { - mark_dquot_dirty(transfer_to[cnt]); - /* The reference we got is transferred to the inode */ - transfer_to[cnt] = NULL; - } - } + mark_all_dquot_dirty(transfer_from); + mark_all_dquot_dirty(transfer_to); + /* The reference we got is transferred to the inode */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + transfer_to[cnt] = NULL; warn_put_all: flush_warnings(transfer_to, warntype_to); flush_warnings(transfer_from, warntype_from_inodes); flush_warnings(transfer_from, warntype_from_space); put_all: - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - dqput(transfer_from[cnt]); - dqput(transfer_to[cnt]); - } + dqput_all(transfer_from); + dqput_all(transfer_to); return ret; over_quota: spin_unlock(&dq_data_lock); @@ -2164,7 +2191,9 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; + mutex_lock(&sb->s_root->d_inode->i_mutex); dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); + mutex_unlock(&sb->s_root->d_inode->i_mutex); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c index 0edcf42b1778..2ae757e9c008 100644 --- a/fs/quota/quota_v1.c +++ b/fs/quota/quota_v1.c @@ -204,7 +204,7 @@ out: return ret; } -static struct quota_format_ops v1_format_ops = { +static const struct quota_format_ops v1_format_ops = { .check_quota_file = v1_check_quota_file, .read_file_info = v1_read_file_info, .write_file_info = v1_write_file_info, diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index a5475fb1ae44..e3da02f4986f 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -23,14 +23,23 @@ MODULE_LICENSE("GPL"); #define __QUOTA_V2_PARANOIA -static void v2_mem2diskdqb(void *dp, struct dquot *dquot); -static void v2_disk2memdqb(struct dquot *dquot, void *dp); -static int v2_is_id(void *dp, struct dquot *dquot); - -static struct qtree_fmt_operations v2_qtree_ops = { - .mem2disk_dqblk = v2_mem2diskdqb, - .disk2mem_dqblk = v2_disk2memdqb, - .is_id = v2_is_id, +static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot); +static void v2r0_disk2memdqb(struct dquot *dquot, void *dp); +static int v2r0_is_id(void *dp, struct dquot *dquot); +static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot); +static void v2r1_disk2memdqb(struct dquot *dquot, void *dp); +static int v2r1_is_id(void *dp, struct dquot *dquot); + +static struct qtree_fmt_operations v2r0_qtree_ops = { + .mem2disk_dqblk = v2r0_mem2diskdqb, + .disk2mem_dqblk = v2r0_disk2memdqb, + .is_id = v2r0_is_id, +}; + +static struct qtree_fmt_operations v2r1_qtree_ops = { + .mem2disk_dqblk = v2r1_mem2diskdqb, + .disk2mem_dqblk = v2r1_disk2memdqb, + .is_id = v2r1_is_id, }; #define QUOTABLOCK_BITS 10 @@ -46,23 +55,33 @@ static inline qsize_t v2_qbtos(qsize_t blocks) return blocks << QUOTABLOCK_BITS; } +static int v2_read_header(struct super_block *sb, int type, + struct v2_disk_dqheader *dqhead) +{ + ssize_t size; + + size = sb->s_op->quota_read(sb, type, (char *)dqhead, + sizeof(struct v2_disk_dqheader), 0); + if (size != sizeof(struct v2_disk_dqheader)) { + printk(KERN_WARNING "quota_v2: Failed header read:" + " expected=%zd got=%zd\n", + sizeof(struct v2_disk_dqheader), size); + return 0; + } + return 1; +} + /* Check whether given file is really vfsv0 quotafile */ static int v2_check_quota_file(struct super_block *sb, int type) { struct v2_disk_dqheader dqhead; - ssize_t size; static const uint quota_magics[] = V2_INITQMAGICS; static const uint quota_versions[] = V2_INITQVERSIONS; - size = sb->s_op->quota_read(sb, type, (char *)&dqhead, - sizeof(struct v2_disk_dqheader), 0); - if (size != sizeof(struct v2_disk_dqheader)) { - printk("quota_v2: failed read expected=%zd got=%zd\n", - sizeof(struct v2_disk_dqheader), size); + if (!v2_read_header(sb, type, &dqhead)) return 0; - } if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || - le32_to_cpu(dqhead.dqh_version) != quota_versions[type]) + le32_to_cpu(dqhead.dqh_version) > quota_versions[type]) return 0; return 1; } @@ -71,14 +90,23 @@ static int v2_check_quota_file(struct super_block *sb, int type) static int v2_read_file_info(struct super_block *sb, int type) { struct v2_disk_dqinfo dinfo; + struct v2_disk_dqheader dqhead; struct mem_dqinfo *info = sb_dqinfo(sb, type); struct qtree_mem_dqinfo *qinfo; ssize_t size; + unsigned int version; + + if (!v2_read_header(sb, type, &dqhead)) + return -1; + version = le32_to_cpu(dqhead.dqh_version); + if ((info->dqi_fmt_id == QFMT_VFS_V0 && version != 0) || + (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1)) + return -1; size = sb->s_op->quota_read(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); if (size != sizeof(struct v2_disk_dqinfo)) { - printk(KERN_WARNING "Can't read info structure on device %s.\n", + printk(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n", sb->s_id); return -1; } @@ -89,9 +117,15 @@ static int v2_read_file_info(struct super_block *sb, int type) return -1; } qinfo = info->dqi_priv; - /* limits are stored as unsigned 32-bit data */ - info->dqi_maxblimit = 0xffffffff; - info->dqi_maxilimit = 0xffffffff; + if (version == 0) { + /* limits are stored as unsigned 32-bit data */ + info->dqi_maxblimit = 0xffffffff; + info->dqi_maxilimit = 0xffffffff; + } else { + /* used space is stored as unsigned 64-bit value */ + info->dqi_maxblimit = 0xffffffffffffffffULL; /* 2^64-1 */ + info->dqi_maxilimit = 0xffffffffffffffffULL; + } info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); info->dqi_flags = le32_to_cpu(dinfo.dqi_flags); @@ -103,8 +137,13 @@ static int v2_read_file_info(struct super_block *sb, int type) qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS; qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS; qinfo->dqi_qtree_depth = qtree_depth(qinfo); - qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk); - qinfo->dqi_ops = &v2_qtree_ops; + if (version == 0) { + qinfo->dqi_entry_size = sizeof(struct v2r0_disk_dqblk); + qinfo->dqi_ops = &v2r0_qtree_ops; + } else { + qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk); + qinfo->dqi_ops = &v2r1_qtree_ops; + } return 0; } @@ -135,9 +174,9 @@ static int v2_write_file_info(struct super_block *sb, int type) return 0; } -static void v2_disk2memdqb(struct dquot *dquot, void *dp) +static void v2r0_disk2memdqb(struct dquot *dquot, void *dp) { - struct v2_disk_dqblk *d = dp, empty; + struct v2r0_disk_dqblk *d = dp, empty; struct mem_dqblk *m = &dquot->dq_dqb; m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); @@ -149,15 +188,15 @@ static void v2_disk2memdqb(struct dquot *dquot, void *dp) m->dqb_curspace = le64_to_cpu(d->dqb_curspace); m->dqb_btime = le64_to_cpu(d->dqb_btime); /* We need to escape back all-zero structure */ - memset(&empty, 0, sizeof(struct v2_disk_dqblk)); + memset(&empty, 0, sizeof(struct v2r0_disk_dqblk)); empty.dqb_itime = cpu_to_le64(1); - if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk))) + if (!memcmp(&empty, dp, sizeof(struct v2r0_disk_dqblk))) m->dqb_itime = 0; } -static void v2_mem2diskdqb(void *dp, struct dquot *dquot) +static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot) { - struct v2_disk_dqblk *d = dp; + struct v2r0_disk_dqblk *d = dp; struct mem_dqblk *m = &dquot->dq_dqb; struct qtree_mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; @@ -175,9 +214,60 @@ static void v2_mem2diskdqb(void *dp, struct dquot *dquot) d->dqb_itime = cpu_to_le64(1); } -static int v2_is_id(void *dp, struct dquot *dquot) +static int v2r0_is_id(void *dp, struct dquot *dquot) { - struct v2_disk_dqblk *d = dp; + struct v2r0_disk_dqblk *d = dp; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + + if (qtree_entry_unused(info, dp)) + return 0; + return le32_to_cpu(d->dqb_id) == dquot->dq_id; +} + +static void v2r1_disk2memdqb(struct dquot *dquot, void *dp) +{ + struct v2r1_disk_dqblk *d = dp, empty; + struct mem_dqblk *m = &dquot->dq_dqb; + + m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit); + m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit); + m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes); + m->dqb_itime = le64_to_cpu(d->dqb_itime); + m->dqb_bhardlimit = v2_qbtos(le64_to_cpu(d->dqb_bhardlimit)); + m->dqb_bsoftlimit = v2_qbtos(le64_to_cpu(d->dqb_bsoftlimit)); + m->dqb_curspace = le64_to_cpu(d->dqb_curspace); + m->dqb_btime = le64_to_cpu(d->dqb_btime); + /* We need to escape back all-zero structure */ + memset(&empty, 0, sizeof(struct v2r1_disk_dqblk)); + empty.dqb_itime = cpu_to_le64(1); + if (!memcmp(&empty, dp, sizeof(struct v2r1_disk_dqblk))) + m->dqb_itime = 0; +} + +static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot) +{ + struct v2r1_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + + d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); + d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); + d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes); + d->dqb_itime = cpu_to_le64(m->dqb_itime); + d->dqb_bhardlimit = cpu_to_le64(v2_stoqb(m->dqb_bhardlimit)); + d->dqb_bsoftlimit = cpu_to_le64(v2_stoqb(m->dqb_bsoftlimit)); + d->dqb_curspace = cpu_to_le64(m->dqb_curspace); + d->dqb_btime = cpu_to_le64(m->dqb_btime); + d->dqb_id = cpu_to_le32(dquot->dq_id); + if (qtree_entry_unused(info, dp)) + d->dqb_itime = cpu_to_le64(1); +} + +static int v2r1_is_id(void *dp, struct dquot *dquot) +{ + struct v2r1_disk_dqblk *d = dp; struct qtree_mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; @@ -207,7 +297,7 @@ static int v2_free_file_info(struct super_block *sb, int type) return 0; } -static struct quota_format_ops v2_format_ops = { +static const struct quota_format_ops v2_format_ops = { .check_quota_file = v2_check_quota_file, .read_file_info = v2_read_file_info, .write_file_info = v2_write_file_info, @@ -217,20 +307,32 @@ static struct quota_format_ops v2_format_ops = { .release_dqblk = v2_release_dquot, }; -static struct quota_format_type v2_quota_format = { +static struct quota_format_type v2r0_quota_format = { .qf_fmt_id = QFMT_VFS_V0, .qf_ops = &v2_format_ops, .qf_owner = THIS_MODULE }; +static struct quota_format_type v2r1_quota_format = { + .qf_fmt_id = QFMT_VFS_V1, + .qf_ops = &v2_format_ops, + .qf_owner = THIS_MODULE +}; + static int __init init_v2_quota_format(void) { - return register_quota_format(&v2_quota_format); + int ret; + + ret = register_quota_format(&v2r0_quota_format); + if (ret) + return ret; + return register_quota_format(&v2r1_quota_format); } static void __exit exit_v2_quota_format(void) { - unregister_quota_format(&v2_quota_format); + unregister_quota_format(&v2r0_quota_format); + unregister_quota_format(&v2r1_quota_format); } module_init(init_v2_quota_format); diff --git a/fs/quota/quotaio_v2.h b/fs/quota/quotaio_v2.h index 530fe580685c..f1966b42c2fd 100644 --- a/fs/quota/quotaio_v2.h +++ b/fs/quota/quotaio_v2.h @@ -17,8 +17,8 @@ } #define V2_INITQVERSIONS {\ - 0, /* USRQUOTA */\ - 0 /* GRPQUOTA */\ + 1, /* USRQUOTA */\ + 1 /* GRPQUOTA */\ } /* First generic header */ @@ -32,7 +32,7 @@ struct v2_disk_dqheader { * (as it appears on disk) - the file is a radix tree whose leaves point * to blocks of these structures. */ -struct v2_disk_dqblk { +struct v2r0_disk_dqblk { __le32 dqb_id; /* id this quota applies to */ __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */ __le32 dqb_isoftlimit; /* preferred inode limit */ @@ -44,6 +44,19 @@ struct v2_disk_dqblk { __le64 dqb_itime; /* time limit for excessive inode use */ }; +struct v2r1_disk_dqblk { + __le32 dqb_id; /* id this quota applies to */ + __le32 dqb_pad; + __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __le64 dqb_isoftlimit; /* preferred inode limit */ + __le64 dqb_curinodes; /* current # allocated inodes */ + __le64 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */ + __le64 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */ + __le64 dqb_curspace; /* current space occupied (in bytes) */ + __le64 dqb_btime; /* time limit for excessive disk use */ + __le64 dqb_itime; /* time limit for excessive inode use */ +}; + /* Header with type and version specific information */ struct v2_disk_dqinfo { __le32 dqi_bgrace; /* Time before block soft limit becomes hard limit */ diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 32fae4040ebf..1739a4aba25f 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -60,7 +60,7 @@ const struct inode_operations ramfs_file_inode_operations = { */ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) { - unsigned long npages, xpages, loop, limit; + unsigned long npages, xpages, loop; struct page *pages; unsigned order; void *data; @@ -123,30 +123,6 @@ add_error: /*****************************************************************************/ /* - * check that file shrinkage doesn't leave any VMAs dangling in midair - */ -static int ramfs_nommu_check_mappings(struct inode *inode, - size_t newsize, size_t size) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - - /* search for VMAs that fall within the dead zone */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - newsize >> PAGE_SHIFT, - (size + PAGE_SIZE - 1) >> PAGE_SHIFT - ) { - /* found one - only interested if it's shared out of the page - * cache */ - if (vma->vm_flags & VM_SHARED) - return -ETXTBSY; /* not quite true, but near enough */ - } - - return 0; -} - -/*****************************************************************************/ -/* * */ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) @@ -164,7 +140,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) /* check that a decrease in size doesn't cut off any shared mappings */ if (newsize < size) { - ret = ramfs_nommu_check_mappings(inode, newsize, size); + ret = nommu_shrink_inode_mappings(inode, size, newsize); if (ret < 0) return ret; } diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile index 6a9e30c041dd..792b3cb2cd18 100644 --- a/fs/reiserfs/Makefile +++ b/fs/reiserfs/Makefile @@ -7,7 +7,11 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \ super.o prints.o objectid.o lbalance.o ibalance.o stree.o \ hashes.o tail_conversion.o journal.o resize.o \ - item_ops.o ioctl.o procfs.o xattr.o lock.o + item_ops.o ioctl.o xattr.o lock.o + +ifeq ($(CONFIG_REISERFS_PROC_INFO),y) +reiserfs-objs += procfs.o +endif ifeq ($(CONFIG_REISERFS_FS_XATTR),y) reiserfs-objs += xattr_user.o xattr_trusted.o diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 685495707181..65c872761177 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -1277,7 +1277,10 @@ int reiserfs_init_bitmap_cache(struct super_block *sb) struct reiserfs_bitmap_info *bitmap; unsigned int bmap_nr = reiserfs_bmap_count(sb); + /* Avoid lock recursion in fault case */ + reiserfs_write_unlock(sb); bitmap = vmalloc(sizeof(*bitmap) * bmap_nr); + reiserfs_write_lock(sb); if (bitmap == NULL) return -ENOMEM; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 3a28e7751b3c..9087b10209e6 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -31,11 +31,12 @@ void reiserfs_delete_inode(struct inode *inode) JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); struct reiserfs_transaction_handle th; + int depth; int err; truncate_inode_pages(&inode->i_data, 0); - reiserfs_write_lock(inode->i_sb); + depth = reiserfs_write_lock_once(inode->i_sb); /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ @@ -74,7 +75,7 @@ void reiserfs_delete_inode(struct inode *inode) out: clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ inode->i_blocks = 0; - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, depth); } static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid, @@ -2538,6 +2539,12 @@ static int reiserfs_writepage(struct page *page, struct writeback_control *wbc) return reiserfs_write_full_page(page, wbc); } +static void reiserfs_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + reiserfs_truncate_file(inode, 0); +} + static int reiserfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -2604,6 +2611,8 @@ static int reiserfs_write_begin(struct file *file, if (ret) { unlock_page(page); page_cache_release(page); + /* Truncate allocated blocks */ + reiserfs_truncate_failed_write(inode); } return ret; } @@ -2701,9 +2710,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, ** transaction tracking stuff when the size changes. So, we have ** to do the i_size updates here. */ - pos += copied; - - if (pos > inode->i_size) { + if (pos + copied > inode->i_size) { struct reiserfs_transaction_handle myth; lock_depth = reiserfs_write_lock_once(inode->i_sb); locked = true; @@ -2721,7 +2728,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, goto journal_error; reiserfs_update_inode_transaction(inode); - inode->i_size = pos; + inode->i_size = pos + copied; /* * this will just nest into our transaction. It's important * to use mark_inode_dirty so the inode gets pushed around on the @@ -2751,6 +2758,10 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, reiserfs_write_unlock_once(inode->i_sb, lock_depth); unlock_page(page); page_cache_release(page); + + if (pos + len > inode->i_size) + reiserfs_truncate_failed_write(inode); + return ret == 0 ? copied : ret; journal_error: @@ -3051,13 +3062,14 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; - int error; unsigned int ia_valid; + int depth; + int error; /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - reiserfs_write_lock(inode->i_sb); + depth = reiserfs_write_lock_once(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { /* version 2 items will be caught by the s_maxbytes check ** done for us in vmtruncate @@ -3138,8 +3150,17 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) journal_end(&th, inode->i_sb, jbegin_count); } } - if (!error) + if (!error) { + /* + * Relax the lock here, as it might truncate the + * inode pages and wait for inode pages locks. + * To release such page lock, the owner needs the + * reiserfs lock + */ + reiserfs_write_unlock_once(inode->i_sb, depth); error = inode_setattr(inode, attr); + depth = reiserfs_write_lock_once(inode->i_sb); + } } if (!error && reiserfs_posixacl(inode->i_sb)) { @@ -3148,7 +3169,8 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) } out: - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, depth); + return error; } diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index ace77451ceb1..f53505de0712 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -104,9 +104,10 @@ setflags_out: err = put_user(inode->i_generation, (int __user *)arg); break; case REISERFS_IOC_SETVERSION: - if (!is_owner_or_cap(inode)) + if (!is_owner_or_cap(inode)) { err = -EPERM; break; + } err = mnt_want_write(filp->f_path.mnt); if (err) break; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 2f8a7e7b8dab..ba98546fabbd 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2009,10 +2009,11 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, destroy_workqueue(commit_wq); commit_wq = NULL; } - reiserfs_write_lock(sb); free_journal_ram(sb); + reiserfs_write_lock(sb); + return 0; } @@ -2758,11 +2759,18 @@ int journal_init(struct super_block *sb, const char *j_dev_name, struct reiserfs_journal *journal; struct reiserfs_journal_list *jl; char b[BDEVNAME_SIZE]; + int ret; + /* + * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS + * dependency inversion warnings. + */ + reiserfs_write_unlock(sb); journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal)); if (!journal) { reiserfs_warning(sb, "journal-1256", "unable to get memory for journal structure"); + reiserfs_write_lock(sb); return 1; } memset(journal, 0, sizeof(struct reiserfs_journal)); @@ -2771,10 +2779,12 @@ int journal_init(struct super_block *sb, const char *j_dev_name, INIT_LIST_HEAD(&journal->j_working_list); INIT_LIST_HEAD(&journal->j_journal_list); journal->j_persistent_trans = 0; - if (reiserfs_allocate_list_bitmaps(sb, - journal->j_list_bitmap, - reiserfs_bmap_count(sb))) + ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, + reiserfs_bmap_count(sb)); + reiserfs_write_lock(sb); + if (ret) goto free_and_return; + allocate_bitmap_nodes(sb); /* reserved for journal area support */ @@ -2903,7 +2913,9 @@ int journal_init(struct super_block *sb, const char *j_dev_name, journal->j_mount_id = 10; journal->j_state = 0; atomic_set(&(journal->j_jlock), 0); + reiserfs_write_unlock(sb); journal->j_cnode_free_list = allocate_cnodes(num_cnodes); + reiserfs_write_lock(sb); journal->j_cnode_free_orig = journal->j_cnode_free_list; journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; journal->j_cnode_used = 0; diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c index ee2cfc0fd8a7..b87aa2c1afc1 100644 --- a/fs/reiserfs/lock.c +++ b/fs/reiserfs/lock.c @@ -86,3 +86,12 @@ void reiserfs_check_lock_depth(struct super_block *sb, char *caller) reiserfs_panic(sb, "%s called without kernel lock held %d", caller); } + +#ifdef CONFIG_REISERFS_CHECK +void reiserfs_lock_check_recursive(struct super_block *sb) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); + + WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n"); +} +#endif diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index e296ff72a6cc..9d4dcf0b07cb 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -921,6 +921,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) struct reiserfs_transaction_handle th; int jbegin_count; unsigned long savelink; + int depth; inode = dentry->d_inode; @@ -932,7 +933,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); - reiserfs_write_lock(dir->i_sb); + depth = reiserfs_write_lock_once(dir->i_sb); retval = journal_begin(&th, dir->i_sb, jbegin_count); if (retval) goto out_unlink; @@ -993,7 +994,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) retval = journal_end(&th, dir->i_sb, jbegin_count); reiserfs_check_path(&path); - reiserfs_write_unlock(dir->i_sb); + reiserfs_write_unlock_once(dir->i_sb, depth); return retval; end_unlink: @@ -1003,7 +1004,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) if (err) retval = err; out_unlink: - reiserfs_write_unlock(dir->i_sb); + reiserfs_write_unlock_once(dir->i_sb, depth); return retval; } diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 9229e5514a4e..7a9981196c1c 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -17,8 +17,6 @@ #include <linux/init.h> #include <linux/proc_fs.h> -#ifdef CONFIG_REISERFS_PROC_INFO - /* * LOCKING: * @@ -48,14 +46,6 @@ static int show_version(struct seq_file *m, struct super_block *sb) return 0; } -int reiserfs_global_version_in_proc(char *buffer, char **start, off_t offset, - int count, int *eof, void *data) -{ - *start = buffer; - *eof = 1; - return 0; -} - #define SF( x ) ( r -> x ) #define SFP( x ) SF( s_proc_info_data.x ) #define SFPL( x ) SFP( x[ level ] ) @@ -538,19 +528,6 @@ int reiserfs_proc_info_done(struct super_block *sb) return 0; } -struct proc_dir_entry *reiserfs_proc_register_global(char *name, - read_proc_t * func) -{ - return (proc_info_root) ? create_proc_read_entry(name, 0, - proc_info_root, - func, NULL) : NULL; -} - -void reiserfs_proc_unregister_global(const char *name) -{ - remove_proc_entry(name, proc_info_root); -} - int reiserfs_proc_info_global_init(void) { if (proc_info_root == NULL) { @@ -572,48 +549,6 @@ int reiserfs_proc_info_global_done(void) } return 0; } - -/* REISERFS_PROC_INFO */ -#else - -int reiserfs_proc_info_init(struct super_block *sb) -{ - return 0; -} -int reiserfs_proc_info_done(struct super_block *sb) -{ - return 0; -} - -struct proc_dir_entry *reiserfs_proc_register_global(char *name, - read_proc_t * func) -{ - return NULL; -} - -void reiserfs_proc_unregister_global(const char *name) -{; -} - -int reiserfs_proc_info_global_init(void) -{ - return 0; -} -int reiserfs_proc_info_global_done(void) -{ - return 0; -} - -int reiserfs_global_version_in_proc(char *buffer, char **start, - off_t offset, - int count, int *eof, void *data) -{ - return 0; -} - -/* REISERFS_PROC_INFO */ -#endif - /* * Revision 1.1.8.2 2001/07/15 17:08:42 god * . use get_super() in procfs.c diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 339b0baf2af6..b4a7dd03bdb9 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2222,8 +2222,6 @@ static int __init init_reiserfs_fs(void) } reiserfs_proc_info_global_init(); - reiserfs_proc_register_global("version", - reiserfs_global_version_in_proc); ret = register_filesystem(&reiserfs_fs_type); @@ -2231,7 +2229,6 @@ static int __init init_reiserfs_fs(void) return 0; } - reiserfs_proc_unregister_global("version"); reiserfs_proc_info_global_done(); destroy_inodecache(); @@ -2240,7 +2237,6 @@ static int __init init_reiserfs_fs(void) static void __exit exit_reiserfs_fs(void) { - reiserfs_proc_unregister_global("version"); reiserfs_proc_info_global_done(); unregister_filesystem(&reiserfs_fs_type); destroy_inodecache(); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 58aa8e75f7f5..81f09fab8ae4 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -48,6 +48,7 @@ #include <net/checksum.h> #include <linux/stat.h> #include <linux/quotaops.h> +#include <linux/security.h> #define PRIVROOT_NAME ".reiserfs_priv" #define XAROOT_NAME "xattrs" @@ -82,7 +83,8 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) BUG_ON(!mutex_is_locked(&dir->i_mutex)); vfs_dq_init(dir); - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, + I_MUTEX_CHILD, dir->i_sb); error = dir->i_op->unlink(dir, dentry); mutex_unlock(&dentry->d_inode->i_mutex); @@ -97,7 +99,8 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) BUG_ON(!mutex_is_locked(&dir->i_mutex)); vfs_dq_init(dir); - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, + I_MUTEX_CHILD, dir->i_sb); dentry_unhash(dentry); error = dir->i_op->rmdir(dir, dentry); if (!error) @@ -234,16 +237,22 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1) return 0; + reiserfs_write_unlock(inode->i_sb); dir = open_xa_dir(inode, XATTR_REPLACE); if (IS_ERR(dir)) { err = PTR_ERR(dir); + reiserfs_write_lock(inode->i_sb); goto out; } else if (!dir->d_inode) { err = 0; + reiserfs_write_lock(inode->i_sb); goto out_dir; } mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); + + reiserfs_write_lock(inode->i_sb); + buf.xadir = dir; err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos); while ((err == 0 || err == -ENOSPC) && buf.count) { @@ -282,8 +291,9 @@ static int reiserfs_for_each_xattr(struct inode *inode, err = journal_begin(&th, inode->i_sb, blocks); if (!err) { int jerror; - mutex_lock_nested(&dir->d_parent->d_inode->i_mutex, - I_MUTEX_XATTR); + reiserfs_mutex_lock_nested_safe( + &dir->d_parent->d_inode->i_mutex, + I_MUTEX_XATTR, inode->i_sb); err = action(dir, data); jerror = journal_end(&th, inode->i_sb, blocks); mutex_unlock(&dir->d_parent->d_inode->i_mutex); @@ -442,7 +452,9 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) } if (dentry->d_inode) { + reiserfs_write_lock(inode->i_sb); err = xattr_unlink(xadir->d_inode, dentry); + reiserfs_write_unlock(inode->i_sb); update_ctime(inode); } @@ -476,15 +488,24 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, if (get_inode_sd_version(inode) == STAT_DATA_V1) return -EOPNOTSUPP; - if (!buffer) - return lookup_and_delete_xattr(inode, name); + reiserfs_write_unlock(inode->i_sb); + + if (!buffer) { + err = lookup_and_delete_xattr(inode, name); + reiserfs_write_lock(inode->i_sb); + return err; + } dentry = xattr_lookup(inode, name, flags); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + reiserfs_write_lock(inode->i_sb); return PTR_ERR(dentry); + } down_write(&REISERFS_I(inode)->i_xattr_sem); + reiserfs_write_lock(inode->i_sb); + xahash = xattr_hash(buffer, buffer_size); while (buffer_pos < buffer_size || buffer_pos == 0) { size_t chunk; @@ -539,8 +560,12 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, .ia_size = buffer_size, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; + + reiserfs_write_unlock(inode->i_sb); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); down_write(&dentry->d_inode->i_alloc_sem); + reiserfs_write_lock(inode->i_sb); + err = reiserfs_setattr(dentry, &newattrs); up_write(&dentry->d_inode->i_alloc_sem); mutex_unlock(&dentry->d_inode->i_mutex); @@ -726,15 +751,14 @@ ssize_t reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, size_t size) { - struct inode *inode = dentry->d_inode; struct xattr_handler *handler; - handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name); + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) return -EOPNOTSUPP; - return handler->get(inode, name, buffer, size); + return handler->get(dentry, name, buffer, size, handler->flags); } /* @@ -746,15 +770,14 @@ int reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = dentry->d_inode; struct xattr_handler *handler; - handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name); + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) return -EOPNOTSUPP; - return handler->set(inode, name, value, size, flags); + return handler->set(dentry, name, value, size, flags, handler->flags); } /* @@ -764,21 +787,20 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, */ int reiserfs_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode = dentry->d_inode; struct xattr_handler *handler; - handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name); + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) return -EOPNOTSUPP; - return handler->set(inode, name, NULL, 0, XATTR_REPLACE); + return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags); } struct listxattr_buf { size_t size; size_t pos; char *buf; - struct inode *inode; + struct dentry *dentry; }; static int listxattr_filler(void *buf, const char *name, int namelen, @@ -789,17 +811,19 @@ static int listxattr_filler(void *buf, const char *name, int namelen, if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { struct xattr_handler *handler; - handler = find_xattr_handler_prefix(b->inode->i_sb->s_xattr, + handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, name); if (!handler) /* Unsupported xattr name */ return 0; if (b->buf) { - size = handler->list(b->inode, b->buf + b->pos, - b->size, name, namelen); + size = handler->list(b->dentry, b->buf + b->pos, + b->size, name, namelen, + handler->flags); if (size > b->size) return -ERANGE; } else { - size = handler->list(b->inode, NULL, 0, name, namelen); + size = handler->list(b->dentry, NULL, 0, name, + namelen, handler->flags); } b->pos += size; @@ -820,7 +844,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) int err = 0; loff_t pos = 0; struct listxattr_buf buf = { - .inode = dentry->d_inode, + .dentry = dentry, .buf = buffer, .size = buffer ? size : 0, }; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 35d6e672a279..dd20a7883f0f 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -15,8 +15,10 @@ static int reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct posix_acl *acl); static int -xattr_set_acl(struct inode *inode, int type, const void *value, size_t size) +posix_acl_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; struct posix_acl *acl; int error, error2; struct reiserfs_transaction_handle th; @@ -60,15 +62,16 @@ xattr_set_acl(struct inode *inode, int type, const void *value, size_t size) } static int -xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +posix_acl_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { struct posix_acl *acl; int error; - if (!reiserfs_posixacl(inode->i_sb)) + if (!reiserfs_posixacl(dentry->d_sb)) return -EOPNOTSUPP; - acl = reiserfs_get_acl(inode, type); + acl = reiserfs_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -452,7 +455,9 @@ int reiserfs_acl_chmod(struct inode *inode) return 0; } + reiserfs_write_unlock(inode->i_sb); acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); + reiserfs_write_lock(inode->i_sb); if (!acl) return 0; if (IS_ERR(acl)) @@ -482,30 +487,12 @@ int reiserfs_acl_chmod(struct inode *inode) return error; } -static int -posix_acl_access_get(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1) - return -EINVAL; - return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int -posix_acl_access_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1) - return -EINVAL; - return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static size_t posix_acl_access_list(struct inode *inode, char *list, +static size_t posix_acl_access_list(struct dentry *dentry, char *list, size_t list_size, const char *name, - size_t name_len) + size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - if (!reiserfs_posixacl(inode->i_sb)) + if (!reiserfs_posixacl(dentry->d_sb)) return 0; if (list && size <= list_size) memcpy(list, POSIX_ACL_XATTR_ACCESS, size); @@ -514,35 +501,18 @@ static size_t posix_acl_access_list(struct inode *inode, char *list, struct xattr_handler reiserfs_posix_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, - .get = posix_acl_access_get, - .set = posix_acl_access_set, + .flags = ACL_TYPE_ACCESS, + .get = posix_acl_get, + .set = posix_acl_set, .list = posix_acl_access_list, }; -static int -posix_acl_default_get(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1) - return -EINVAL; - return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int -posix_acl_default_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1) - return -EINVAL; - return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - -static size_t posix_acl_default_list(struct inode *inode, char *list, +static size_t posix_acl_default_list(struct dentry *dentry, char *list, size_t list_size, const char *name, - size_t name_len) + size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - if (!reiserfs_posixacl(inode->i_sb)) + if (!reiserfs_posixacl(dentry->d_sb)) return 0; if (list && size <= list_size) memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); @@ -551,7 +521,8 @@ static size_t posix_acl_default_list(struct inode *inode, char *list, struct xattr_handler reiserfs_posix_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, - .get = posix_acl_default_get, - .set = posix_acl_default_set, + .flags = ACL_TYPE_DEFAULT, + .get = posix_acl_get, + .set = posix_acl_set, .list = posix_acl_default_list, }; diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index a92c8792c0f6..d8b5bfcbdd30 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -8,36 +8,37 @@ #include <asm/uaccess.h> static int -security_get(struct inode *inode, const char *name, void *buffer, size_t size) +security_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) { if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) return -EINVAL; - if (IS_PRIVATE(inode)) + if (IS_PRIVATE(dentry->d_inode)) return -EPERM; - return reiserfs_xattr_get(inode, name, buffer, size); + return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); } static int -security_set(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +security_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) { if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) return -EINVAL; - if (IS_PRIVATE(inode)) + if (IS_PRIVATE(dentry->d_inode)) return -EPERM; - return reiserfs_xattr_set(inode, name, buffer, size, flags); + return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); } -static size_t security_list(struct inode *inode, char *list, size_t list_len, - const char *name, size_t namelen) +static size_t security_list(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t namelen, int handler_flags) { const size_t len = namelen + 1; - if (IS_PRIVATE(inode)) + if (IS_PRIVATE(dentry->d_inode)) return 0; if (list && len <= list_len) { diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index a865042f75e2..5b08aaca3daf 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -8,36 +8,37 @@ #include <asm/uaccess.h> static int -trusted_get(struct inode *inode, const char *name, void *buffer, size_t size) +trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) { if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) return -EPERM; - return reiserfs_xattr_get(inode, name, buffer, size); + return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); } static int -trusted_set(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +trusted_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) { if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) return -EPERM; - return reiserfs_xattr_set(inode, name, buffer, size, flags); + return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); } -static size_t trusted_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int handler_flags) { const size_t len = name_len + 1; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) return 0; if (list && len <= list_size) { diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index e3238dc4f3db..75d59c49b911 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -7,34 +7,35 @@ #include <asm/uaccess.h> static int -user_get(struct inode *inode, const char *name, void *buffer, size_t size) +user_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) { if (strlen(name) < sizeof(XATTR_USER_PREFIX)) return -EINVAL; - if (!reiserfs_xattrs_user(inode->i_sb)) + if (!reiserfs_xattrs_user(dentry->d_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_get(inode, name, buffer, size); + return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); } static int -user_set(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +user_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) { if (strlen(name) < sizeof(XATTR_USER_PREFIX)) return -EINVAL; - if (!reiserfs_xattrs_user(inode->i_sb)) + if (!reiserfs_xattrs_user(dentry->d_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_set(inode, name, buffer, size, flags); + return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); } -static size_t user_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int handler_flags) { const size_t len = name_len + 1; - if (!reiserfs_xattrs_user(inode->i_sb)) + if (!reiserfs_xattrs_user(dentry->d_sb)) return 0; if (list && len <= list_size) { memcpy(list, name, name_len); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index c117fa80d1e9..42d213546894 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -544,6 +544,7 @@ error: error_rsb_inval: ret = -EINVAL; error_rsb: + kfree(rsb); return ret; } diff --git a/fs/signalfd.c b/fs/signalfd.c index b07565c94386..1dabe4ee02fe 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -236,7 +236,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, * anon_inode_getfd() will install the fd. */ ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx, - flags & (O_CLOEXEC | O_NONBLOCK)); + O_RDWR | (flags & (O_CLOEXEC | O_NONBLOCK))); if (ufd < 0) kfree(ctx); } else { diff --git a/fs/stack.c b/fs/stack.c index 67716f6a1a4a..4a6f7f440658 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -7,18 +7,63 @@ * This function cannot be inlined since i_size_{read,write} is rather * heavy-weight on 32-bit systems */ -void fsstack_copy_inode_size(struct inode *dst, const struct inode *src) +void fsstack_copy_inode_size(struct inode *dst, struct inode *src) { - i_size_write(dst, i_size_read((struct inode *)src)); - dst->i_blocks = src->i_blocks; + loff_t i_size; + blkcnt_t i_blocks; + + /* + * i_size_read() includes its own seqlocking and protection from + * preemption (see include/linux/fs.h): we need nothing extra for + * that here, and prefer to avoid nesting locks than attempt to keep + * i_size and i_blocks in sync together. + */ + i_size = i_size_read(src); + + /* + * But if CONFIG_LBDAF (on 32-bit), we ought to make an effort to + * keep the two halves of i_blocks in sync despite SMP or PREEMPT - + * though stat's generic_fillattr() doesn't bother, and we won't be + * applying quotas (where i_blocks does become important) at the + * upper level. + * + * We don't actually know what locking is used at the lower level; + * but if it's a filesystem that supports quotas, it will be using + * i_lock as in inode_add_bytes(). tmpfs uses other locking, and + * its 32-bit is (just) able to exceed 2TB i_size with the aid of + * holes; but its i_blocks cannot carry into the upper long without + * almost 2TB swap - let's ignore that case. + */ + if (sizeof(i_blocks) > sizeof(long)) + spin_lock(&src->i_lock); + i_blocks = src->i_blocks; + if (sizeof(i_blocks) > sizeof(long)) + spin_unlock(&src->i_lock); + + /* + * If CONFIG_SMP or CONFIG_PREEMPT on 32-bit, it's vital for + * fsstack_copy_inode_size() to hold some lock around + * i_size_write(), otherwise i_size_read() may spin forever (see + * include/linux/fs.h). We don't necessarily hold i_mutex when this + * is called, so take i_lock for that case. + * + * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the + * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock + * for that case too, and do both at once by combining the tests. + * + * There is none of this locking overhead in the 64-bit case. + */ + if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long)) + spin_lock(&dst->i_lock); + i_size_write(dst, i_size); + dst->i_blocks = i_blocks; + if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long)) + spin_unlock(&dst->i_lock); } EXPORT_SYMBOL_GPL(fsstack_copy_inode_size); -/* copy all attributes; get_nlinks is optional way to override the i_nlink - * copying - */ -void fsstack_copy_attr_all(struct inode *dest, const struct inode *src, - int (*get_nlinks)(struct inode *)) +/* copy all attributes */ +void fsstack_copy_attr_all(struct inode *dest, const struct inode *src) { dest->i_mode = src->i_mode; dest->i_uid = src->i_uid; @@ -29,14 +74,6 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src, dest->i_ctime = src->i_ctime; dest->i_blkbits = src->i_blkbits; dest->i_flags = src->i_flags; - - /* - * Update the nlinks AFTER updating the above fields, because the - * get_links callback may depend on them. - */ - if (!get_nlinks) - dest->i_nlink = src->i_nlink; - else - dest->i_nlink = (*get_nlinks)(dest); + dest->i_nlink = src->i_nlink; } EXPORT_SYMBOL_GPL(fsstack_copy_attr_all); diff --git a/fs/stat.c b/fs/stat.c index 075694e31d8b..c4ecd52c5737 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -401,9 +401,9 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename, } #endif /* __ARCH_WANT_STAT64 */ -void inode_add_bytes(struct inode *inode, loff_t bytes) +/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ +void __inode_add_bytes(struct inode *inode, loff_t bytes) { - spin_lock(&inode->i_lock); inode->i_blocks += bytes >> 9; bytes &= 511; inode->i_bytes += bytes; @@ -411,6 +411,12 @@ void inode_add_bytes(struct inode *inode, loff_t bytes) inode->i_blocks++; inode->i_bytes -= 512; } +} + +void inode_add_bytes(struct inode *inode, loff_t bytes) +{ + spin_lock(&inode->i_lock); + __inode_add_bytes(inode, bytes); spin_unlock(&inode->i_lock); } diff --git a/fs/super.c b/fs/super.c index 19eb70b374bc..aff046b0fe78 100644 --- a/fs/super.c +++ b/fs/super.c @@ -901,8 +901,9 @@ int get_sb_single(struct file_system_type *fs_type, return error; } s->s_flags |= MS_ACTIVE; + } else { + do_remount_sb(s, flags, data, 0); } - do_remount_sb(s, flags, data, 0); simple_set_mnt(mnt, s); return 0; } diff --git a/fs/sync.c b/fs/sync.c index d104591b066b..418727a2a239 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -295,10 +295,11 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) */ int generic_write_sync(struct file *file, loff_t pos, loff_t count) { - if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host)) + if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) return 0; return vfs_fsync_range(file, file->f_path.dentry, pos, - pos + count - 1, 1); + pos + count - 1, + (file->f_flags & __O_SYNC) ? 0 : 1); } EXPORT_SYMBOL(generic_write_sync); @@ -354,6 +355,7 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, { int ret; struct file *file; + struct address_space *mapping; loff_t endbyte; /* inclusive */ int fput_needed; umode_t i_mode; @@ -404,7 +406,28 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, !S_ISLNK(i_mode)) goto out_put; - ret = do_sync_mapping_range(file->f_mapping, offset, endbyte, flags); + mapping = file->f_mapping; + if (!mapping) { + ret = -EINVAL; + goto out_put; + } + + ret = 0; + if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { + ret = filemap_fdatawait_range(mapping, offset, endbyte); + if (ret < 0) + goto out_put; + } + + if (flags & SYNC_FILE_RANGE_WRITE) { + ret = filemap_fdatawrite_range(mapping, offset, endbyte); + if (ret < 0) + goto out_put; + } + + if (flags & SYNC_FILE_RANGE_WAIT_AFTER) + ret = filemap_fdatawait_range(mapping, offset, endbyte); + out_put: fput_light(file, fput_needed); out: @@ -436,42 +459,3 @@ asmlinkage long SyS_sync_file_range2(long fd, long flags, } SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2); #endif - -/* - * `endbyte' is inclusive - */ -int do_sync_mapping_range(struct address_space *mapping, loff_t offset, - loff_t endbyte, unsigned int flags) -{ - int ret; - - if (!mapping) { - ret = -EINVAL; - goto out; - } - - ret = 0; - if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { - ret = wait_on_page_writeback_range(mapping, - offset >> PAGE_CACHE_SHIFT, - endbyte >> PAGE_CACHE_SHIFT); - if (ret < 0) - goto out; - } - - if (flags & SYNC_FILE_RANGE_WRITE) { - ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_ALL); - if (ret < 0) - goto out; - } - - if (flags & SYNC_FILE_RANGE_WAIT_AFTER) { - ret = wait_on_page_writeback_range(mapping, - offset >> PAGE_CACHE_SHIFT, - endbyte >> PAGE_CACHE_SHIFT); - } -out: - return ret; -} -EXPORT_SYMBOL_GPL(do_sync_mapping_range); diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 60c702bc10ae..a0a500af24a1 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -483,7 +483,8 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd) * @attr: attribute descriptor. */ -int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) +int sysfs_create_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) { BUG_ON(!kobj || !kobj->sd || !attr); @@ -497,7 +498,8 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) * @attr: attribute descriptor. */ -void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) +void sysfs_remove_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) { sysfs_hash_and_remove(kobj->sd, attr->attr.name); } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index e0201837d244..699f371b9f12 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -25,7 +25,6 @@ #include "sysfs.h" DEFINE_MUTEX(sysfs_mutex); -DEFINE_MUTEX(sysfs_rename_mutex); DEFINE_SPINLOCK(sysfs_assoc_lock); static DEFINE_SPINLOCK(sysfs_ino_lock); @@ -85,46 +84,6 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) } /** - * sysfs_get_dentry - get dentry for the given sysfs_dirent - * @sd: sysfs_dirent of interest - * - * Get dentry for @sd. Dentry is looked up if currently not - * present. This function descends from the root looking up - * dentry for each step. - * - * LOCKING: - * mutex_lock(sysfs_rename_mutex) - * - * RETURNS: - * Pointer to found dentry on success, ERR_PTR() value on error. - */ -struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd) -{ - struct dentry *dentry = dget(sysfs_sb->s_root); - - while (dentry->d_fsdata != sd) { - struct sysfs_dirent *cur; - struct dentry *parent; - - /* find the first ancestor which hasn't been looked up */ - cur = sd; - while (cur->s_parent != dentry->d_fsdata) - cur = cur->s_parent; - - /* look it up */ - parent = dentry; - mutex_lock(&parent->d_inode->i_mutex); - dentry = lookup_one_noperm(cur->s_name, parent); - mutex_unlock(&parent->d_inode->i_mutex); - dput(parent); - - if (IS_ERR(dentry)) - break; - } - return dentry; -} - -/** * sysfs_get_active - get an active reference to sysfs_dirent * @sd: sysfs_dirent to get an active reference to * @@ -147,8 +106,10 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) return NULL; t = atomic_cmpxchg(&sd->s_active, v, v + 1); - if (likely(t == v)) + if (likely(t == v)) { + rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_); return sd; + } if (t < 0) return NULL; @@ -171,6 +132,7 @@ static void sysfs_put_active(struct sysfs_dirent *sd) if (unlikely(!sd)) return; + rwsem_release(&sd->dep_map, 1, _RET_IP_); v = atomic_dec_return(&sd->s_active); if (likely(v != SD_DEACTIVATED_BIAS)) return; @@ -235,15 +197,21 @@ static void sysfs_deactivate(struct sysfs_dirent *sd) BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); sd->s_sibling = (void *)&wait; + rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); /* atomic_add_return() is a mb(), put_active() will always see * the updated sd->s_sibling. */ v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); - if (v != SD_DEACTIVATED_BIAS) + if (v != SD_DEACTIVATED_BIAS) { + lock_contended(&sd->dep_map, _RET_IP_); wait_for_completion(&wait); + } sd->s_sibling = NULL; + + lock_acquired(&sd->dep_map, _RET_IP_); + rwsem_release(&sd->dep_map, 1, _RET_IP_); } static int sysfs_alloc_ino(ino_t *pino) @@ -298,7 +266,61 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) goto repeat; } -static void sysfs_d_iput(struct dentry * dentry, struct inode * inode) +static int sysfs_dentry_delete(struct dentry *dentry) +{ + struct sysfs_dirent *sd = dentry->d_fsdata; + return !!(sd->s_flags & SYSFS_FLAG_REMOVED); +} + +static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct sysfs_dirent *sd = dentry->d_fsdata; + int is_dir; + + mutex_lock(&sysfs_mutex); + + /* The sysfs dirent has been deleted */ + if (sd->s_flags & SYSFS_FLAG_REMOVED) + goto out_bad; + + /* The sysfs dirent has been moved? */ + if (dentry->d_parent->d_fsdata != sd->s_parent) + goto out_bad; + + /* The sysfs dirent has been renamed */ + if (strcmp(dentry->d_name.name, sd->s_name) != 0) + goto out_bad; + + mutex_unlock(&sysfs_mutex); +out_valid: + return 1; +out_bad: + /* Remove the dentry from the dcache hashes. + * If this is a deleted dentry we use d_drop instead of d_delete + * so sysfs doesn't need to cope with negative dentries. + * + * If this is a dentry that has simply been renamed we + * use d_drop to remove it from the dcache lookup on its + * old parent. If this dentry persists later when a lookup + * is performed at its new name the dentry will be readded + * to the dcache hashes. + */ + is_dir = (sysfs_type(sd) == SYSFS_DIR); + mutex_unlock(&sysfs_mutex); + if (is_dir) { + /* If we have submounts we must allow the vfs caches + * to lie about the state of the filesystem to prevent + * leaks and other nasty things. + */ + if (have_submounts(dentry)) + goto out_valid; + shrink_dcache_parent(dentry); + } + d_drop(dentry); + return 0; +} + +static void sysfs_dentry_iput(struct dentry *dentry, struct inode *inode) { struct sysfs_dirent * sd = dentry->d_fsdata; @@ -307,7 +329,9 @@ static void sysfs_d_iput(struct dentry * dentry, struct inode * inode) } static const struct dentry_operations sysfs_dentry_ops = { - .d_iput = sysfs_d_iput, + .d_revalidate = sysfs_dentry_revalidate, + .d_delete = sysfs_dentry_delete, + .d_iput = sysfs_dentry_iput, }; struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) @@ -330,6 +354,7 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) atomic_set(&sd->s_count, 1); atomic_set(&sd->s_active, 0); + sysfs_dirent_init_lockdep(sd); sd->s_name = name; sd->s_mode = mode; @@ -344,12 +369,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) return NULL; } -static int sysfs_ilookup_test(struct inode *inode, void *arg) -{ - struct sysfs_dirent *sd = arg; - return inode->i_ino == sd->s_ino; -} - /** * sysfs_addrm_start - prepare for sysfs_dirent add/remove * @acxt: pointer to sysfs_addrm_cxt to be used @@ -357,47 +376,20 @@ static int sysfs_ilookup_test(struct inode *inode, void *arg) * * This function is called when the caller is about to add or * remove sysfs_dirent under @parent_sd. This function acquires - * sysfs_mutex, grabs inode for @parent_sd if available and lock - * i_mutex of it. @acxt is used to keep and pass context to + * sysfs_mutex. @acxt is used to keep and pass context to * other addrm functions. * * LOCKING: * Kernel thread context (may sleep). sysfs_mutex is locked on - * return. i_mutex of parent inode is locked on return if - * available. + * return. */ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *parent_sd) { - struct inode *inode; - memset(acxt, 0, sizeof(*acxt)); acxt->parent_sd = parent_sd; - /* Lookup parent inode. inode initialization is protected by - * sysfs_mutex, so inode existence can be determined by - * looking up inode while holding sysfs_mutex. - */ mutex_lock(&sysfs_mutex); - - inode = ilookup5(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test, - parent_sd); - if (inode) { - WARN_ON(inode->i_state & I_NEW); - - /* parent inode available */ - acxt->parent_inode = inode; - - /* sysfs_mutex is below i_mutex in lock hierarchy. - * First, trylock i_mutex. If fails, unlock - * sysfs_mutex and lock them in order. - */ - if (!mutex_trylock(&inode->i_mutex)) { - mutex_unlock(&sysfs_mutex); - mutex_lock(&inode->i_mutex); - mutex_lock(&sysfs_mutex); - } - } } /** @@ -422,18 +414,22 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, */ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) { + struct sysfs_inode_attrs *ps_iattr; + if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) return -EEXIST; sd->s_parent = sysfs_get(acxt->parent_sd); - if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode) - inc_nlink(acxt->parent_inode); - - acxt->cnt++; - sysfs_link_sibling(sd); + /* Update timestamps on the parent */ + ps_iattr = acxt->parent_sd->s_iattr; + if (ps_iattr) { + struct iattr *ps_iattrs = &ps_iattr->ia_iattr; + ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; + } + return 0; } @@ -512,70 +508,22 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) */ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) { + struct sysfs_inode_attrs *ps_iattr; + BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED); sysfs_unlink_sibling(sd); + /* Update timestamps on the parent */ + ps_iattr = acxt->parent_sd->s_iattr; + if (ps_iattr) { + struct iattr *ps_iattrs = &ps_iattr->ia_iattr; + ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; + } + sd->s_flags |= SYSFS_FLAG_REMOVED; sd->s_sibling = acxt->removed; acxt->removed = sd; - - if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode) - drop_nlink(acxt->parent_inode); - - acxt->cnt++; -} - -/** - * sysfs_drop_dentry - drop dentry for the specified sysfs_dirent - * @sd: target sysfs_dirent - * - * Drop dentry for @sd. @sd must have been unlinked from its - * parent on entry to this function such that it can't be looked - * up anymore. - */ -static void sysfs_drop_dentry(struct sysfs_dirent *sd) -{ - struct inode *inode; - struct dentry *dentry; - - inode = ilookup(sysfs_sb, sd->s_ino); - if (!inode) - return; - - /* Drop any existing dentries associated with sd. - * - * For the dentry to be properly freed we need to grab a - * reference to the dentry under the dcache lock, unhash it, - * and then put it. The playing with the dentry count allows - * dput to immediately free the dentry if it is not in use. - */ -repeat: - spin_lock(&dcache_lock); - list_for_each_entry(dentry, &inode->i_dentry, d_alias) { - if (d_unhashed(dentry)) - continue; - dget_locked(dentry); - spin_lock(&dentry->d_lock); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); - dput(dentry); - goto repeat; - } - spin_unlock(&dcache_lock); - - /* adjust nlink and update timestamp */ - mutex_lock(&inode->i_mutex); - - inode->i_ctime = CURRENT_TIME; - drop_nlink(inode); - if (sysfs_type(sd) == SYSFS_DIR) - drop_nlink(inode); - - mutex_unlock(&inode->i_mutex); - - iput(inode); } /** @@ -584,25 +532,15 @@ repeat: * * Finish up sysfs_dirent add/remove. Resources acquired by * sysfs_addrm_start() are released and removed sysfs_dirents are - * cleaned up. Timestamps on the parent inode are updated. + * cleaned up. * * LOCKING: - * All mutexes acquired by sysfs_addrm_start() are released. + * sysfs_mutex is released. */ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) { /* release resources acquired by sysfs_addrm_start() */ mutex_unlock(&sysfs_mutex); - if (acxt->parent_inode) { - struct inode *inode = acxt->parent_inode; - - /* if added/removed, update timestamps on the parent */ - if (acxt->cnt) - inode->i_ctime = inode->i_mtime = CURRENT_TIME; - - mutex_unlock(&inode->i_mutex); - iput(inode); - } /* kill removed sysfs_dirents */ while (acxt->removed) { @@ -611,7 +549,6 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) acxt->removed = sd->s_sibling; sd->s_sibling = NULL; - sysfs_drop_dentry(sd); sysfs_deactivate(sd); unmap_bin_file(sd); sysfs_put(sd); @@ -751,10 +688,15 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, } /* instantiate and hash dentry */ - dentry->d_op = &sysfs_dentry_ops; - dentry->d_fsdata = sysfs_get(sd); - d_instantiate(dentry, inode); - d_rehash(dentry); + ret = d_find_alias(inode); + if (!ret) { + dentry->d_op = &sysfs_dentry_ops; + dentry->d_fsdata = sysfs_get(sd); + d_add(dentry, inode); + } else { + d_move(ret, dentry); + iput(inode); + } out_unlock: mutex_unlock(&sysfs_mutex); @@ -763,7 +705,9 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, const struct inode_operations sysfs_dir_inode_operations = { .lookup = sysfs_lookup, + .permission = sysfs_permission, .setattr = sysfs_setattr, + .getattr = sysfs_getattr, .setxattr = sysfs_setxattr, }; @@ -826,141 +770,65 @@ void sysfs_remove_dir(struct kobject * kobj) __sysfs_remove_dir(sd); } -int sysfs_rename_dir(struct kobject * kobj, const char *new_name) +int sysfs_rename(struct sysfs_dirent *sd, + struct sysfs_dirent *new_parent_sd, const char *new_name) { - struct sysfs_dirent *sd = kobj->sd; - struct dentry *parent = NULL; - struct dentry *old_dentry = NULL, *new_dentry = NULL; const char *dup_name = NULL; int error; - mutex_lock(&sysfs_rename_mutex); + mutex_lock(&sysfs_mutex); error = 0; - if (strcmp(sd->s_name, new_name) == 0) + if ((sd->s_parent == new_parent_sd) && + (strcmp(sd->s_name, new_name) == 0)) goto out; /* nothing to rename */ - /* get the original dentry */ - old_dentry = sysfs_get_dentry(sd); - if (IS_ERR(old_dentry)) { - error = PTR_ERR(old_dentry); - old_dentry = NULL; - goto out; - } - - parent = old_dentry->d_parent; - - /* lock parent and get dentry for new name */ - mutex_lock(&parent->d_inode->i_mutex); - mutex_lock(&sysfs_mutex); - error = -EEXIST; - if (sysfs_find_dirent(sd->s_parent, new_name)) - goto out_unlock; - - error = -ENOMEM; - new_dentry = d_alloc_name(parent, new_name); - if (!new_dentry) - goto out_unlock; + if (sysfs_find_dirent(new_parent_sd, new_name)) + goto out; /* rename sysfs_dirent */ - error = -ENOMEM; - new_name = dup_name = kstrdup(new_name, GFP_KERNEL); - if (!new_name) - goto out_unlock; - - dup_name = sd->s_name; - sd->s_name = new_name; + if (strcmp(sd->s_name, new_name) != 0) { + error = -ENOMEM; + new_name = dup_name = kstrdup(new_name, GFP_KERNEL); + if (!new_name) + goto out; + + dup_name = sd->s_name; + sd->s_name = new_name; + } - /* rename */ - d_add(new_dentry, NULL); - d_move(old_dentry, new_dentry); + /* Remove from old parent's list and insert into new parent's list. */ + if (sd->s_parent != new_parent_sd) { + sysfs_unlink_sibling(sd); + sysfs_get(new_parent_sd); + sysfs_put(sd->s_parent); + sd->s_parent = new_parent_sd; + sysfs_link_sibling(sd); + } error = 0; - out_unlock: + out: mutex_unlock(&sysfs_mutex); - mutex_unlock(&parent->d_inode->i_mutex); kfree(dup_name); - dput(old_dentry); - dput(new_dentry); - out: - mutex_unlock(&sysfs_rename_mutex); return error; } +int sysfs_rename_dir(struct kobject *kobj, const char *new_name) +{ + return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name); +} + int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) { struct sysfs_dirent *sd = kobj->sd; struct sysfs_dirent *new_parent_sd; - struct dentry *old_parent, *new_parent = NULL; - struct dentry *old_dentry = NULL, *new_dentry = NULL; - int error; - mutex_lock(&sysfs_rename_mutex); BUG_ON(!sd->s_parent); - new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ? + new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; - error = 0; - if (sd->s_parent == new_parent_sd) - goto out; /* nothing to move */ - - /* get dentries */ - old_dentry = sysfs_get_dentry(sd); - if (IS_ERR(old_dentry)) { - error = PTR_ERR(old_dentry); - old_dentry = NULL; - goto out; - } - old_parent = old_dentry->d_parent; - - new_parent = sysfs_get_dentry(new_parent_sd); - if (IS_ERR(new_parent)) { - error = PTR_ERR(new_parent); - new_parent = NULL; - goto out; - } - -again: - mutex_lock(&old_parent->d_inode->i_mutex); - if (!mutex_trylock(&new_parent->d_inode->i_mutex)) { - mutex_unlock(&old_parent->d_inode->i_mutex); - goto again; - } - mutex_lock(&sysfs_mutex); - - error = -EEXIST; - if (sysfs_find_dirent(new_parent_sd, sd->s_name)) - goto out_unlock; - - error = -ENOMEM; - new_dentry = d_alloc_name(new_parent, sd->s_name); - if (!new_dentry) - goto out_unlock; - - error = 0; - d_add(new_dentry, NULL); - d_move(old_dentry, new_dentry); - - /* Remove from old parent's list and insert into new parent's list. */ - sysfs_unlink_sibling(sd); - sysfs_get(new_parent_sd); - drop_nlink(old_parent->d_inode); - sysfs_put(sd->s_parent); - sd->s_parent = new_parent_sd; - inc_nlink(new_parent->d_inode); - sysfs_link_sibling(sd); - - out_unlock: - mutex_unlock(&sysfs_mutex); - mutex_unlock(&new_parent->d_inode->i_mutex); - mutex_unlock(&old_parent->d_inode->i_mutex); - out: - dput(new_parent); - dput(old_dentry); - dput(new_dentry); - mutex_unlock(&sysfs_rename_mutex); - return error; + return sysfs_rename(sd, new_parent_sd, sd->s_name); } /* Relationship between s_mode and the DT_xxx types */ diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index f5ea4680f15f..dc30d9e31683 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -579,46 +579,23 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); */ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) { - struct sysfs_dirent *victim_sd = NULL; - struct dentry *victim = NULL; - struct inode * inode; + struct sysfs_dirent *sd; struct iattr newattrs; int rc; - rc = -ENOENT; - victim_sd = sysfs_get_dirent(kobj->sd, attr->name); - if (!victim_sd) - goto out; + mutex_lock(&sysfs_mutex); - mutex_lock(&sysfs_rename_mutex); - victim = sysfs_get_dentry(victim_sd); - mutex_unlock(&sysfs_rename_mutex); - if (IS_ERR(victim)) { - rc = PTR_ERR(victim); - victim = NULL; + rc = -ENOENT; + sd = sysfs_find_dirent(kobj->sd, attr->name); + if (!sd) goto out; - } - - inode = victim->d_inode; - - mutex_lock(&inode->i_mutex); - newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); - newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - newattrs.ia_ctime = current_fs_time(inode->i_sb); - rc = sysfs_setattr(victim, &newattrs); + newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE; + rc = sysfs_sd_setattr(sd, &newattrs); - if (rc == 0) { - fsnotify_change(victim, newattrs.ia_valid); - mutex_lock(&sysfs_mutex); - victim_sd->s_mode = newattrs.ia_mode; - mutex_unlock(&sysfs_mutex); - } - - mutex_unlock(&inode->i_mutex); out: - dput(victim); - sysfs_put(victim_sd); + mutex_unlock(&sysfs_mutex); return rc; } EXPORT_SYMBOL_GPL(sysfs_chmod_file); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index e28cecf179f5..220b758523ae 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -37,7 +37,9 @@ static struct backing_dev_info sysfs_backing_dev_info = { }; static const struct inode_operations sysfs_inode_operations ={ + .permission = sysfs_permission, .setattr = sysfs_setattr, + .getattr = sysfs_getattr, .setxattr = sysfs_setxattr, }; @@ -46,7 +48,7 @@ int __init sysfs_inode_init(void) return bdi_init(&sysfs_backing_dev_info); } -struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) +static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) { struct sysfs_inode_attrs *attrs; struct iattr *iattrs; @@ -64,30 +66,15 @@ struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) return attrs; } -int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) + +int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr) { - struct inode * inode = dentry->d_inode; - struct sysfs_dirent * sd = dentry->d_fsdata; struct sysfs_inode_attrs *sd_attrs; struct iattr *iattrs; unsigned int ia_valid = iattr->ia_valid; - int error; - - if (!sd) - return -EINVAL; sd_attrs = sd->s_iattr; - error = inode_change_ok(inode, iattr); - if (error) - return error; - - iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ - - error = inode_setattr(inode, iattr); - if (error) - return error; - if (!sd_attrs) { /* setting attributes for the first time, allocate now */ sd_attrs = sysfs_init_inode_attrs(sd); @@ -103,42 +90,78 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) if (ia_valid & ATTR_GID) iattrs->ia_gid = iattr->ia_gid; if (ia_valid & ATTR_ATIME) - iattrs->ia_atime = timespec_trunc(iattr->ia_atime, - inode->i_sb->s_time_gran); + iattrs->ia_atime = iattr->ia_atime; if (ia_valid & ATTR_MTIME) - iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime, - inode->i_sb->s_time_gran); + iattrs->ia_mtime = iattr->ia_mtime; if (ia_valid & ATTR_CTIME) - iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime, - inode->i_sb->s_time_gran); + iattrs->ia_ctime = iattr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; iattrs->ia_mode = sd->s_mode = mode; } } + return 0; +} + +int sysfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + struct sysfs_dirent *sd = dentry->d_fsdata; + int error; + + if (!sd) + return -EINVAL; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ + + error = inode_setattr(inode, iattr); + if (error) + return error; + + mutex_lock(&sysfs_mutex); + error = sysfs_sd_setattr(sd, iattr); + mutex_unlock(&sysfs_mutex); + return error; } +static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *secdata_len) +{ + struct sysfs_inode_attrs *iattrs; + void *old_secdata; + size_t old_secdata_len; + + iattrs = sd->s_iattr; + if (!iattrs) + iattrs = sysfs_init_inode_attrs(sd); + if (!iattrs) + return -ENOMEM; + + old_secdata = iattrs->ia_secdata; + old_secdata_len = iattrs->ia_secdata_len; + + iattrs->ia_secdata = *secdata; + iattrs->ia_secdata_len = *secdata_len; + + *secdata = old_secdata; + *secdata_len = old_secdata_len; + return 0; +} + int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct sysfs_dirent *sd = dentry->d_fsdata; - struct sysfs_inode_attrs *iattrs; void *secdata; int error; u32 secdata_len = 0; if (!sd) return -EINVAL; - if (!sd->s_iattr) - sd->s_iattr = sysfs_init_inode_attrs(sd); - if (!sd->s_iattr) - return -ENOMEM; - - iattrs = sd->s_iattr; if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; @@ -150,12 +173,13 @@ int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, &secdata, &secdata_len); if (error) goto out; - if (iattrs->ia_secdata) - security_release_secctx(iattrs->ia_secdata, - iattrs->ia_secdata_len); - iattrs->ia_secdata = secdata; - iattrs->ia_secdata_len = secdata_len; + mutex_lock(&sysfs_mutex); + error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len); + mutex_unlock(&sysfs_mutex); + + if (secdata) + security_release_secctx(secdata, secdata_len); } else return -EINVAL; out: @@ -170,7 +194,6 @@ static inline void set_default_inode_attr(struct inode * inode, mode_t mode) static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) { - inode->i_mode = iattr->ia_mode; inode->i_uid = iattr->ia_uid; inode->i_gid = iattr->ia_gid; inode->i_atime = iattr->ia_atime; @@ -178,17 +201,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) inode->i_ctime = iattr->ia_ctime; } - -/* - * sysfs has a different i_mutex lock order behavior for i_mutex than other - * filesystems; sysfs i_mutex is called in many places with subsystem locks - * held. At the same time, many of the VFS locking rules do not apply to - * sysfs at all (cross directory rename for example). To untangle this mess - * (which gives false positives in lockdep), we're giving sysfs inodes their - * own class for i_mutex. - */ -static struct lock_class_key sysfs_inode_imutex_key; - static int sysfs_count_nlink(struct sysfs_dirent *sd) { struct sysfs_dirent *child; @@ -201,38 +213,55 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd) return nr + 2; } +static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) +{ + struct sysfs_inode_attrs *iattrs = sd->s_iattr; + + inode->i_mode = sd->s_mode; + if (iattrs) { + /* sysfs_dirent has non-default attributes + * get them from persistent copy in sysfs_dirent + */ + set_inode_attr(inode, &iattrs->ia_iattr); + security_inode_notifysecctx(inode, + iattrs->ia_secdata, + iattrs->ia_secdata_len); + } + + if (sysfs_type(sd) == SYSFS_DIR) + inode->i_nlink = sysfs_count_nlink(sd); +} + +int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct sysfs_dirent *sd = dentry->d_fsdata; + struct inode *inode = dentry->d_inode; + + mutex_lock(&sysfs_mutex); + sysfs_refresh_inode(sd, inode); + mutex_unlock(&sysfs_mutex); + + generic_fillattr(inode, stat); + return 0; +} + static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) { struct bin_attribute *bin_attr; - struct sysfs_inode_attrs *iattrs; inode->i_private = sysfs_get(sd); inode->i_mapping->a_ops = &sysfs_aops; inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; inode->i_op = &sysfs_inode_operations; - inode->i_ino = sd->s_ino; - lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); - iattrs = sd->s_iattr; - if (iattrs) { - /* sysfs_dirent has non-default attributes - * get them for the new inode from persistent copy - * in sysfs_dirent - */ - set_inode_attr(inode, &iattrs->ia_iattr); - if (iattrs->ia_secdata) - security_inode_notifysecctx(inode, - iattrs->ia_secdata, - iattrs->ia_secdata_len); - } else - set_default_inode_attr(inode, sd->s_mode); + set_default_inode_attr(inode, sd->s_mode); + sysfs_refresh_inode(sd, inode); /* initialize inode according to type */ switch (sysfs_type(sd)) { case SYSFS_DIR: inode->i_op = &sysfs_dir_inode_operations; inode->i_fop = &sysfs_dir_operations; - inode->i_nlink = sysfs_count_nlink(sd); break; case SYSFS_KOBJ_ATTR: inode->i_size = PAGE_SIZE; @@ -315,3 +344,14 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) else return -ENOENT; } + +int sysfs_permission(struct inode *inode, int mask) +{ + struct sysfs_dirent *sd = inode->i_private; + + mutex_lock(&sysfs_mutex); + sysfs_refresh_inode(sd, inode); + mutex_unlock(&sysfs_mutex); + + return generic_permission(inode, mask, NULL); +} diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index c5081ad77026..c5eff49fa41b 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -210,10 +210,13 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co } const struct inode_operations sysfs_symlink_inode_operations = { - .setxattr = sysfs_setxattr, - .readlink = generic_readlink, - .follow_link = sysfs_follow_link, - .put_link = sysfs_put_link, + .setxattr = sysfs_setxattr, + .readlink = generic_readlink, + .follow_link = sysfs_follow_link, + .put_link = sysfs_put_link, + .setattr = sysfs_setattr, + .getattr = sysfs_getattr, + .permission = sysfs_permission, }; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index af4c4e7482ac..cdd9377a6e06 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -8,6 +8,7 @@ * This file is released under the GPLv2. */ +#include <linux/lockdep.h> #include <linux/fs.h> struct sysfs_open_dirent; @@ -50,6 +51,9 @@ struct sysfs_inode_attrs { struct sysfs_dirent { atomic_t s_count; atomic_t s_active; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif struct sysfs_dirent *s_parent; struct sysfs_dirent *s_sibling; const char *s_name; @@ -84,14 +88,23 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd) return sd->s_flags & SYSFS_TYPE_MASK; } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#define sysfs_dirent_init_lockdep(sd) \ +do { \ + static struct lock_class_key __key; \ + \ + lockdep_init_map(&sd->dep_map, "s_active", &__key, 0); \ +} while(0) +#else +#define sysfs_dirent_init_lockdep(sd) do {} while(0) +#endif + /* * Context structure to be used while adding/removing nodes. */ struct sysfs_addrm_cxt { struct sysfs_dirent *parent_sd; - struct inode *parent_inode; struct sysfs_dirent *removed; - int cnt; }; /* @@ -105,7 +118,6 @@ extern struct kmem_cache *sysfs_dir_cachep; * dir.c */ extern struct mutex sysfs_mutex; -extern struct mutex sysfs_rename_mutex; extern spinlock_t sysfs_assoc_lock; extern const struct file_operations sysfs_dir_operations; @@ -133,6 +145,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name, struct sysfs_dirent **p_sd); void sysfs_remove_subdir(struct sysfs_dirent *sd); +int sysfs_rename(struct sysfs_dirent *sd, + struct sysfs_dirent *new_parent_sd, const char *new_name); + static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) { if (sd) { @@ -155,7 +170,10 @@ static inline void __sysfs_put(struct sysfs_dirent *sd) */ struct inode *sysfs_get_inode(struct sysfs_dirent *sd); void sysfs_delete_inode(struct inode *inode); +int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); +int sysfs_permission(struct inode *inode, int mask); int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); +int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); diff --git a/fs/timerfd.c b/fs/timerfd.c index b042bd7034b1..1bfc95ad5f71 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -200,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, - flags & TFD_SHARED_FCNTL_FLAGS); + O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); if (ufd < 0) kfree(ctx); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index dbc093afd946..90492327b383 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -350,13 +350,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) le32_to_cpu(sup->fmt_version)); printk(KERN_DEBUG "\ttime_gran %u\n", le32_to_cpu(sup->time_gran)); - printk(KERN_DEBUG "\tUUID %02X%02X%02X%02X-%02X%02X" - "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n", - sup->uuid[0], sup->uuid[1], sup->uuid[2], sup->uuid[3], - sup->uuid[4], sup->uuid[5], sup->uuid[6], sup->uuid[7], - sup->uuid[8], sup->uuid[9], sup->uuid[10], sup->uuid[11], - sup->uuid[12], sup->uuid[13], sup->uuid[14], - sup->uuid[15]); + printk(KERN_DEBUG "\tUUID %pUB\n", + sup->uuid); break; } case UBIFS_MST_NODE: @@ -2014,7 +2009,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, inum = key_inum_flash(c, &dent->key); fscki1 = read_add_inode(c, priv, inum); if (IS_ERR(fscki1)) { - err = PTR_ERR(fscki); + err = PTR_ERR(fscki1); ubifs_err("error %d while processing entry node and " "trying to find parent inode node %lu", err, (unsigned long)inum); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 1009adc8d602..16a6444330ec 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -45,7 +45,7 @@ * * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the * read-ahead path does not lock it ("sys_read -> generic_file_aio_read -> - * ondemand_readahead -> readpage"). In case of readahead, @I_LOCK flag is not + * ondemand_readahead -> readpage"). In case of readahead, @I_SYNC flag is not * set as well. However, UBIFS disables readahead. */ @@ -1389,7 +1389,6 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { int err; - ssize_t ret; struct inode *inode = iocb->ki_filp->f_mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -1397,17 +1396,7 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err) return err; - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - if (ret < 0) - return ret; - - if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) { - err = ubifs_sync_wbufs_by_inode(c, inode); - if (err) - return err; - } - - return ret; + return generic_file_aio_write(iocb, iov, nr_segs, pos); } static int ubifs_set_page_dirty(struct page *page) diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 618c2701d3a7..e5a3d8e96bb7 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -54,6 +54,7 @@ */ #include <linux/pagemap.h> +#include <linux/list_sort.h> #include "ubifs.h" /* @@ -108,101 +109,6 @@ static int switch_gc_head(struct ubifs_info *c) } /** - * list_sort - sort a list. - * @priv: private data, passed to @cmp - * @head: the list to sort - * @cmp: the elements comparison function - * - * This function has been implemented by Mark J Roberts <mjr@znex.org>. It - * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted - * in ascending order. - * - * The comparison function @cmp is supposed to return a negative value if @a is - * than @b, and a positive value if @a is greater than @b. If @a and @b are - * equivalent, then it does not matter what this function returns. - */ -static void list_sort(void *priv, struct list_head *head, - int (*cmp)(void *priv, struct list_head *a, - struct list_head *b)) -{ - struct list_head *p, *q, *e, *list, *tail, *oldhead; - int insize, nmerges, psize, qsize, i; - - if (list_empty(head)) - return; - - list = head->next; - list_del(head); - insize = 1; - for (;;) { - p = oldhead = list; - list = tail = NULL; - nmerges = 0; - - while (p) { - nmerges++; - q = p; - psize = 0; - for (i = 0; i < insize; i++) { - psize++; - q = q->next == oldhead ? NULL : q->next; - if (!q) - break; - } - - qsize = insize; - while (psize > 0 || (qsize > 0 && q)) { - if (!psize) { - e = q; - q = q->next; - qsize--; - if (q == oldhead) - q = NULL; - } else if (!qsize || !q) { - e = p; - p = p->next; - psize--; - if (p == oldhead) - p = NULL; - } else if (cmp(priv, p, q) <= 0) { - e = p; - p = p->next; - psize--; - if (p == oldhead) - p = NULL; - } else { - e = q; - q = q->next; - qsize--; - if (q == oldhead) - q = NULL; - } - if (tail) - tail->next = e; - else - list = e; - e->prev = tail; - tail = e; - } - p = q; - } - - tail->next = list; - list->prev = tail; - - if (nmerges <= 1) - break; - - insize *= 2; - } - - head->next = list; - head->prev = list->prev; - list->prev->next = head; - list->prev = head; -} - -/** * data_nodes_cmp - compare 2 data nodes. * @priv: UBIFS file-system description object * @a: first data node diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 333e181ee987..43f9d19a6f33 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1393,12 +1393,7 @@ static int mount_ubifs(struct ubifs_info *c) c->leb_size, c->leb_size >> 10); dbg_msg("data journal heads: %d", c->jhead_cnt - NONDATA_JHEADS_CNT); - dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X" - "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X", - c->uuid[0], c->uuid[1], c->uuid[2], c->uuid[3], - c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7], - c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11], - c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]); + dbg_msg("UUID: %pUB", c->uuid); dbg_msg("big_lpt %d", c->big_lpt); dbg_msg("log LEBs: %d (%d - %d)", c->log_lebs, UBIFS_LOG_LNUM, c->log_last); @@ -1842,22 +1837,32 @@ const struct super_operations ubifs_super_operations = { * @name: UBI volume name * @mode: UBI volume open mode * - * There are several ways to specify UBI volumes when mounting UBIFS: - * o ubiX_Y - UBI device number X, volume Y; - * o ubiY - UBI device number 0, volume Y; + * The primary method of mounting UBIFS is by specifying the UBI volume + * character device node path. However, UBIFS may also be mounted withoug any + * character device node using one of the following methods: + * + * o ubiX_Y - mount UBI device number X, volume Y; + * o ubiY - mount UBI device number 0, volume Y; * o ubiX:NAME - mount UBI device X, volume with name NAME; * o ubi:NAME - mount UBI device 0, volume with name NAME. * * Alternative '!' separator may be used instead of ':' (because some shells * like busybox may interpret ':' as an NFS host name separator). This function - * returns ubi volume object in case of success and a negative error code in - * case of failure. + * returns UBI volume description object in case of success and a negative + * error code in case of failure. */ static struct ubi_volume_desc *open_ubi(const char *name, int mode) { + struct ubi_volume_desc *ubi; int dev, vol; char *endptr; + /* First, try to open using the device node path method */ + ubi = ubi_open_volume_path(name, mode); + if (!IS_ERR(ubi)) + return ubi; + + /* Try the "nodev" method */ if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i') return ERR_PTR(-EINVAL); diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 1e068535b58b..82372e332f08 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -440,7 +440,7 @@ static void udf_table_free_blocks(struct super_block *sb, (bloc->logicalBlockNum + count) > partmap->s_partition_len) { udf_debug("%d < %d || %d + %d > %d\n", - bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count, + bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, partmap->s_partition_len); goto error_return; } diff --git a/fs/udf/file.c b/fs/udf/file.c index b80cbd78833c..f311d509b6a3 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -196,6 +196,7 @@ static int udf_release_file(struct inode *inode, struct file *filp) mutex_lock(&inode->i_mutex); lock_kernel(); udf_discard_prealloc(inode); + udf_truncate_tail_extent(inode); unlock_kernel(); mutex_unlock(&inode->i_mutex); } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 6d24c2c63f93..f90231eb2916 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -97,15 +97,17 @@ no_delete: */ void udf_clear_inode(struct inode *inode) { - struct udf_inode_info *iinfo; - if (!(inode->i_sb->s_flags & MS_RDONLY)) { - lock_kernel(); - udf_truncate_tail_extent(inode); - unlock_kernel(); - write_inode_now(inode, 0); - invalidate_inode_buffers(inode); + struct udf_inode_info *iinfo = UDF_I(inode); + + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && + inode->i_size != iinfo->i_lenExtents) { + printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has " + "inode size %llu different from extent lenght %llu. " + "Filesystem need not be standards compliant.\n", + inode->i_sb->s_id, inode->i_ino, inode->i_mode, + (unsigned long long)inode->i_size, + (unsigned long long)iinfo->i_lenExtents); } - iinfo = UDF_I(inode); kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; } @@ -198,7 +200,6 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, int newblock; struct buffer_head *dbh = NULL; struct kernel_lb_addr eloc; - uint32_t elen; uint8_t alloctype; struct extent_position epos; @@ -273,12 +274,11 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, eloc.logicalBlockNum = *block; eloc.partitionReferenceNum = iinfo->i_location.partitionReferenceNum; - elen = inode->i_sb->s_blocksize; - iinfo->i_lenExtents = elen; + iinfo->i_lenExtents = inode->i_size; epos.bh = NULL; epos.block = iinfo->i_location; epos.offset = udf_file_entry_alloc_offset(inode); - udf_add_aext(inode, &epos, &eloc, elen, 0); + udf_add_aext(inode, &epos, &eloc, inode->i_size, 0); /* UniqueID stuff */ brelse(epos.bh); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 21dad8c608f9..cd2115060fdc 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -408,15 +408,6 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, } add: - /* Is there any extent whose size we need to round up? */ - if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) { - elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); - if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - epos.offset -= sizeof(struct short_ad); - else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - epos.offset -= sizeof(struct long_ad); - udf_write_aext(dir, &epos, &eloc, elen, 1); - } f_pos += nfidlen; if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && @@ -439,6 +430,7 @@ add: udf_current_aext(dir, &epos, &eloc, &elen, 1); } + /* Entry fits into current block? */ if (sb->s_blocksize - fibh->eoffset >= nfidlen) { fibh->soffset = fibh->eoffset; fibh->eoffset += nfidlen; @@ -462,6 +454,16 @@ add: (fibh->sbh->b_data + fibh->soffset); } } else { + /* Round up last extent in the file */ + elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + udf_write_aext(dir, &epos, &eloc, elen, 1); + dinfo->i_lenExtents = (dinfo->i_lenExtents + sb->s_blocksize + - 1) & ~(sb->s_blocksize - 1); + fibh->soffset = fibh->eoffset - sb->s_blocksize; fibh->eoffset += nfidlen - sb->s_blocksize; if (fibh->sbh != fibh->ebh) { @@ -508,6 +510,20 @@ add: dir->i_size += nfidlen; if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) dinfo->i_lenAlloc += nfidlen; + else { + /* Find the last extent and truncate it to proper size */ + while (udf_next_aext(dir, &epos, &eloc, &elen, 1) == + (EXT_RECORDED_ALLOCATED >> 30)) + ; + elen -= dinfo->i_lenExtents - dir->i_size; + if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + epos.offset -= sizeof(struct short_ad); + else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + epos.offset -= sizeof(struct long_ad); + udf_write_aext(dir, &epos, &eloc, elen, 1); + dinfo->i_lenExtents = dir->i_size; + } + mark_inode_dirty(dir); goto out_ok; } else { @@ -922,7 +938,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, block = udf_get_pblock(inode->i_sb, block, iinfo->i_location.partitionReferenceNum, 0); - epos.bh = udf_tread(inode->i_sb, block); + epos.bh = udf_tgetblk(inode->i_sb, block); lock_buffer(epos.bh); memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize); set_buffer_uptodate(epos.bh); @@ -999,6 +1015,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, inode->i_size = elen; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) iinfo->i_lenAlloc = inode->i_size; + else + udf_truncate_tail_extent(inode); mark_inode_dirty(inode); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); diff --git a/fs/udf/super.c b/fs/udf/super.c index 9d1b8c2e6c45..1e4543cbcd27 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1078,21 +1078,39 @@ static int udf_fill_partdesc_info(struct super_block *sb, return 0; } -static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) +static void udf_find_vat_block(struct super_block *sb, int p_index, + int type1_index, sector_t start_block) { struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map = &sbi->s_partmaps[p_index]; + sector_t vat_block; struct kernel_lb_addr ino; + + /* + * VAT file entry is in the last recorded block. Some broken disks have + * it a few blocks before so try a bit harder... + */ + ino.partitionReferenceNum = type1_index; + for (vat_block = start_block; + vat_block >= map->s_partition_root && + vat_block >= start_block - 3 && + !sbi->s_vat_inode; vat_block--) { + ino.logicalBlockNum = vat_block - map->s_partition_root; + sbi->s_vat_inode = udf_iget(sb, &ino); + } +} + +static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) +{ + struct udf_sb_info *sbi = UDF_SB(sb); + struct udf_part_map *map = &sbi->s_partmaps[p_index]; struct buffer_head *bh = NULL; struct udf_inode_info *vati; uint32_t pos; struct virtualAllocationTable20 *vat20; sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; - /* VAT file entry is in the last recorded block */ - ino.partitionReferenceNum = type1_index; - ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root; - sbi->s_vat_inode = udf_iget(sb, &ino); + udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block); if (!sbi->s_vat_inode && sbi->s_last_block != blocks - 1) { printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the" @@ -1100,9 +1118,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) "block of the device (%lu).\n", (unsigned long)sbi->s_last_block, (unsigned long)blocks - 1); - ino.partitionReferenceNum = type1_index; - ino.logicalBlockNum = blocks - 1 - map->s_partition_root; - sbi->s_vat_inode = udf_iget(sb, &ino); + udf_find_vat_block(sb, p_index, type1_index, blocks - 1); } if (!sbi->s_vat_inode) return 1; diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 6f671f1ac271..22af68f8b682 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -70,13 +70,13 @@ static inline unsigned long ufs_dir_pages(struct inode *inode) return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; } -ino_t ufs_inode_by_name(struct inode *dir, struct dentry *dentry) +ino_t ufs_inode_by_name(struct inode *dir, struct qstr *qstr) { ino_t res = 0; struct ufs_dir_entry *de; struct page *page; - de = ufs_find_entry(dir, dentry, &page); + de = ufs_find_entry(dir, qstr, &page); if (de) { res = fs32_to_cpu(dir->i_sb, de->d_ino); ufs_put_page(page); @@ -249,12 +249,12 @@ struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p) * (as a parameter - res_dir). Page is returned mapped and unlocked. * Entry is guaranteed to be valid. */ -struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry, +struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct qstr *qstr, struct page **res_page) { struct super_block *sb = dir->i_sb; - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; + const char *name = qstr->name; + int namelen = qstr->len; unsigned reclen = UFS_DIR_REC_LEN(namelen); unsigned long start, n; unsigned long npages = ufs_dir_pages(dir); diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 23119fe7ad62..4c26d9e8bc94 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -56,7 +56,7 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru return ERR_PTR(-ENAMETOOLONG); lock_kernel(); - ino = ufs_inode_by_name(dir, dentry); + ino = ufs_inode_by_name(dir, &dentry->d_name); if (ino) { inode = ufs_iget(dir->i_sb, ino); if (IS_ERR(inode)) { @@ -237,7 +237,7 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) struct page *page; int err = -ENOENT; - de = ufs_find_entry(dir, dentry, &page); + de = ufs_find_entry(dir, &dentry->d_name, &page); if (!de) goto out; @@ -281,7 +281,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ufs_dir_entry *old_de; int err = -ENOENT; - old_de = ufs_find_entry(old_dir, old_dentry, &old_page); + old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; @@ -301,7 +301,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_dir; err = -ENOENT; - new_de = ufs_find_entry(new_dir, new_dentry, &new_page); + new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; inode_inc_link_count(old_inode); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 5faed7954d0a..143c20bfb04b 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -66,6 +66,7 @@ */ +#include <linux/exportfs.h> #include <linux/module.h> #include <linux/bitops.h> @@ -96,6 +97,56 @@ #include "swab.h" #include "util.h" +static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) +{ + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct inode *inode; + + if (ino < UFS_ROOTINO || ino > uspi->s_ncg * uspi->s_ipg) + return ERR_PTR(-ESTALE); + + inode = ufs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *ufs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ufs_nfs_get_inode); +} + +static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, ufs_nfs_get_inode); +} + +static struct dentry *ufs_get_parent(struct dentry *child) +{ + struct qstr dot_dot = { + .name = "..", + .len = 2, + }; + ino_t ino; + + ino = ufs_inode_by_name(child->d_inode, &dot_dot); + if (!ino) + return ERR_PTR(-ENOENT); + return d_obtain_alias(ufs_iget(child->d_inode->i_sb, ino)); +} + +static const struct export_operations ufs_export_ops = { + .fh_to_dentry = ufs_fh_to_dentry, + .fh_to_parent = ufs_fh_to_parent, + .get_parent = ufs_get_parent, +}; + #ifdef CONFIG_UFS_DEBUG /* * Print contents of ufs_super_block, useful for debugging @@ -990,6 +1041,7 @@ magic_found: * Read ufs_super_block into internal data structures */ sb->s_op = &ufs_super_ops; + sb->s_export_op = &ufs_export_ops; sb->dq_op = NULL; /***/ sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic); diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 644e77e13599..0b4c39bc0d9e 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -86,9 +86,9 @@ extern void ufs_put_cylinder (struct super_block *, unsigned); /* dir.c */ extern const struct inode_operations ufs_dir_inode_operations; extern int ufs_add_link (struct dentry *, struct inode *); -extern ino_t ufs_inode_by_name(struct inode *, struct dentry *); +extern ino_t ufs_inode_by_name(struct inode *, struct qstr *); extern int ufs_make_empty(struct inode *, struct inode *); -extern struct ufs_dir_entry *ufs_find_entry(struct inode *, struct dentry *, struct page **); +extern struct ufs_dir_entry *ufs_find_entry(struct inode *, struct qstr *, struct page **); extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *); extern int ufs_empty_dir (struct inode *); extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); diff --git a/fs/xattr.c b/fs/xattr.c index 6d4f6d3449fb..46f87e828b48 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -615,12 +615,11 @@ ssize_t generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { struct xattr_handler *handler; - struct inode *inode = dentry->d_inode; - handler = xattr_resolve_name(inode->i_sb->s_xattr, &name); + handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) return -EOPNOTSUPP; - return handler->get(inode, name, buffer, size); + return handler->get(dentry, name, buffer, size, handler->flags); } /* @@ -630,18 +629,20 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; - struct xattr_handler *handler, **handlers = inode->i_sb->s_xattr; + struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; unsigned int size = 0; if (!buffer) { - for_each_xattr_handler(handlers, handler) - size += handler->list(inode, NULL, 0, NULL, 0); + for_each_xattr_handler(handlers, handler) { + size += handler->list(dentry, NULL, 0, NULL, 0, + handler->flags); + } } else { char *buf = buffer; for_each_xattr_handler(handlers, handler) { - size = handler->list(inode, buf, buffer_size, NULL, 0); + size = handler->list(dentry, buf, buffer_size, + NULL, 0, handler->flags); if (size > buffer_size) return -ERANGE; buf += size; @@ -659,14 +660,13 @@ int generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct xattr_handler *handler; - struct inode *inode = dentry->d_inode; if (size == 0) value = ""; /* empty EA, do not remove */ - handler = xattr_resolve_name(inode->i_sb->s_xattr, &name); + handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) return -EOPNOTSUPP; - return handler->set(inode, name, value, size, flags); + return handler->set(dentry, name, value, size, 0, handler->flags); } /* @@ -677,12 +677,12 @@ int generic_removexattr(struct dentry *dentry, const char *name) { struct xattr_handler *handler; - struct inode *inode = dentry->d_inode; - handler = xattr_resolve_name(inode->i_sb->s_xattr, &name); + handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) return -EOPNOTSUPP; - return handler->set(inode, name, NULL, 0, XATTR_REPLACE); + return handler->set(dentry, name, NULL, 0, + XATTR_REPLACE, handler->flags); } EXPORT_SYMBOL(generic_getxattr); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7a59daed1782..56641fe52a23 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -26,6 +26,8 @@ endif obj-$(CONFIG_XFS_FS) += xfs.o +xfs-y += linux-2.6/xfs_trace.o + xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \ xfs_dquot.o \ xfs_dquot_item.o \ @@ -90,8 +92,7 @@ xfs-y += xfs_alloc.o \ xfs_rw.o \ xfs_dmops.o -xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o \ - xfs_dir2_trace.o +xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o # Objects in linux/ xfs-y += $(addprefix $(XFS_LINUX)/, \ @@ -113,6 +114,3 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \ xfs-y += $(addprefix support/, \ debug.o \ uuid.o) - -xfs-$(CONFIG_XFS_TRACE) += support/ktrace.o - diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index b23a54506446..883ca5ab8af5 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -21,6 +21,7 @@ #include "xfs_bmap_btree.h" #include "xfs_inode.h" #include "xfs_vnodeops.h" +#include "xfs_trace.h" #include <linux/xattr.h> #include <linux/posix_acl_xattr.h> @@ -250,8 +251,9 @@ xfs_set_mode(struct inode *inode, mode_t mode) if (mode != inode->i_mode) { struct iattr iattr; - iattr.ia_valid = ATTR_MODE; + iattr.ia_valid = ATTR_MODE | ATTR_CTIME; iattr.ia_mode = mode; + iattr.ia_ctime = current_fs_time(inode->i_sb); error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL); } @@ -353,37 +355,14 @@ xfs_acl_chmod(struct inode *inode) return error; } -/* - * System xattr handlers. - * - * Currently Posix ACLs are the only system namespace extended attribute - * handlers supported by XFS, so we just implement the handlers here. - * If we ever support other system extended attributes this will need - * some refactoring. - */ - static int -xfs_decode_acl(const char *name) -{ - if (strcmp(name, "posix_acl_access") == 0) - return ACL_TYPE_ACCESS; - else if (strcmp(name, "posix_acl_default") == 0) - return ACL_TYPE_DEFAULT; - return -EINVAL; -} - -static int -xfs_xattr_system_get(struct inode *inode, const char *name, - void *value, size_t size) +xfs_xattr_acl_get(struct dentry *dentry, const char *name, + void *value, size_t size, int type) { struct posix_acl *acl; - int type, error; - - type = xfs_decode_acl(name); - if (type < 0) - return type; + int error; - acl = xfs_get_acl(inode, type); + acl = xfs_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -396,15 +375,13 @@ xfs_xattr_system_get(struct inode *inode, const char *name, } static int -xfs_xattr_system_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +xfs_xattr_acl_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; struct posix_acl *acl = NULL; - int error = 0, type; + int error = 0; - type = xfs_decode_acl(name); - if (type < 0) - return type; if (flags & XATTR_CREATE) return -EINVAL; if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) @@ -461,8 +438,16 @@ xfs_xattr_system_set(struct inode *inode, const char *name, return error; } -struct xattr_handler xfs_xattr_system_handler = { - .prefix = XATTR_SYSTEM_PREFIX, - .get = xfs_xattr_system_get, - .set = xfs_xattr_system_set, +struct xattr_handler xfs_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .get = xfs_xattr_acl_get, + .set = xfs_xattr_acl_set, +}; + +struct xattr_handler xfs_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .get = xfs_xattr_acl_get, + .set = xfs_xattr_acl_set, }; diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 70f989895d15..66abe36c1213 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -38,6 +38,7 @@ #include "xfs_rw.h" #include "xfs_iomap.h" #include "xfs_vnodeops.h" +#include "xfs_trace.h" #include <linux/mpage.h> #include <linux/pagevec.h> #include <linux/writeback.h> @@ -76,7 +77,7 @@ xfs_ioend_wake( wake_up(to_ioend_wq(ip)); } -STATIC void +void xfs_count_page_state( struct page *page, int *delalloc, @@ -98,48 +99,6 @@ xfs_count_page_state( } while ((bh = bh->b_this_page) != head); } -#if defined(XFS_RW_TRACE) -void -xfs_page_trace( - int tag, - struct inode *inode, - struct page *page, - unsigned long pgoff) -{ - xfs_inode_t *ip; - loff_t isize = i_size_read(inode); - loff_t offset = page_offset(page); - int delalloc = -1, unmapped = -1, unwritten = -1; - - if (page_has_buffers(page)) - xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); - - ip = XFS_I(inode); - if (!ip->i_rwtrace) - return; - - ktrace_enter(ip->i_rwtrace, - (void *)((unsigned long)tag), - (void *)ip, - (void *)inode, - (void *)page, - (void *)pgoff, - (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), - (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), - (void *)((unsigned long)((isize >> 32) & 0xffffffff)), - (void *)((unsigned long)(isize & 0xffffffff)), - (void *)((unsigned long)((offset >> 32) & 0xffffffff)), - (void *)((unsigned long)(offset & 0xffffffff)), - (void *)((unsigned long)delalloc), - (void *)((unsigned long)unmapped), - (void *)((unsigned long)unwritten), - (void *)((unsigned long)current_pid()), - (void *)NULL); -} -#else -#define xfs_page_trace(tag, inode, page, pgoff) -#endif - STATIC struct block_device * xfs_find_bdev_for_inode( struct xfs_inode *ip) @@ -235,71 +194,36 @@ xfs_setfilesize( } /* - * Buffered IO write completion for delayed allocate extents. - */ -STATIC void -xfs_end_bio_delalloc( - struct work_struct *work) -{ - xfs_ioend_t *ioend = - container_of(work, xfs_ioend_t, io_work); - - xfs_setfilesize(ioend); - xfs_destroy_ioend(ioend); -} - -/* - * Buffered IO write completion for regular, written extents. + * IO write completion. */ STATIC void -xfs_end_bio_written( - struct work_struct *work) -{ - xfs_ioend_t *ioend = - container_of(work, xfs_ioend_t, io_work); - - xfs_setfilesize(ioend); - xfs_destroy_ioend(ioend); -} - -/* - * IO write completion for unwritten extents. - * - * Issue transactions to convert a buffer range from unwritten - * to written extents. - */ -STATIC void -xfs_end_bio_unwritten( +xfs_end_io( struct work_struct *work) { xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); struct xfs_inode *ip = XFS_I(ioend->io_inode); - xfs_off_t offset = ioend->io_offset; - size_t size = ioend->io_size; - - if (likely(!ioend->io_error)) { - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - int error; - error = xfs_iomap_write_unwritten(ip, offset, size); - if (error) - ioend->io_error = error; - } - xfs_setfilesize(ioend); - } - xfs_destroy_ioend(ioend); -} -/* - * IO read completion for regular, written extents. - */ -STATIC void -xfs_end_bio_read( - struct work_struct *work) -{ - xfs_ioend_t *ioend = - container_of(work, xfs_ioend_t, io_work); + /* + * For unwritten extents we need to issue transactions to convert a + * range to normal written extens after the data I/O has finished. + */ + if (ioend->io_type == IOMAP_UNWRITTEN && + likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { + int error; + + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, + ioend->io_size); + if (error) + ioend->io_error = error; + } + /* + * We might have to update the on-disk file size after extending + * writes. + */ + if (ioend->io_type != IOMAP_READ) + xfs_setfilesize(ioend); xfs_destroy_ioend(ioend); } @@ -314,10 +238,10 @@ xfs_finish_ioend( int wait) { if (atomic_dec_and_test(&ioend->io_remaining)) { - struct workqueue_struct *wq = xfsdatad_workqueue; - if (ioend->io_work.func == xfs_end_bio_unwritten) - wq = xfsconvertd_workqueue; + struct workqueue_struct *wq; + wq = (ioend->io_type == IOMAP_UNWRITTEN) ? + xfsconvertd_workqueue : xfsdatad_workqueue; queue_work(wq, &ioend->io_work); if (wait) flush_workqueue(wq); @@ -355,15 +279,7 @@ xfs_alloc_ioend( ioend->io_offset = 0; ioend->io_size = 0; - if (type == IOMAP_UNWRITTEN) - INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten); - else if (type == IOMAP_DELAY) - INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc); - else if (type == IOMAP_READ) - INIT_WORK(&ioend->io_work, xfs_end_bio_read); - else - INIT_WORK(&ioend->io_work, xfs_end_bio_written); - + INIT_WORK(&ioend->io_work, xfs_end_io); return ioend; } @@ -380,7 +296,7 @@ xfs_map_blocks( return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps); } -STATIC_INLINE int +STATIC int xfs_iomap_valid( xfs_iomap_t *iomapp, loff_t offset) @@ -412,8 +328,9 @@ xfs_end_bio( STATIC void xfs_submit_ioend_bio( - xfs_ioend_t *ioend, - struct bio *bio) + struct writeback_control *wbc, + xfs_ioend_t *ioend, + struct bio *bio) { atomic_inc(&ioend->io_remaining); bio->bi_private = ioend; @@ -426,7 +343,8 @@ xfs_submit_ioend_bio( if (xfs_ioend_new_eof(ioend)) xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); - submit_bio(WRITE, bio); + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC_PLUG : WRITE, bio); ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); bio_put(bio); } @@ -505,6 +423,7 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) */ STATIC void xfs_submit_ioend( + struct writeback_control *wbc, xfs_ioend_t *ioend) { xfs_ioend_t *head = ioend; @@ -533,19 +452,19 @@ xfs_submit_ioend( retry: bio = xfs_alloc_ioend_bio(bh); } else if (bh->b_blocknr != lastblock + 1) { - xfs_submit_ioend_bio(ioend, bio); + xfs_submit_ioend_bio(wbc, ioend, bio); goto retry; } if (bio_add_buffer(bio, bh) != bh->b_size) { - xfs_submit_ioend_bio(ioend, bio); + xfs_submit_ioend_bio(wbc, ioend, bio); goto retry; } lastblock = bh->b_blocknr; } if (bio) - xfs_submit_ioend_bio(ioend, bio); + xfs_submit_ioend_bio(wbc, ioend, bio); xfs_finish_ioend(ioend, 0); } while ((ioend = next) != NULL); } @@ -1191,7 +1110,7 @@ xfs_page_state_convert( } if (iohead) - xfs_submit_ioend(iohead); + xfs_submit_ioend(wbc, iohead); return page_dirty; @@ -1242,7 +1161,7 @@ xfs_vm_writepage( int delalloc, unmapped, unwritten; struct inode *inode = page->mapping->host; - xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0); + trace_xfs_writepage(inode, page, 0); /* * We need a transaction if: @@ -1347,7 +1266,7 @@ xfs_vm_releasepage( .nr_to_write = 1, }; - xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, 0); + trace_xfs_releasepage(inode, page, 0); if (!page_has_buffers(page)) return 0; @@ -1528,7 +1447,7 @@ xfs_end_io_direct( * didn't map an unwritten extent so switch it's completion * handler. */ - INIT_WORK(&ioend->io_work, xfs_end_bio_written); + ioend->io_type = IOMAP_NEW; xfs_finish_ioend(ioend, 0); } @@ -1555,19 +1474,13 @@ xfs_vm_direct_IO( bdev = xfs_find_bdev_for_inode(XFS_I(inode)); - if (rw == WRITE) { - iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); - ret = blockdev_direct_IO_own_locking(rw, iocb, inode, - bdev, iov, offset, nr_segs, - xfs_get_blocks_direct, - xfs_end_io_direct); - } else { - iocb->private = xfs_alloc_ioend(inode, IOMAP_READ); - ret = blockdev_direct_IO_no_locking(rw, iocb, inode, - bdev, iov, offset, nr_segs, - xfs_get_blocks_direct, - xfs_end_io_direct); - } + iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? + IOMAP_UNWRITTEN : IOMAP_READ); + + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, + offset, nr_segs, + xfs_get_blocks_direct, + xfs_end_io_direct); if (unlikely(ret != -EIOCBQUEUED && iocb->private)) xfs_destroy_ioend(iocb->private); @@ -1627,8 +1540,7 @@ xfs_vm_invalidatepage( struct page *page, unsigned long offset) { - xfs_page_trace(XFS_INVALIDPAGE_ENTER, - page->mapping->host, page, offset); + trace_xfs_invalidatepage(page->mapping->host, page, offset); block_invalidatepage(page, offset); } diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h index 221b3e66ceef..4cfc6ea87df8 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -45,4 +45,6 @@ extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); extern void xfs_ioend_init(void); extern void xfs_ioend_wait(struct xfs_inode *); +extern void xfs_count_page_state(struct page *, int *, int *, int *); + #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 965df1227d64..77b8be81c769 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -39,6 +39,7 @@ #include "xfs_ag.h" #include "xfs_dmapi.h" #include "xfs_mount.h" +#include "xfs_trace.h" static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); @@ -53,34 +54,6 @@ static struct workqueue_struct *xfslogd_workqueue; struct workqueue_struct *xfsdatad_workqueue; struct workqueue_struct *xfsconvertd_workqueue; -#ifdef XFS_BUF_TRACE -void -xfs_buf_trace( - xfs_buf_t *bp, - char *id, - void *data, - void *ra) -{ - ktrace_enter(xfs_buf_trace_buf, - bp, id, - (void *)(unsigned long)bp->b_flags, - (void *)(unsigned long)bp->b_hold.counter, - (void *)(unsigned long)bp->b_sema.count, - (void *)current, - data, ra, - (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff), - (void *)(unsigned long)(bp->b_file_offset & 0xffffffff), - (void *)(unsigned long)bp->b_buffer_length, - NULL, NULL, NULL, NULL, NULL); -} -ktrace_t *xfs_buf_trace_buf; -#define XFS_BUF_TRACE_SIZE 4096 -#define XB_TRACE(bp, id, data) \ - xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0)) -#else -#define XB_TRACE(bp, id, data) do { } while (0) -#endif - #ifdef XFS_BUF_LOCK_TRACKING # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) @@ -149,7 +122,7 @@ page_region_mask( return mask; } -STATIC_INLINE void +STATIC void set_page_region( struct page *page, size_t offset, @@ -161,7 +134,7 @@ set_page_region( SetPageUptodate(page); } -STATIC_INLINE int +STATIC int test_page_region( struct page *page, size_t offset, @@ -279,7 +252,8 @@ _xfs_buf_initialize( init_waitqueue_head(&bp->b_waiters); XFS_STATS_INC(xb_create); - XB_TRACE(bp, "initialize", target); + + trace_xfs_buf_init(bp, _RET_IP_); } /* @@ -318,6 +292,7 @@ _xfs_buf_free_pages( { if (bp->b_pages != bp->b_page_array) { kmem_free(bp->b_pages); + bp->b_pages = NULL; } } @@ -332,7 +307,7 @@ void xfs_buf_free( xfs_buf_t *bp) { - XB_TRACE(bp, "free", 0); + trace_xfs_buf_free(bp, _RET_IP_); ASSERT(list_empty(&bp->b_hash_list)); @@ -349,9 +324,8 @@ xfs_buf_free( ASSERT(!PagePrivate(page)); page_cache_release(page); } - _xfs_buf_free_pages(bp); } - + _xfs_buf_free_pages(bp); xfs_buf_deallocate(bp); } @@ -445,7 +419,6 @@ _xfs_buf_lookup_pages( if (page_count == bp->b_page_count) bp->b_flags |= XBF_DONE; - XB_TRACE(bp, "lookup_pages", (long)page_count); return error; } @@ -548,7 +521,6 @@ found: if (down_trylock(&bp->b_sema)) { if (!(flags & XBF_TRYLOCK)) { /* wait for buffer ownership */ - XB_TRACE(bp, "get_lock", 0); xfs_buf_lock(bp); XFS_STATS_INC(xb_get_locked_waited); } else { @@ -571,7 +543,8 @@ found: ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); bp->b_flags &= XBF_MAPPED; } - XB_TRACE(bp, "got_lock", 0); + + trace_xfs_buf_find(bp, flags, _RET_IP_); XFS_STATS_INC(xb_get_locked); return bp; } @@ -582,7 +555,7 @@ found: * although backing storage may not be. */ xfs_buf_t * -xfs_buf_get_flags( +xfs_buf_get( xfs_buftarg_t *target,/* target for buffer */ xfs_off_t ioff, /* starting offset of range */ size_t isize, /* length of range */ @@ -627,7 +600,7 @@ xfs_buf_get_flags( bp->b_bn = ioff; bp->b_count_desired = bp->b_buffer_length; - XB_TRACE(bp, "get", (unsigned long)flags); + trace_xfs_buf_get(bp, flags, _RET_IP_); return bp; no_buffer: @@ -644,8 +617,6 @@ _xfs_buf_read( { int status; - XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags); - ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); @@ -661,7 +632,7 @@ _xfs_buf_read( } xfs_buf_t * -xfs_buf_read_flags( +xfs_buf_read( xfs_buftarg_t *target, xfs_off_t ioff, size_t isize, @@ -671,21 +642,20 @@ xfs_buf_read_flags( flags |= XBF_READ; - bp = xfs_buf_get_flags(target, ioff, isize, flags); + bp = xfs_buf_get(target, ioff, isize, flags); if (bp) { + trace_xfs_buf_read(bp, flags, _RET_IP_); + if (!XFS_BUF_ISDONE(bp)) { - XB_TRACE(bp, "read", (unsigned long)flags); XFS_STATS_INC(xb_get_read); _xfs_buf_read(bp, flags); } else if (flags & XBF_ASYNC) { - XB_TRACE(bp, "read_async", (unsigned long)flags); /* * Read ahead call which is already satisfied, * drop the buffer */ goto no_buffer; } else { - XB_TRACE(bp, "read_done", (unsigned long)flags); /* We do not want read in the flags */ bp->b_flags &= ~XBF_READ; } @@ -718,7 +688,7 @@ xfs_buf_readahead( return; flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); - xfs_buf_read_flags(target, ioff, isize, flags); + xfs_buf_read(target, ioff, isize, flags); } xfs_buf_t * @@ -823,7 +793,7 @@ xfs_buf_get_noaddr( xfs_buf_unlock(bp); - XB_TRACE(bp, "no_daddr", len); + trace_xfs_buf_get_noaddr(bp, _RET_IP_); return bp; fail_free_mem: @@ -845,8 +815,8 @@ void xfs_buf_hold( xfs_buf_t *bp) { + trace_xfs_buf_hold(bp, _RET_IP_); atomic_inc(&bp->b_hold); - XB_TRACE(bp, "hold", 0); } /* @@ -859,7 +829,7 @@ xfs_buf_rele( { xfs_bufhash_t *hash = bp->b_hash; - XB_TRACE(bp, "rele", bp->b_relse); + trace_xfs_buf_rele(bp, _RET_IP_); if (unlikely(!hash)) { ASSERT(!bp->b_relse); @@ -909,21 +879,19 @@ xfs_buf_cond_lock( int locked; locked = down_trylock(&bp->b_sema) == 0; - if (locked) { + if (locked) XB_SET_OWNER(bp); - } - XB_TRACE(bp, "cond_lock", (long)locked); + + trace_xfs_buf_cond_lock(bp, _RET_IP_); return locked ? 0 : -EBUSY; } -#if defined(DEBUG) || defined(XFS_BLI_TRACE) int xfs_buf_lock_value( xfs_buf_t *bp) { return bp->b_sema.count; } -#endif /* * Locks a buffer object. @@ -935,12 +903,14 @@ void xfs_buf_lock( xfs_buf_t *bp) { - XB_TRACE(bp, "lock", 0); + trace_xfs_buf_lock(bp, _RET_IP_); + if (atomic_read(&bp->b_io_remaining)) blk_run_address_space(bp->b_target->bt_mapping); down(&bp->b_sema); XB_SET_OWNER(bp); - XB_TRACE(bp, "locked", 0); + + trace_xfs_buf_lock_done(bp, _RET_IP_); } /* @@ -962,7 +932,8 @@ xfs_buf_unlock( XB_CLEAR_OWNER(bp); up(&bp->b_sema); - XB_TRACE(bp, "unlock", 0); + + trace_xfs_buf_unlock(bp, _RET_IP_); } @@ -974,17 +945,18 @@ void xfs_buf_pin( xfs_buf_t *bp) { + trace_xfs_buf_pin(bp, _RET_IP_); atomic_inc(&bp->b_pin_count); - XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter); } void xfs_buf_unpin( xfs_buf_t *bp) { + trace_xfs_buf_unpin(bp, _RET_IP_); + if (atomic_dec_and_test(&bp->b_pin_count)) wake_up_all(&bp->b_waiters); - XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter); } int @@ -1035,7 +1007,7 @@ xfs_buf_iodone_work( */ if ((bp->b_error == EOPNOTSUPP) && (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) { - XB_TRACE(bp, "ordered_retry", bp->b_iodone); + trace_xfs_buf_ordered_retry(bp, _RET_IP_); bp->b_flags &= ~XBF_ORDERED; bp->b_flags |= _XFS_BARRIER_FAILED; xfs_buf_iorequest(bp); @@ -1050,12 +1022,12 @@ xfs_buf_ioend( xfs_buf_t *bp, int schedule) { + trace_xfs_buf_iodone(bp, _RET_IP_); + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); if (bp->b_error == 0) bp->b_flags |= XBF_DONE; - XB_TRACE(bp, "iodone", bp->b_iodone); - if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { if (schedule) { INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); @@ -1075,7 +1047,7 @@ xfs_buf_ioerror( { ASSERT(error >= 0 && error <= 0xffff); bp->b_error = (unsigned short)error; - XB_TRACE(bp, "ioerror", (unsigned long)error); + trace_xfs_buf_ioerror(bp, error, _RET_IP_); } int @@ -1083,7 +1055,7 @@ xfs_bawrite( void *mp, struct xfs_buf *bp) { - XB_TRACE(bp, "bawrite", 0); + trace_xfs_buf_bawrite(bp, _RET_IP_); ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); @@ -1102,7 +1074,7 @@ xfs_bdwrite( void *mp, struct xfs_buf *bp) { - XB_TRACE(bp, "bdwrite", 0); + trace_xfs_buf_bdwrite(bp, _RET_IP_); bp->b_strat = xfs_bdstrat_cb; bp->b_mount = mp; @@ -1113,7 +1085,7 @@ xfs_bdwrite( xfs_buf_delwri_queue(bp, 1); } -STATIC_INLINE void +STATIC void _xfs_buf_ioend( xfs_buf_t *bp, int schedule) @@ -1177,10 +1149,14 @@ _xfs_buf_ioapply( if (bp->b_flags & XBF_ORDERED) { ASSERT(!(bp->b_flags & XBF_READ)); rw = WRITE_BARRIER; - } else if (bp->b_flags & _XBF_RUN_QUEUES) { + } else if (bp->b_flags & XBF_LOG_BUFFER) { ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); bp->b_flags &= ~_XBF_RUN_QUEUES; rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC; + } else if (bp->b_flags & _XBF_RUN_QUEUES) { + ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); + bp->b_flags &= ~_XBF_RUN_QUEUES; + rw = (bp->b_flags & XBF_WRITE) ? WRITE_META : READ_META; } else { rw = (bp->b_flags & XBF_WRITE) ? WRITE : (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; @@ -1253,7 +1229,7 @@ int xfs_buf_iorequest( xfs_buf_t *bp) { - XB_TRACE(bp, "iorequest", 0); + trace_xfs_buf_iorequest(bp, _RET_IP_); if (bp->b_flags & XBF_DELWRI) { xfs_buf_delwri_queue(bp, 1); @@ -1287,11 +1263,13 @@ int xfs_buf_iowait( xfs_buf_t *bp) { - XB_TRACE(bp, "iowait", 0); + trace_xfs_buf_iowait(bp, _RET_IP_); + if (atomic_read(&bp->b_io_remaining)) blk_run_address_space(bp->b_target->bt_mapping); wait_for_completion(&bp->b_iowait); - XB_TRACE(bp, "iowaited", (long)bp->b_error); + + trace_xfs_buf_iowait_done(bp, _RET_IP_); return bp->b_error; } @@ -1604,7 +1582,8 @@ xfs_buf_delwri_queue( struct list_head *dwq = &bp->b_target->bt_delwrite_queue; spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; - XB_TRACE(bp, "delwri_q", (long)unlock); + trace_xfs_buf_delwri_queue(bp, _RET_IP_); + ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); spin_lock(dwlk); @@ -1644,7 +1623,7 @@ xfs_buf_delwri_dequeue( if (dequeued) xfs_buf_rele(bp); - XB_TRACE(bp, "delwri_dq", (long)dequeued); + trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); } STATIC void @@ -1692,7 +1671,7 @@ xfs_buf_delwri_split( INIT_LIST_HEAD(list); spin_lock(dwlk); list_for_each_entry_safe(bp, n, dwq, b_list) { - XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp)); + trace_xfs_buf_delwri_split(bp, _RET_IP_); ASSERT(bp->b_flags & XBF_DELWRI); if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { @@ -1816,14 +1795,10 @@ xfs_flush_buftarg( int __init xfs_buf_init(void) { -#ifdef XFS_BUF_TRACE - xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS); -#endif - xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", KM_ZONE_HWALIGN, NULL); if (!xfs_buf_zone) - goto out_free_trace_buf; + goto out; xfslogd_workqueue = create_workqueue("xfslogd"); if (!xfslogd_workqueue) @@ -1846,10 +1821,7 @@ xfs_buf_init(void) destroy_workqueue(xfslogd_workqueue); out_free_buf_zone: kmem_zone_destroy(xfs_buf_zone); - out_free_trace_buf: -#ifdef XFS_BUF_TRACE - ktrace_free(xfs_buf_trace_buf); -#endif + out: return -ENOMEM; } @@ -1861,9 +1833,6 @@ xfs_buf_terminate(void) destroy_workqueue(xfsdatad_workqueue); destroy_workqueue(xfslogd_workqueue); kmem_zone_destroy(xfs_buf_zone); -#ifdef XFS_BUF_TRACE - ktrace_free(xfs_buf_trace_buf); -#endif } #ifdef CONFIG_KDB_MODULES diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 9b4d666ad31f..a34c7b54822d 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -55,6 +55,7 @@ typedef enum { XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ XBF_ORDERED = (1 << 11), /* use ordered writes */ XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */ + XBF_LOG_BUFFER = (1 << 13), /* this is a buffer used for the log */ /* flags used only as arguments to access routines */ XBF_LOCK = (1 << 14), /* lock requested */ @@ -95,6 +96,28 @@ typedef enum { _XFS_BARRIER_FAILED = (1 << 23), } xfs_buf_flags_t; +#define XFS_BUF_FLAGS \ + { XBF_READ, "READ" }, \ + { XBF_WRITE, "WRITE" }, \ + { XBF_MAPPED, "MAPPED" }, \ + { XBF_ASYNC, "ASYNC" }, \ + { XBF_DONE, "DONE" }, \ + { XBF_DELWRI, "DELWRI" }, \ + { XBF_STALE, "STALE" }, \ + { XBF_FS_MANAGED, "FS_MANAGED" }, \ + { XBF_ORDERED, "ORDERED" }, \ + { XBF_READ_AHEAD, "READ_AHEAD" }, \ + { XBF_LOCK, "LOCK" }, /* should never be set */\ + { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ + { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ + { _XBF_PAGE_CACHE, "PAGE_CACHE" }, \ + { _XBF_PAGES, "PAGES" }, \ + { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ + { _XBF_DELWRI_Q, "DELWRI_Q" }, \ + { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }, \ + { _XFS_BARRIER_FAILED, "BARRIER_FAILED" } + + typedef enum { XBT_FORCE_SLEEP = 0, XBT_FORCE_FLUSH = 1, @@ -186,15 +209,10 @@ extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t, #define xfs_incore(buftarg,blkno,len,lockit) \ _xfs_buf_find(buftarg, blkno ,len, lockit, NULL) -extern xfs_buf_t *xfs_buf_get_flags(xfs_buftarg_t *, xfs_off_t, size_t, +extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t, xfs_buf_flags_t); -#define xfs_buf_get(target, blkno, len, flags) \ - xfs_buf_get_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED) - -extern xfs_buf_t *xfs_buf_read_flags(xfs_buftarg_t *, xfs_off_t, size_t, +extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, xfs_buf_flags_t); -#define xfs_buf_read(target, blkno, len, flags) \ - xfs_buf_read_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED) extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *); @@ -248,13 +266,6 @@ extern void xfs_buf_delwri_dequeue(xfs_buf_t *); extern int xfs_buf_init(void); extern void xfs_buf_terminate(void); -#ifdef XFS_BUF_TRACE -extern ktrace_t *xfs_buf_trace_buf; -extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *); -#else -#define xfs_buf_trace(bp,id,ptr,ra) do { } while (0) -#endif - #define xfs_buf_target_name(target) \ ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) @@ -370,10 +381,6 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) #define xfs_bpin(bp) xfs_buf_pin(bp) #define xfs_bunpin(bp) xfs_buf_unpin(bp) - -#define xfs_buftrace(id, bp) \ - xfs_buf_trace(bp, id, NULL, (void *)__builtin_return_address(0)) - #define xfs_biodone(bp) xfs_buf_ioend(bp, 0) #define xfs_biomove(bp, off, len, data, rw) \ diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index eff61e2732af..e4caeb28ce2e 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -52,7 +52,7 @@ xfs_file_aio_read( loff_t pos) { struct file *file = iocb->ki_filp; - int ioflags = IO_ISAIO; + int ioflags = 0; BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) @@ -71,7 +71,7 @@ xfs_file_aio_write( loff_t pos) { struct file *file = iocb->ki_filp; - int ioflags = IO_ISAIO; + int ioflags = 0; BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c index 08be36d7326c..7501b85fd860 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -19,6 +19,7 @@ #include "xfs_vnodeops.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" +#include "xfs_trace.h" int fs_noerr(void) { return 0; } int fs_nosys(void) { return ENOSYS; } @@ -51,6 +52,8 @@ xfs_flushinval_pages( struct address_space *mapping = VFS_I(ip)->i_mapping; int ret = 0; + trace_xfs_pagecache_inval(ip, first, last); + if (mapping->nrpages) { xfs_iflags_clear(ip, XFS_ITRUNCATED); ret = filemap_write_and_wait(mapping); diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 5bb523d7f37e..a034cf624437 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -51,6 +51,7 @@ #include "xfs_quota.h" #include "xfs_inode_item.h" #include "xfs_export.h" +#include "xfs_trace.h" #include <linux/capability.h> #include <linux/dcache.h> diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index eafcc7c18706..be1527b1670c 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -46,6 +46,7 @@ #include "xfs_attr.h" #include "xfs_ioctl.h" #include "xfs_ioctl32.h" +#include "xfs_trace.h" #define _NATIVE_IOC(cmd, type) \ _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index cd42ef78f6b5..225946012d0b 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -47,6 +47,7 @@ #include "xfs_buf_item.h" #include "xfs_utils.h" #include "xfs_vnodeops.h" +#include "xfs_trace.h" #include <linux/capability.h> #include <linux/xattr.h> @@ -573,8 +574,8 @@ xfs_vn_fallocate( bf.l_len = len; xfs_ilock(ip, XFS_IOLOCK_EXCL); - error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, - 0, XFS_ATTR_NOLOCK); + error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, + 0, XFS_ATTR_NOLOCK); if (!error && !(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) new_size = offset + len; @@ -585,7 +586,7 @@ xfs_vn_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; - error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); + error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); } xfs_iunlock(ip, XFS_IOLOCK_EXCL); @@ -793,7 +794,7 @@ xfs_setup_inode( struct inode *inode = &ip->i_vnode; inode->i_ino = ip->i_ino; - inode->i_state = I_NEW|I_LOCK; + inode->i_state = I_NEW; inode_add_to_lists(ip->i_mount->m_super, inode); inode->i_mode = ip->i_d.di_mode; diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 6127e24062d0..5af0c81ca1ae 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -40,7 +40,6 @@ #include <sv.h> #include <time.h> -#include <support/ktrace.h> #include <support/debug.h> #include <support/uuid.h> diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 072050f8d346..0d32457abef1 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -48,73 +48,12 @@ #include "xfs_utils.h" #include "xfs_iomap.h" #include "xfs_vnodeops.h" +#include "xfs_trace.h" #include <linux/capability.h> #include <linux/writeback.h> -#if defined(XFS_RW_TRACE) -void -xfs_rw_enter_trace( - int tag, - xfs_inode_t *ip, - void *data, - size_t segs, - loff_t offset, - int ioflags) -{ - if (ip->i_rwtrace == NULL) - return; - ktrace_enter(ip->i_rwtrace, - (void *)(unsigned long)tag, - (void *)ip, - (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), - (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), - (void *)data, - (void *)((unsigned long)segs), - (void *)((unsigned long)((offset >> 32) & 0xffffffff)), - (void *)((unsigned long)(offset & 0xffffffff)), - (void *)((unsigned long)ioflags), - (void *)((unsigned long)((ip->i_new_size >> 32) & 0xffffffff)), - (void *)((unsigned long)(ip->i_new_size & 0xffffffff)), - (void *)((unsigned long)current_pid()), - (void *)NULL, - (void *)NULL, - (void *)NULL, - (void *)NULL); -} - -void -xfs_inval_cached_trace( - xfs_inode_t *ip, - xfs_off_t offset, - xfs_off_t len, - xfs_off_t first, - xfs_off_t last) -{ - - if (ip->i_rwtrace == NULL) - return; - ktrace_enter(ip->i_rwtrace, - (void *)(__psint_t)XFS_INVAL_CACHED, - (void *)ip, - (void *)((unsigned long)((offset >> 32) & 0xffffffff)), - (void *)((unsigned long)(offset & 0xffffffff)), - (void *)((unsigned long)((len >> 32) & 0xffffffff)), - (void *)((unsigned long)(len & 0xffffffff)), - (void *)((unsigned long)((first >> 32) & 0xffffffff)), - (void *)((unsigned long)(first & 0xffffffff)), - (void *)((unsigned long)((last >> 32) & 0xffffffff)), - (void *)((unsigned long)(last & 0xffffffff)), - (void *)((unsigned long)current_pid()), - (void *)NULL, - (void *)NULL, - (void *)NULL, - (void *)NULL, - (void *)NULL); -} -#endif - /* * xfs_iozero * @@ -250,13 +189,10 @@ xfs_read( } } - xfs_rw_enter_trace(XFS_READ_ENTER, ip, - (void *)iovp, segs, *offset, ioflags); + trace_xfs_file_read(ip, size, *offset, ioflags); iocb->ki_pos = *offset; ret = generic_file_aio_read(iocb, iovp, segs, *offset); - if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) - ret = wait_on_sync_kiocb(iocb); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -294,8 +230,9 @@ xfs_splice_read( return -error; } } - xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, ip, - pipe, count, *ppos, ioflags); + + trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -344,8 +281,8 @@ xfs_splice_write( ip->i_new_size = new_size; xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, ip, - pipe, count, *ppos, ioflags); + trace_xfs_file_splice_write(ip, count, *ppos, ioflags); + ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); if (ret > 0) XFS_STATS_ADD(xs_write_bytes, ret); @@ -712,8 +649,6 @@ start: if ((ioflags & IO_ISDIRECT)) { if (mapping->nrpages) { WARN_ON(need_i_mutex == 0); - xfs_inval_cached_trace(xip, pos, -1, - (pos & PAGE_CACHE_MASK), -1); error = xfs_flushinval_pages(xip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); @@ -730,8 +665,7 @@ start: need_i_mutex = 0; } - xfs_rw_enter_trace(XFS_DIOWR_ENTER, xip, (void *)iovp, segs, - *offset, ioflags); + trace_xfs_file_direct_write(xip, count, *offset, ioflags); ret = generic_file_direct_write(iocb, iovp, &segs, pos, offset, count, ocount); @@ -754,8 +688,7 @@ start: ssize_t ret2 = 0; write_retry: - xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs, - *offset, ioflags); + trace_xfs_file_buffered_write(xip, count, *offset, ioflags); ret2 = generic_file_buffered_write(iocb, iovp, segs, pos, offset, count, ret); /* @@ -774,9 +707,6 @@ write_retry: current->backing_dev_info = NULL; - if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) - ret = wait_on_sync_kiocb(iocb); - isize = i_size_read(inode); if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) *offset = isize; @@ -811,7 +741,7 @@ write_retry: XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ - if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { + if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { loff_t end = pos + ret - 1; int error2; @@ -863,7 +793,7 @@ int xfs_bdstrat_cb(struct xfs_buf *bp) { if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { - xfs_buftrace("XFS__BDSTRAT IOERROR", bp); + trace_xfs_bdstrat_shut(bp, _RET_IP_); /* * Metadata write that didn't get logged but * written delayed anyway. These aren't associated @@ -896,7 +826,7 @@ xfsbdstrat( return; } - xfs_buftrace("XFSBDSTRAT IOERROR", bp); + trace_xfs_bdstrat_shut(bp, _RET_IP_); xfs_bioerror_relse(bp); } diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index e6be37dbd0e9..d1f7789c7ffb 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -20,52 +20,7 @@ struct xfs_mount; struct xfs_inode; -struct xfs_bmbt_irec; struct xfs_buf; -struct xfs_iomap; - -#if defined(XFS_RW_TRACE) -/* - * Defines for the trace mechanisms in xfs_lrw.c. - */ -#define XFS_RW_KTRACE_SIZE 128 - -#define XFS_READ_ENTER 1 -#define XFS_WRITE_ENTER 2 -#define XFS_IOMAP_READ_ENTER 3 -#define XFS_IOMAP_WRITE_ENTER 4 -#define XFS_IOMAP_READ_MAP 5 -#define XFS_IOMAP_WRITE_MAP 6 -#define XFS_IOMAP_WRITE_NOSPACE 7 -#define XFS_ITRUNC_START 8 -#define XFS_ITRUNC_FINISH1 9 -#define XFS_ITRUNC_FINISH2 10 -#define XFS_CTRUNC1 11 -#define XFS_CTRUNC2 12 -#define XFS_CTRUNC3 13 -#define XFS_CTRUNC4 14 -#define XFS_CTRUNC5 15 -#define XFS_CTRUNC6 16 -#define XFS_BUNMAP 17 -#define XFS_INVAL_CACHED 18 -#define XFS_DIORD_ENTER 19 -#define XFS_DIOWR_ENTER 20 -#define XFS_WRITEPAGE_ENTER 22 -#define XFS_RELEASEPAGE_ENTER 23 -#define XFS_INVALIDPAGE_ENTER 24 -#define XFS_IOMAP_ALLOC_ENTER 25 -#define XFS_IOMAP_ALLOC_MAP 26 -#define XFS_IOMAP_UNWRITTEN 27 -#define XFS_SPLICE_READ_ENTER 28 -#define XFS_SPLICE_WRITE_ENTER 29 -extern void xfs_rw_enter_trace(int, struct xfs_inode *, - void *, size_t, loff_t, int); -extern void xfs_inval_cached_trace(struct xfs_inode *, - xfs_off_t, xfs_off_t, xfs_off_t, xfs_off_t); -#else -#define xfs_rw_enter_trace(tag, ip, data, size, offset, ioflags) -#define xfs_inval_cached_trace(ip, offset, len, first, last) -#endif /* errors from xfsbdstrat() must be extracted from the buffer */ extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 18a4b8e11df2..77414db10dc2 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -15,6 +15,7 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + #include "xfs.h" #include "xfs_bit.h" #include "xfs_log.h" @@ -52,11 +53,11 @@ #include "xfs_trans_priv.h" #include "xfs_filestream.h" #include "xfs_da_btree.h" -#include "xfs_dir2_trace.h" #include "xfs_extfree_item.h" #include "xfs_mru_cache.h" #include "xfs_inode_item.h" #include "xfs_sync.h" +#include "xfs_trace.h" #include <linux/namei.h> #include <linux/init.h> @@ -930,13 +931,37 @@ xfs_fs_alloc_inode( */ STATIC void xfs_fs_destroy_inode( - struct inode *inode) + struct inode *inode) { - xfs_inode_t *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(inode); + + xfs_itrace_entry(ip); XFS_STATS_INC(vn_reclaim); - if (xfs_reclaim(ip)) - panic("%s: cannot reclaim 0x%p\n", __func__, inode); + + /* bad inode, get out here ASAP */ + if (is_bad_inode(inode)) + goto out_reclaim; + + xfs_ioend_wait(ip); + + ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); + + /* + * We should never get here with one of the reclaim flags already set. + */ + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); + + /* + * We always use background reclaim here because even if the + * inode is clean, it still may be under IO and hence we have + * to take the flush lock. The background reclaim path handles + * this more efficiently than we can here, so simply let background + * reclaim tear down all inodes. + */ +out_reclaim: + xfs_inode_set_reclaim_tag(ip); } /* @@ -973,7 +998,6 @@ xfs_fs_inode_init_once( mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); } /* @@ -1075,6 +1099,20 @@ xfs_fs_clear_inode( XFS_STATS_INC(vn_remove); XFS_STATS_DEC(vn_active); + /* + * The iolock is used by the file system to coordinate reads, + * writes, and block truncates. Up to this point the lock + * protected concurrent accesses by users of the inode. But + * from here forward we're doing some final processing of the + * inode because we're done with it, and although we reuse the + * iolock for protection it is really a distinct lock class + * (in the lockdep sense) from before. To keep lockdep happy + * (and basically indicate what we are doing), we explicitly + * re-init the iolock here. + */ + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + xfs_inactive(ip); } @@ -1092,8 +1130,6 @@ xfs_fs_put_super( struct super_block *sb) { struct xfs_mount *mp = XFS_M(sb); - struct xfs_inode *rip = mp->m_rootip; - int unmount_event_flags = 0; xfs_syncd_stop(mp); @@ -1109,20 +1145,7 @@ xfs_fs_put_super( xfs_sync_attr(mp, 0); } -#ifdef HAVE_DMAPI - if (mp->m_flags & XFS_MOUNT_DMAPI) { - unmount_event_flags = - (mp->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? - 0 : DM_FLAGS_UNWANTED; - /* - * Ignore error from dmapi here, first unmount is not allowed - * to fail anyway, and second we wouldn't want to fail a - * unmount because of dmapi. - */ - XFS_SEND_PREUNMOUNT(mp, rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL, - NULL, NULL, 0, 0, unmount_event_flags); - } -#endif + XFS_SEND_PREUNMOUNT(mp); /* * Blow away any referenced inode in the filestreams cache. @@ -1133,10 +1156,7 @@ xfs_fs_put_super( XFS_bflush(mp->m_ddev_targp); - if (mp->m_flags & XFS_MOUNT_DMAPI) { - XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0, - unmount_event_flags); - } + XFS_SEND_UNMOUNT(mp); xfs_unmountfs(mp); xfs_freesb(mp); @@ -1504,8 +1524,6 @@ xfs_fs_fill_super( goto fail_vnrele; kfree(mtpt); - - xfs_itrace_exit(XFS_I(sb->s_root->d_inode)); return 0; out_filestream_unmount: @@ -1581,94 +1599,6 @@ static struct file_system_type xfs_fs_type = { }; STATIC int __init -xfs_alloc_trace_bufs(void) -{ -#ifdef XFS_ALLOC_TRACE - xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_MAYFAIL); - if (!xfs_alloc_trace_buf) - goto out; -#endif -#ifdef XFS_BMAP_TRACE - xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_MAYFAIL); - if (!xfs_bmap_trace_buf) - goto out_free_alloc_trace; -#endif -#ifdef XFS_BTREE_TRACE - xfs_allocbt_trace_buf = ktrace_alloc(XFS_ALLOCBT_TRACE_SIZE, - KM_MAYFAIL); - if (!xfs_allocbt_trace_buf) - goto out_free_bmap_trace; - - xfs_inobt_trace_buf = ktrace_alloc(XFS_INOBT_TRACE_SIZE, KM_MAYFAIL); - if (!xfs_inobt_trace_buf) - goto out_free_allocbt_trace; - - xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL); - if (!xfs_bmbt_trace_buf) - goto out_free_inobt_trace; -#endif -#ifdef XFS_ATTR_TRACE - xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL); - if (!xfs_attr_trace_buf) - goto out_free_bmbt_trace; -#endif -#ifdef XFS_DIR2_TRACE - xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_MAYFAIL); - if (!xfs_dir2_trace_buf) - goto out_free_attr_trace; -#endif - - return 0; - -#ifdef XFS_DIR2_TRACE - out_free_attr_trace: -#endif -#ifdef XFS_ATTR_TRACE - ktrace_free(xfs_attr_trace_buf); - out_free_bmbt_trace: -#endif -#ifdef XFS_BTREE_TRACE - ktrace_free(xfs_bmbt_trace_buf); - out_free_inobt_trace: - ktrace_free(xfs_inobt_trace_buf); - out_free_allocbt_trace: - ktrace_free(xfs_allocbt_trace_buf); - out_free_bmap_trace: -#endif -#ifdef XFS_BMAP_TRACE - ktrace_free(xfs_bmap_trace_buf); - out_free_alloc_trace: -#endif -#ifdef XFS_ALLOC_TRACE - ktrace_free(xfs_alloc_trace_buf); - out: -#endif - return -ENOMEM; -} - -STATIC void -xfs_free_trace_bufs(void) -{ -#ifdef XFS_DIR2_TRACE - ktrace_free(xfs_dir2_trace_buf); -#endif -#ifdef XFS_ATTR_TRACE - ktrace_free(xfs_attr_trace_buf); -#endif -#ifdef XFS_BTREE_TRACE - ktrace_free(xfs_bmbt_trace_buf); - ktrace_free(xfs_inobt_trace_buf); - ktrace_free(xfs_allocbt_trace_buf); -#endif -#ifdef XFS_BMAP_TRACE - ktrace_free(xfs_bmap_trace_buf); -#endif -#ifdef XFS_ALLOC_TRACE - ktrace_free(xfs_alloc_trace_buf); -#endif -} - -STATIC int __init xfs_init_zones(void) { @@ -1809,7 +1739,6 @@ init_xfs_fs(void) printk(KERN_INFO XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"); - ktrace_init(64); xfs_ioend_init(); xfs_dir_startup(); @@ -1817,13 +1746,9 @@ init_xfs_fs(void) if (error) goto out; - error = xfs_alloc_trace_bufs(); - if (error) - goto out_destroy_zones; - error = xfs_mru_cache_init(); if (error) - goto out_free_trace_buffers; + goto out_destroy_zones; error = xfs_filestream_init(); if (error) @@ -1858,8 +1783,6 @@ init_xfs_fs(void) xfs_filestream_uninit(); out_mru_cache_uninit: xfs_mru_cache_uninit(); - out_free_trace_buffers: - xfs_free_trace_bufs(); out_destroy_zones: xfs_destroy_zones(); out: @@ -1876,9 +1799,7 @@ exit_xfs_fs(void) xfs_buf_terminate(); xfs_filestream_uninit(); xfs_mru_cache_uninit(); - xfs_free_trace_bufs(); xfs_destroy_zones(); - ktrace_uninit(); } module_init(init_xfs_fs); diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 18175ebd58ed..233d4b9881b1 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -56,12 +56,6 @@ extern void xfs_qm_exit(void); # define XFS_BIGFS_STRING #endif -#ifdef CONFIG_XFS_TRACE -# define XFS_TRACE_STRING "tracing, " -#else -# define XFS_TRACE_STRING -#endif - #ifdef CONFIG_XFS_DMAPI # define XFS_DMAPI_STRING "dmapi support, " #else @@ -78,7 +72,6 @@ extern void xfs_qm_exit(void); XFS_SECURITY_STRING \ XFS_REALTIME_STRING \ XFS_BIGFS_STRING \ - XFS_TRACE_STRING \ XFS_DMAPI_STRING \ XFS_DBG_STRING /* DBG must be last */ diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 961df0a22c78..1f5e4bb5e970 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -44,6 +44,7 @@ #include "xfs_inode_item.h" #include "xfs_rw.h" #include "xfs_quota.h" +#include "xfs_trace.h" #include <linux/kthread.h> #include <linux/freezer.h> @@ -64,7 +65,6 @@ xfs_inode_ag_lookup( * as the tree is sparse and a gang lookup walks to find * the number of objects requested. */ - read_lock(&pag->pag_ici_lock); if (tag == XFS_ICI_NO_TAG) { nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)&ip, *first_index, 1); @@ -73,7 +73,7 @@ xfs_inode_ag_lookup( (void **)&ip, *first_index, 1, tag); } if (!nr_found) - goto unlock; + return NULL; /* * Update the index for the next lookup. Catch overflows @@ -83,13 +83,8 @@ xfs_inode_ag_lookup( */ *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - goto unlock; - + return NULL; return ip; - -unlock: - read_unlock(&pag->pag_ici_lock); - return NULL; } STATIC int @@ -99,7 +94,8 @@ xfs_inode_ag_walk( int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags, - int tag) + int tag, + int exclusive) { struct xfs_perag *pag = &mp->m_perag[ag]; uint32_t first_index; @@ -113,10 +109,20 @@ restart: int error = 0; xfs_inode_t *ip; + if (exclusive) + write_lock(&pag->pag_ici_lock); + else + read_lock(&pag->pag_ici_lock); ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); - if (!ip) + if (!ip) { + if (exclusive) + write_unlock(&pag->pag_ici_lock); + else + read_unlock(&pag->pag_ici_lock); break; + } + /* execute releases pag->pag_ici_lock */ error = execute(ip, pag, flags); if (error == EAGAIN) { skipped++; @@ -124,9 +130,8 @@ restart: } if (error) last_error = error; - /* - * bail out if the filesystem is corrupted. - */ + + /* bail out if the filesystem is corrupted. */ if (error == EFSCORRUPTED) break; @@ -147,7 +152,8 @@ xfs_inode_ag_iterator( int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags, - int tag) + int tag, + int exclusive) { int error = 0; int last_error = 0; @@ -156,7 +162,8 @@ xfs_inode_ag_iterator( for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { if (!mp->m_perag[ag].pag_ici_init) continue; - error = xfs_inode_ag_walk(mp, ag, execute, flags, tag); + error = xfs_inode_ag_walk(mp, ag, execute, flags, tag, + exclusive); if (error) { last_error = error; if (error == EFSCORRUPTED) @@ -173,30 +180,31 @@ xfs_sync_inode_valid( struct xfs_perag *pag) { struct inode *inode = VFS_I(ip); + int error = EFSCORRUPTED; /* nothing to sync during shutdown */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - read_unlock(&pag->pag_ici_lock); - return EFSCORRUPTED; - } + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + goto out_unlock; - /* - * If we can't get a reference on the inode, it must be in reclaim. - * Leave it for the reclaim code to flush. Also avoid inodes that - * haven't been fully initialised. - */ - if (!igrab(inode)) { - read_unlock(&pag->pag_ici_lock); - return ENOENT; - } - read_unlock(&pag->pag_ici_lock); + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + error = ENOENT; + if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + goto out_unlock; - if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) { + /* If we can't grab the inode, it must on it's way to reclaim. */ + if (!igrab(inode)) + goto out_unlock; + + if (is_bad_inode(inode)) { IRELE(ip); - return ENOENT; + goto out_unlock; } - return 0; + /* inode is valid */ + error = 0; +out_unlock: + read_unlock(&pag->pag_ici_lock); + return error; } STATIC int @@ -281,7 +289,7 @@ xfs_sync_data( ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, - XFS_ICI_NO_TAG); + XFS_ICI_NO_TAG, 0); if (error) return XFS_ERROR(error); @@ -303,7 +311,7 @@ xfs_sync_attr( ASSERT((flags & ~SYNC_WAIT) == 0); return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, - XFS_ICI_NO_TAG); + XFS_ICI_NO_TAG, 0); } STATIC int @@ -663,67 +671,6 @@ xfs_syncd_stop( kthread_stop(mp->m_sync_task); } -int -xfs_reclaim_inode( - xfs_inode_t *ip, - int locked, - int sync_mode) -{ - xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); - - /* The hash lock here protects a thread in xfs_iget_core from - * racing with us on linking the inode back with a vnode. - * Once we have the XFS_IRECLAIM flag set it will not touch - * us. - */ - write_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - if (__xfs_iflags_test(ip, XFS_IRECLAIM) || - !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { - spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); - if (locked) { - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - return -EAGAIN; - } - __xfs_iflags_set(ip, XFS_IRECLAIM); - spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); - xfs_put_perag(ip->i_mount, pag); - - /* - * If the inode is still dirty, then flush it out. If the inode - * is not in the AIL, then it will be OK to flush it delwri as - * long as xfs_iflush() does not keep any references to the inode. - * We leave that decision up to xfs_iflush() since it has the - * knowledge of whether it's OK to simply do a delwri flush of - * the inode or whether we need to wait until the inode is - * pulled from the AIL. - * We get the flush lock regardless, though, just to make sure - * we don't free it while it is being flushed. - */ - if (!locked) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_iflock(ip); - } - - /* - * In the case of a forced shutdown we rely on xfs_iflush() to - * wait for the inode to be unpinned before returning an error. - */ - if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { - /* synchronize with xfs_iflush_done */ - xfs_iflock(ip); - xfs_ifunlock(ip); - } - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_ireclaim(ip); - return 0; -} - void __xfs_inode_set_reclaim_tag( struct xfs_perag *pag, @@ -766,19 +713,55 @@ __xfs_inode_clear_reclaim_tag( } STATIC int -xfs_reclaim_inode_now( +xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag, - int flags) + int sync_mode) { - /* ignore if already under reclaim */ - if (xfs_iflags_test(ip, XFS_IRECLAIM)) { - read_unlock(&pag->pag_ici_lock); + /* + * The radix tree lock here protects a thread in xfs_iget from racing + * with us starting reclaim on the inode. Once we have the + * XFS_IRECLAIM flag set it will not touch us. + */ + spin_lock(&ip->i_flags_lock); + ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); + if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { + /* ignore as it is already under reclaim */ + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); return 0; } - read_unlock(&pag->pag_ici_lock); + __xfs_iflags_set(ip, XFS_IRECLAIM); + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); - return xfs_reclaim_inode(ip, 0, flags); + /* + * If the inode is still dirty, then flush it out. If the inode + * is not in the AIL, then it will be OK to flush it delwri as + * long as xfs_iflush() does not keep any references to the inode. + * We leave that decision up to xfs_iflush() since it has the + * knowledge of whether it's OK to simply do a delwri flush of + * the inode or whether we need to wait until the inode is + * pulled from the AIL. + * We get the flush lock regardless, though, just to make sure + * we don't free it while it is being flushed. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_iflock(ip); + + /* + * In the case of a forced shutdown we rely on xfs_iflush() to + * wait for the inode to be unpinned before returning an error. + */ + if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { + /* synchronize with xfs_iflush_done */ + xfs_iflock(ip); + xfs_ifunlock(ip); + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_ireclaim(ip); + return 0; } int @@ -786,6 +769,6 @@ xfs_reclaim_inodes( xfs_mount_t *mp, int mode) { - return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode, - XFS_ICI_RECLAIM_TAG); + return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, + XFS_ICI_RECLAIM_TAG, 1); } diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 27920eb7a820..ea932b43335d 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -44,7 +44,6 @@ void xfs_quiesce_attr(struct xfs_mount *mp); void xfs_flush_inodes(struct xfs_inode *ip); -int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); @@ -55,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags, int tag); + int flags, int tag, int write_lock); #endif diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c new file mode 100644 index 000000000000..856eb3c8d605 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_trace.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2009, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dir2_sf.h" +#include "xfs_attr_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_ialloc.h" +#include "xfs_itable.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_leaf.h" +#include "xfs_log_priv.h" +#include "xfs_buf_item.h" +#include "xfs_quota.h" +#include "xfs_iomap.h" +#include "xfs_aops.h" +#include "quota/xfs_dquot_item.h" +#include "quota/xfs_dquot.h" + +/* + * Format fsblock number into a static buffer & return it. + */ +STATIC char *xfs_fmtfsblock(xfs_fsblock_t bno) +{ + static char rval[50]; + + if (bno == NULLFSBLOCK) + sprintf(rval, "NULLFSBLOCK"); + else if (isnullstartblock(bno)) + sprintf(rval, "NULLSTARTBLOCK(%lld)", startblockval(bno)); + else + sprintf(rval, "%lld", (xfs_dfsbno_t)bno); + return rval; +} + +/* + * We include this last to have the helpers above available for the trace + * event implementations. + */ +#define CREATE_TRACE_POINTS +#include "xfs_trace.h" diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h new file mode 100644 index 000000000000..c22a608321a3 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -0,0 +1,1422 @@ +/* + * Copyright (c) 2009, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM xfs + +#if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_XFS_H + +#include <linux/tracepoint.h> + +struct xfs_agf; +struct xfs_alloc_arg; +struct xfs_attr_list_context; +struct xfs_buf_log_item; +struct xfs_da_args; +struct xfs_da_node_entry; +struct xfs_dquot; +struct xlog_ticket; +struct log; + +DECLARE_EVENT_CLASS(xfs_attr_list_class, + TP_PROTO(struct xfs_attr_list_context *ctx), + TP_ARGS(ctx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u32, hashval) + __field(u32, blkno) + __field(u32, offset) + __field(void *, alist) + __field(int, bufsize) + __field(int, count) + __field(int, firstu) + __field(int, dupcnt) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; + __entry->ino = ctx->dp->i_ino; + __entry->hashval = ctx->cursor->hashval; + __entry->blkno = ctx->cursor->blkno; + __entry->offset = ctx->cursor->offset; + __entry->alist = ctx->alist; + __entry->bufsize = ctx->bufsize; + __entry->count = ctx->count; + __entry->firstu = ctx->firstu; + __entry->flags = ctx->flags; + ), + TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " + "alist 0x%p size %u count %u firstu %u flags %d %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->hashval, + __entry->blkno, + __entry->offset, + __entry->dupcnt, + __entry->alist, + __entry->bufsize, + __entry->count, + __entry->firstu, + __entry->flags, + __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) + ) +) + +#define DEFINE_ATTR_LIST_EVENT(name) \ +DEFINE_EVENT(xfs_attr_list_class, name, \ + TP_PROTO(struct xfs_attr_list_context *ctx), \ + TP_ARGS(ctx)) +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf_end); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); + +TRACE_EVENT(xfs_attr_list_node_descend, + TP_PROTO(struct xfs_attr_list_context *ctx, + struct xfs_da_node_entry *btree), + TP_ARGS(ctx, btree), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u32, hashval) + __field(u32, blkno) + __field(u32, offset) + __field(void *, alist) + __field(int, bufsize) + __field(int, count) + __field(int, firstu) + __field(int, dupcnt) + __field(int, flags) + __field(u32, bt_hashval) + __field(u32, bt_before) + ), + TP_fast_assign( + __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; + __entry->ino = ctx->dp->i_ino; + __entry->hashval = ctx->cursor->hashval; + __entry->blkno = ctx->cursor->blkno; + __entry->offset = ctx->cursor->offset; + __entry->alist = ctx->alist; + __entry->bufsize = ctx->bufsize; + __entry->count = ctx->count; + __entry->firstu = ctx->firstu; + __entry->flags = ctx->flags; + __entry->bt_hashval = be32_to_cpu(btree->hashval); + __entry->bt_before = be32_to_cpu(btree->before); + ), + TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " + "alist 0x%p size %u count %u firstu %u flags %d %s " + "node hashval %u, node before %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->hashval, + __entry->blkno, + __entry->offset, + __entry->dupcnt, + __entry->alist, + __entry->bufsize, + __entry->count, + __entry->firstu, + __entry->flags, + __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), + __entry->bt_hashval, + __entry->bt_before) +); + +TRACE_EVENT(xfs_iext_insert, + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, + struct xfs_bmbt_irec *r, int state, unsigned long caller_ip), + TP_ARGS(ip, idx, r, state, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_extnum_t, idx) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(int, bmap_state) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->idx = idx; + __entry->startoff = r->br_startoff; + __entry->startblock = r->br_startblock; + __entry->blockcount = r->br_blockcount; + __entry->state = r->br_state; + __entry->bmap_state = state; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " + "offset %lld block %s count %lld flag %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), + (long)__entry->idx, + __entry->startoff, + xfs_fmtfsblock(__entry->startblock), + __entry->blockcount, + __entry->state, + (char *)__entry->caller_ip) +); + +DECLARE_EVENT_CLASS(xfs_bmap_class, + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, + unsigned long caller_ip), + TP_ARGS(ip, idx, state, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_extnum_t, idx) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(int, bmap_state) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ? + ip->i_afp : &ip->i_df; + struct xfs_bmbt_irec r; + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->idx = idx; + __entry->startoff = r.br_startoff; + __entry->startblock = r.br_startblock; + __entry->blockcount = r.br_blockcount; + __entry->state = r.br_state; + __entry->bmap_state = state; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " + "offset %lld block %s count %lld flag %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), + (long)__entry->idx, + __entry->startoff, + xfs_fmtfsblock(__entry->startblock), + __entry->blockcount, + __entry->state, + (char *)__entry->caller_ip) +) + +#define DEFINE_BMAP_EVENT(name) \ +DEFINE_EVENT(xfs_bmap_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \ + unsigned long caller_ip), \ + TP_ARGS(ip, idx, state, caller_ip)) +DEFINE_BMAP_EVENT(xfs_iext_remove); +DEFINE_BMAP_EVENT(xfs_bmap_pre_update); +DEFINE_BMAP_EVENT(xfs_bmap_post_update); +DEFINE_BMAP_EVENT(xfs_extlist); + +DECLARE_EVENT_CLASS(xfs_buf_class, + TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), + TP_ARGS(bp, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(unsigned, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = bp->b_buffer_length; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = xfs_buf_lock_value(bp); + __entry->flags = bp->b_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) +) + +#define DEFINE_BUF_EVENT(name) \ +DEFINE_EVENT(xfs_buf_class, name, \ + TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \ + TP_ARGS(bp, caller_ip)) +DEFINE_BUF_EVENT(xfs_buf_init); +DEFINE_BUF_EVENT(xfs_buf_free); +DEFINE_BUF_EVENT(xfs_buf_hold); +DEFINE_BUF_EVENT(xfs_buf_rele); +DEFINE_BUF_EVENT(xfs_buf_pin); +DEFINE_BUF_EVENT(xfs_buf_unpin); +DEFINE_BUF_EVENT(xfs_buf_iodone); +DEFINE_BUF_EVENT(xfs_buf_iorequest); +DEFINE_BUF_EVENT(xfs_buf_bawrite); +DEFINE_BUF_EVENT(xfs_buf_bdwrite); +DEFINE_BUF_EVENT(xfs_buf_lock); +DEFINE_BUF_EVENT(xfs_buf_lock_done); +DEFINE_BUF_EVENT(xfs_buf_cond_lock); +DEFINE_BUF_EVENT(xfs_buf_unlock); +DEFINE_BUF_EVENT(xfs_buf_ordered_retry); +DEFINE_BUF_EVENT(xfs_buf_iowait); +DEFINE_BUF_EVENT(xfs_buf_iowait_done); +DEFINE_BUF_EVENT(xfs_buf_delwri_queue); +DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); +DEFINE_BUF_EVENT(xfs_buf_delwri_split); +DEFINE_BUF_EVENT(xfs_buf_get_noaddr); +DEFINE_BUF_EVENT(xfs_bdstrat_shut); +DEFINE_BUF_EVENT(xfs_buf_item_relse); +DEFINE_BUF_EVENT(xfs_buf_item_iodone); +DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); +DEFINE_BUF_EVENT(xfs_buf_error_relse); +DEFINE_BUF_EVENT(xfs_trans_read_buf_io); +DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); + +/* not really buffer traces, but the buf provides useful information */ +DEFINE_BUF_EVENT(xfs_btree_corrupt); +DEFINE_BUF_EVENT(xfs_da_btree_corrupt); +DEFINE_BUF_EVENT(xfs_reset_dqcounts); +DEFINE_BUF_EVENT(xfs_inode_item_push); + +/* pass flags explicitly */ +DECLARE_EVENT_CLASS(xfs_buf_flags_class, + TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), + TP_ARGS(bp, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(unsigned, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = bp->b_buffer_length; + __entry->flags = flags; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = xfs_buf_lock_value(bp); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) +) + +#define DEFINE_BUF_FLAGS_EVENT(name) \ +DEFINE_EVENT(xfs_buf_flags_class, name, \ + TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \ + TP_ARGS(bp, flags, caller_ip)) +DEFINE_BUF_FLAGS_EVENT(xfs_buf_find); +DEFINE_BUF_FLAGS_EVENT(xfs_buf_get); +DEFINE_BUF_FLAGS_EVENT(xfs_buf_read); + +TRACE_EVENT(xfs_buf_ioerror, + TP_PROTO(struct xfs_buf *bp, int error, unsigned long caller_ip), + TP_ARGS(bp, error, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(unsigned, flags) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(int, error) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = bp->b_buffer_length; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = xfs_buf_lock_value(bp); + __entry->error = error; + __entry->flags = bp->b_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d error %d flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __entry->error, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) +); + +DECLARE_EVENT_CLASS(xfs_buf_item_class, + TP_PROTO(struct xfs_buf_log_item *bip), + TP_ARGS(bip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, buf_bno) + __field(size_t, buf_len) + __field(int, buf_hold) + __field(int, buf_pincount) + __field(int, buf_lockval) + __field(unsigned, buf_flags) + __field(unsigned, bli_recur) + __field(int, bli_refcount) + __field(unsigned, bli_flags) + __field(void *, li_desc) + __field(unsigned, li_flags) + ), + TP_fast_assign( + __entry->dev = bip->bli_buf->b_target->bt_dev; + __entry->bli_flags = bip->bli_flags; + __entry->bli_recur = bip->bli_recur; + __entry->bli_refcount = atomic_read(&bip->bli_refcount); + __entry->buf_bno = bip->bli_buf->b_bn; + __entry->buf_len = bip->bli_buf->b_buffer_length; + __entry->buf_flags = bip->bli_buf->b_flags; + __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); + __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); + __entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf); + __entry->li_desc = bip->bli_item.li_desc; + __entry->li_flags = bip->bli_item.li_flags; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s recur %d refcount %d bliflags %s " + "lidesc 0x%p liflags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->buf_bno, + __entry->buf_len, + __entry->buf_hold, + __entry->buf_pincount, + __entry->buf_lockval, + __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS), + __entry->bli_recur, + __entry->bli_refcount, + __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS), + __entry->li_desc, + __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS)) +) + +#define DEFINE_BUF_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_buf_item_class, name, \ + TP_PROTO(struct xfs_buf_log_item *bip), \ + TP_ARGS(bip)) +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); +DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); +DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); +DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb_recur); +DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur); +DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); +DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); + +DECLARE_EVENT_CLASS(xfs_lock_class, + TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, + unsigned long caller_ip), + TP_ARGS(ip, lock_flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, lock_flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lock_flags = lock_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), + (void *)__entry->caller_ip) +) + +#define DEFINE_LOCK_EVENT(name) \ +DEFINE_EVENT(xfs_lock_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \ + unsigned long caller_ip), \ + TP_ARGS(ip, lock_flags, caller_ip)) +DEFINE_LOCK_EVENT(xfs_ilock); +DEFINE_LOCK_EVENT(xfs_ilock_nowait); +DEFINE_LOCK_EVENT(xfs_ilock_demote); +DEFINE_LOCK_EVENT(xfs_iunlock); + +DECLARE_EVENT_CLASS(xfs_iget_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + ), + TP_printk("dev %d:%d ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +) + +#define DEFINE_IGET_EVENT(name) \ +DEFINE_EVENT(xfs_iget_class, name, \ + TP_PROTO(struct xfs_inode *ip), \ + TP_ARGS(ip)) +DEFINE_IGET_EVENT(xfs_iget_skip); +DEFINE_IGET_EVENT(xfs_iget_reclaim); +DEFINE_IGET_EVENT(xfs_iget_found); +DEFINE_IGET_EVENT(xfs_iget_alloc); + +DECLARE_EVENT_CLASS(xfs_inode_class, + TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), + TP_ARGS(ip, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, count) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->count, + (char *)__entry->caller_ip) +) + +#define DEFINE_INODE_EVENT(name) \ +DEFINE_EVENT(xfs_inode_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ + TP_ARGS(ip, caller_ip)) +DEFINE_INODE_EVENT(xfs_ihold); +DEFINE_INODE_EVENT(xfs_irele); +/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */ +DEFINE_INODE_EVENT(xfs_inode); +#define xfs_itrace_entry(ip) \ + trace_xfs_inode(ip, _THIS_IP_) + +DECLARE_EVENT_CLASS(xfs_dquot_class, + TP_PROTO(struct xfs_dquot *dqp), + TP_ARGS(dqp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(__be32, id) + __field(unsigned, flags) + __field(unsigned, nrefs) + __field(unsigned long long, res_bcount) + __field(unsigned long long, bcount) + __field(unsigned long long, icount) + __field(unsigned long long, blk_hardlimit) + __field(unsigned long long, blk_softlimit) + __field(unsigned long long, ino_hardlimit) + __field(unsigned long long, ino_softlimit) + ), \ + TP_fast_assign( + __entry->dev = dqp->q_mount->m_super->s_dev; + __entry->id = dqp->q_core.d_id; + __entry->flags = dqp->dq_flags; + __entry->nrefs = dqp->q_nrefs; + __entry->res_bcount = dqp->q_res_bcount; + __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); + __entry->icount = be64_to_cpu(dqp->q_core.d_icount); + __entry->blk_hardlimit = + be64_to_cpu(dqp->q_core.d_blk_hardlimit); + __entry->blk_softlimit = + be64_to_cpu(dqp->q_core.d_blk_softlimit); + __entry->ino_hardlimit = + be64_to_cpu(dqp->q_core.d_ino_hardlimit); + __entry->ino_softlimit = + be64_to_cpu(dqp->q_core.d_ino_softlimit); + ), + TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " + "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] " + "icnt 0x%llx [hard 0x%llx | soft 0x%llx]", + MAJOR(__entry->dev), MINOR(__entry->dev), + be32_to_cpu(__entry->id), + __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), + __entry->nrefs, + __entry->res_bcount, + __entry->bcount, + __entry->blk_hardlimit, + __entry->blk_softlimit, + __entry->icount, + __entry->ino_hardlimit, + __entry->ino_softlimit) +) + +#define DEFINE_DQUOT_EVENT(name) \ +DEFINE_EVENT(xfs_dquot_class, name, \ + TP_PROTO(struct xfs_dquot *dqp), \ + TP_ARGS(dqp)) +DEFINE_DQUOT_EVENT(xfs_dqadjust); +DEFINE_DQUOT_EVENT(xfs_dqshake_dirty); +DEFINE_DQUOT_EVENT(xfs_dqshake_unlink); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink); +DEFINE_DQUOT_EVENT(xfs_dqattach_found); +DEFINE_DQUOT_EVENT(xfs_dqattach_get); +DEFINE_DQUOT_EVENT(xfs_dqinit); +DEFINE_DQUOT_EVENT(xfs_dqreuse); +DEFINE_DQUOT_EVENT(xfs_dqalloc); +DEFINE_DQUOT_EVENT(xfs_dqtobp_read); +DEFINE_DQUOT_EVENT(xfs_dqread); +DEFINE_DQUOT_EVENT(xfs_dqread_fail); +DEFINE_DQUOT_EVENT(xfs_dqlookup_found); +DEFINE_DQUOT_EVENT(xfs_dqlookup_want); +DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist); +DEFINE_DQUOT_EVENT(xfs_dqlookup_move); +DEFINE_DQUOT_EVENT(xfs_dqlookup_done); +DEFINE_DQUOT_EVENT(xfs_dqget_hit); +DEFINE_DQUOT_EVENT(xfs_dqget_miss); +DEFINE_DQUOT_EVENT(xfs_dqput); +DEFINE_DQUOT_EVENT(xfs_dqput_wait); +DEFINE_DQUOT_EVENT(xfs_dqput_free); +DEFINE_DQUOT_EVENT(xfs_dqrele); +DEFINE_DQUOT_EVENT(xfs_dqflush); +DEFINE_DQUOT_EVENT(xfs_dqflush_force); +DEFINE_DQUOT_EVENT(xfs_dqflush_done); +/* not really iget events, but we re-use the format */ +DEFINE_IGET_EVENT(xfs_dquot_dqalloc); +DEFINE_IGET_EVENT(xfs_dquot_dqdetach); + +DECLARE_EVENT_CLASS(xfs_loggrant_class, + TP_PROTO(struct log *log, struct xlog_ticket *tic), + TP_ARGS(log, tic), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned, trans_type) + __field(char, ocnt) + __field(char, cnt) + __field(int, curr_res) + __field(int, unit_res) + __field(unsigned int, flags) + __field(void *, reserve_headq) + __field(void *, write_headq) + __field(int, grant_reserve_cycle) + __field(int, grant_reserve_bytes) + __field(int, grant_write_cycle) + __field(int, grant_write_bytes) + __field(int, curr_cycle) + __field(int, curr_block) + __field(xfs_lsn_t, tail_lsn) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->trans_type = tic->t_trans_type; + __entry->ocnt = tic->t_ocnt; + __entry->cnt = tic->t_cnt; + __entry->curr_res = tic->t_curr_res; + __entry->unit_res = tic->t_unit_res; + __entry->flags = tic->t_flags; + __entry->reserve_headq = log->l_reserve_headq; + __entry->write_headq = log->l_write_headq; + __entry->grant_reserve_cycle = log->l_grant_reserve_cycle; + __entry->grant_reserve_bytes = log->l_grant_reserve_bytes; + __entry->grant_write_cycle = log->l_grant_write_cycle; + __entry->grant_write_bytes = log->l_grant_write_bytes; + __entry->curr_cycle = log->l_curr_cycle; + __entry->curr_block = log->l_curr_block; + __entry->tail_lsn = log->l_tail_lsn; + ), + TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " + "t_unit_res %u t_flags %s reserve_headq 0x%p " + "write_headq 0x%p grant_reserve_cycle %d " + "grant_reserve_bytes %d grant_write_cycle %d " + "grant_write_bytes %d curr_cycle %d curr_block %d " + "tail_cycle %d tail_block %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), + __entry->ocnt, + __entry->cnt, + __entry->curr_res, + __entry->unit_res, + __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), + __entry->reserve_headq, + __entry->write_headq, + __entry->grant_reserve_cycle, + __entry->grant_reserve_bytes, + __entry->grant_write_cycle, + __entry->grant_write_bytes, + __entry->curr_cycle, + __entry->curr_block, + CYCLE_LSN(__entry->tail_lsn), + BLOCK_LSN(__entry->tail_lsn) + ) +) + +#define DEFINE_LOGGRANT_EVENT(name) \ +DEFINE_EVENT(xfs_loggrant_class, name, \ + TP_PROTO(struct log *log, struct xlog_ticket *tic), \ + TP_ARGS(log, tic)) +DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); +DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); +DEFINE_LOGGRANT_EVENT(xfs_log_reserve); +DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_error); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); +DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); + +#define DEFINE_RW_EVENT(name) \ +TRACE_EVENT(name, \ + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ + TP_ARGS(ip, count, offset, flags), \ + TP_STRUCT__entry( \ + __field(dev_t, dev) \ + __field(xfs_ino_t, ino) \ + __field(xfs_fsize_t, size) \ + __field(xfs_fsize_t, new_size) \ + __field(loff_t, offset) \ + __field(size_t, count) \ + __field(int, flags) \ + ), \ + TP_fast_assign( \ + __entry->dev = VFS_I(ip)->i_sb->s_dev; \ + __entry->ino = ip->i_ino; \ + __entry->size = ip->i_d.di_size; \ + __entry->new_size = ip->i_new_size; \ + __entry->offset = offset; \ + __entry->count = count; \ + __entry->flags = flags; \ + ), \ + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ + "offset 0x%llx count 0x%zx ioflags %s", \ + MAJOR(__entry->dev), MINOR(__entry->dev), \ + __entry->ino, \ + __entry->size, \ + __entry->new_size, \ + __entry->offset, \ + __entry->count, \ + __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \ +) +DEFINE_RW_EVENT(xfs_file_read); +DEFINE_RW_EVENT(xfs_file_buffered_write); +DEFINE_RW_EVENT(xfs_file_direct_write); +DEFINE_RW_EVENT(xfs_file_splice_read); +DEFINE_RW_EVENT(xfs_file_splice_write); + + +#define DEFINE_PAGE_EVENT(name) \ +TRACE_EVENT(name, \ + TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ + TP_ARGS(inode, page, off), \ + TP_STRUCT__entry( \ + __field(dev_t, dev) \ + __field(xfs_ino_t, ino) \ + __field(pgoff_t, pgoff) \ + __field(loff_t, size) \ + __field(unsigned long, offset) \ + __field(int, delalloc) \ + __field(int, unmapped) \ + __field(int, unwritten) \ + ), \ + TP_fast_assign( \ + int delalloc = -1, unmapped = -1, unwritten = -1; \ + \ + if (page_has_buffers(page)) \ + xfs_count_page_state(page, &delalloc, \ + &unmapped, &unwritten); \ + __entry->dev = inode->i_sb->s_dev; \ + __entry->ino = XFS_I(inode)->i_ino; \ + __entry->pgoff = page_offset(page); \ + __entry->size = i_size_read(inode); \ + __entry->offset = off; \ + __entry->delalloc = delalloc; \ + __entry->unmapped = unmapped; \ + __entry->unwritten = unwritten; \ + ), \ + TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \ + "delalloc %d unmapped %d unwritten %d", \ + MAJOR(__entry->dev), MINOR(__entry->dev), \ + __entry->ino, \ + __entry->pgoff, \ + __entry->size, \ + __entry->offset, \ + __entry->delalloc, \ + __entry->unmapped, \ + __entry->unwritten) \ +) +DEFINE_PAGE_EVENT(xfs_writepage); +DEFINE_PAGE_EVENT(xfs_releasepage); +DEFINE_PAGE_EVENT(xfs_invalidatepage); + +#define DEFINE_IOMAP_EVENT(name) \ +TRACE_EVENT(name, \ + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ + int flags, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, offset, count, flags, irec), \ + TP_STRUCT__entry( \ + __field(dev_t, dev) \ + __field(xfs_ino_t, ino) \ + __field(loff_t, size) \ + __field(loff_t, new_size) \ + __field(loff_t, offset) \ + __field(size_t, count) \ + __field(int, flags) \ + __field(xfs_fileoff_t, startoff) \ + __field(xfs_fsblock_t, startblock) \ + __field(xfs_filblks_t, blockcount) \ + ), \ + TP_fast_assign( \ + __entry->dev = VFS_I(ip)->i_sb->s_dev; \ + __entry->ino = ip->i_ino; \ + __entry->size = ip->i_d.di_size; \ + __entry->new_size = ip->i_new_size; \ + __entry->offset = offset; \ + __entry->count = count; \ + __entry->flags = flags; \ + __entry->startoff = irec ? irec->br_startoff : 0; \ + __entry->startblock = irec ? irec->br_startblock : 0; \ + __entry->blockcount = irec ? irec->br_blockcount : 0; \ + ), \ + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ + "offset 0x%llx count %zd flags %s " \ + "startoff 0x%llx startblock %s blockcount 0x%llx", \ + MAJOR(__entry->dev), MINOR(__entry->dev), \ + __entry->ino, \ + __entry->size, \ + __entry->new_size, \ + __entry->offset, \ + __entry->count, \ + __print_flags(__entry->flags, "|", BMAPI_FLAGS), \ + __entry->startoff, \ + xfs_fmtfsblock(__entry->startblock), \ + __entry->blockcount) \ +) +DEFINE_IOMAP_EVENT(xfs_iomap_enter); +DEFINE_IOMAP_EVENT(xfs_iomap_found); +DEFINE_IOMAP_EVENT(xfs_iomap_alloc); + +#define DEFINE_SIMPLE_IO_EVENT(name) \ +TRACE_EVENT(name, \ + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \ + TP_ARGS(ip, offset, count), \ + TP_STRUCT__entry( \ + __field(dev_t, dev) \ + __field(xfs_ino_t, ino) \ + __field(loff_t, size) \ + __field(loff_t, new_size) \ + __field(loff_t, offset) \ + __field(size_t, count) \ + ), \ + TP_fast_assign( \ + __entry->dev = VFS_I(ip)->i_sb->s_dev; \ + __entry->ino = ip->i_ino; \ + __entry->size = ip->i_d.di_size; \ + __entry->new_size = ip->i_new_size; \ + __entry->offset = offset; \ + __entry->count = count; \ + ), \ + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ + "offset 0x%llx count %zd", \ + MAJOR(__entry->dev), MINOR(__entry->dev), \ + __entry->ino, \ + __entry->size, \ + __entry->new_size, \ + __entry->offset, \ + __entry->count) \ +); +DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); +DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); + + +TRACE_EVENT(xfs_itruncate_start, + TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size, int flag, + xfs_off_t toss_start, xfs_off_t toss_finish), + TP_ARGS(ip, new_size, flag, toss_start, toss_finish), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fsize_t, new_size) + __field(xfs_off_t, toss_start) + __field(xfs_off_t, toss_finish) + __field(int, flag) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = new_size; + __entry->toss_start = toss_start; + __entry->toss_finish = toss_finish; + __entry->flag = flag; + ), + TP_printk("dev %d:%d ino 0x%llx %s size 0x%llx new_size 0x%llx " + "toss start 0x%llx toss finish 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->flag, "|", XFS_ITRUNC_FLAGS), + __entry->size, + __entry->new_size, + __entry->toss_start, + __entry->toss_finish) +); + +DECLARE_EVENT_CLASS(xfs_itrunc_class, + TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), + TP_ARGS(ip, new_size), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fsize_t, new_size) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = new_size; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size) +) + +#define DEFINE_ITRUNC_EVENT(name) \ +DEFINE_EVENT(xfs_itrunc_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ + TP_ARGS(ip, new_size)) +DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_start); +DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_end); + +TRACE_EVENT(xfs_pagecache_inval, + TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), + TP_ARGS(ip, start, finish), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_off_t, start) + __field(xfs_off_t, finish) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->start = start; + __entry->finish = finish; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->start, + __entry->finish) +); + +TRACE_EVENT(xfs_bunmap, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, + int flags, unsigned long caller_ip), + TP_ARGS(ip, bno, len, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fileoff_t, bno) + __field(xfs_filblks_t, len) + __field(unsigned long, caller_ip) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->bno = bno; + __entry->len = len; + __entry->caller_ip = caller_ip; + __entry->flags = flags; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx" + "flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->bno, + __entry->len, + __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS), + (void *)__entry->caller_ip) + +); + +TRACE_EVENT(xfs_alloc_busy, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_extlen_t len, int slot), + TP_ARGS(mp, agno, agbno, len, slot), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(int, slot) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->slot = slot; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->slot) + +); + +#define XFS_BUSY_STATES \ + { 0, "found" }, \ + { 1, "missing" } + +TRACE_EVENT(xfs_alloc_unbusy, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + int slot, int found), + TP_ARGS(mp, agno, slot, found), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, slot) + __field(int, found) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->slot = slot; + __entry->found = found; + ), + TP_printk("dev %d:%d agno %u slot %d %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->slot, + __print_symbolic(__entry->found, XFS_BUSY_STATES)) +); + +TRACE_EVENT(xfs_alloc_busysearch, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_extlen_t len, xfs_lsn_t lsn), + TP_ARGS(mp, agno, agbno, len, lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->lsn = lsn; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->lsn) +); + +TRACE_EVENT(xfs_agf, + TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, + unsigned long caller_ip), + TP_ARGS(mp, agf, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, flags) + __field(__u32, length) + __field(__u32, bno_root) + __field(__u32, cnt_root) + __field(__u32, bno_level) + __field(__u32, cnt_level) + __field(__u32, flfirst) + __field(__u32, fllast) + __field(__u32, flcount) + __field(__u32, freeblks) + __field(__u32, longest) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = be32_to_cpu(agf->agf_seqno), + __entry->flags = flags; + __entry->length = be32_to_cpu(agf->agf_length), + __entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]), + __entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]), + __entry->bno_level = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]), + __entry->cnt_level = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]), + __entry->flfirst = be32_to_cpu(agf->agf_flfirst), + __entry->fllast = be32_to_cpu(agf->agf_fllast), + __entry->flcount = be32_to_cpu(agf->agf_flcount), + __entry->freeblks = be32_to_cpu(agf->agf_freeblks), + __entry->longest = be32_to_cpu(agf->agf_longest); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u " + "levels b %u c %u flfirst %u fllast %u flcount %u " + "freeblks %u longest %u caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __print_flags(__entry->flags, "|", XFS_AGF_FLAGS), + __entry->length, + __entry->bno_root, + __entry->cnt_root, + __entry->bno_level, + __entry->cnt_level, + __entry->flfirst, + __entry->fllast, + __entry->flcount, + __entry->freeblks, + __entry->longest, + (void *)__entry->caller_ip) +); + +TRACE_EVENT(xfs_free_extent, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_extlen_t len, bool isfl, int haveleft, int haveright), + TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(int, isfl) + __field(int, haveleft) + __field(int, haveright) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->isfl = isfl; + __entry->haveleft = haveleft; + __entry->haveright = haveright; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->isfl, + __entry->haveleft ? + (__entry->haveright ? "both" : "left") : + (__entry->haveright ? "right" : "none")) + +); + +DECLARE_EVENT_CLASS(xfs_alloc_class, + TP_PROTO(struct xfs_alloc_arg *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, minlen) + __field(xfs_extlen_t, maxlen) + __field(xfs_extlen_t, mod) + __field(xfs_extlen_t, prod) + __field(xfs_extlen_t, minleft) + __field(xfs_extlen_t, total) + __field(xfs_extlen_t, alignment) + __field(xfs_extlen_t, minalignslop) + __field(xfs_extlen_t, len) + __field(short, type) + __field(short, otype) + __field(char, wasdel) + __field(char, wasfromfl) + __field(char, isfl) + __field(char, userdata) + __field(xfs_fsblock_t, firstblock) + ), + TP_fast_assign( + __entry->dev = args->mp->m_super->s_dev; + __entry->agno = args->agno; + __entry->agbno = args->agbno; + __entry->minlen = args->minlen; + __entry->maxlen = args->maxlen; + __entry->mod = args->mod; + __entry->prod = args->prod; + __entry->minleft = args->minleft; + __entry->total = args->total; + __entry->alignment = args->alignment; + __entry->minalignslop = args->minalignslop; + __entry->len = args->len; + __entry->type = args->type; + __entry->otype = args->otype; + __entry->wasdel = args->wasdel; + __entry->wasfromfl = args->wasfromfl; + __entry->isfl = args->isfl; + __entry->userdata = args->userdata; + __entry->firstblock = args->firstblock; + ), + TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " + "prod %u minleft %u total %u alignment %u minalignslop %u " + "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " + "userdata %d firstblock 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->minlen, + __entry->maxlen, + __entry->mod, + __entry->prod, + __entry->minleft, + __entry->total, + __entry->alignment, + __entry->minalignslop, + __entry->len, + __print_symbolic(__entry->type, XFS_ALLOC_TYPES), + __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), + __entry->wasdel, + __entry->wasfromfl, + __entry->isfl, + __entry->userdata, + __entry->firstblock) +) + +#define DEFINE_ALLOC_EVENT(name) \ +DEFINE_EVENT(xfs_alloc_class, name, \ + TP_PROTO(struct xfs_alloc_arg *args), \ + TP_ARGS(args)) +DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); +DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); +DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); +DEFINE_ALLOC_EVENT(xfs_alloc_near_first); +DEFINE_ALLOC_EVENT(xfs_alloc_near_greater); +DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser); +DEFINE_ALLOC_EVENT(xfs_alloc_near_error); +DEFINE_ALLOC_EVENT(xfs_alloc_size_neither); +DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry); +DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft); +DEFINE_ALLOC_EVENT(xfs_alloc_size_done); +DEFINE_ALLOC_EVENT(xfs_alloc_size_error); +DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist); +DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough); +DEFINE_ALLOC_EVENT(xfs_alloc_small_done); +DEFINE_ALLOC_EVENT(xfs_alloc_small_error); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed); + +DECLARE_EVENT_CLASS(xfs_dir2_class, + TP_PROTO(struct xfs_da_args *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __dynamic_array(char, name, args->namelen) + __field(int, namelen) + __field(xfs_dahash_t, hashval) + __field(xfs_ino_t, inumber) + __field(int, op_flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + if (args->namelen) + memcpy(__get_str(name), args->name, args->namelen); + __entry->namelen = args->namelen; + __entry->hashval = args->hashval; + __entry->inumber = args->inumber; + __entry->op_flags = args->op_flags; + ), + TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x " + "inumber 0x%llx op_flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __entry->namelen ? __get_str(name) : NULL, + __entry->namelen, + __entry->hashval, + __entry->inumber, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) +) + +#define DEFINE_DIR2_EVENT(name) \ +DEFINE_EVENT(xfs_dir2_class, name, \ + TP_PROTO(struct xfs_da_args *args), \ + TP_ARGS(args)) +DEFINE_DIR2_EVENT(xfs_dir2_sf_addname); +DEFINE_DIR2_EVENT(xfs_dir2_sf_create); +DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_sf_replace); +DEFINE_DIR2_EVENT(xfs_dir2_sf_removename); +DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4); +DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8); +DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block); +DEFINE_DIR2_EVENT(xfs_dir2_block_addname); +DEFINE_DIR2_EVENT(xfs_dir2_block_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_block_replace); +DEFINE_DIR2_EVENT(xfs_dir2_block_removename); +DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf); +DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node); +DEFINE_DIR2_EVENT(xfs_dir2_node_addname); +DEFINE_DIR2_EVENT(xfs_dir2_node_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_node_replace); +DEFINE_DIR2_EVENT(xfs_dir2_node_removename); +DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); + +DECLARE_EVENT_CLASS(xfs_dir2_space_class, + TP_PROTO(struct xfs_da_args *args, int idx), + TP_ARGS(args, idx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, op_flags) + __field(int, idx) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + __entry->op_flags = args->op_flags; + __entry->idx = idx; + ), + TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), + __entry->idx) +) + +#define DEFINE_DIR2_SPACE_EVENT(name) \ +DEFINE_EVENT(xfs_dir2_space_class, name, \ + TP_PROTO(struct xfs_da_args *args, int idx), \ + TP_ARGS(args, idx)) +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode); + +TRACE_EVENT(xfs_dir2_leafn_moveents, + TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count), + TP_ARGS(args, src_idx, dst_idx, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, op_flags) + __field(int, src_idx) + __field(int, dst_idx) + __field(int, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + __entry->op_flags = args->op_flags; + __entry->src_idx = src_idx; + __entry->dst_idx = dst_idx; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx op_flags %s " + "src_idx %d dst_idx %d count %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), + __entry->src_idx, + __entry->dst_idx, + __entry->count) +); + +#endif /* _TRACE_XFS_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE xfs_trace +#include <trace/define_trace.h> diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index ad7fbead4c97..7c220b4227bc 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -36,10 +36,13 @@ struct attrlist_cursor_kern; /* * Flags for read/write calls - same values as IRIX */ -#define IO_ISAIO 0x00001 /* don't wait for completion */ #define IO_ISDIRECT 0x00004 /* bypass page cache */ #define IO_INVIS 0x00020 /* don't update inode timestamps */ +#define XFS_IO_FLAGS \ + { IO_ISDIRECT, "DIRECT" }, \ + { IO_INVIS, "INVIS"} + /* * Flush/Invalidate options for vop_toss/flush/flushinval_pages. */ diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c index 497c7fb75cc1..0b1878857fc3 100644 --- a/fs/xfs/linux-2.6/xfs_xattr.c +++ b/fs/xfs/linux-2.6/xfs_xattr.c @@ -30,10 +30,10 @@ static int -__xfs_xattr_get(struct inode *inode, const char *name, +xfs_xattr_get(struct dentry *dentry, const char *name, void *value, size_t size, int xflags) { - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(dentry->d_inode); int error, asize = size; if (strcmp(name, "") == 0) @@ -52,10 +52,10 @@ __xfs_xattr_get(struct inode *inode, const char *name, } static int -__xfs_xattr_set(struct inode *inode, const char *name, const void *value, +xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int xflags) { - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(dentry->d_inode); if (strcmp(name, "") == 0) return -EINVAL; @@ -71,75 +71,34 @@ __xfs_xattr_set(struct inode *inode, const char *name, const void *value, return -xfs_attr_set(ip, name, (void *)value, size, xflags); } -static int -xfs_xattr_user_get(struct inode *inode, const char *name, - void *value, size_t size) -{ - return __xfs_xattr_get(inode, name, value, size, 0); -} - -static int -xfs_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return __xfs_xattr_set(inode, name, value, size, flags, 0); -} - static struct xattr_handler xfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, - .get = xfs_xattr_user_get, - .set = xfs_xattr_user_set, + .flags = 0, /* no flags implies user namespace */ + .get = xfs_xattr_get, + .set = xfs_xattr_set, }; - -static int -xfs_xattr_trusted_get(struct inode *inode, const char *name, - void *value, size_t size) -{ - return __xfs_xattr_get(inode, name, value, size, ATTR_ROOT); -} - -static int -xfs_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return __xfs_xattr_set(inode, name, value, size, flags, ATTR_ROOT); -} - static struct xattr_handler xfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, - .get = xfs_xattr_trusted_get, - .set = xfs_xattr_trusted_set, + .flags = ATTR_ROOT, + .get = xfs_xattr_get, + .set = xfs_xattr_set, }; - -static int -xfs_xattr_secure_get(struct inode *inode, const char *name, - void *value, size_t size) -{ - return __xfs_xattr_get(inode, name, value, size, ATTR_SECURE); -} - -static int -xfs_xattr_secure_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return __xfs_xattr_set(inode, name, value, size, flags, ATTR_SECURE); -} - static struct xattr_handler xfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .get = xfs_xattr_secure_get, - .set = xfs_xattr_secure_set, + .flags = ATTR_SECURE, + .get = xfs_xattr_get, + .set = xfs_xattr_set, }; - struct xattr_handler *xfs_xattr_handlers[] = { &xfs_xattr_user_handler, &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, #ifdef CONFIG_XFS_POSIX_ACL - &xfs_xattr_system_handler, + &xfs_xattr_acl_access_handler, + &xfs_xattr_acl_default_handler, #endif NULL }; diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 2f3f2229eaaf..d7c7eea09fc2 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -47,6 +47,7 @@ #include "xfs_trans_space.h" #include "xfs_trans_priv.h" #include "xfs_qm.h" +#include "xfs_trace.h" /* @@ -112,10 +113,7 @@ xfs_qm_dqinit( init_completion(&dqp->q_flush); complete(&dqp->q_flush); -#ifdef XFS_DQUOT_TRACE - dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_NOFS); - xfs_dqtrace_entry(dqp, "DQINIT"); -#endif + trace_xfs_dqinit(dqp); } else { /* * Only the q_core portion was zeroed in dqreclaim_one(). @@ -136,10 +134,7 @@ xfs_qm_dqinit( dqp->q_hash = NULL; ASSERT(dqp->dq_flnext == dqp->dq_flprev); -#ifdef XFS_DQUOT_TRACE - ASSERT(dqp->q_trace); - xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT"); -#endif + trace_xfs_dqreuse(dqp); } /* @@ -167,13 +162,8 @@ xfs_qm_dqdestroy( mutex_destroy(&dqp->q_qlock); sv_destroy(&dqp->q_pinwait); - -#ifdef XFS_DQUOT_TRACE - if (dqp->q_trace) - ktrace_free(dqp->q_trace); - dqp->q_trace = NULL; -#endif kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); + atomic_dec(&xfs_Gqm->qm_totaldquots); } @@ -195,49 +185,6 @@ xfs_qm_dqinit_core( d->dd_diskdq.d_flags = type; } - -#ifdef XFS_DQUOT_TRACE -/* - * Dquot tracing for debugging. - */ -/* ARGSUSED */ -void -__xfs_dqtrace_entry( - xfs_dquot_t *dqp, - char *func, - void *retaddr, - xfs_inode_t *ip) -{ - xfs_dquot_t *udqp = NULL; - xfs_ino_t ino = 0; - - ASSERT(dqp->q_trace); - if (ip) { - ino = ip->i_ino; - udqp = ip->i_udquot; - } - ktrace_enter(dqp->q_trace, - (void *)(__psint_t)DQUOT_KTRACE_ENTRY, - (void *)func, - (void *)(__psint_t)dqp->q_nrefs, - (void *)(__psint_t)dqp->dq_flags, - (void *)(__psint_t)dqp->q_res_bcount, - (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_bcount), - (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_icount), - (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_blk_hardlimit), - (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_blk_softlimit), - (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_ino_hardlimit), - (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_ino_softlimit), - (void *)(__psint_t)be32_to_cpu(dqp->q_core.d_id), - (void *)(__psint_t)current_pid(), - (void *)(__psint_t)ino, - (void *)(__psint_t)retaddr, - (void *)(__psint_t)udqp); - return; -} -#endif - - /* * If default limits are in force, push them into the dquot now. * We overwrite the dquot limits only if they are zero and this @@ -425,7 +372,8 @@ xfs_qm_dqalloc( xfs_trans_t *tp = *tpp; ASSERT(tp != NULL); - xfs_dqtrace_entry(dqp, "DQALLOC"); + + trace_xfs_dqalloc(dqp); /* * Initialize the bmap freelist prior to calling bmapi code. @@ -612,7 +560,8 @@ xfs_qm_dqtobp( * (in which case we already have the buf). */ if (! newdquot) { - xfs_dqtrace_entry(dqp, "DQTOBP READBUF"); + trace_xfs_dqtobp_read(dqp); + if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, XFS_QI_DQCHUNKLEN(mp), @@ -670,11 +619,12 @@ xfs_qm_dqread( ASSERT(tpp); + trace_xfs_dqread(dqp); + /* * get a pointer to the on-disk dquot and the buffer containing it * dqp already knows its own type (GROUP/USER). */ - xfs_dqtrace_entry(dqp, "DQREAD"); if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) { return (error); } @@ -763,7 +713,7 @@ xfs_qm_idtodq( * or if the dquot didn't exist on disk and we ask to * allocate (ENOENT). */ - xfs_dqtrace_entry(dqp, "DQREAD FAIL"); + trace_xfs_dqread_fail(dqp); cancelflags |= XFS_TRANS_ABORT; goto error0; } @@ -817,7 +767,8 @@ xfs_qm_dqlookup( * id can't be modified without the hashlock anyway. */ if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) { - xfs_dqtrace_entry(dqp, "DQFOUND BY LOOKUP"); + trace_xfs_dqlookup_found(dqp); + /* * All in core dquots must be on the dqlist of mp */ @@ -827,7 +778,7 @@ xfs_qm_dqlookup( if (dqp->q_nrefs == 0) { ASSERT (XFS_DQ_IS_ON_FREELIST(dqp)); if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { - xfs_dqtrace_entry(dqp, "DQLOOKUP: WANT"); + trace_xfs_dqlookup_want(dqp); /* * We may have raced with dqreclaim_one() @@ -857,8 +808,7 @@ xfs_qm_dqlookup( /* * take it off the freelist */ - xfs_dqtrace_entry(dqp, - "DQLOOKUP: TAKEOFF FL"); + trace_xfs_dqlookup_freelist(dqp); XQM_FREELIST_REMOVE(dqp); /* xfs_qm_freelist_print(&(xfs_Gqm-> qm_dqfreelist), @@ -878,8 +828,7 @@ xfs_qm_dqlookup( */ ASSERT(mutex_is_locked(&qh->qh_lock)); if (dqp->HL_PREVP != &qh->qh_next) { - xfs_dqtrace_entry(dqp, - "DQLOOKUP: HASH MOVETOFRONT"); + trace_xfs_dqlookup_move(dqp); if ((d = dqp->HL_NEXT)) d->HL_PREVP = dqp->HL_PREVP; *(dqp->HL_PREVP) = d; @@ -889,7 +838,7 @@ xfs_qm_dqlookup( dqp->HL_PREVP = &qh->qh_next; qh->qh_next = dqp; } - xfs_dqtrace_entry(dqp, "LOOKUP END"); + trace_xfs_dqlookup_done(dqp); *O_dqpp = dqp; ASSERT(mutex_is_locked(&qh->qh_lock)); return (0); @@ -971,7 +920,7 @@ xfs_qm_dqget( ASSERT(*O_dqpp); ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp)); mutex_unlock(&h->qh_lock); - xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)"); + trace_xfs_dqget_hit(*O_dqpp); return (0); /* success */ } XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses); @@ -1104,7 +1053,7 @@ xfs_qm_dqget( mutex_unlock(&h->qh_lock); dqret: ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); - xfs_dqtrace_entry(dqp, "DQGET DONE"); + trace_xfs_dqget_miss(dqp); *O_dqpp = dqp; return (0); } @@ -1124,7 +1073,8 @@ xfs_qm_dqput( ASSERT(dqp->q_nrefs > 0); ASSERT(XFS_DQ_IS_LOCKED(dqp)); - xfs_dqtrace_entry(dqp, "DQPUT"); + + trace_xfs_dqput(dqp); if (dqp->q_nrefs != 1) { dqp->q_nrefs--; @@ -1137,7 +1087,7 @@ xfs_qm_dqput( * in the right order; but try to get it out-of-order first */ if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { - xfs_dqtrace_entry(dqp, "DQPUT: FLLOCK-WAIT"); + trace_xfs_dqput_wait(dqp); xfs_dqunlock(dqp); xfs_qm_freelist_lock(xfs_Gqm); xfs_dqlock(dqp); @@ -1148,7 +1098,8 @@ xfs_qm_dqput( /* We can't depend on nrefs being == 1 here */ if (--dqp->q_nrefs == 0) { - xfs_dqtrace_entry(dqp, "DQPUT: ON FREELIST"); + trace_xfs_dqput_free(dqp); + /* * insert at end of the freelist. */ @@ -1196,7 +1147,7 @@ xfs_qm_dqrele( if (!dqp) return; - xfs_dqtrace_entry(dqp, "DQRELE"); + trace_xfs_dqrele(dqp); xfs_dqlock(dqp); /* @@ -1229,7 +1180,7 @@ xfs_qm_dqflush( ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(!completion_done(&dqp->q_flush)); - xfs_dqtrace_entry(dqp, "DQFLUSH"); + trace_xfs_dqflush(dqp); /* * If not dirty, or it's pinned and we are not supposed to @@ -1259,7 +1210,6 @@ xfs_qm_dqflush( * the ondisk-dquot has already been allocated for. */ if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) { - xfs_dqtrace_entry(dqp, "DQTOBP FAIL"); ASSERT(error != ENOENT); /* * Quotas could have gotten turned off (ESRCH) @@ -1297,7 +1247,7 @@ xfs_qm_dqflush( * get stuck waiting in the write for too long. */ if (XFS_BUF_ISPINNED(bp)) { - xfs_dqtrace_entry(dqp, "DQFLUSH LOG FORCE"); + trace_xfs_dqflush_force(dqp); xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); } @@ -1308,7 +1258,9 @@ xfs_qm_dqflush( } else { error = xfs_bwrite(mp, bp); } - xfs_dqtrace_entry(dqp, "DQFLUSH END"); + + trace_xfs_dqflush_done(dqp); + /* * dqp is still locked, but caller is free to unlock it now. */ @@ -1483,7 +1435,7 @@ xfs_qm_dqpurge( */ if (XFS_DQ_IS_DIRTY(dqp)) { int error; - xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY"); + /* dqflush unlocks dqflock */ /* * Given that dqpurge is a very rare occurrence, it is OK diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h index a2c16bcee90b..a0f7da586d1b 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/quota/xfs_dquot.h @@ -85,9 +85,6 @@ typedef struct xfs_dquot { struct completion q_flush; /* flush completion queue */ atomic_t q_pincount; /* dquot pin count */ wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ -#ifdef XFS_DQUOT_TRACE - struct ktrace *q_trace; /* trace header structure */ -#endif } xfs_dquot_t; @@ -144,24 +141,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) (XFS_IS_UQUOTA_ON((d)->q_mount)) : \ (XFS_IS_OQUOTA_ON((d)->q_mount)))) -#ifdef XFS_DQUOT_TRACE -/* - * Dquot Tracing stuff. - */ -#define DQUOT_TRACE_SIZE 64 -#define DQUOT_KTRACE_ENTRY 1 - -extern void __xfs_dqtrace_entry(xfs_dquot_t *dqp, char *func, - void *, xfs_inode_t *); -#define xfs_dqtrace_entry_ino(a,b,ip) \ - __xfs_dqtrace_entry((a), (b), (void*)__return_address, (ip)) -#define xfs_dqtrace_entry(a,b) \ - __xfs_dqtrace_entry((a), (b), (void*)__return_address, NULL) -#else -#define xfs_dqtrace_entry(a,b) -#define xfs_dqtrace_entry_ino(a,b,ip) -#endif - #ifdef QUOTADEBUG extern void xfs_qm_dqprint(xfs_dquot_t *); #else diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 45b1bfef7388..9e627a8b5b0e 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -47,6 +47,7 @@ #include "xfs_trans_space.h" #include "xfs_utils.h" #include "xfs_qm.h" +#include "xfs_trace.h" /* * The global quota manager. There is only one of these for the entire @@ -453,7 +454,7 @@ again: xfs_dqunlock(dqp); continue; } - xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY"); + /* XXX a sentinel would be better */ recl = XFS_QI_MPLRECLAIMS(mp); if (!xfs_dqflock_nowait(dqp)) { @@ -651,7 +652,7 @@ xfs_qm_dqattach_one( */ dqp = *IO_idqpp; if (dqp) { - xfs_dqtrace_entry(dqp, "DQATTACH: found in ip"); + trace_xfs_dqattach_found(dqp); return 0; } @@ -704,7 +705,7 @@ xfs_qm_dqattach_one( if (error) return error; - xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget"); + trace_xfs_dqattach_get(dqp); /* * dqget may have dropped and re-acquired the ilock, but it guarantees @@ -890,15 +891,15 @@ xfs_qm_dqdetach( if (!(ip->i_udquot || ip->i_gdquot)) return; + trace_xfs_dquot_dqdetach(ip); + ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino); ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino); if (ip->i_udquot) { - xfs_dqtrace_entry_ino(ip->i_udquot, "DQDETTACH", ip); xfs_qm_dqrele(ip->i_udquot); ip->i_udquot = NULL; } if (ip->i_gdquot) { - xfs_dqtrace_entry_ino(ip->i_gdquot, "DQDETTACH", ip); xfs_qm_dqrele(ip->i_gdquot); ip->i_gdquot = NULL; } @@ -977,7 +978,6 @@ xfs_qm_sync( * across a disk write */ xfs_qm_mplist_unlock(mp); - xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH"); error = xfs_qm_dqflush(dqp, flush_flags); xfs_dqunlock(dqp); if (error && XFS_FORCED_SHUTDOWN(mp)) @@ -1350,7 +1350,8 @@ xfs_qm_reset_dqcounts( xfs_disk_dquot_t *ddq; int j; - xfs_buftrace("RESET DQUOTS", bp); + trace_xfs_reset_dqcounts(bp, _RET_IP_); + /* * Reset all counters and timers. They'll be * started afresh by xfs_qm_quotacheck. @@ -1543,7 +1544,9 @@ xfs_qm_quotacheck_dqadjust( xfs_qcnt_t rtblks) { ASSERT(XFS_DQ_IS_LOCKED(dqp)); - xfs_dqtrace_entry(dqp, "QCHECK DQADJUST"); + + trace_xfs_dqadjust(dqp); + /* * Adjust the inode count and the block count to reflect this inode's * resource usage. @@ -1994,7 +1997,9 @@ xfs_qm_shake_freelist( */ if (XFS_DQ_IS_DIRTY(dqp)) { int error; - xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY"); + + trace_xfs_dqshake_dirty(dqp); + /* * We flush it delayed write, so don't bother * releasing the mplock. @@ -2038,7 +2043,9 @@ xfs_qm_shake_freelist( return nreclaimed; goto tryagain; } - xfs_dqtrace_entry(dqp, "DQSHAKE: UNLINKING"); + + trace_xfs_dqshake_unlink(dqp); + #ifdef QUOTADEBUG cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n", dqp, be32_to_cpu(dqp->q_core.d_id)); @@ -2125,7 +2132,9 @@ xfs_qm_dqreclaim_one(void) */ if (dqp->dq_flags & XFS_DQ_WANT) { ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); - xfs_dqtrace_entry(dqp, "DQRECLAIM: DQWANT"); + + trace_xfs_dqreclaim_want(dqp); + xfs_dqunlock(dqp); xfs_qm_freelist_unlock(xfs_Gqm); if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) @@ -2171,7 +2180,9 @@ xfs_qm_dqreclaim_one(void) */ if (XFS_DQ_IS_DIRTY(dqp)) { int error; - xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY"); + + trace_xfs_dqreclaim_dirty(dqp); + /* * We flush it delayed write, so don't bother * releasing the freelist lock. @@ -2194,8 +2205,9 @@ xfs_qm_dqreclaim_one(void) if (!mutex_trylock(&dqp->q_hash->qh_lock)) goto mplistunlock; + trace_xfs_dqreclaim_unlink(dqp); + ASSERT(dqp->q_nrefs == 0); - xfs_dqtrace_entry(dqp, "DQRECLAIM: UNLINKING"); XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); XQM_HASHLIST_REMOVE(dqp->q_hash, dqp); XQM_FREELIST_REMOVE(dqp); @@ -2430,7 +2442,7 @@ xfs_qm_vop_dqalloc( } } if (uq) - xfs_dqtrace_entry_ino(uq, "DQALLOC", ip); + trace_xfs_dquot_dqalloc(ip); xfs_iunlock(ip, lockflags); if (O_udqpp) diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 5d1a3b98a6e6..873e07e29074 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -49,6 +49,7 @@ #include "xfs_buf_item.h" #include "xfs_utils.h" #include "xfs_qm.h" +#include "xfs_trace.h" #ifdef DEBUG # define qdprintk(s, args...) cmn_err(CE_DEBUG, s, ## args) @@ -496,7 +497,6 @@ xfs_qm_scall_setqlim( ASSERT(error != ENOENT); return (error); } - xfs_dqtrace_entry(dqp, "Q_SETQLIM: AFT DQGET"); xfs_trans_dqjoin(tp, dqp); ddq = &dqp->q_core; @@ -602,7 +602,6 @@ xfs_qm_scall_setqlim( dqp->dq_flags |= XFS_DQ_DIRTY; xfs_trans_log_dquot(tp, dqp); - xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT"); error = xfs_trans_commit(tp, 0); xfs_qm_dqprint(dqp); xfs_qm_dqrele(dqp); @@ -630,7 +629,6 @@ xfs_qm_scall_getquota( return (error); } - xfs_dqtrace_entry(dqp, "Q_GETQUOTA SUCCESS"); /* * If everything's NULL, this dquot doesn't quite exist as far as * our utility programs are concerned. @@ -893,7 +891,7 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0); } /*------------------------------------------------------------------------*/ diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h index 6f4fd37c67af..d2d20462fd4f 100644 --- a/fs/xfs/support/debug.h +++ b/fs/xfs/support/debug.h @@ -41,10 +41,6 @@ extern void assfail(char *expr, char *f, int l); # define STATIC static noinline #endif -#ifndef STATIC_INLINE -# define STATIC_INLINE static inline -#endif - #else /* DEBUG */ #define ASSERT(expr) \ @@ -54,19 +50,5 @@ extern void assfail(char *expr, char *f, int l); # define STATIC noinline #endif -/* - * We stop inlining of inline functions in debug mode. - * Unfortunately, this means static inline in header files - * get multiple definitions, so they need to remain static. - * This then gives tonnes of warnings about unused but defined - * functions, so we need to add the unused attribute to prevent - * these spurious warnings. - */ -#ifndef STATIC_INLINE -# define STATIC_INLINE static __attribute__ ((unused)) noinline -#endif - #endif /* DEBUG */ - - #endif /* __XFS_SUPPORT_DEBUG_H__ */ diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c deleted file mode 100644 index 2d494c26717f..000000000000 --- a/fs/xfs/support/ktrace.c +++ /dev/null @@ -1,323 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include <xfs.h> - -static kmem_zone_t *ktrace_hdr_zone; -static kmem_zone_t *ktrace_ent_zone; -static int ktrace_zentries; - -void __init -ktrace_init(int zentries) -{ - ktrace_zentries = roundup_pow_of_two(zentries); - - ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t), - "ktrace_hdr"); - ASSERT(ktrace_hdr_zone); - - ktrace_ent_zone = kmem_zone_init(ktrace_zentries - * sizeof(ktrace_entry_t), - "ktrace_ent"); - ASSERT(ktrace_ent_zone); -} - -void __exit -ktrace_uninit(void) -{ - kmem_zone_destroy(ktrace_hdr_zone); - kmem_zone_destroy(ktrace_ent_zone); -} - -/* - * ktrace_alloc() - * - * Allocate a ktrace header and enough buffering for the given - * number of entries. Round the number of entries up to a - * power of 2 so we can do fast masking to get the index from - * the atomic index counter. - */ -ktrace_t * -ktrace_alloc(int nentries, unsigned int __nocast sleep) -{ - ktrace_t *ktp; - ktrace_entry_t *ktep; - int entries; - - ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep); - - if (ktp == (ktrace_t*)NULL) { - /* - * KM_SLEEP callers don't expect failure. - */ - if (sleep & KM_SLEEP) - panic("ktrace_alloc: NULL memory on KM_SLEEP request!"); - - return NULL; - } - - /* - * Special treatment for buffers with the ktrace_zentries entries - */ - entries = roundup_pow_of_two(nentries); - if (entries == ktrace_zentries) { - ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone, - sleep); - } else { - ktep = (ktrace_entry_t*)kmem_zalloc((entries * sizeof(*ktep)), - sleep | KM_LARGE); - } - - if (ktep == NULL) { - /* - * KM_SLEEP callers don't expect failure. - */ - if (sleep & KM_SLEEP) - panic("ktrace_alloc: NULL memory on KM_SLEEP request!"); - - kmem_free(ktp); - - return NULL; - } - - ktp->kt_entries = ktep; - ktp->kt_nentries = entries; - ASSERT(is_power_of_2(entries)); - ktp->kt_index_mask = entries - 1; - atomic_set(&ktp->kt_index, 0); - ktp->kt_rollover = 0; - return ktp; -} - - -/* - * ktrace_free() - * - * Free up the ktrace header and buffer. It is up to the caller - * to ensure that no-one is referencing it. - */ -void -ktrace_free(ktrace_t *ktp) -{ - if (ktp == (ktrace_t *)NULL) - return; - - /* - * Special treatment for the Vnode trace buffer. - */ - if (ktp->kt_nentries == ktrace_zentries) - kmem_zone_free(ktrace_ent_zone, ktp->kt_entries); - else - kmem_free(ktp->kt_entries); - - kmem_zone_free(ktrace_hdr_zone, ktp); -} - - -/* - * Enter the given values into the "next" entry in the trace buffer. - * kt_index is always the index of the next entry to be filled. - */ -void -ktrace_enter( - ktrace_t *ktp, - void *val0, - void *val1, - void *val2, - void *val3, - void *val4, - void *val5, - void *val6, - void *val7, - void *val8, - void *val9, - void *val10, - void *val11, - void *val12, - void *val13, - void *val14, - void *val15) -{ - int index; - ktrace_entry_t *ktep; - - ASSERT(ktp != NULL); - - /* - * Grab an entry by pushing the index up to the next one. - */ - index = atomic_add_return(1, &ktp->kt_index); - index = (index - 1) & ktp->kt_index_mask; - if (!ktp->kt_rollover && index == ktp->kt_nentries - 1) - ktp->kt_rollover = 1; - - ASSERT((index >= 0) && (index < ktp->kt_nentries)); - - ktep = &(ktp->kt_entries[index]); - - ktep->val[0] = val0; - ktep->val[1] = val1; - ktep->val[2] = val2; - ktep->val[3] = val3; - ktep->val[4] = val4; - ktep->val[5] = val5; - ktep->val[6] = val6; - ktep->val[7] = val7; - ktep->val[8] = val8; - ktep->val[9] = val9; - ktep->val[10] = val10; - ktep->val[11] = val11; - ktep->val[12] = val12; - ktep->val[13] = val13; - ktep->val[14] = val14; - ktep->val[15] = val15; -} - -/* - * Return the number of entries in the trace buffer. - */ -int -ktrace_nentries( - ktrace_t *ktp) -{ - int index; - if (ktp == NULL) - return 0; - - index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask; - return (ktp->kt_rollover ? ktp->kt_nentries : index); -} - -/* - * ktrace_first() - * - * This is used to find the start of the trace buffer. - * In conjunction with ktrace_next() it can be used to - * iterate through the entire trace buffer. This code does - * not do any locking because it is assumed that it is called - * from the debugger. - * - * The caller must pass in a pointer to a ktrace_snap - * structure in which we will keep some state used to - * iterate through the buffer. This state must not touched - * by any code outside of this module. - */ -ktrace_entry_t * -ktrace_first(ktrace_t *ktp, ktrace_snap_t *ktsp) -{ - ktrace_entry_t *ktep; - int index; - int nentries; - - if (ktp->kt_rollover) - index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask; - else - index = 0; - - ktsp->ks_start = index; - ktep = &(ktp->kt_entries[index]); - - nentries = ktrace_nentries(ktp); - index++; - if (index < nentries) { - ktsp->ks_index = index; - } else { - ktsp->ks_index = 0; - if (index > nentries) - ktep = NULL; - } - return ktep; -} - -/* - * ktrace_next() - * - * This is used to iterate through the entries of the given - * trace buffer. The caller must pass in the ktrace_snap_t - * structure initialized by ktrace_first(). The return value - * will be either a pointer to the next ktrace_entry or NULL - * if all of the entries have been traversed. - */ -ktrace_entry_t * -ktrace_next( - ktrace_t *ktp, - ktrace_snap_t *ktsp) -{ - int index; - ktrace_entry_t *ktep; - - index = ktsp->ks_index; - if (index == ktsp->ks_start) { - ktep = NULL; - } else { - ktep = &ktp->kt_entries[index]; - } - - index++; - if (index == ktrace_nentries(ktp)) { - ktsp->ks_index = 0; - } else { - ktsp->ks_index = index; - } - - return ktep; -} - -/* - * ktrace_skip() - * - * Skip the next "count" entries and return the entry after that. - * Return NULL if this causes us to iterate past the beginning again. - */ -ktrace_entry_t * -ktrace_skip( - ktrace_t *ktp, - int count, - ktrace_snap_t *ktsp) -{ - int index; - int new_index; - ktrace_entry_t *ktep; - int nentries = ktrace_nentries(ktp); - - index = ktsp->ks_index; - new_index = index + count; - while (new_index >= nentries) { - new_index -= nentries; - } - if (index == ktsp->ks_start) { - /* - * We've iterated around to the start, so we're done. - */ - ktep = NULL; - } else if ((new_index < index) && (index < ktsp->ks_index)) { - /* - * We've skipped past the start again, so we're done. - */ - ktep = NULL; - ktsp->ks_index = ktsp->ks_start; - } else { - ktep = &(ktp->kt_entries[new_index]); - new_index++; - if (new_index == nentries) { - ktsp->ks_index = 0; - } else { - ktsp->ks_index = new_index; - } - } - return ktep; -} diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h deleted file mode 100644 index 741d6947ca60..000000000000 --- a/fs/xfs/support/ktrace.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_KTRACE_H__ -#define __XFS_SUPPORT_KTRACE_H__ - -/* - * Trace buffer entry structure. - */ -typedef struct ktrace_entry { - void *val[16]; -} ktrace_entry_t; - -/* - * Trace buffer header structure. - */ -typedef struct ktrace { - int kt_nentries; /* number of entries in trace buf */ - atomic_t kt_index; /* current index in entries */ - unsigned int kt_index_mask; - int kt_rollover; - ktrace_entry_t *kt_entries; /* buffer of entries */ -} ktrace_t; - -/* - * Trace buffer snapshot structure. - */ -typedef struct ktrace_snap { - int ks_start; /* kt_index at time of snap */ - int ks_index; /* current index */ -} ktrace_snap_t; - - -#ifdef CONFIG_XFS_TRACE - -extern void ktrace_init(int zentries); -extern void ktrace_uninit(void); - -extern ktrace_t *ktrace_alloc(int, unsigned int __nocast); -extern void ktrace_free(ktrace_t *); - -extern void ktrace_enter( - ktrace_t *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *); - -extern ktrace_entry_t *ktrace_first(ktrace_t *, ktrace_snap_t *); -extern int ktrace_nentries(ktrace_t *); -extern ktrace_entry_t *ktrace_next(ktrace_t *, ktrace_snap_t *); -extern ktrace_entry_t *ktrace_skip(ktrace_t *, int, ktrace_snap_t *); - -#else -#define ktrace_init(x) do { } while (0) -#define ktrace_uninit() do { } while (0) -#endif /* CONFIG_XFS_TRACE */ - -#endif /* __XFS_SUPPORT_KTRACE_H__ */ diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index 17254b529c54..5ad8ad3a1dcd 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -25,21 +25,5 @@ /* #define QUOTADEBUG 1 */ #endif -#ifdef CONFIG_XFS_TRACE -#define XFS_ALLOC_TRACE 1 -#define XFS_ATTR_TRACE 1 -#define XFS_BLI_TRACE 1 -#define XFS_BMAP_TRACE 1 -#define XFS_BTREE_TRACE 1 -#define XFS_DIR2_TRACE 1 -#define XFS_DQUOT_TRACE 1 -#define XFS_ILOCK_TRACE 1 -#define XFS_LOG_TRACE 1 -#define XFS_RW_TRACE 1 -#define XFS_BUF_TRACE 1 -#define XFS_INODE_TRACE 1 -#define XFS_FILESTREAMS_TRACE 1 -#endif - #include <linux-2.6/xfs_linux.h> #endif /* __XFS_H__ */ diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 947b150df8ed..00fd357c3e46 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -49,7 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode); extern int posix_acl_access_exists(struct inode *inode); extern int posix_acl_default_exists(struct inode *inode); -extern struct xattr_handler xfs_xattr_system_handler; +extern struct xattr_handler xfs_xattr_acl_access_handler; +extern struct xattr_handler xfs_xattr_acl_default_handler; #else # define xfs_check_acl NULL # define xfs_get_acl(inode, type) NULL diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index a5d54bf4931b..6702bd865811 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -86,6 +86,20 @@ typedef struct xfs_agf { #define XFS_AGF_NUM_BITS 12 #define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) +#define XFS_AGF_FLAGS \ + { XFS_AGF_MAGICNUM, "MAGICNUM" }, \ + { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \ + { XFS_AGF_SEQNO, "SEQNO" }, \ + { XFS_AGF_LENGTH, "LENGTH" }, \ + { XFS_AGF_ROOTS, "ROOTS" }, \ + { XFS_AGF_LEVELS, "LEVELS" }, \ + { XFS_AGF_FLFIRST, "FLFIRST" }, \ + { XFS_AGF_FLLAST, "FLLAST" }, \ + { XFS_AGF_FLCOUNT, "FLCOUNT" }, \ + { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ + { XFS_AGF_LONGEST, "LONGEST" }, \ + { XFS_AGF_BTREEBLKS, "BTREEBLKS" } + /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) #define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 2cf944eb796d..275b1f4f9430 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -38,6 +38,7 @@ #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_trace.h" #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) @@ -51,30 +52,6 @@ xfs_alloc_search_busy(xfs_trans_t *tp, xfs_agblock_t bno, xfs_extlen_t len); -#if defined(XFS_ALLOC_TRACE) -ktrace_t *xfs_alloc_trace_buf; - -#define TRACE_ALLOC(s,a) \ - xfs_alloc_trace_alloc(__func__, s, a, __LINE__) -#define TRACE_FREE(s,a,b,x,f) \ - xfs_alloc_trace_free(__func__, s, mp, a, b, x, f, __LINE__) -#define TRACE_MODAGF(s,a,f) \ - xfs_alloc_trace_modagf(__func__, s, mp, a, f, __LINE__) -#define TRACE_BUSY(__func__,s,ag,agb,l,sl,tp) \ - xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__) -#define TRACE_UNBUSY(__func__,s,ag,sl,tp) \ - xfs_alloc_trace_busy(__func__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__) -#define TRACE_BUSYSEARCH(__func__,s,ag,agb,l,tp) \ - xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, 0, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__) -#else -#define TRACE_ALLOC(s,a) -#define TRACE_FREE(s,a,b,x,f) -#define TRACE_MODAGF(s,a,f) -#define TRACE_BUSY(s,a,ag,agb,l,sl,tp) -#define TRACE_UNBUSY(fname,s,ag,sl,tp) -#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,tp) -#endif /* XFS_ALLOC_TRACE */ - /* * Prototypes for per-ag allocation routines */ @@ -498,124 +475,6 @@ xfs_alloc_read_agfl( return 0; } -#if defined(XFS_ALLOC_TRACE) -/* - * Add an allocation trace entry for an alloc call. - */ -STATIC void -xfs_alloc_trace_alloc( - const char *name, /* function tag string */ - char *str, /* additional string */ - xfs_alloc_arg_t *args, /* allocation argument structure */ - int line) /* source line number */ -{ - ktrace_enter(xfs_alloc_trace_buf, - (void *)(__psint_t)(XFS_ALLOC_KTRACE_ALLOC | (line << 16)), - (void *)name, - (void *)str, - (void *)args->mp, - (void *)(__psunsigned_t)args->agno, - (void *)(__psunsigned_t)args->agbno, - (void *)(__psunsigned_t)args->minlen, - (void *)(__psunsigned_t)args->maxlen, - (void *)(__psunsigned_t)args->mod, - (void *)(__psunsigned_t)args->prod, - (void *)(__psunsigned_t)args->minleft, - (void *)(__psunsigned_t)args->total, - (void *)(__psunsigned_t)args->alignment, - (void *)(__psunsigned_t)args->len, - (void *)((((__psint_t)args->type) << 16) | - (__psint_t)args->otype), - (void *)(__psint_t)((args->wasdel << 3) | - (args->wasfromfl << 2) | - (args->isfl << 1) | - (args->userdata << 0))); -} - -/* - * Add an allocation trace entry for a free call. - */ -STATIC void -xfs_alloc_trace_free( - const char *name, /* function tag string */ - char *str, /* additional string */ - xfs_mount_t *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* a.g. relative block number */ - xfs_extlen_t len, /* length of extent */ - int isfl, /* set if is freelist allocation/free */ - int line) /* source line number */ -{ - ktrace_enter(xfs_alloc_trace_buf, - (void *)(__psint_t)(XFS_ALLOC_KTRACE_FREE | (line << 16)), - (void *)name, - (void *)str, - (void *)mp, - (void *)(__psunsigned_t)agno, - (void *)(__psunsigned_t)agbno, - (void *)(__psunsigned_t)len, - (void *)(__psint_t)isfl, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); -} - -/* - * Add an allocation trace entry for modifying an agf. - */ -STATIC void -xfs_alloc_trace_modagf( - const char *name, /* function tag string */ - char *str, /* additional string */ - xfs_mount_t *mp, /* file system mount point */ - xfs_agf_t *agf, /* new agf value */ - int flags, /* logging flags for agf */ - int line) /* source line number */ -{ - ktrace_enter(xfs_alloc_trace_buf, - (void *)(__psint_t)(XFS_ALLOC_KTRACE_MODAGF | (line << 16)), - (void *)name, - (void *)str, - (void *)mp, - (void *)(__psint_t)flags, - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_seqno), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_length), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_flfirst), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_fllast), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_flcount), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_freeblks), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_longest)); -} - -STATIC void -xfs_alloc_trace_busy( - const char *name, /* function tag string */ - char *str, /* additional string */ - xfs_mount_t *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* a.g. relative block number */ - xfs_extlen_t len, /* length of extent */ - int slot, /* perag Busy slot */ - xfs_trans_t *tp, - int trtype, /* type: add, delete, search */ - int line) /* source line number */ -{ - ktrace_enter(xfs_alloc_trace_buf, - (void *)(__psint_t)(trtype | (line << 16)), - (void *)name, - (void *)str, - (void *)mp, - (void *)(__psunsigned_t)agno, - (void *)(__psunsigned_t)agbno, - (void *)(__psunsigned_t)len, - (void *)(__psint_t)slot, - (void *)tp, - NULL, NULL, NULL, NULL, NULL, NULL, NULL); -} -#endif /* XFS_ALLOC_TRACE */ - /* * Allocation group level functions. */ @@ -665,9 +524,6 @@ xfs_alloc_ag_vextent( */ if (args->agbno != NULLAGBLOCK) { xfs_agf_t *agf; /* allocation group freelist header */ -#ifdef XFS_ALLOC_TRACE - xfs_mount_t *mp = args->mp; -#endif long slen = (long)args->len; ASSERT(args->len >= args->minlen && args->len <= args->maxlen); @@ -682,7 +538,6 @@ xfs_alloc_ag_vextent( args->pag->pagf_freeblks -= args->len; ASSERT(be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length)); - TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS); xfs_alloc_log_agf(args->tp, args->agbp, XFS_AGF_FREEBLKS); /* search the busylist for these blocks */ @@ -792,13 +647,14 @@ xfs_alloc_ag_vextent_exact( } xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - TRACE_ALLOC("normal", args); + + trace_xfs_alloc_exact_done(args); args->wasfromfl = 0; return 0; error0: xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); - TRACE_ALLOC("error", args); + trace_xfs_alloc_exact_error(args); return error; } @@ -958,7 +814,7 @@ xfs_alloc_ag_vextent_near( args->len = blen; if (!xfs_alloc_fix_minleft(args)) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - TRACE_ALLOC("nominleft", args); + trace_xfs_alloc_near_nominleft(args); return 0; } blen = args->len; @@ -981,7 +837,8 @@ xfs_alloc_ag_vextent_near( goto error0; xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); - TRACE_ALLOC("first", args); + + trace_xfs_alloc_near_first(args); return 0; } /* @@ -1272,7 +1129,7 @@ xfs_alloc_ag_vextent_near( * If we couldn't get anything, give up. */ if (bno_cur_lt == NULL && bno_cur_gt == NULL) { - TRACE_ALLOC("neither", args); + trace_xfs_alloc_size_neither(args); args->agbno = NULLAGBLOCK; return 0; } @@ -1299,7 +1156,7 @@ xfs_alloc_ag_vextent_near( args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); if (!xfs_alloc_fix_minleft(args)) { - TRACE_ALLOC("nominleft", args); + trace_xfs_alloc_near_nominleft(args); xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); return 0; @@ -1314,13 +1171,18 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, ltnew, rlen, XFSA_FIXUP_BNO_OK))) goto error0; - TRACE_ALLOC(j ? "gt" : "lt", args); + + if (j) + trace_xfs_alloc_near_greater(args); + else + trace_xfs_alloc_near_lesser(args); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); return 0; error0: - TRACE_ALLOC("error", args); + trace_xfs_alloc_near_error(args); if (cnt_cur != NULL) xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); if (bno_cur_lt != NULL) @@ -1371,7 +1233,7 @@ xfs_alloc_ag_vextent_size( goto error0; if (i == 0 || flen == 0) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - TRACE_ALLOC("noentry", args); + trace_xfs_alloc_size_noentry(args); return 0; } ASSERT(i == 1); @@ -1448,7 +1310,7 @@ xfs_alloc_ag_vextent_size( xfs_alloc_fix_len(args); if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - TRACE_ALLOC("nominleft", args); + trace_xfs_alloc_size_nominleft(args); args->agbno = NULLAGBLOCK; return 0; } @@ -1471,11 +1333,11 @@ xfs_alloc_ag_vextent_size( args->agbno + args->len <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), error0); - TRACE_ALLOC("normal", args); + trace_xfs_alloc_size_done(args); return 0; error0: - TRACE_ALLOC("error", args); + trace_xfs_alloc_size_error(args); if (cnt_cur) xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); if (bno_cur) @@ -1534,7 +1396,7 @@ xfs_alloc_ag_vextent_small( be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), error0); args->wasfromfl = 1; - TRACE_ALLOC("freelist", args); + trace_xfs_alloc_small_freelist(args); *stat = 0; return 0; } @@ -1556,17 +1418,17 @@ xfs_alloc_ag_vextent_small( */ if (flen < args->minlen) { args->agbno = NULLAGBLOCK; - TRACE_ALLOC("notenough", args); + trace_xfs_alloc_small_notenough(args); flen = 0; } *fbnop = fbno; *flenp = flen; *stat = 1; - TRACE_ALLOC("normal", args); + trace_xfs_alloc_small_done(args); return 0; error0: - TRACE_ALLOC("error", args); + trace_xfs_alloc_small_error(args); return error; } @@ -1809,17 +1671,14 @@ xfs_free_ag_extent( be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length), error0); - TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS); xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); if (!isfl) xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len); XFS_STATS_INC(xs_freex); XFS_STATS_ADD(xs_freeb, len); } - TRACE_FREE(haveleft ? - (haveright ? "both" : "left") : - (haveright ? "right" : "none"), - agno, bno, len, isfl); + + trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); /* * Since blocks move to the free list without the coordination @@ -1836,7 +1695,7 @@ xfs_free_ag_extent( return 0; error0: - TRACE_FREE("error", agno, bno, len, isfl); + trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1); if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); if (cnt_cur) @@ -2122,7 +1981,6 @@ xfs_alloc_get_freelist( logflags |= XFS_AGF_BTREEBLKS; } - TRACE_MODAGF(NULL, agf, logflags); xfs_alloc_log_agf(tp, agbp, logflags); *bnop = bno; @@ -2165,6 +2023,8 @@ xfs_alloc_log_agf( sizeof(xfs_agf_t) }; + trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); + xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); } @@ -2230,13 +2090,11 @@ xfs_alloc_put_freelist( logflags |= XFS_AGF_BTREEBLKS; } - TRACE_MODAGF(NULL, agf, logflags); xfs_alloc_log_agf(tp, agbp, logflags); ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)]; *blockp = cpu_to_be32(bno); - TRACE_MODAGF(NULL, agf, logflags); xfs_alloc_log_agf(tp, agbp, logflags); xfs_trans_log_buf(tp, agflbp, (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl), @@ -2399,7 +2257,7 @@ xfs_alloc_vextent( args->minlen > args->maxlen || args->minlen > agsize || args->mod >= args->prod) { args->fsbno = NULLFSBLOCK; - TRACE_ALLOC("badargs", args); + trace_xfs_alloc_vextent_badargs(args); return 0; } minleft = args->minleft; @@ -2418,12 +2276,12 @@ xfs_alloc_vextent( error = xfs_alloc_fix_freelist(args, 0); args->minleft = minleft; if (error) { - TRACE_ALLOC("nofix", args); + trace_xfs_alloc_vextent_nofix(args); goto error0; } if (!args->agbp) { up_read(&mp->m_peraglock); - TRACE_ALLOC("noagbp", args); + trace_xfs_alloc_vextent_noagbp(args); break; } args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); @@ -2488,7 +2346,7 @@ xfs_alloc_vextent( error = xfs_alloc_fix_freelist(args, flags); args->minleft = minleft; if (error) { - TRACE_ALLOC("nofix", args); + trace_xfs_alloc_vextent_nofix(args); goto error0; } /* @@ -2499,7 +2357,9 @@ xfs_alloc_vextent( goto error0; break; } - TRACE_ALLOC("loopfailed", args); + + trace_xfs_alloc_vextent_loopfailed(args); + /* * Didn't work, figure out the next iteration. */ @@ -2526,7 +2386,7 @@ xfs_alloc_vextent( if (args->agno == sagno) { if (no_min == 1) { args->agbno = NULLAGBLOCK; - TRACE_ALLOC("allfailed", args); + trace_xfs_alloc_vextent_allfailed(args); break; } if (flags == 0) { @@ -2642,16 +2502,16 @@ xfs_alloc_mark_busy(xfs_trans_t *tp, } } + trace_xfs_alloc_busy(mp, agno, bno, len, n); + if (n < XFS_PAGB_NUM_SLOTS) { bsy = &mp->m_perag[agno].pagb_list[n]; mp->m_perag[agno].pagb_count++; - TRACE_BUSY("xfs_alloc_mark_busy", "got", agno, bno, len, n, tp); bsy->busy_start = bno; bsy->busy_length = len; bsy->busy_tp = tp; xfs_trans_add_busy(tp, agno, n); } else { - TRACE_BUSY("xfs_alloc_mark_busy", "FULL", agno, bno, len, -1, tp); /* * The busy list is full! Since it is now not possible to * track the free block, make this a synchronous transaction @@ -2678,12 +2538,12 @@ xfs_alloc_clear_busy(xfs_trans_t *tp, list = mp->m_perag[agno].pagb_list; ASSERT(idx < XFS_PAGB_NUM_SLOTS); + + trace_xfs_alloc_unbusy(mp, agno, idx, list[idx].busy_tp == tp); + if (list[idx].busy_tp == tp) { - TRACE_UNBUSY("xfs_alloc_clear_busy", "found", agno, idx, tp); list[idx].busy_tp = NULL; mp->m_perag[agno].pagb_count--; - } else { - TRACE_UNBUSY("xfs_alloc_clear_busy", "missing", agno, idx, tp); } spin_unlock(&mp->m_perag[agno].pagb_lock); @@ -2703,45 +2563,41 @@ xfs_alloc_search_busy(xfs_trans_t *tp, xfs_mount_t *mp; xfs_perag_busy_t *bsy; xfs_agblock_t uend, bend; - xfs_lsn_t lsn; + xfs_lsn_t lsn = 0; int cnt; mp = tp->t_mountp; spin_lock(&mp->m_perag[agno].pagb_lock); - cnt = mp->m_perag[agno].pagb_count; uend = bno + len - 1; - /* search pagb_list for this slot, skipping open slots */ - for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) { + /* + * search pagb_list for this slot, skipping open slots. We have to + * search the entire array as there may be multiple overlaps and + * we have to get the most recent LSN for the log force to push out + * all the transactions that span the range. + */ + for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) { + bsy = &mp->m_perag[agno].pagb_list[cnt]; + if (!bsy->busy_tp) + continue; - /* - * (start1,length1) within (start2, length2) - */ - if (bsy->busy_tp != NULL) { - bend = bsy->busy_start + bsy->busy_length - 1; - if ((bno > bend) || (uend < bsy->busy_start)) { - cnt--; - } else { - TRACE_BUSYSEARCH("xfs_alloc_search_busy", - "found1", agno, bno, len, tp); - break; - } - } + bend = bsy->busy_start + bsy->busy_length - 1; + if (bno > bend || uend < bsy->busy_start) + continue; + + /* (start1,length1) within (start2, length2) */ + if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) + lsn = bsy->busy_tp->t_commit_lsn; } + spin_unlock(&mp->m_perag[agno].pagb_lock); + trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn); /* * If a block was found, force the log through the LSN of the * transaction that freed the block */ - if (cnt) { - TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp); - lsn = bsy->busy_tp->t_commit_lsn; - spin_unlock(&mp->m_perag[agno].pagb_lock); + if (lsn) xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); - } else { - TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp); - spin_unlock(&mp->m_perag[agno].pagb_lock); - } } diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index e704caee10df..599bffa39784 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -37,6 +37,15 @@ typedef enum xfs_alloctype XFS_ALLOCTYPE_THIS_BNO /* at exactly this block */ } xfs_alloctype_t; +#define XFS_ALLOC_TYPES \ + { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \ + { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \ + { XFS_ALLOCTYPE_START_AG, "START_AG" }, \ + { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \ + { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \ + { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \ + { XFS_ALLOCTYPE_THIS_BNO, "THIS_BNO" } + /* * Flags for xfs_alloc_fix_freelist. */ @@ -109,24 +118,6 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp, #ifdef __KERNEL__ -#if defined(XFS_ALLOC_TRACE) -/* - * Allocation tracing buffer size. - */ -#define XFS_ALLOC_TRACE_SIZE 4096 -extern ktrace_t *xfs_alloc_trace_buf; - -/* - * Types for alloc tracing. - */ -#define XFS_ALLOC_KTRACE_ALLOC 1 -#define XFS_ALLOC_KTRACE_FREE 2 -#define XFS_ALLOC_KTRACE_MODAGF 3 -#define XFS_ALLOC_KTRACE_BUSY 4 -#define XFS_ALLOC_KTRACE_UNBUSY 5 -#define XFS_ALLOC_KTRACE_BUSYSEARCH 6 -#endif - void xfs_alloc_mark_busy(xfs_trans_t *tp, xfs_agnumber_t agno, diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index c10c3a292d30..adbd9141aea1 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -39,6 +39,7 @@ #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_trace.h" STATIC struct xfs_btree_cur * diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 4ece1906bd41..e953b6cfb2a8 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -47,6 +47,7 @@ #include "xfs_trans_space.h" #include "xfs_rw.h" #include "xfs_vnodeops.h" +#include "xfs_trace.h" /* * xfs_attr.c @@ -89,10 +90,6 @@ STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args); #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ -#if defined(XFS_ATTR_TRACE) -ktrace_t *xfs_attr_trace_buf; -#endif - STATIC int xfs_attr_name_to_xname( struct xfs_name *xname, @@ -123,9 +120,13 @@ xfs_inode_hasattr( * Overall external interface routines. *========================================================================*/ -int -xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name, - char *value, int *valuelenp, int flags) +STATIC int +xfs_attr_get_int( + struct xfs_inode *ip, + struct xfs_name *name, + char *value, + int *valuelenp, + int flags) { xfs_da_args_t args; int error; @@ -188,7 +189,7 @@ xfs_attr_get( return error; xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_attr_fetch(ip, &xname, value, valuelenp, flags); + error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags); xfs_iunlock(ip, XFS_ILOCK_SHARED); return(error); } @@ -636,7 +637,6 @@ xfs_attr_list_int(xfs_attr_list_context_t *context) return EIO; xfs_ilock(dp, XFS_ILOCK_SHARED); - xfs_attr_trace_l_c("syscall start", context); /* * Decide on what work routines to call based on the inode size. @@ -652,7 +652,6 @@ xfs_attr_list_int(xfs_attr_list_context_t *context) } xfs_iunlock(dp, XFS_ILOCK_SHARED); - xfs_attr_trace_l_c("syscall end", context); return error; } @@ -698,7 +697,7 @@ xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags, context->count * sizeof(alist->al_offset[0]); context->firstu -= ATTR_ENTSIZE(namelen); if (context->firstu < arraytop) { - xfs_attr_trace_l_c("buffer full", context); + trace_xfs_attr_list_full(context); alist->al_more = 1; context->seen_enough = 1; return 1; @@ -710,7 +709,7 @@ xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags, aep->a_name[namelen] = 0; alist->al_offset[context->count++] = context->firstu; alist->al_count = context->count; - xfs_attr_trace_l_c("add", context); + trace_xfs_attr_list_add(context); return 0; } @@ -1849,7 +1848,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) node = bp->data; switch (be16_to_cpu(node->hdr.info.magic)) { case XFS_DA_NODE_MAGIC: - xfs_attr_trace_l_cn("wrong blk", context, node); + trace_xfs_attr_list_wrong_blk(context); xfs_da_brelse(NULL, bp); bp = NULL; break; @@ -1857,20 +1856,18 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) leaf = bp->data; if (cursor->hashval > be32_to_cpu(leaf->entries[ be16_to_cpu(leaf->hdr.count)-1].hashval)) { - xfs_attr_trace_l_cl("wrong blk", - context, leaf); + trace_xfs_attr_list_wrong_blk(context); xfs_da_brelse(NULL, bp); bp = NULL; } else if (cursor->hashval <= be32_to_cpu(leaf->entries[0].hashval)) { - xfs_attr_trace_l_cl("maybe wrong blk", - context, leaf); + trace_xfs_attr_list_wrong_blk(context); xfs_da_brelse(NULL, bp); bp = NULL; } break; default: - xfs_attr_trace_l_c("wrong blk - ??", context); + trace_xfs_attr_list_wrong_blk(context); xfs_da_brelse(NULL, bp); bp = NULL; } @@ -1915,8 +1912,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (cursor->hashval <= be32_to_cpu(btree->hashval)) { cursor->blkno = be32_to_cpu(btree->before); - xfs_attr_trace_l_cb("descending", - context, btree); + trace_xfs_attr_list_node_descend(context, + btree); break; } } @@ -2143,8 +2140,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt, - XFS_BUF_LOCK | XBF_DONT_BLOCK); + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, + XFS_BUF_LOCK | XBF_DONT_BLOCK); ASSERT(bp); ASSERT(!XFS_BUF_GETERROR(bp)); @@ -2266,85 +2263,3 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) } return(0); } - -#if defined(XFS_ATTR_TRACE) -/* - * Add a trace buffer entry for an attr_list context structure. - */ -void -xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context) -{ - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where, context, - (__psunsigned_t)NULL, - (__psunsigned_t)NULL, - (__psunsigned_t)NULL); -} - -/* - * Add a trace buffer entry for a context structure and a Btree node. - */ -void -xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context, - struct xfs_da_intnode *node) -{ - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where, context, - (__psunsigned_t)be16_to_cpu(node->hdr.count), - (__psunsigned_t)be32_to_cpu(node->btree[0].hashval), - (__psunsigned_t)be32_to_cpu(node->btree[ - be16_to_cpu(node->hdr.count)-1].hashval)); -} - -/* - * Add a trace buffer entry for a context structure and a Btree element. - */ -void -xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context, - struct xfs_da_node_entry *btree) -{ - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where, context, - (__psunsigned_t)be32_to_cpu(btree->hashval), - (__psunsigned_t)be32_to_cpu(btree->before), - (__psunsigned_t)NULL); -} - -/* - * Add a trace buffer entry for a context structure and a leaf block. - */ -void -xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context, - struct xfs_attr_leafblock *leaf) -{ - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where, context, - (__psunsigned_t)be16_to_cpu(leaf->hdr.count), - (__psunsigned_t)be32_to_cpu(leaf->entries[0].hashval), - (__psunsigned_t)be32_to_cpu(leaf->entries[ - be16_to_cpu(leaf->hdr.count)-1].hashval)); -} - -/* - * Add a trace buffer entry for the arguments given to the routine, - * generic form. - */ -void -xfs_attr_trace_enter(int type, char *where, - struct xfs_attr_list_context *context, - __psunsigned_t a13, __psunsigned_t a14, - __psunsigned_t a15) -{ - ASSERT(xfs_attr_trace_buf); - ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type), - (void *)((__psunsigned_t)where), - (void *)((__psunsigned_t)context->dp), - (void *)((__psunsigned_t)context->cursor->hashval), - (void *)((__psunsigned_t)context->cursor->blkno), - (void *)((__psunsigned_t)context->cursor->offset), - (void *)((__psunsigned_t)context->alist), - (void *)((__psunsigned_t)context->bufsize), - (void *)((__psunsigned_t)context->count), - (void *)((__psunsigned_t)context->firstu), - NULL, - (void *)((__psunsigned_t)context->dupcnt), - (void *)((__psunsigned_t)context->flags), - (void *)a13, (void *)a14, (void *)a15); -} -#endif /* XFS_ATTR_TRACE */ diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index fb3b2a68b9b9..59b410ce69a1 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -48,6 +48,16 @@ struct xfs_attr_list_context; #define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ #define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ +#define XFS_ATTR_FLAGS \ + { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ + { ATTR_ROOT, "ROOT" }, \ + { ATTR_TRUST, "TRUST" }, \ + { ATTR_SECURE, "SECURE" }, \ + { ATTR_CREATE, "CREATE" }, \ + { ATTR_REPLACE, "REPLACE" }, \ + { ATTR_KERNOTIME, "KERNOTIME" }, \ + { ATTR_KERNOVAL, "KERNOVAL" } + /* * The maximum size (into the kernel or returned from the kernel) of an * attribute value or the buffer used for an attr_list() call. Larger @@ -131,7 +141,6 @@ typedef struct xfs_attr_list_context { */ int xfs_attr_calc_size(struct xfs_inode *, int, int, int *); int xfs_attr_inactive(struct xfs_inode *dp); -int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int); int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_list_int(struct xfs_attr_list_context *); diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index afdc8911637d..baf41b5af756 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -42,6 +42,7 @@ #include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_error.h" +#include "xfs_trace.h" /* * xfs_attr_leaf.c @@ -98,7 +99,7 @@ STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); * If namespace bits don't match return 0. * If all match then return 1. */ -STATIC_INLINE int +STATIC int xfs_attr_namesp_match(int arg_flags, int ondisk_flags) { return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); @@ -594,7 +595,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) cursor = context->cursor; ASSERT(cursor != NULL); - xfs_attr_trace_l_c("sf start", context); + trace_xfs_attr_list_sf(context); /* * If the buffer is large enough and the cursor is at the start, @@ -627,7 +628,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) return error; sfe = XFS_ATTR_SF_NEXTENTRY(sfe); } - xfs_attr_trace_l_c("sf big-gulp", context); + trace_xfs_attr_list_sf_all(context); return(0); } @@ -653,7 +654,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", XFS_ERRLEVEL_LOW, context->dp->i_mount, sfe); - xfs_attr_trace_l_c("sf corrupted", context); kmem_free(sbuf); return XFS_ERROR(EFSCORRUPTED); } @@ -693,7 +693,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) } if (i == nsbuf) { kmem_free(sbuf); - xfs_attr_trace_l_c("blk end", context); return(0); } @@ -719,7 +718,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) } kmem_free(sbuf); - xfs_attr_trace_l_c("sf E-O-F", context); return(0); } @@ -2323,7 +2321,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) cursor = context->cursor; cursor->initted = 1; - xfs_attr_trace_l_cl("blk start", context, leaf); + trace_xfs_attr_list_leaf(context); /* * Re-find our place in the leaf block if this is a new syscall. @@ -2344,7 +2342,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) } } if (i == be16_to_cpu(leaf->hdr.count)) { - xfs_attr_trace_l_c("not found", context); + trace_xfs_attr_list_notfound(context); return(0); } } else { @@ -2419,7 +2417,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) break; cursor->offset++; } - xfs_attr_trace_l_cl("blk end", context, leaf); + trace_xfs_attr_list_leaf_end(context); return(retval); } diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h index ea22839caed2..76ab7b0cbb3a 100644 --- a/fs/xfs/xfs_attr_sf.h +++ b/fs/xfs/xfs_attr_sf.h @@ -25,8 +25,6 @@ * to fit into the literal area of the inode. */ -struct xfs_inode; - /* * Entries are packed toward the top as tight as possible. */ @@ -69,42 +67,4 @@ typedef struct xfs_attr_sf_sort { (be16_to_cpu(((xfs_attr_shortform_t *) \ ((dp)->i_afp->if_u1.if_data))->hdr.totsize)) -#if defined(XFS_ATTR_TRACE) -/* - * Kernel tracing support for attribute lists - */ -struct xfs_attr_list_context; -struct xfs_da_intnode; -struct xfs_da_node_entry; -struct xfs_attr_leafblock; - -#define XFS_ATTR_TRACE_SIZE 4096 /* size of global trace buffer */ -extern ktrace_t *xfs_attr_trace_buf; - -/* - * Trace record types. - */ -#define XFS_ATTR_KTRACE_L_C 1 /* context */ -#define XFS_ATTR_KTRACE_L_CN 2 /* context, node */ -#define XFS_ATTR_KTRACE_L_CB 3 /* context, btree */ -#define XFS_ATTR_KTRACE_L_CL 4 /* context, leaf */ - -void xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context); -void xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context, - struct xfs_da_intnode *node); -void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context, - struct xfs_da_node_entry *btree); -void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context, - struct xfs_attr_leafblock *leaf); -void xfs_attr_trace_enter(int type, char *where, - struct xfs_attr_list_context *context, - __psunsigned_t a13, __psunsigned_t a14, - __psunsigned_t a15); -#else -#define xfs_attr_trace_l_c(w,c) -#define xfs_attr_trace_l_cn(w,c,n) -#define xfs_attr_trace_l_cb(w,c,b) -#define xfs_attr_trace_l_cl(w,c,l) -#endif /* XFS_ATTR_TRACE */ - #endif /* __XFS_ATTR_SF_H__ */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 8971fb09d387..98251cdc52aa 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -54,6 +54,7 @@ #include "xfs_buf_item.h" #include "xfs_filestream.h" #include "xfs_vnodeops.h" +#include "xfs_trace.h" #ifdef DEBUG @@ -272,71 +273,6 @@ xfs_bmap_isaeof( int whichfork, /* data or attribute fork */ char *aeof); /* return value */ -#ifdef XFS_BMAP_TRACE -/* - * Add bmap trace entry prior to a call to xfs_iext_remove. - */ -STATIC void -xfs_bmap_trace_delete( - const char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry(entries) deleted */ - xfs_extnum_t cnt, /* count of entries deleted, 1 or 2 */ - int whichfork); /* data or attr fork */ - -/* - * Add bmap trace entry prior to a call to xfs_iext_insert, or - * reading in the extents list from the disk (in the btree). - */ -STATIC void -xfs_bmap_trace_insert( - const char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry(entries) inserted */ - xfs_extnum_t cnt, /* count of entries inserted, 1 or 2 */ - xfs_bmbt_irec_t *r1, /* inserted record 1 */ - xfs_bmbt_irec_t *r2, /* inserted record 2 or null */ - int whichfork); /* data or attr fork */ - -/* - * Add bmap trace entry after updating an extent record in place. - */ -STATIC void -xfs_bmap_trace_post_update( - const char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry updated */ - int whichfork); /* data or attr fork */ - -/* - * Add bmap trace entry prior to updating an extent record in place. - */ -STATIC void -xfs_bmap_trace_pre_update( - const char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry to be updated */ - int whichfork); /* data or attr fork */ - -#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w) \ - xfs_bmap_trace_delete(__func__,d,ip,i,c,w) -#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) \ - xfs_bmap_trace_insert(__func__,d,ip,i,c,r1,r2,w) -#define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w) \ - xfs_bmap_trace_post_update(__func__,d,ip,i,w) -#define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w) \ - xfs_bmap_trace_pre_update(__func__,d,ip,i,w) -#else -#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w) -#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) -#define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w) -#define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w) -#endif /* XFS_BMAP_TRACE */ - /* * Compute the worst-case number of indirect blocks that will be used * for ip's delayed extent of length "len". @@ -363,18 +299,6 @@ xfs_bmap_validate_ret( #define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) #endif /* DEBUG */ -#if defined(XFS_RW_TRACE) -STATIC void -xfs_bunmap_trace( - xfs_inode_t *ip, - xfs_fileoff_t bno, - xfs_filblks_t len, - int flags, - inst_t *ra); -#else -#define xfs_bunmap_trace(ip, bno, len, flags, ra) -#endif /* XFS_RW_TRACE */ - STATIC int xfs_bmap_count_tree( xfs_mount_t *mp, @@ -590,9 +514,9 @@ xfs_bmap_add_extent( * already extents in the list. */ if (nextents == 0) { - XFS_BMAP_TRACE_INSERT("insert empty", ip, 0, 1, new, NULL, - whichfork); - xfs_iext_insert(ifp, 0, 1, new); + xfs_iext_insert(ip, 0, 1, new, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); + ASSERT(cur == NULL); ifp->if_lastex = 0; if (!isnullstartblock(new->br_startblock)) { @@ -759,26 +683,10 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t temp=0; /* value for dnew calculations */ xfs_filblks_t temp2=0;/* value for dnew calculations */ int tmp_rval; /* partial logging flags */ - enum { /* bit number definitions for state */ - LEFT_CONTIG, RIGHT_CONTIG, - LEFT_FILLING, RIGHT_FILLING, - LEFT_DELAY, RIGHT_DELAY, - LEFT_VALID, RIGHT_VALID - }; #define LEFT r[0] #define RIGHT r[1] #define PREV r[2] -#define MASK(b) (1 << (b)) -#define MASK2(a,b) (MASK(a) | MASK(b)) -#define MASK3(a,b,c) (MASK2(a,b) | MASK(c)) -#define MASK4(a,b,c,d) (MASK3(a,b,c) | MASK(d)) -#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) -#define STATE_TEST(b) (state & MASK(b)) -#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ - ((state &= ~MASK(b)), 0)) -#define SWITCH_STATE \ - (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG)) /* * Set up a bunch of variables to make the tests simpler. @@ -790,69 +698,80 @@ xfs_bmap_add_extent_delay_real( new_endoff = new->br_startoff + new->br_blockcount; ASSERT(PREV.br_startoff <= new->br_startoff); ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + /* * Set flags determining what part of the previous delayed allocation * extent is being replaced by a real allocation. */ - STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff); - STATE_SET(RIGHT_FILLING, - PREV.br_startoff + PREV.br_blockcount == new_endoff); + if (PREV.br_startoff == new->br_startoff) + state |= BMAP_LEFT_FILLING; + if (PREV.br_startoff + PREV.br_blockcount == new_endoff) + state |= BMAP_RIGHT_FILLING; + /* * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { + if (idx > 0) { + state |= BMAP_LEFT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); - STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock)); + + if (isnullstartblock(LEFT.br_startblock)) + state |= BMAP_LEFT_DELAY; } - STATE_SET(LEFT_CONTIG, - STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && - LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && - LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && - LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN); + + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && + LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && + LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && + LEFT.br_state == new->br_state && + LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + /* * Check and set flags if this segment has a right neighbor. * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (STATE_SET_TEST(RIGHT_VALID, - idx < - ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { + if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); - STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock)); + + if (isnullstartblock(RIGHT.br_startblock)) + state |= BMAP_RIGHT_DELAY; } - STATE_SET(RIGHT_CONTIG, - STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && - new_endoff == RIGHT.br_startoff && - new->br_startblock + new->br_blockcount == - RIGHT.br_startblock && - new->br_state == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && - ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) != - MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) || - LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)); + + if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && + new_endoff == RIGHT.br_startoff && + new->br_startblock + new->br_blockcount == RIGHT.br_startblock && + new->br_state == RIGHT.br_state && + new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING)) != + (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING) || + LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount + <= MAXEXTLEN)) + state |= BMAP_RIGHT_CONTIG; + error = 0; /* * Switch out based on the FILLING and CONTIG state bits. */ - switch (SWITCH_STATE) { - - case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in all of a previously delayed allocation extent. * The left and right neighbors are both contiguous with new. */ - XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC|RC", ip, idx - 1, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC|RC", ip, idx - 1, - XFS_DATA_FORK); - XFS_BMAP_TRACE_DELETE("LF|RF|LC|RC", ip, idx, 2, XFS_DATA_FORK); - xfs_iext_remove(ifp, idx, 2); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + + xfs_iext_remove(ip, idx, 2, state); ip->i_df.if_lastex = idx - 1; ip->i_d.di_nextents--; if (cur == NULL) @@ -885,20 +804,18 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount; break; - case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: /* * Filling in all of a previously delayed allocation extent. * The left neighbor is contiguous, the right is not. */ - XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC", ip, idx - 1, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount); - XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC", ip, idx - 1, - XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + ip->i_df.if_lastex = idx - 1; - XFS_BMAP_TRACE_DELETE("LF|RF|LC", ip, idx, 1, XFS_DATA_FORK); - xfs_iext_remove(ifp, idx, 1); + xfs_iext_remove(ip, idx, 1, state); if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -921,19 +838,19 @@ xfs_bmap_add_extent_delay_real( PREV.br_blockcount; break; - case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in all of a previously delayed allocation extent. * The right neighbor is contiguous, the left is not. */ - XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); - XFS_BMAP_TRACE_POST_UPDATE("LF|RF|RC", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + ip->i_df.if_lastex = idx; - XFS_BMAP_TRACE_DELETE("LF|RF|RC", ip, idx + 1, 1, XFS_DATA_FORK); - xfs_iext_remove(ifp, idx + 1, 1); + xfs_iext_remove(ip, idx + 1, 1, state); if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -956,15 +873,16 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount; break; - case MASK2(LEFT_FILLING, RIGHT_FILLING): + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: /* * Filling in all of a previously delayed allocation extent. * Neither the left nor right neighbors are contiguous with * the new one. */ - XFS_BMAP_TRACE_PRE_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); - XFS_BMAP_TRACE_POST_UPDATE("LF|RF", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + ip->i_df.if_lastex = idx; ip->i_d.di_nextents++; if (cur == NULL) @@ -987,19 +905,20 @@ xfs_bmap_add_extent_delay_real( temp2 = new->br_blockcount; break; - case MASK2(LEFT_FILLING, LEFT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: /* * Filling in the first part of a previous delayed allocation. * The left neighbor is contiguous. */ - XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx - 1, XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + temp = PREV.br_blockcount - new->br_blockcount; - XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); ip->i_df.if_lastex = idx - 1; if (cur == NULL) @@ -1021,7 +940,7 @@ xfs_bmap_add_extent_delay_real( temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), startblockval(PREV.br_startblock)); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); *dnew = temp; /* DELTA: The boundary between two in-core extents moved. */ temp = LEFT.br_startoff; @@ -1029,18 +948,16 @@ xfs_bmap_add_extent_delay_real( PREV.br_blockcount; break; - case MASK(LEFT_FILLING): + case BMAP_LEFT_FILLING: /* * Filling in the first part of a previous delayed allocation. * The left neighbor is not contiguous. */ - XFS_BMAP_TRACE_PRE_UPDATE("LF", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_startoff(ep, new_endoff); temp = PREV.br_blockcount - new->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); - XFS_BMAP_TRACE_INSERT("LF", ip, idx, 1, new, NULL, - XFS_DATA_FORK); - xfs_iext_insert(ifp, idx, 1, new); + xfs_iext_insert(ip, idx, 1, new, state); ip->i_df.if_lastex = idx; ip->i_d.di_nextents++; if (cur == NULL) @@ -1071,27 +988,27 @@ xfs_bmap_add_extent_delay_real( (cur ? cur->bc_private.b.allocated : 0)); ep = xfs_iext_get_ext(ifp, idx + 1); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); *dnew = temp; /* DELTA: One in-core extent is split in two. */ temp = PREV.br_startoff; temp2 = PREV.br_blockcount; break; - case MASK2(RIGHT_FILLING, RIGHT_CONTIG): + case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in the last part of a previous delayed allocation. * The right neighbor is contiguous with the new allocation. */ temp = PREV.br_blockcount - new->br_blockcount; - XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); - XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, RIGHT.br_state); - XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx + 1, XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); ip->i_df.if_lastex = idx + 1; if (cur == NULL) rval = XFS_ILOG_DEXT; @@ -1112,7 +1029,7 @@ xfs_bmap_add_extent_delay_real( temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), startblockval(PREV.br_startblock)); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); *dnew = temp; /* DELTA: The boundary between two in-core extents moved. */ temp = PREV.br_startoff; @@ -1120,17 +1037,15 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount; break; - case MASK(RIGHT_FILLING): + case BMAP_RIGHT_FILLING: /* * Filling in the last part of a previous delayed allocation. * The right neighbor is not contiguous. */ temp = PREV.br_blockcount - new->br_blockcount; - XFS_BMAP_TRACE_PRE_UPDATE("RF", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - XFS_BMAP_TRACE_INSERT("RF", ip, idx + 1, 1, new, NULL, - XFS_DATA_FORK); - xfs_iext_insert(ifp, idx + 1, 1, new); + xfs_iext_insert(ip, idx + 1, 1, new, state); ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents++; if (cur == NULL) @@ -1161,7 +1076,7 @@ xfs_bmap_add_extent_delay_real( (cur ? cur->bc_private.b.allocated : 0)); ep = xfs_iext_get_ext(ifp, idx); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); *dnew = temp; /* DELTA: One in-core extent is split in two. */ temp = PREV.br_startoff; @@ -1175,7 +1090,7 @@ xfs_bmap_add_extent_delay_real( * This case is avoided almost all the time. */ temp = new->br_startoff - PREV.br_startoff; - XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); r[0] = *new; r[1].br_state = PREV.br_state; @@ -1183,9 +1098,7 @@ xfs_bmap_add_extent_delay_real( r[1].br_startoff = new_endoff; temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; r[1].br_blockcount = temp2; - XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 2, &r[0], &r[1], - XFS_DATA_FORK); - xfs_iext_insert(ifp, idx + 1, 2, &r[0]); + xfs_iext_insert(ip, idx + 1, 2, &r[0], state); ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents++; if (cur == NULL) @@ -1242,24 +1155,24 @@ xfs_bmap_add_extent_delay_real( } ep = xfs_iext_get_ext(ifp, idx); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK); - XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2), nullstartblock((int)temp2)); - XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_); *dnew = temp + temp2; /* DELTA: One in-core extent is split in three. */ temp = PREV.br_startoff; temp2 = PREV.br_blockcount; break; - case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): - case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): - case MASK2(LEFT_FILLING, RIGHT_CONTIG): - case MASK2(RIGHT_FILLING, LEFT_CONTIG): - case MASK2(LEFT_CONTIG, RIGHT_CONTIG): - case MASK(LEFT_CONTIG): - case MASK(RIGHT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_CONTIG: + case BMAP_RIGHT_CONTIG: /* * These cases are all impossible. */ @@ -1279,14 +1192,6 @@ done: #undef LEFT #undef RIGHT #undef PREV -#undef MASK -#undef MASK2 -#undef MASK3 -#undef MASK4 -#undef STATE_SET -#undef STATE_TEST -#undef STATE_SET_TEST -#undef SWITCH_STATE } /* @@ -1316,27 +1221,10 @@ xfs_bmap_add_extent_unwritten_real( int state = 0;/* state bits, accessed thru macros */ xfs_filblks_t temp=0; xfs_filblks_t temp2=0; - enum { /* bit number definitions for state */ - LEFT_CONTIG, RIGHT_CONTIG, - LEFT_FILLING, RIGHT_FILLING, - LEFT_DELAY, RIGHT_DELAY, - LEFT_VALID, RIGHT_VALID - }; #define LEFT r[0] #define RIGHT r[1] #define PREV r[2] -#define MASK(b) (1 << (b)) -#define MASK2(a,b) (MASK(a) | MASK(b)) -#define MASK3(a,b,c) (MASK2(a,b) | MASK(c)) -#define MASK4(a,b,c,d) (MASK3(a,b,c) | MASK(d)) -#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) -#define STATE_TEST(b) (state & MASK(b)) -#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ - ((state &= ~MASK(b)), 0)) -#define SWITCH_STATE \ - (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG)) - /* * Set up a bunch of variables to make the tests simpler. */ @@ -1352,68 +1240,78 @@ xfs_bmap_add_extent_unwritten_real( new_endoff = new->br_startoff + new->br_blockcount; ASSERT(PREV.br_startoff <= new->br_startoff); ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + /* * Set flags determining what part of the previous oldext allocation * extent is being replaced by a newext allocation. */ - STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff); - STATE_SET(RIGHT_FILLING, - PREV.br_startoff + PREV.br_blockcount == new_endoff); + if (PREV.br_startoff == new->br_startoff) + state |= BMAP_LEFT_FILLING; + if (PREV.br_startoff + PREV.br_blockcount == new_endoff) + state |= BMAP_RIGHT_FILLING; + /* * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { + if (idx > 0) { + state |= BMAP_LEFT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); - STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock)); + + if (isnullstartblock(LEFT.br_startblock)) + state |= BMAP_LEFT_DELAY; } - STATE_SET(LEFT_CONTIG, - STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && - LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && - LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && - LEFT.br_state == newext && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN); + + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && + LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && + LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && + LEFT.br_state == newext && + LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + /* * Check and set flags if this segment has a right neighbor. * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (STATE_SET_TEST(RIGHT_VALID, - idx < - ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { + if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); - STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock)); + if (isnullstartblock(RIGHT.br_startblock)) + state |= BMAP_RIGHT_DELAY; } - STATE_SET(RIGHT_CONTIG, - STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && - new_endoff == RIGHT.br_startoff && - new->br_startblock + new->br_blockcount == - RIGHT.br_startblock && - newext == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && - ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) != - MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) || - LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)); + + if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && + new_endoff == RIGHT.br_startoff && + new->br_startblock + new->br_blockcount == RIGHT.br_startblock && + newext == RIGHT.br_state && + new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING)) != + (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING) || + LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount + <= MAXEXTLEN)) + state |= BMAP_RIGHT_CONTIG; + /* * Switch out based on the FILLING and CONTIG state bits. */ - switch (SWITCH_STATE) { - - case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Setting all of a previous oldext extent to newext. * The left and right neighbors are both contiguous with new. */ - XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC|RC", ip, idx - 1, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC|RC", ip, idx - 1, - XFS_DATA_FORK); - XFS_BMAP_TRACE_DELETE("LF|RF|LC|RC", ip, idx, 2, XFS_DATA_FORK); - xfs_iext_remove(ifp, idx, 2); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + + xfs_iext_remove(ip, idx, 2, state); ip->i_df.if_lastex = idx - 1; ip->i_d.di_nextents -= 2; if (cur == NULL) @@ -1450,20 +1348,18 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_blockcount; break; - case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: /* * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|LC", ip, idx - 1, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + PREV.br_blockcount); - XFS_BMAP_TRACE_POST_UPDATE("LF|RF|LC", ip, idx - 1, - XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + ip->i_df.if_lastex = idx - 1; - XFS_BMAP_TRACE_DELETE("LF|RF|LC", ip, idx, 1, XFS_DATA_FORK); - xfs_iext_remove(ifp, idx, 1); + xfs_iext_remove(ip, idx, 1, state); ip->i_d.di_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1492,21 +1388,18 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_blockcount; break; - case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Setting all of a previous oldext extent to newext. * The right neighbor is contiguous, the left is not. */ - XFS_BMAP_TRACE_PRE_UPDATE("LF|RF|RC", ip, idx, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); xfs_bmbt_set_state(ep, newext); - XFS_BMAP_TRACE_POST_UPDATE("LF|RF|RC", ip, idx, - XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); ip->i_df.if_lastex = idx; - XFS_BMAP_TRACE_DELETE("LF|RF|RC", ip, idx + 1, 1, XFS_DATA_FORK); - xfs_iext_remove(ifp, idx + 1, 1); + xfs_iext_remove(ip, idx + 1, 1, state); ip->i_d.di_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1535,17 +1428,16 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_blockcount; break; - case MASK2(LEFT_FILLING, RIGHT_FILLING): + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: /* * Setting all of a previous oldext extent to newext. * Neither the left nor right neighbors are contiguous with * the new one. */ - XFS_BMAP_TRACE_PRE_UPDATE("LF|RF", ip, idx, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_state(ep, newext); - XFS_BMAP_TRACE_POST_UPDATE("LF|RF", ip, idx, - XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + ip->i_df.if_lastex = idx; if (cur == NULL) rval = XFS_ILOG_DEXT; @@ -1566,27 +1458,25 @@ xfs_bmap_add_extent_unwritten_real( temp2 = new->br_blockcount; break; - case MASK2(LEFT_FILLING, LEFT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: /* * Setting the first part of a previous oldext extent to newext. * The left neighbor is contiguous. */ - XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx - 1, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx - 1, - XFS_DATA_FORK); - XFS_BMAP_TRACE_PRE_UPDATE("LF|LC", ip, idx, - XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock + new->br_blockcount); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, - XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + ip->i_df.if_lastex = idx - 1; if (cur == NULL) rval = XFS_ILOG_DEXT; @@ -1617,22 +1507,21 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_blockcount; break; - case MASK(LEFT_FILLING): + case BMAP_LEFT_FILLING: /* * Setting the first part of a previous oldext extent to newext. * The left neighbor is not contiguous. */ - XFS_BMAP_TRACE_PRE_UPDATE("LF", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); xfs_bmbt_set_startoff(ep, new_endoff); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); xfs_bmbt_set_startblock(ep, new->br_startblock + new->br_blockcount); - XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx, XFS_DATA_FORK); - XFS_BMAP_TRACE_INSERT("LF", ip, idx, 1, new, NULL, - XFS_DATA_FORK); - xfs_iext_insert(ifp, idx, 1, new); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + + xfs_iext_insert(ip, idx, 1, new, state); ip->i_df.if_lastex = idx; ip->i_d.di_nextents++; if (cur == NULL) @@ -1660,24 +1549,21 @@ xfs_bmap_add_extent_unwritten_real( temp2 = PREV.br_blockcount; break; - case MASK2(RIGHT_FILLING, RIGHT_CONTIG): + case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Setting the last part of a previous oldext extent to newext. * The right neighbor is contiguous with the new allocation. */ - XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx, - XFS_DATA_FORK); - XFS_BMAP_TRACE_PRE_UPDATE("RF|RC", ip, idx + 1, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, - XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, newext); - XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx + 1, - XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); + ip->i_df.if_lastex = idx + 1; if (cur == NULL) rval = XFS_ILOG_DEXT; @@ -1707,18 +1593,17 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_blockcount; break; - case MASK(RIGHT_FILLING): + case BMAP_RIGHT_FILLING: /* * Setting the last part of a previous oldext extent to newext. * The right neighbor is not contiguous. */ - XFS_BMAP_TRACE_PRE_UPDATE("RF", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK); - XFS_BMAP_TRACE_INSERT("RF", ip, idx + 1, 1, new, NULL, - XFS_DATA_FORK); - xfs_iext_insert(ifp, idx + 1, 1, new); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + + xfs_iext_insert(ip, idx + 1, 1, new, state); ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents++; if (cur == NULL) @@ -1756,19 +1641,18 @@ xfs_bmap_add_extent_unwritten_real( * newext. Contiguity is impossible here. * One extent becomes three extents. */ - XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, new->br_startoff - PREV.br_startoff); - XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + r[0] = *new; r[1].br_startoff = new_endoff; r[1].br_blockcount = PREV.br_startoff + PREV.br_blockcount - new_endoff; r[1].br_startblock = new->br_startblock + new->br_blockcount; r[1].br_state = oldext; - XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 2, &r[0], &r[1], - XFS_DATA_FORK); - xfs_iext_insert(ifp, idx + 1, 2, &r[0]); + xfs_iext_insert(ip, idx + 1, 2, &r[0], state); ip->i_df.if_lastex = idx + 1; ip->i_d.di_nextents += 2; if (cur == NULL) @@ -1813,13 +1697,13 @@ xfs_bmap_add_extent_unwritten_real( temp2 = PREV.br_blockcount; break; - case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): - case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): - case MASK2(LEFT_FILLING, RIGHT_CONTIG): - case MASK2(RIGHT_FILLING, LEFT_CONTIG): - case MASK2(LEFT_CONTIG, RIGHT_CONTIG): - case MASK(LEFT_CONTIG): - case MASK(RIGHT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_CONTIG: + case BMAP_RIGHT_CONTIG: /* * These cases are all impossible. */ @@ -1839,14 +1723,6 @@ done: #undef LEFT #undef RIGHT #undef PREV -#undef MASK -#undef MASK2 -#undef MASK3 -#undef MASK4 -#undef STATE_SET -#undef STATE_TEST -#undef STATE_SET_TEST -#undef SWITCH_STATE } /* @@ -1872,62 +1748,57 @@ xfs_bmap_add_extent_hole_delay( int state; /* state bits, accessed thru macros */ xfs_filblks_t temp=0; /* temp for indirect calculations */ xfs_filblks_t temp2=0; - enum { /* bit number definitions for state */ - LEFT_CONTIG, RIGHT_CONTIG, - LEFT_DELAY, RIGHT_DELAY, - LEFT_VALID, RIGHT_VALID - }; - -#define MASK(b) (1 << (b)) -#define MASK2(a,b) (MASK(a) | MASK(b)) -#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) -#define STATE_TEST(b) (state & MASK(b)) -#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ - ((state &= ~MASK(b)), 0)) -#define SWITCH_STATE (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG)) ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ep = xfs_iext_get_ext(ifp, idx); state = 0; ASSERT(isnullstartblock(new->br_startblock)); + /* * Check and set flags if this segment has a left neighbor */ - if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { + if (idx > 0) { + state |= BMAP_LEFT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); - STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock)); + + if (isnullstartblock(left.br_startblock)) + state |= BMAP_LEFT_DELAY; } + /* * Check and set flags if the current (right) segment exists. * If it doesn't exist, we're converting the hole at end-of-file. */ - if (STATE_SET_TEST(RIGHT_VALID, - idx < - ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { + if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(ep, &right); - STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock)); + + if (isnullstartblock(right.br_startblock)) + state |= BMAP_RIGHT_DELAY; } + /* * Set contiguity flags on the left and right neighbors. * Don't let extents get too large, even if the pieces are contiguous. */ - STATE_SET(LEFT_CONTIG, - STATE_TEST(LEFT_VALID) && STATE_TEST(LEFT_DELAY) && - left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN); - STATE_SET(RIGHT_CONTIG, - STATE_TEST(RIGHT_VALID) && STATE_TEST(RIGHT_DELAY) && - new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && - (!STATE_TEST(LEFT_CONTIG) || - (left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN))); + if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + (!(state & BMAP_LEFT_CONTIG) || + (left.br_blockcount + new->br_blockcount + + right.br_blockcount <= MAXEXTLEN))) + state |= BMAP_RIGHT_CONTIG; + /* * Switch out based on the contiguity flags. */ - switch (SWITCH_STATE) { - - case MASK2(LEFT_CONTIG, RIGHT_CONTIG): + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: /* * New allocation is contiguous with delayed allocations * on the left and on the right. @@ -1935,8 +1806,8 @@ xfs_bmap_add_extent_hole_delay( */ temp = left.br_blockcount + new->br_blockcount + right.br_blockcount; - XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1, - XFS_DATA_FORK); + + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock) + @@ -1944,53 +1815,52 @@ xfs_bmap_add_extent_hole_delay( newlen = xfs_bmap_worst_indlen(ip, temp); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), nullstartblock((int)newlen)); - XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1, - XFS_DATA_FORK); - XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK); - xfs_iext_remove(ifp, idx, 1); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + + xfs_iext_remove(ip, idx, 1, state); ip->i_df.if_lastex = idx - 1; /* DELTA: Two in-core extents were replaced by one. */ temp2 = temp; temp = left.br_startoff; break; - case MASK(LEFT_CONTIG): + case BMAP_LEFT_CONTIG: /* * New allocation is contiguous with a delayed allocation * on the left. * Merge the new allocation with the left neighbor. */ temp = left.br_blockcount + new->br_blockcount; - XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), nullstartblock((int)newlen)); - XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, - XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + ip->i_df.if_lastex = idx - 1; /* DELTA: One in-core extent grew into a hole. */ temp2 = temp; temp = left.br_startoff; break; - case MASK(RIGHT_CONTIG): + case BMAP_RIGHT_CONTIG: /* * New allocation is contiguous with a delayed allocation * on the right. * Merge the new allocation with the right neighbor. */ - XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); temp = new->br_blockcount + right.br_blockcount; oldlen = startblockval(new->br_startblock) + startblockval(right.br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); xfs_bmbt_set_allf(ep, new->br_startoff, nullstartblock((int)newlen), temp, right.br_state); - XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + ip->i_df.if_lastex = idx; /* DELTA: One in-core extent grew into a hole. */ temp2 = temp; @@ -2004,9 +1874,7 @@ xfs_bmap_add_extent_hole_delay( * Insert a new entry. */ oldlen = newlen = 0; - XFS_BMAP_TRACE_INSERT("0", ip, idx, 1, new, NULL, - XFS_DATA_FORK); - xfs_iext_insert(ifp, idx, 1, new); + xfs_iext_insert(ip, idx, 1, new, state); ip->i_df.if_lastex = idx; /* DELTA: A new in-core extent was added in a hole. */ temp2 = new->br_blockcount; @@ -2030,12 +1898,6 @@ xfs_bmap_add_extent_hole_delay( } *logflagsp = 0; return 0; -#undef MASK -#undef MASK2 -#undef STATE_SET -#undef STATE_TEST -#undef STATE_SET_TEST -#undef SWITCH_STATE } /* @@ -2062,83 +1924,75 @@ xfs_bmap_add_extent_hole_real( int state; /* state bits, accessed thru macros */ xfs_filblks_t temp=0; xfs_filblks_t temp2=0; - enum { /* bit number definitions for state */ - LEFT_CONTIG, RIGHT_CONTIG, - LEFT_DELAY, RIGHT_DELAY, - LEFT_VALID, RIGHT_VALID - }; - -#define MASK(b) (1 << (b)) -#define MASK2(a,b) (MASK(a) | MASK(b)) -#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) -#define STATE_TEST(b) (state & MASK(b)) -#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ - ((state &= ~MASK(b)), 0)) -#define SWITCH_STATE (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG)) ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); ep = xfs_iext_get_ext(ifp, idx); state = 0; + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + /* * Check and set flags if this segment has a left neighbor. */ - if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { + if (idx > 0) { + state |= BMAP_LEFT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); - STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock)); + if (isnullstartblock(left.br_startblock)) + state |= BMAP_LEFT_DELAY; } + /* * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (STATE_SET_TEST(RIGHT_VALID, - idx < - ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { + if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(ep, &right); - STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock)); + if (isnullstartblock(right.br_startblock)) + state |= BMAP_RIGHT_DELAY; } + /* * We're inserting a real allocation between "left" and "right". * Set the contiguity flags. Don't let extents get too large. */ - STATE_SET(LEFT_CONTIG, - STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && - left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_startblock + left.br_blockcount == new->br_startblock && - left.br_state == new->br_state && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN); - STATE_SET(RIGHT_CONTIG, - STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && - new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_startblock + new->br_blockcount == - right.br_startblock && - new->br_state == right.br_state && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && - (!STATE_TEST(LEFT_CONTIG) || - left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN)); + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_startblock + left.br_blockcount == new->br_startblock && + left.br_state == new->br_state && + left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_startblock + new->br_blockcount == right.br_startblock && + new->br_state == right.br_state && + new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + (!(state & BMAP_LEFT_CONTIG) || + left.br_blockcount + new->br_blockcount + + right.br_blockcount <= MAXEXTLEN)) + state |= BMAP_RIGHT_CONTIG; error = 0; /* * Select which case we're in here, and implement it. */ - switch (SWITCH_STATE) { - - case MASK2(LEFT_CONTIG, RIGHT_CONTIG): + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: /* * New allocation is contiguous with real allocations on the * left and on the right. * Merge all three into a single extent record. */ - XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1, - whichfork); + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), left.br_blockcount + new->br_blockcount + right.br_blockcount); - XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1, - whichfork); - XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, whichfork); - xfs_iext_remove(ifp, idx, 1); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + + xfs_iext_remove(ip, idx, 1, state); ifp->if_lastex = idx - 1; XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); @@ -2173,16 +2027,17 @@ xfs_bmap_add_extent_hole_real( right.br_blockcount; break; - case MASK(LEFT_CONTIG): + case BMAP_LEFT_CONTIG: /* * New allocation is contiguous with a real allocation * on the left. * Merge the new allocation with the left neighbor. */ - XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1, whichfork); + trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), left.br_blockcount + new->br_blockcount); - XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork); + trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); + ifp->if_lastex = idx - 1; if (cur == NULL) { rval = xfs_ilog_fext(whichfork); @@ -2207,17 +2062,18 @@ xfs_bmap_add_extent_hole_real( new->br_blockcount; break; - case MASK(RIGHT_CONTIG): + case BMAP_RIGHT_CONTIG: /* * New allocation is contiguous with a real allocation * on the right. * Merge the new allocation with the right neighbor. */ - XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, whichfork); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, right.br_state); - XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + ifp->if_lastex = idx; if (cur == NULL) { rval = xfs_ilog_fext(whichfork); @@ -2248,8 +2104,7 @@ xfs_bmap_add_extent_hole_real( * real allocation. * Insert a new entry. */ - XFS_BMAP_TRACE_INSERT("0", ip, idx, 1, new, NULL, whichfork); - xfs_iext_insert(ifp, idx, 1, new); + xfs_iext_insert(ip, idx, 1, new, state); ifp->if_lastex = idx; XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); @@ -2283,12 +2138,6 @@ xfs_bmap_add_extent_hole_real( done: *logflagsp = rval; return error; -#undef MASK -#undef MASK2 -#undef STATE_SET -#undef STATE_TEST -#undef STATE_SET_TEST -#undef SWITCH_STATE } /* @@ -3115,8 +2964,13 @@ xfs_bmap_del_extent( uint qfield; /* quota field to update */ xfs_filblks_t temp; /* for indirect length calculations */ xfs_filblks_t temp2; /* for indirect length calculations */ + int state = 0; XFS_STATS_INC(xs_del_exlist); + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT((idx >= 0) && (idx < ifp->if_bytes / @@ -3196,8 +3050,8 @@ xfs_bmap_del_extent( /* * Matches the whole extent. Delete the entry. */ - XFS_BMAP_TRACE_DELETE("3", ip, idx, 1, whichfork); - xfs_iext_remove(ifp, idx, 1); + xfs_iext_remove(ip, idx, 1, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); ifp->if_lastex = idx; if (delay) break; @@ -3217,7 +3071,7 @@ xfs_bmap_del_extent( /* * Deleting the first part of the extent. */ - XFS_BMAP_TRACE_PRE_UPDATE("2", ip, idx, whichfork); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_startoff(ep, del_endoff); temp = got.br_blockcount - del->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); @@ -3226,13 +3080,12 @@ xfs_bmap_del_extent( temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, - whichfork); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); da_new = temp; break; } xfs_bmbt_set_startblock(ep, del_endblock); - XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); if (!cur) { flags |= xfs_ilog_fext(whichfork); break; @@ -3248,19 +3101,18 @@ xfs_bmap_del_extent( * Deleting the last part of the extent. */ temp = got.br_blockcount - del->br_blockcount; - XFS_BMAP_TRACE_PRE_UPDATE("1", ip, idx, whichfork); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); ifp->if_lastex = idx; if (delay) { temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, - whichfork); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); da_new = temp; break; } - XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); if (!cur) { flags |= xfs_ilog_fext(whichfork); break; @@ -3277,7 +3129,7 @@ xfs_bmap_del_extent( * Deleting the middle of the extent. */ temp = del->br_startoff - got.br_startoff; - XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx, whichfork); + trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); new.br_startoff = del_endoff; temp2 = got_endoff - del_endoff; @@ -3364,10 +3216,8 @@ xfs_bmap_del_extent( } } } - XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, whichfork); - XFS_BMAP_TRACE_INSERT("0", ip, idx + 1, 1, &new, NULL, - whichfork); - xfs_iext_insert(ifp, idx + 1, 1, &new); + trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); + xfs_iext_insert(ip, idx + 1, 1, &new, state); ifp->if_lastex = idx + 1; break; } @@ -3687,7 +3537,9 @@ xfs_bmap_local_to_extents( xfs_iext_add(ifp, 0, 1); ep = xfs_iext_get_ext(ifp, 0); xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); - XFS_BMAP_TRACE_POST_UPDATE("new", ip, 0, whichfork); + trace_xfs_bmap_post_update(ip, 0, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, + _THIS_IP_); XFS_IFORK_NEXT_SET(ip, whichfork, 1); ip->i_d.di_nblocks = 1; xfs_trans_mod_dquot_byino(tp, ip, @@ -3800,158 +3652,6 @@ xfs_bmap_search_extents( return ep; } - -#ifdef XFS_BMAP_TRACE -ktrace_t *xfs_bmap_trace_buf; - -/* - * Add a bmap trace buffer entry. Base routine for the others. - */ -STATIC void -xfs_bmap_trace_addentry( - int opcode, /* operation */ - const char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry(ies) */ - xfs_extnum_t cnt, /* count of entries, 1 or 2 */ - xfs_bmbt_rec_host_t *r1, /* first record */ - xfs_bmbt_rec_host_t *r2, /* second record or null */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_rec_host_t tr2; - - ASSERT(cnt == 1 || cnt == 2); - ASSERT(r1 != NULL); - if (cnt == 1) { - ASSERT(r2 == NULL); - r2 = &tr2; - memset(&tr2, 0, sizeof(tr2)); - } else - ASSERT(r2 != NULL); - ktrace_enter(xfs_bmap_trace_buf, - (void *)(__psint_t)(opcode | (whichfork << 16)), - (void *)fname, (void *)desc, (void *)ip, - (void *)(__psint_t)idx, - (void *)(__psint_t)cnt, - (void *)(__psunsigned_t)(ip->i_ino >> 32), - (void *)(__psunsigned_t)(unsigned)ip->i_ino, - (void *)(__psunsigned_t)(r1->l0 >> 32), - (void *)(__psunsigned_t)(unsigned)(r1->l0), - (void *)(__psunsigned_t)(r1->l1 >> 32), - (void *)(__psunsigned_t)(unsigned)(r1->l1), - (void *)(__psunsigned_t)(r2->l0 >> 32), - (void *)(__psunsigned_t)(unsigned)(r2->l0), - (void *)(__psunsigned_t)(r2->l1 >> 32), - (void *)(__psunsigned_t)(unsigned)(r2->l1) - ); - ASSERT(ip->i_xtrace); - ktrace_enter(ip->i_xtrace, - (void *)(__psint_t)(opcode | (whichfork << 16)), - (void *)fname, (void *)desc, (void *)ip, - (void *)(__psint_t)idx, - (void *)(__psint_t)cnt, - (void *)(__psunsigned_t)(ip->i_ino >> 32), - (void *)(__psunsigned_t)(unsigned)ip->i_ino, - (void *)(__psunsigned_t)(r1->l0 >> 32), - (void *)(__psunsigned_t)(unsigned)(r1->l0), - (void *)(__psunsigned_t)(r1->l1 >> 32), - (void *)(__psunsigned_t)(unsigned)(r1->l1), - (void *)(__psunsigned_t)(r2->l0 >> 32), - (void *)(__psunsigned_t)(unsigned)(r2->l0), - (void *)(__psunsigned_t)(r2->l1 >> 32), - (void *)(__psunsigned_t)(unsigned)(r2->l1) - ); -} - -/* - * Add bmap trace entry prior to a call to xfs_iext_remove. - */ -STATIC void -xfs_bmap_trace_delete( - const char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry(entries) deleted */ - xfs_extnum_t cnt, /* count of entries deleted, 1 or 2 */ - int whichfork) /* data or attr fork */ -{ - xfs_ifork_t *ifp; /* inode fork pointer */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_DELETE, fname, desc, ip, idx, - cnt, xfs_iext_get_ext(ifp, idx), - cnt == 2 ? xfs_iext_get_ext(ifp, idx + 1) : NULL, - whichfork); -} - -/* - * Add bmap trace entry prior to a call to xfs_iext_insert, or - * reading in the extents list from the disk (in the btree). - */ -STATIC void -xfs_bmap_trace_insert( - const char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry(entries) inserted */ - xfs_extnum_t cnt, /* count of entries inserted, 1 or 2 */ - xfs_bmbt_irec_t *r1, /* inserted record 1 */ - xfs_bmbt_irec_t *r2, /* inserted record 2 or null */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_rec_host_t tr1; /* compressed record 1 */ - xfs_bmbt_rec_host_t tr2; /* compressed record 2 if needed */ - - xfs_bmbt_set_all(&tr1, r1); - if (cnt == 2) { - ASSERT(r2 != NULL); - xfs_bmbt_set_all(&tr2, r2); - } else { - ASSERT(cnt == 1); - ASSERT(r2 == NULL); - } - xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_INSERT, fname, desc, ip, idx, - cnt, &tr1, cnt == 2 ? &tr2 : NULL, whichfork); -} - -/* - * Add bmap trace entry after updating an extent record in place. - */ -STATIC void -xfs_bmap_trace_post_update( - const char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry updated */ - int whichfork) /* data or attr fork */ -{ - xfs_ifork_t *ifp; /* inode fork pointer */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_POST_UP, fname, desc, ip, idx, - 1, xfs_iext_get_ext(ifp, idx), NULL, whichfork); -} - -/* - * Add bmap trace entry prior to updating an extent record in place. - */ -STATIC void -xfs_bmap_trace_pre_update( - const char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry to be updated */ - int whichfork) /* data or attr fork */ -{ - xfs_ifork_t *ifp; /* inode fork pointer */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_PRE_UP, fname, desc, ip, idx, 1, - xfs_iext_get_ext(ifp, idx), NULL, whichfork); -} -#endif /* XFS_BMAP_TRACE */ - /* * Compute the worst-case number of indirect blocks that will be used * for ip's delayed extent of length "len". @@ -3983,37 +3683,6 @@ xfs_bmap_worst_indlen( return rval; } -#if defined(XFS_RW_TRACE) -STATIC void -xfs_bunmap_trace( - xfs_inode_t *ip, - xfs_fileoff_t bno, - xfs_filblks_t len, - int flags, - inst_t *ra) -{ - if (ip->i_rwtrace == NULL) - return; - ktrace_enter(ip->i_rwtrace, - (void *)(__psint_t)XFS_BUNMAP, - (void *)ip, - (void *)(__psint_t)((ip->i_d.di_size >> 32) & 0xffffffff), - (void *)(__psint_t)(ip->i_d.di_size & 0xffffffff), - (void *)(__psint_t)(((xfs_dfiloff_t)bno >> 32) & 0xffffffff), - (void *)(__psint_t)((xfs_dfiloff_t)bno & 0xffffffff), - (void *)(__psint_t)len, - (void *)(__psint_t)flags, - (void *)(unsigned long)current_cpu(), - (void *)ra, - (void *)0, - (void *)0, - (void *)0, - (void *)0, - (void *)0, - (void *)0); -} -#endif - /* * Convert inode from non-attributed to attributed. * Must not be in a transaction, ip must not be locked. @@ -4702,34 +4371,30 @@ error0: return XFS_ERROR(EFSCORRUPTED); } -#ifdef XFS_BMAP_TRACE +#ifdef DEBUG /* * Add bmap trace insert entries for all the contents of the extent records. */ void xfs_bmap_trace_exlist( - const char *fname, /* function name */ xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t cnt, /* count of entries in the list */ - int whichfork) /* data or attr fork */ + int whichfork, /* data or attr fork */ + unsigned long caller_ip) { - xfs_bmbt_rec_host_t *ep; /* current extent record */ xfs_extnum_t idx; /* extent record index */ xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_irec_t s; /* file extent record */ + int state = 0; + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); - for (idx = 0; idx < cnt; idx++) { - ep = xfs_iext_get_ext(ifp, idx); - xfs_bmbt_get_all(ep, &s); - XFS_BMAP_TRACE_INSERT("exlist", ip, idx, 1, &s, NULL, - whichfork); - } + for (idx = 0; idx < cnt; idx++) + trace_xfs_extlist(ip, idx, whichfork, caller_ip); } -#endif -#ifdef DEBUG /* * Validate that the bmbt_irecs being returned from bmapi are valid * given the callers original parameters. Specifically check the @@ -5478,7 +5143,8 @@ xfs_bunmapi( int rsvd; /* OK to allocate reserved blocks */ xfs_fsblock_t sum; - xfs_bunmap_trace(ip, bno, len, flags, (inst_t *)__return_address); + trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; ifp = XFS_IFORK_PTR(ip, whichfork); diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 56f62d2edc35..419dafb9d87d 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -95,6 +95,21 @@ typedef struct xfs_bmap_free /* need write cache flushing and no */ /* additional allocation alignments */ +#define XFS_BMAPI_FLAGS \ + { XFS_BMAPI_WRITE, "WRITE" }, \ + { XFS_BMAPI_DELAY, "DELAY" }, \ + { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ + { XFS_BMAPI_METADATA, "METADATA" }, \ + { XFS_BMAPI_EXACT, "EXACT" }, \ + { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ + { XFS_BMAPI_ASYNC, "ASYNC" }, \ + { XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \ + { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ + { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ + { XFS_BMAPI_CONTIG, "CONTIG" }, \ + { XFS_BMAPI_CONVERT, "CONVERT" } + + static inline int xfs_bmapi_aflag(int w) { return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0); @@ -135,36 +150,43 @@ typedef struct xfs_bmalloca { char conv; /* overwriting unwritten extents */ } xfs_bmalloca_t; -#if defined(__KERNEL__) && defined(XFS_BMAP_TRACE) /* - * Trace operations for bmap extent tracing + * Flags for xfs_bmap_add_extent*. */ -#define XFS_BMAP_KTRACE_DELETE 1 -#define XFS_BMAP_KTRACE_INSERT 2 -#define XFS_BMAP_KTRACE_PRE_UP 3 -#define XFS_BMAP_KTRACE_POST_UP 4 - -#define XFS_BMAP_TRACE_SIZE 4096 /* size of global trace buffer */ -#define XFS_BMAP_KTRACE_SIZE 32 /* size of per-inode trace buffer */ -extern ktrace_t *xfs_bmap_trace_buf; +#define BMAP_LEFT_CONTIG (1 << 0) +#define BMAP_RIGHT_CONTIG (1 << 1) +#define BMAP_LEFT_FILLING (1 << 2) +#define BMAP_RIGHT_FILLING (1 << 3) +#define BMAP_LEFT_DELAY (1 << 4) +#define BMAP_RIGHT_DELAY (1 << 5) +#define BMAP_LEFT_VALID (1 << 6) +#define BMAP_RIGHT_VALID (1 << 7) +#define BMAP_ATTRFORK (1 << 8) + +#define XFS_BMAP_EXT_FLAGS \ + { BMAP_LEFT_CONTIG, "LC" }, \ + { BMAP_RIGHT_CONTIG, "RC" }, \ + { BMAP_LEFT_FILLING, "LF" }, \ + { BMAP_RIGHT_FILLING, "RF" }, \ + { BMAP_ATTRFORK, "ATTR" } /* * Add bmap trace insert entries for all the contents of the extent list. + * + * Quite excessive tracing. Only do this for debug builds. */ +#if defined(__KERNEL) && defined(DEBUG) void xfs_bmap_trace_exlist( - const char *fname, /* function name */ struct xfs_inode *ip, /* incore inode pointer */ xfs_extnum_t cnt, /* count of entries in list */ - int whichfork); /* data or attr fork */ + int whichfork, + unsigned long caller_ip); /* data or attr fork */ #define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ - xfs_bmap_trace_exlist(__func__,ip,c,w) - -#else /* __KERNEL__ && XFS_BMAP_TRACE */ - + xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_) +#else #define XFS_BMAP_TRACE_EXLIST(ip,c,w) - -#endif /* __KERNEL__ && XFS_BMAP_TRACE */ +#endif /* * Convert inode from non-attributed to attributed. diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index eb7b702d0690..38751d5fac6f 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -98,8 +98,7 @@ xfs_bmdr_to_bmbt( * This code must be in sync with the routines xfs_bmbt_get_startoff, * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state. */ - -STATIC_INLINE void +STATIC void __xfs_bmbt_get_all( __uint64_t l0, __uint64_t l1, @@ -769,12 +768,6 @@ xfs_bmbt_trace_enter( (void *)a0, (void *)a1, (void *)a2, (void *)a3, (void *)a4, (void *)a5, (void *)a6, (void *)a7, (void *)a8, (void *)a9, (void *)a10); - ktrace_enter(ip->i_btrace, - (void *)((__psint_t)type | (whichfork << 8) | (line << 16)), - (void *)func, (void *)s, (void *)ip, (void *)cur, - (void *)a0, (void *)a1, (void *)a2, (void *)a3, - (void *)a4, (void *)a5, (void *)a6, (void *)a7, - (void *)a8, (void *)a9, (void *)a10); } STATIC void diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 5549d495947f..cf07ca7c22e7 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -46,20 +46,12 @@ typedef struct xfs_bmdr_block { #define BMBT_STARTBLOCK_BITLEN 52 #define BMBT_BLOCKCOUNT_BITLEN 21 - -#define BMBT_USE_64 1 - -typedef struct xfs_bmbt_rec_32 -{ - __uint32_t l0, l1, l2, l3; -} xfs_bmbt_rec_32_t; -typedef struct xfs_bmbt_rec_64 -{ +typedef struct xfs_bmbt_rec { __be64 l0, l1; -} xfs_bmbt_rec_64_t; +} xfs_bmbt_rec_t; typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ -typedef xfs_bmbt_rec_64_t xfs_bmbt_rec_t, xfs_bmdr_rec_t; +typedef xfs_bmbt_rec_t xfs_bmdr_rec_t; typedef struct xfs_bmbt_rec_host { __uint64_t l0, l1; diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 52b5f14d0c32..36a0992dd669 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -39,6 +39,7 @@ #include "xfs_btree_trace.h" #include "xfs_ialloc.h" #include "xfs_error.h" +#include "xfs_trace.h" /* * Cursor allocation zone. @@ -81,7 +82,7 @@ xfs_btree_check_lblock( XFS_ERRTAG_BTREE_CHECK_LBLOCK, XFS_RANDOM_BTREE_CHECK_LBLOCK))) { if (bp) - xfs_buftrace("LBTREE ERROR", bp); + trace_xfs_btree_corrupt(bp, _RET_IP_); XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); @@ -119,7 +120,7 @@ xfs_btree_check_sblock( XFS_ERRTAG_BTREE_CHECK_SBLOCK, XFS_RANDOM_BTREE_CHECK_SBLOCK))) { if (bp) - xfs_buftrace("SBTREE ERROR", bp); + trace_xfs_btree_corrupt(bp, _RET_IP_); XFS_CORRUPTION_ERROR("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW, cur->bc_mp, block); return XFS_ERROR(EFSCORRUPTED); diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h index b3f5eb3c3c6c..2d8a309873ea 100644 --- a/fs/xfs/xfs_btree_trace.h +++ b/fs/xfs/xfs_btree_trace.h @@ -58,8 +58,6 @@ void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *, struct xfs_buf *, int, int); void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *, struct xfs_buf *, int, int, int); -void xfs_btree_trace_argfffi(const char *, struct xfs_btree_cur *, - xfs_dfiloff_t, xfs_dfsbno_t, xfs_dfilblks_t, int, int); void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int); void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int, union xfs_btree_ptr, union xfs_btree_key *, int); @@ -71,24 +69,10 @@ void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *, union xfs_btree_rec *, int); void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int); - -#define XFS_ALLOCBT_TRACE_SIZE 4096 /* size of global trace buffer */ -extern ktrace_t *xfs_allocbt_trace_buf; - -#define XFS_INOBT_TRACE_SIZE 4096 /* size of global trace buffer */ -extern ktrace_t *xfs_inobt_trace_buf; - -#define XFS_BMBT_TRACE_SIZE 4096 /* size of global trace buffer */ -#define XFS_BMBT_KTRACE_SIZE 32 /* size of per-inode trace buffer */ -extern ktrace_t *xfs_bmbt_trace_buf; - - #define XFS_BTREE_TRACE_ARGBI(c, b, i) \ xfs_btree_trace_argbi(__func__, c, b, i, __LINE__) #define XFS_BTREE_TRACE_ARGBII(c, b, i, j) \ xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__) -#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j) \ - xfs_btree_trace_argfffi(__func__, c, o, b, i, j, __LINE__) #define XFS_BTREE_TRACE_ARGI(c, i) \ xfs_btree_trace_argi(__func__, c, i, __LINE__) #define XFS_BTREE_TRACE_ARGIPK(c, i, p, k) \ @@ -104,7 +88,6 @@ extern ktrace_t *xfs_bmbt_trace_buf; #else #define XFS_BTREE_TRACE_ARGBI(c, b, i) #define XFS_BTREE_TRACE_ARGBII(c, b, i, j) -#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j) #define XFS_BTREE_TRACE_ARGI(c, i) #define XFS_BTREE_TRACE_ARGIPK(c, i, p, s) #define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 92af4098c7e8..a30f7e9eb2b9 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -29,6 +29,7 @@ #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_error.h" +#include "xfs_trace.h" kmem_zone_t *xfs_buf_item_zone; @@ -164,7 +165,7 @@ xfs_buf_item_size( * is the buf log format structure with the * cancel flag in it. */ - xfs_buf_item_trace("SIZE STALE", bip); + trace_xfs_buf_item_size_stale(bip); ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); return 1; } @@ -206,7 +207,7 @@ xfs_buf_item_size( } } - xfs_buf_item_trace("SIZE NORM", bip); + trace_xfs_buf_item_size(bip); return nvecs; } @@ -259,7 +260,7 @@ xfs_buf_item_format( * is the buf log format structure with the * cancel flag in it. */ - xfs_buf_item_trace("FORMAT STALE", bip); + trace_xfs_buf_item_format_stale(bip); ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); bip->bli_format.blf_size = nvecs; return; @@ -335,7 +336,7 @@ xfs_buf_item_format( /* * Check to make sure everything is consistent. */ - xfs_buf_item_trace("FORMAT NORM", bip); + trace_xfs_buf_item_format(bip); xfs_buf_item_log_check(bip); } @@ -355,8 +356,7 @@ xfs_buf_item_pin( ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); - xfs_buf_item_trace("PIN", bip); - xfs_buftrace("XFS_PIN", bp); + trace_xfs_buf_item_pin(bip); xfs_bpin(bp); } @@ -383,8 +383,7 @@ xfs_buf_item_unpin( ASSERT(bp != NULL); ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip); ASSERT(atomic_read(&bip->bli_refcount) > 0); - xfs_buf_item_trace("UNPIN", bip); - xfs_buftrace("XFS_UNPIN", bp); + trace_xfs_buf_item_unpin(bip); freed = atomic_dec_and_test(&bip->bli_refcount); ailp = bip->bli_item.li_ailp; @@ -395,8 +394,8 @@ xfs_buf_item_unpin( ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); - xfs_buf_item_trace("UNPIN STALE", bip); - xfs_buftrace("XFS_UNPIN STALE", bp); + trace_xfs_buf_item_unpin_stale(bip); + /* * If we get called here because of an IO error, we may * or may not have the item on the AIL. xfs_trans_ail_delete() @@ -440,8 +439,8 @@ xfs_buf_item_unpin_remove( if ((atomic_read(&bip->bli_refcount) == 1) && (bip->bli_flags & XFS_BLI_STALE)) { ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0); - xfs_buf_item_trace("UNPIN REMOVE", bip); - xfs_buftrace("XFS_UNPIN_REMOVE", bp); + trace_xfs_buf_item_unpin_stale(bip); + /* * yes -- clear the xaction descriptor in-use flag * and free the chunk if required. We can safely @@ -495,7 +494,7 @@ xfs_buf_item_trylock( XFS_BUF_HOLD(bp); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - xfs_buf_item_trace("TRYLOCK SUCCESS", bip); + trace_xfs_buf_item_trylock(bip); return XFS_ITEM_SUCCESS; } @@ -524,7 +523,6 @@ xfs_buf_item_unlock( uint hold; bp = bip->bli_buf; - xfs_buftrace("XFS_UNLOCK", bp); /* * Clear the buffer's association with this transaction. @@ -547,7 +545,7 @@ xfs_buf_item_unlock( */ if (bip->bli_flags & XFS_BLI_STALE) { bip->bli_flags &= ~XFS_BLI_LOGGED; - xfs_buf_item_trace("UNLOCK STALE", bip); + trace_xfs_buf_item_unlock_stale(bip); ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); if (!aborted) return; @@ -574,7 +572,7 @@ xfs_buf_item_unlock( * release the buffer at the end of this routine. */ hold = bip->bli_flags & XFS_BLI_HOLD; - xfs_buf_item_trace("UNLOCK", bip); + trace_xfs_buf_item_unlock(bip); /* * If the buf item isn't tracking any data, free it. @@ -618,7 +616,8 @@ xfs_buf_item_committed( xfs_buf_log_item_t *bip, xfs_lsn_t lsn) { - xfs_buf_item_trace("COMMITTED", bip); + trace_xfs_buf_item_committed(bip); + if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && (bip->bli_item.li_lsn != 0)) { return bip->bli_item.li_lsn; @@ -640,7 +639,7 @@ xfs_buf_item_push( xfs_buf_t *bp; ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - xfs_buf_item_trace("PUSH", bip); + trace_xfs_buf_item_push(bip); bp = bip->bli_buf; @@ -738,9 +737,6 @@ xfs_buf_item_init( bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); bip->bli_format.blf_map_size = map_size; -#ifdef XFS_BLI_TRACE - bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS); -#endif #ifdef XFS_TRANS_DEBUG /* @@ -878,9 +874,6 @@ xfs_buf_item_free( kmem_free(bip->bli_logged); #endif /* XFS_TRANS_DEBUG */ -#ifdef XFS_BLI_TRACE - ktrace_free(bip->bli_trace); -#endif kmem_zone_free(xfs_buf_item_zone, bip); } @@ -897,7 +890,8 @@ xfs_buf_item_relse( { xfs_buf_log_item_t *bip; - xfs_buftrace("XFS_RELSE", bp); + trace_xfs_buf_item_relse(bp, _RET_IP_); + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list); if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) && @@ -994,7 +988,7 @@ xfs_buf_iodone_callbacks( if (XFS_FORCED_SHUTDOWN(mp)) { ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); XFS_BUF_SUPER_STALE(bp); - xfs_buftrace("BUF_IODONE_CB", bp); + trace_xfs_buf_item_iodone(bp, _RET_IP_); xfs_buf_do_callbacks(bp, lip); XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_CLR_IODONE_FUNC(bp); @@ -1030,7 +1024,7 @@ xfs_buf_iodone_callbacks( XFS_BUF_SET_START(bp); } ASSERT(XFS_BUF_IODONE_FUNC(bp)); - xfs_buftrace("BUF_IODONE ASYNC", bp); + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); xfs_buf_relse(bp); } else { /* @@ -1053,9 +1047,7 @@ xfs_buf_iodone_callbacks( } return; } -#ifdef XFSERRORDEBUG - xfs_buftrace("XFS BUFCB NOERR", bp); -#endif + xfs_buf_do_callbacks(bp, lip); XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_CLR_IODONE_FUNC(bp); @@ -1081,7 +1073,9 @@ xfs_buf_error_relse( XFS_BUF_DONE(bp); XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_ERROR(bp,0); - xfs_buftrace("BUF_ERROR_RELSE", bp); + + trace_xfs_buf_error_relse(bp, _RET_IP_); + if (! XFS_FORCED_SHUTDOWN(mp)) xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); /* @@ -1128,34 +1122,3 @@ xfs_buf_iodone( xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); xfs_buf_item_free(bip); } - -#if defined(XFS_BLI_TRACE) -void -xfs_buf_item_trace( - char *id, - xfs_buf_log_item_t *bip) -{ - xfs_buf_t *bp; - ASSERT(bip->bli_trace != NULL); - - bp = bip->bli_buf; - ktrace_enter(bip->bli_trace, - (void *)id, - (void *)bip->bli_buf, - (void *)((unsigned long)bip->bli_flags), - (void *)((unsigned long)bip->bli_recur), - (void *)((unsigned long)atomic_read(&bip->bli_refcount)), - (void *)((unsigned long) - (0xFFFFFFFF & XFS_BUF_ADDR(bp) >> 32)), - (void *)((unsigned long)(0xFFFFFFFF & XFS_BUF_ADDR(bp))), - (void *)((unsigned long)XFS_BUF_COUNT(bp)), - (void *)((unsigned long)XFS_BUF_BFLAGS(bp)), - XFS_BUF_FSPRIVATE(bp, void *), - XFS_BUF_FSPRIVATE2(bp, void *), - (void *)(unsigned long)XFS_BUF_ISPINNED(bp), - (void *)XFS_BUF_IODONE_FUNC(bp), - (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))), - (void *)bip->bli_item.li_desc, - (void *)((unsigned long)bip->bli_item.li_flags)); -} -#endif /* XFS_BLI_TRACE */ diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 5a41c348bb1c..217f34af00cb 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -70,22 +70,21 @@ typedef struct xfs_buf_log_format_t { #define XFS_BLI_INODE_ALLOC_BUF 0x10 #define XFS_BLI_STALE_INODE 0x20 +#define XFS_BLI_FLAGS \ + { XFS_BLI_HOLD, "HOLD" }, \ + { XFS_BLI_DIRTY, "DIRTY" }, \ + { XFS_BLI_STALE, "STALE" }, \ + { XFS_BLI_LOGGED, "LOGGED" }, \ + { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ + { XFS_BLI_STALE_INODE, "STALE_INODE" } + #ifdef __KERNEL__ struct xfs_buf; -struct ktrace; struct xfs_mount; struct xfs_buf_log_item; -#if defined(XFS_BLI_TRACE) -#define XFS_BLI_TRACE_SIZE 32 - -void xfs_buf_item_trace(char *, struct xfs_buf_log_item *); -#else -#define xfs_buf_item_trace(id, bip) -#endif - /* * This is the in core log item structure used to track information * needed to log buffers. It tracks how many times the lock has been @@ -97,9 +96,6 @@ typedef struct xfs_buf_log_item { unsigned int bli_flags; /* misc flags */ unsigned int bli_recur; /* lock recursion count */ atomic_t bli_refcount; /* cnt of tp refs */ -#ifdef XFS_BLI_TRACE - struct ktrace *bli_trace; /* event trace buf */ -#endif #ifdef XFS_TRANS_DEBUG char *bli_orig; /* original buffer copy */ char *bli_logged; /* bytes logged (bitmap) */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 2847bbc1c534..c0c8869115b1 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -46,6 +46,7 @@ #include "xfs_dir2_block.h" #include "xfs_dir2_node.h" #include "xfs_error.h" +#include "xfs_trace.h" /* * xfs_da_btree.c @@ -2107,7 +2108,7 @@ xfs_da_do_buf( (be32_to_cpu(free->hdr.magic) != XFS_DIR2_FREE_MAGIC), mp, XFS_ERRTAG_DA_READ_BUF, XFS_RANDOM_DA_READ_BUF))) { - xfs_buftrace("DA READ ERROR", rbp->bps[0]); + trace_xfs_da_btree_corrupt(rbp->bps[0], _RET_IP_); XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)", XFS_ERRLEVEL_LOW, mp, info); error = XFS_ERROR(EFSCORRUPTED); diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 8c536167bf75..30cd08f56a3a 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -125,6 +125,13 @@ typedef struct xfs_da_args { #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ +#define XFS_DA_OP_FLAGS \ + { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ + { XFS_DA_OP_RENAME, "RENAME" }, \ + { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ + { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ + { XFS_DA_OP_CILOOKUP, "CILOOKUP" } + /* * Structure to describe buffer(s) for a block. * This is needed in the directory version 2 format case, when diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index ab89a7e94a0f..84ca1cf16a1e 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -43,6 +43,7 @@ #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_vnodeops.h" +#include "xfs_trace.h" /* * Syssgi interface for swapext @@ -113,10 +114,82 @@ xfs_swapext( return error; } +/* + * We need to check that the format of the data fork in the temporary inode is + * valid for the target inode before doing the swap. This is not a problem with + * attr1 because of the fixed fork offset, but attr2 has a dynamically sized + * data fork depending on the space the attribute fork is taking so we can get + * invalid formats on the target inode. + * + * E.g. target has space for 7 extents in extent format, temp inode only has + * space for 6. If we defragment down to 7 extents, then the tmp format is a + * btree, but when swapped it needs to be in extent format. Hence we can't just + * blindly swap data forks on attr2 filesystems. + * + * Note that we check the swap in both directions so that we don't end up with + * a corrupt temporary inode, either. + * + * Note that fixing the way xfs_fsr sets up the attribute fork in the source + * inode will prevent this situation from occurring, so all we do here is + * reject and log the attempt. basically we are putting the responsibility on + * userspace to get this right. + */ +static int +xfs_swap_extents_check_format( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip) /* tmp inode */ +{ + + /* Should never get a local format */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || + tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) + return EINVAL; + + /* + * if the target inode has less extents that then temporary inode then + * why did userspace call us? + */ + if (ip->i_d.di_nextents < tip->i_d.di_nextents) + return EINVAL; + + /* + * if the target inode is in extent form and the temp inode is in btree + * form then we will end up with the target inode in the wrong format + * as we already know there are less extents in the temp inode. + */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + return EINVAL; + + /* Check temp in extent form to max in target */ + if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max) + return EINVAL; + + /* Check target in extent form to max in temp */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) + return EINVAL; + + /* Check root block of temp in btree form to max in target */ + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && + XFS_IFORK_BOFF(ip) && + tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) + return EINVAL; + + /* Check root block of target in btree form to max in temp */ + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + XFS_IFORK_BOFF(tip) && + ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) + return EINVAL; + + return 0; +} + int xfs_swap_extents( - xfs_inode_t *ip, - xfs_inode_t *tip, + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip, /* tmp inode */ xfs_swapext_t *sxp) { xfs_mount_t *mp; @@ -160,15 +233,7 @@ xfs_swap_extents( goto out_unlock; } - /* Should never get a local format */ - if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || - tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { - error = XFS_ERROR(EINVAL); - goto out_unlock; - } - if (VN_CACHED(VFS_I(tip)) != 0) { - xfs_inval_cached_trace(tip, 0, -1, 0, -1); error = xfs_flushinval_pages(tip, 0, -1, FI_REMAPF_LOCKED); if (error) @@ -189,13 +254,12 @@ xfs_swap_extents( goto out_unlock; } - /* - * If the target has extended attributes, the tmp file - * must also in order to ensure the correct data fork - * format. - */ - if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) { - error = XFS_ERROR(EINVAL); + /* check inode formats now that data is flushed */ + error = xfs_swap_extents_check_format(ip, tip); + if (error) { + xfs_fs_cmn_err(CE_NOTE, mp, + "%s: inode 0x%llx format is incompatible for exchanging.", + __FILE__, ip->i_ino); goto out_unlock; } @@ -276,6 +340,16 @@ xfs_swap_extents( *tifp = *tempifp; /* struct copy */ /* + * Fix the in-memory data fork values that are dependent on the fork + * offset in the inode. We can't assume they remain the same as attr2 + * has dynamic fork offsets. + */ + ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) / + (uint)sizeof(xfs_bmbt_rec_t); + tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) / + (uint)sizeof(xfs_bmbt_rec_t); + + /* * Fix the on-disk inode values */ tmp = (__uint64_t)ip->i_d.di_nblocks; diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index bb1d58eb3982..93634a7e90e9 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -40,9 +40,9 @@ #include "xfs_dir2_leaf.h" #include "xfs_dir2_block.h" #include "xfs_dir2_node.h" -#include "xfs_dir2_trace.h" #include "xfs_error.h" #include "xfs_vnodeops.h" +#include "xfs_trace.h" struct xfs_name xfs_name_dotdot = {"..", 2}; @@ -525,7 +525,8 @@ xfs_dir2_grow_inode( xfs_trans_t *tp; xfs_drfsbno_t nblks; - xfs_dir2_trace_args_s("grow_inode", args, space); + trace_xfs_dir2_grow_inode(args, space); + dp = args->dp; tp = args->trans; mp = dp->i_mount; @@ -703,7 +704,8 @@ xfs_dir2_shrink_inode( xfs_mount_t *mp; xfs_trans_t *tp; - xfs_dir2_trace_args_db("shrink_inode", args, db, bp); + trace_xfs_dir2_shrink_inode(args, db); + dp = args->dp; mp = dp->i_mount; tp = args->trans; diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index ab52e9e1c1ee..ddc4ecc7807f 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -36,8 +36,8 @@ #include "xfs_dir2_data.h" #include "xfs_dir2_leaf.h" #include "xfs_dir2_block.h" -#include "xfs_dir2_trace.h" #include "xfs_error.h" +#include "xfs_trace.h" /* * Local function prototypes. @@ -94,7 +94,8 @@ xfs_dir2_block_addname( __be16 *tagp; /* pointer to tag value */ xfs_trans_t *tp; /* transaction structure */ - xfs_dir2_trace_args("block_addname", args); + trace_xfs_dir2_block_addname(args); + dp = args->dp; tp = args->trans; mp = dp->i_mount; @@ -590,7 +591,8 @@ xfs_dir2_block_lookup( int error; /* error return value */ xfs_mount_t *mp; /* filesystem mount point */ - xfs_dir2_trace_args("block_lookup", args); + trace_xfs_dir2_block_lookup(args); + /* * Get the buffer, look up the entry. * If not found (ENOENT) then return, have no buffer. @@ -747,7 +749,8 @@ xfs_dir2_block_removename( int size; /* shortform size */ xfs_trans_t *tp; /* transaction pointer */ - xfs_dir2_trace_args("block_removename", args); + trace_xfs_dir2_block_removename(args); + /* * Look up the entry in the block. Gets the buffer and entry index. * It will always be there, the vnodeops level does a lookup first. @@ -823,7 +826,8 @@ xfs_dir2_block_replace( int error; /* error return value */ xfs_mount_t *mp; /* filesystem mount point */ - xfs_dir2_trace_args("block_replace", args); + trace_xfs_dir2_block_replace(args); + /* * Lookup the entry in the directory. Get buffer and entry index. * This will always succeed since the caller has already done a lookup. @@ -897,7 +901,8 @@ xfs_dir2_leaf_to_block( int to; /* block/leaf to index */ xfs_trans_t *tp; /* transaction pointer */ - xfs_dir2_trace_args_bb("leaf_to_block", args, lbp, dbp); + trace_xfs_dir2_leaf_to_block(args); + dp = args->dp; tp = args->trans; mp = dp->i_mount; @@ -1044,7 +1049,8 @@ xfs_dir2_sf_to_block( xfs_trans_t *tp; /* transaction pointer */ struct xfs_name name; - xfs_dir2_trace_args("sf_to_block", args); + trace_xfs_dir2_sf_to_block(args); + dp = args->dp; tp = args->trans; mp = dp->i_mount; diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 41ad537c49e9..29f484c11b3a 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -38,8 +38,8 @@ #include "xfs_dir2_leaf.h" #include "xfs_dir2_block.h" #include "xfs_dir2_node.h" -#include "xfs_dir2_trace.h" #include "xfs_error.h" +#include "xfs_trace.h" /* * Local function declarations. @@ -80,7 +80,8 @@ xfs_dir2_block_to_leaf( int needscan; /* need to rescan bestfree */ xfs_trans_t *tp; /* transaction pointer */ - xfs_dir2_trace_args_b("block_to_leaf", args, dbp); + trace_xfs_dir2_block_to_leaf(args); + dp = args->dp; mp = dp->i_mount; tp = args->trans; @@ -188,7 +189,8 @@ xfs_dir2_leaf_addname( xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t use_block; /* data block number */ - xfs_dir2_trace_args("leaf_addname", args); + trace_xfs_dir2_leaf_addname(args); + dp = args->dp; tp = args->trans; mp = dp->i_mount; @@ -1266,7 +1268,8 @@ xfs_dir2_leaf_lookup( xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ - xfs_dir2_trace_args("leaf_lookup", args); + trace_xfs_dir2_leaf_lookup(args); + /* * Look up name in the leaf block, returning both buffers and index. */ @@ -1454,7 +1457,8 @@ xfs_dir2_leaf_removename( xfs_dir2_data_off_t oldbest; /* old value of best free */ xfs_trans_t *tp; /* transaction pointer */ - xfs_dir2_trace_args("leaf_removename", args); + trace_xfs_dir2_leaf_removename(args); + /* * Lookup the leaf entry, get the leaf and data blocks read in. */ @@ -1586,7 +1590,8 @@ xfs_dir2_leaf_replace( xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ - xfs_dir2_trace_args("leaf_replace", args); + trace_xfs_dir2_leaf_replace(args); + /* * Look up the entry. */ @@ -1766,7 +1771,9 @@ xfs_dir2_node_to_leaf( if (state->path.active > 1) return 0; args = state->args; - xfs_dir2_trace_args("node_to_leaf", args); + + trace_xfs_dir2_node_to_leaf(args); + mp = state->mp; dp = args->dp; tp = args->trans; diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 5a81ccd1045b..ce6e355199b5 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -37,8 +37,8 @@ #include "xfs_dir2_leaf.h" #include "xfs_dir2_block.h" #include "xfs_dir2_node.h" -#include "xfs_dir2_trace.h" #include "xfs_error.h" +#include "xfs_trace.h" /* * Function declarations. @@ -123,7 +123,8 @@ xfs_dir2_leaf_to_node( __be16 *to; /* pointer to freespace entry */ xfs_trans_t *tp; /* transaction pointer */ - xfs_dir2_trace_args_b("leaf_to_node", args, lbp); + trace_xfs_dir2_leaf_to_node(args); + dp = args->dp; mp = dp->i_mount; tp = args->trans; @@ -196,7 +197,8 @@ xfs_dir2_leafn_add( xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ - xfs_dir2_trace_args_sb("leafn_add", args, index, bp); + trace_xfs_dir2_leafn_add(args, index); + dp = args->dp; mp = dp->i_mount; tp = args->trans; @@ -711,8 +713,8 @@ xfs_dir2_leafn_moveents( int stale; /* count stale leaves copied */ xfs_trans_t *tp; /* transaction pointer */ - xfs_dir2_trace_args_bibii("leafn_moveents", args, bp_s, start_s, bp_d, - start_d, count); + trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count); + /* * Silently return if nothing to do. */ @@ -933,7 +935,8 @@ xfs_dir2_leafn_remove( int needscan; /* need to rescan data frees */ xfs_trans_t *tp; /* transaction pointer */ - xfs_dir2_trace_args_sb("leafn_remove", args, index, bp); + trace_xfs_dir2_leafn_remove(args, index); + dp = args->dp; tp = args->trans; mp = dp->i_mount; @@ -1363,7 +1366,8 @@ xfs_dir2_node_addname( int rval; /* sub-return value */ xfs_da_state_t *state; /* btree cursor */ - xfs_dir2_trace_args("node_addname", args); + trace_xfs_dir2_node_addname(args); + /* * Allocate and initialize the state (btree cursor). */ @@ -1822,7 +1826,8 @@ xfs_dir2_node_lookup( int rval; /* operation return value */ xfs_da_state_t *state; /* btree cursor */ - xfs_dir2_trace_args("node_lookup", args); + trace_xfs_dir2_node_lookup(args); + /* * Allocate and initialize the btree cursor. */ @@ -1875,7 +1880,8 @@ xfs_dir2_node_removename( int rval; /* operation return value */ xfs_da_state_t *state; /* btree cursor */ - xfs_dir2_trace_args("node_removename", args); + trace_xfs_dir2_node_removename(args); + /* * Allocate and initialize the btree cursor. */ @@ -1944,7 +1950,8 @@ xfs_dir2_node_replace( int rval; /* internal return value */ xfs_da_state_t *state; /* btree cursor */ - xfs_dir2_trace_args("node_replace", args); + trace_xfs_dir2_node_replace(args); + /* * Allocate and initialize the btree cursor. */ diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index e89734e84646..9d4f17a69676 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -37,7 +37,7 @@ #include "xfs_dir2_data.h" #include "xfs_dir2_leaf.h" #include "xfs_dir2_block.h" -#include "xfs_dir2_trace.h" +#include "xfs_trace.h" /* * Prototypes for internal functions. @@ -169,7 +169,8 @@ xfs_dir2_block_to_sf( xfs_dir2_sf_t *sfp; /* shortform structure */ xfs_ino_t temp; - xfs_dir2_trace_args_sb("block_to_sf", args, size, bp); + trace_xfs_dir2_block_to_sf(args); + dp = args->dp; mp = dp->i_mount; @@ -281,7 +282,8 @@ xfs_dir2_sf_addname( xfs_dir2_sf_t *sfp; /* shortform structure */ xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */ - xfs_dir2_trace_args("sf_addname", args); + trace_xfs_dir2_sf_addname(args); + ASSERT(xfs_dir2_sf_lookup(args) == ENOENT); dp = args->dp; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); @@ -654,7 +656,8 @@ xfs_dir2_sf_create( xfs_dir2_sf_t *sfp; /* shortform structure */ int size; /* directory size */ - xfs_dir2_trace_args_i("sf_create", args, pino); + trace_xfs_dir2_sf_create(args); + dp = args->dp; ASSERT(dp != NULL); @@ -808,7 +811,8 @@ xfs_dir2_sf_lookup( enum xfs_dacmp cmp; /* comparison result */ xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */ - xfs_dir2_trace_args("sf_lookup", args); + trace_xfs_dir2_sf_lookup(args); + xfs_dir2_sf_check(args); dp = args->dp; @@ -891,7 +895,8 @@ xfs_dir2_sf_removename( xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_t *sfp; /* shortform structure */ - xfs_dir2_trace_args("sf_removename", args); + trace_xfs_dir2_sf_removename(args); + dp = args->dp; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); @@ -982,7 +987,8 @@ xfs_dir2_sf_replace( xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_t *sfp; /* shortform structure */ - xfs_dir2_trace_args("sf_replace", args); + trace_xfs_dir2_sf_replace(args); + dp = args->dp; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); @@ -1125,7 +1131,8 @@ xfs_dir2_sf_toino4( xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_t *sfp; /* new sf directory */ - xfs_dir2_trace_args("sf_toino4", args); + trace_xfs_dir2_sf_toino4(args); + dp = args->dp; /* @@ -1202,7 +1209,8 @@ xfs_dir2_sf_toino8( xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_t *sfp; /* new sf directory */ - xfs_dir2_trace_args("sf_toino8", args); + trace_xfs_dir2_sf_toino8(args); + dp = args->dp; /* diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c deleted file mode 100644 index 6cc7c0c681ac..000000000000 --- a/fs/xfs/xfs_dir2_trace.c +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_inum.h" -#include "xfs_dir2.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_dir2_trace.h" - -#ifdef XFS_DIR2_TRACE -ktrace_t *xfs_dir2_trace_buf; - -/* - * Enter something in the trace buffers. - */ -static void -xfs_dir2_trace_enter( - xfs_inode_t *dp, - int type, - char *where, - char *name, - int namelen, - void *a0, - void *a1, - void *a2, - void *a3, - void *a4, - void *a5, - void *a6, - void *a7) -{ - void *n[5]; - - ASSERT(xfs_dir2_trace_buf); - ASSERT(dp->i_dir_trace); - if (name) - memcpy(n, name, min((int)sizeof(n), namelen)); - else - memset((char *)n, 0, sizeof(n)); - ktrace_enter(xfs_dir2_trace_buf, - (void *)(long)type, (void *)where, - (void *)a0, (void *)a1, (void *)a2, (void *)a3, - (void *)a4, (void *)a5, (void *)a6, (void *)a7, - (void *)(long)namelen, - (void *)n[0], (void *)n[1], (void *)n[2], - (void *)n[3], (void *)n[4]); - ktrace_enter(dp->i_dir_trace, - (void *)(long)type, (void *)where, - (void *)a0, (void *)a1, (void *)a2, (void *)a3, - (void *)a4, (void *)a5, (void *)a6, (void *)a7, - (void *)(long)namelen, - (void *)n[0], (void *)n[1], (void *)n[2], - (void *)n[3], (void *)n[4]); -} - -void -xfs_dir2_trace_args( - char *where, - xfs_da_args_t *args) -{ - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS, where, - (char *)args->name, (int)args->namelen, - (void *)(unsigned long)args->hashval, - (void *)((unsigned long)(args->inumber >> 32)), - (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), - (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), - NULL, NULL); -} - -void -xfs_dir2_trace_args_b( - char *where, - xfs_da_args_t *args, - xfs_dabuf_t *bp) -{ - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_B, where, - (char *)args->name, (int)args->namelen, - (void *)(unsigned long)args->hashval, - (void *)((unsigned long)(args->inumber >> 32)), - (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), - (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), - (void *)(bp ? bp->bps[0] : NULL), NULL); -} - -void -xfs_dir2_trace_args_bb( - char *where, - xfs_da_args_t *args, - xfs_dabuf_t *lbp, - xfs_dabuf_t *dbp) -{ - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BB, where, - (char *)args->name, (int)args->namelen, - (void *)(unsigned long)args->hashval, - (void *)((unsigned long)(args->inumber >> 32)), - (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), - (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), - (void *)(lbp ? lbp->bps[0] : NULL), - (void *)(dbp ? dbp->bps[0] : NULL)); -} - -void -xfs_dir2_trace_args_bibii( - char *where, - xfs_da_args_t *args, - xfs_dabuf_t *bs, - int ss, - xfs_dabuf_t *bd, - int sd, - int c) -{ - xfs_buf_t *bpbs = bs ? bs->bps[0] : NULL; - xfs_buf_t *bpbd = bd ? bd->bps[0] : NULL; - - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BIBII, where, - (char *)args->name, (int)args->namelen, - (void *)args->dp, (void *)args->trans, - (void *)bpbs, (void *)(long)ss, (void *)bpbd, (void *)(long)sd, - (void *)(long)c, NULL); -} - -void -xfs_dir2_trace_args_db( - char *where, - xfs_da_args_t *args, - xfs_dir2_db_t db, - xfs_dabuf_t *bp) -{ - xfs_buf_t *dbp = bp ? bp->bps[0] : NULL; - - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_DB, where, - (char *)args->name, (int)args->namelen, - (void *)(unsigned long)args->hashval, - (void *)((unsigned long)(args->inumber >> 32)), - (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), - (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), - (void *)(long)db, (void *)dbp); -} - -void -xfs_dir2_trace_args_i( - char *where, - xfs_da_args_t *args, - xfs_ino_t i) -{ - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_I, where, - (char *)args->name, (int)args->namelen, - (void *)(unsigned long)args->hashval, - (void *)((unsigned long)(args->inumber >> 32)), - (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), - (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), - (void *)((unsigned long)(i >> 32)), - (void *)((unsigned long)(i & 0xFFFFFFFF))); -} - -void -xfs_dir2_trace_args_s( - char *where, - xfs_da_args_t *args, - int s) -{ - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_S, where, - (char *)args->name, (int)args->namelen, - (void *)(unsigned long)args->hashval, - (void *)((unsigned long)(args->inumber >> 32)), - (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), - (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), - (void *)(long)s, NULL); -} - -void -xfs_dir2_trace_args_sb( - char *where, - xfs_da_args_t *args, - int s, - xfs_dabuf_t *bp) -{ - xfs_buf_t *dbp = bp ? bp->bps[0] : NULL; - - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_SB, where, - (char *)args->name, (int)args->namelen, - (void *)(unsigned long)args->hashval, - (void *)((unsigned long)(args->inumber >> 32)), - (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), - (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK), - (void *)(long)s, (void *)dbp); -} -#endif /* XFS_DIR2_TRACE */ diff --git a/fs/xfs/xfs_dir2_trace.h b/fs/xfs/xfs_dir2_trace.h deleted file mode 100644 index ca3c754f4822..000000000000 --- a/fs/xfs/xfs_dir2_trace.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2000,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DIR2_TRACE_H__ -#define __XFS_DIR2_TRACE_H__ - -/* - * Tracing for xfs v2 directories. - */ - -#if defined(XFS_DIR2_TRACE) - -struct ktrace; -struct xfs_dabuf; -struct xfs_da_args; - -#define XFS_DIR2_GTRACE_SIZE 4096 /* global buffer */ -#define XFS_DIR2_KTRACE_SIZE 32 /* per-inode buffer */ -extern struct ktrace *xfs_dir2_trace_buf; - -#define XFS_DIR2_KTRACE_ARGS 1 /* args only */ -#define XFS_DIR2_KTRACE_ARGS_B 2 /* args + buffer */ -#define XFS_DIR2_KTRACE_ARGS_BB 3 /* args + 2 buffers */ -#define XFS_DIR2_KTRACE_ARGS_DB 4 /* args, db, buffer */ -#define XFS_DIR2_KTRACE_ARGS_I 5 /* args, inum */ -#define XFS_DIR2_KTRACE_ARGS_S 6 /* args, int */ -#define XFS_DIR2_KTRACE_ARGS_SB 7 /* args, int, buffer */ -#define XFS_DIR2_KTRACE_ARGS_BIBII 8 /* args, buf/int/buf/int/int */ - -void xfs_dir2_trace_args(char *where, struct xfs_da_args *args); -void xfs_dir2_trace_args_b(char *where, struct xfs_da_args *args, - struct xfs_dabuf *bp); -void xfs_dir2_trace_args_bb(char *where, struct xfs_da_args *args, - struct xfs_dabuf *lbp, struct xfs_dabuf *dbp); -void xfs_dir2_trace_args_bibii(char *where, struct xfs_da_args *args, - struct xfs_dabuf *bs, int ss, - struct xfs_dabuf *bd, int sd, int c); -void xfs_dir2_trace_args_db(char *where, struct xfs_da_args *args, - xfs_dir2_db_t db, struct xfs_dabuf *bp); -void xfs_dir2_trace_args_i(char *where, struct xfs_da_args *args, xfs_ino_t i); -void xfs_dir2_trace_args_s(char *where, struct xfs_da_args *args, int s); -void xfs_dir2_trace_args_sb(char *where, struct xfs_da_args *args, int s, - struct xfs_dabuf *bp); - -#else /* XFS_DIR2_TRACE */ - -#define xfs_dir2_trace_args(where, args) -#define xfs_dir2_trace_args_b(where, args, bp) -#define xfs_dir2_trace_args_bb(where, args, lbp, dbp) -#define xfs_dir2_trace_args_bibii(where, args, bs, ss, bd, sd, c) -#define xfs_dir2_trace_args_db(where, args, db, bp) -#define xfs_dir2_trace_args_i(where, args, i) -#define xfs_dir2_trace_args_s(where, args, s) -#define xfs_dir2_trace_args_sb(where, args, s, bp) - -#endif /* XFS_DIR2_TRACE */ - -#endif /* __XFS_DIR2_TRACE_H__ */ diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index edf8bdf4141f..a631e1451abb 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -34,6 +34,7 @@ #include "xfs_utils.h" #include "xfs_mru_cache.h" #include "xfs_filestream.h" +#include "xfs_trace.h" #ifdef XFS_FILESTREAMS_TRACE @@ -394,9 +395,7 @@ xfs_filestream_init(void) item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item"); if (!item_zone) return -ENOMEM; -#ifdef XFS_FILESTREAMS_TRACE - xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_NOFS); -#endif + return 0; } @@ -407,9 +406,6 @@ xfs_filestream_init(void) void xfs_filestream_uninit(void) { -#ifdef XFS_FILESTREAMS_TRACE - ktrace_free(xfs_filestreams_trace_buf); -#endif kmem_zone_destroy(item_zone); } diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index f655f7dc334c..4aba67c5f64f 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -79,7 +79,7 @@ extern ktrace_t *xfs_filestreams_trace_buf; * the cache that reference per-ag array elements that have since been * reallocated. */ -STATIC_INLINE int +static inline int xfs_filestream_peek_ag( xfs_mount_t *mp, xfs_agnumber_t agno) @@ -87,7 +87,7 @@ xfs_filestream_peek_ag( return atomic_read(&mp->m_perag[agno].pagf_fstrms); } -STATIC_INLINE int +static inline int xfs_filestream_get_ag( xfs_mount_t *mp, xfs_agnumber_t agno) @@ -95,7 +95,7 @@ xfs_filestream_get_ag( return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms); } -STATIC_INLINE int +static inline int xfs_filestream_put_ag( xfs_mount_t *mp, xfs_agnumber_t agno) @@ -122,7 +122,7 @@ int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp); /* filestreams for the inode? */ -STATIC_INLINE int +static inline int xfs_inode_is_filestream( struct xfs_inode *ip) { diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 2d0b3e1da9e6..a13919a6a364 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -45,6 +45,7 @@ #include "xfs_rtalloc.h" #include "xfs_rw.h" #include "xfs_filestream.h" +#include "xfs_trace.h" /* * File system operations @@ -201,8 +202,8 @@ xfs_growfs_data_private( * AG freelist header block */ bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); agf = XFS_BUF_TO_AGF(bp); memset(agf, 0, mp->m_sb.sb_sectsize); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -233,8 +234,8 @@ xfs_growfs_data_private( * AG inode header block */ bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); agi = XFS_BUF_TO_AGI(bp); memset(agi, 0, mp->m_sb.sb_sectsize); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -257,8 +258,9 @@ xfs_growfs_data_private( * BNO btree root block */ bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), + XBF_LOCK | XBF_MAPPED); block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); @@ -278,8 +280,9 @@ xfs_growfs_data_private( * CNT btree root block */ bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), + XBF_LOCK | XBF_MAPPED); block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); @@ -300,8 +303,9 @@ xfs_growfs_data_private( * INO btree root block */ bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), + XBF_LOCK | XBF_MAPPED); block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); @@ -344,6 +348,7 @@ xfs_growfs_data_private( be32_add_cpu(&agf->agf_length, new); ASSERT(be32_to_cpu(agf->agf_length) == be32_to_cpu(agi->agi_length)); + xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); /* * Free the new space. @@ -611,7 +616,7 @@ xfs_fs_log_dummy( xfs_inode_t *ip; int error; - tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); + tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); if (error) { xfs_trans_cancel(tp, 0); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 0785797db828..cb907ba69c4c 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -425,7 +425,7 @@ xfs_ialloc_ag_alloc( return 0; } -STATIC_INLINE xfs_agnumber_t +STATIC xfs_agnumber_t xfs_ialloc_next_ag( xfs_mount_t *mp) { diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 80e526489be5..155e798f30a1 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -43,7 +43,7 @@ #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_btree_trace.h" -#include "xfs_dir2_trace.h" +#include "xfs_trace.h" /* @@ -74,6 +74,8 @@ xfs_inode_alloc( ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + /* initialise the xfs inode */ ip->i_ino = ino; ip->i_mount = mp; @@ -87,30 +89,8 @@ xfs_inode_alloc( ip->i_size = 0; ip->i_new_size = 0; - /* - * Initialize inode's trace buffers. - */ -#ifdef XFS_INODE_TRACE - ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS); -#endif -#ifdef XFS_BMAP_TRACE - ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS); -#endif -#ifdef XFS_BTREE_TRACE - ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS); -#endif -#ifdef XFS_RW_TRACE - ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS); -#endif -#ifdef XFS_ILOCK_TRACE - ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS); -#endif -#ifdef XFS_DIR2_TRACE - ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS); -#endif - /* prevent anyone from using this yet */ - VFS_I(ip)->i_state = I_NEW|I_LOCK; + VFS_I(ip)->i_state = I_NEW; return ip; } @@ -130,25 +110,6 @@ xfs_inode_free( if (ip->i_afp) xfs_idestroy_fork(ip, XFS_ATTR_FORK); -#ifdef XFS_INODE_TRACE - ktrace_free(ip->i_trace); -#endif -#ifdef XFS_BMAP_TRACE - ktrace_free(ip->i_xtrace); -#endif -#ifdef XFS_BTREE_TRACE - ktrace_free(ip->i_btrace); -#endif -#ifdef XFS_RW_TRACE - ktrace_free(ip->i_rwtrace); -#endif -#ifdef XFS_ILOCK_TRACE - ktrace_free(ip->i_lock_trace); -#endif -#ifdef XFS_DIR2_TRACE - ktrace_free(ip->i_dir_trace); -#endif - if (ip->i_itemp) { /* * Only if we are shutting down the fs will we see an @@ -207,6 +168,7 @@ xfs_iget_cache_hit( * instead of polling for it. */ if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { + trace_xfs_iget_skip(ip); XFS_STATS_INC(xs_ig_frecycle); error = EAGAIN; goto out_error; @@ -225,7 +187,7 @@ xfs_iget_cache_hit( * Need to carefully get it back into useable state. */ if (ip->i_flags & XFS_IRECLAIMABLE) { - xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); + trace_xfs_iget_reclaim(ip); /* * We need to set XFS_INEW atomically with clearing the @@ -251,9 +213,10 @@ xfs_iget_cache_hit( ip->i_flags &= ~XFS_INEW; ip->i_flags |= XFS_IRECLAIMABLE; __xfs_inode_set_reclaim_tag(pag, ip); + trace_xfs_iget_reclaim(ip); goto out_error; } - inode->i_state = I_LOCK|I_NEW; + inode->i_state = I_NEW; } else { /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { @@ -270,8 +233,9 @@ xfs_iget_cache_hit( xfs_ilock(ip, lock_flags); xfs_iflags_clear(ip, XFS_ISTALE); - xfs_itrace_exit_tag(ip, "xfs_iget.found"); XFS_STATS_INC(xs_ig_found); + + trace_xfs_iget_found(ip); return 0; out_error: @@ -290,7 +254,7 @@ xfs_iget_cache_miss( struct xfs_inode **ipp, xfs_daddr_t bno, int flags, - int lock_flags) __releases(pag->pag_ici_lock) + int lock_flags) { struct xfs_inode *ip; int error; @@ -305,7 +269,7 @@ xfs_iget_cache_miss( if (error) goto out_destroy; - xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); + xfs_itrace_entry(ip); if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { error = ENOENT; @@ -350,6 +314,8 @@ xfs_iget_cache_miss( write_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); + + trace_xfs_iget_alloc(ip); *ipp = ip; return 0; @@ -511,17 +477,21 @@ xfs_ireclaim( { struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); XFS_STATS_INC(xs_ig_reclaims); /* - * Remove the inode from the per-AG radix tree. It doesn't matter - * if it was never added to it because radix_tree_delete can deal - * with that case just fine. + * Remove the inode from the per-AG radix tree. + * + * Because radix_tree_delete won't complain even if the item was never + * added to the tree assert that it's been there before to catch + * problems with the inode life time early on. */ pag = xfs_get_perag(mp, ip->i_ino); write_lock(&pag->pag_ici_lock); - radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino)); + if (!radix_tree_delete(&pag->pag_ici_root, agino)) + ASSERT(0); write_unlock(&pag->pag_ici_lock); xfs_put_perag(mp, pag); @@ -636,7 +606,7 @@ xfs_ilock( else if (lock_flags & XFS_ILOCK_SHARED) mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); - xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)__return_address); + trace_xfs_ilock(ip, lock_flags, _RET_IP_); } /* @@ -681,7 +651,7 @@ xfs_ilock_nowait( if (!mrtryaccess(&ip->i_lock)) goto out_undo_iolock; } - xfs_ilock_trace(ip, 2, lock_flags, (inst_t *)__return_address); + trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); return 1; out_undo_iolock: @@ -743,7 +713,7 @@ xfs_iunlock( xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp, (xfs_log_item_t*)(ip->i_itemp)); } - xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address); + trace_xfs_iunlock(ip, lock_flags, _RET_IP_); } /* @@ -762,6 +732,8 @@ xfs_ilock_demote( mrdemote(&ip->i_lock); if (lock_flags & XFS_IOLOCK_EXCL) mrdemote(&ip->i_iolock); + + trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); } #ifdef DEBUG @@ -792,52 +764,3 @@ xfs_isilocked( return 1; } #endif - -#ifdef XFS_INODE_TRACE - -#define KTRACE_ENTER(ip, vk, s, line, ra) \ - ktrace_enter((ip)->i_trace, \ -/* 0 */ (void *)(__psint_t)(vk), \ -/* 1 */ (void *)(s), \ -/* 2 */ (void *)(__psint_t) line, \ -/* 3 */ (void *)(__psint_t)atomic_read(&VFS_I(ip)->i_count), \ -/* 4 */ (void *)(ra), \ -/* 5 */ NULL, \ -/* 6 */ (void *)(__psint_t)current_cpu(), \ -/* 7 */ (void *)(__psint_t)current_pid(), \ -/* 8 */ (void *)__return_address, \ -/* 9 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL) - -/* - * Vnode tracing code. - */ -void -_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra) -{ - KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra); -} - -void -_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra) -{ - KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra); -} - -void -xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra) -{ - KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra); -} - -void -_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra) -{ - KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra); -} - -void -xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra) -{ - KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra); -} -#endif /* XFS_INODE_TRACE */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b92a4fa2a0a1..ef77fd88c8e3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -47,10 +47,10 @@ #include "xfs_rw.h" #include "xfs_error.h" #include "xfs_utils.h" -#include "xfs_dir2_trace.h" #include "xfs_quota.h" #include "xfs_filestream.h" #include "xfs_vnodeops.h" +#include "xfs_trace.h" kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_inode_zone; @@ -1291,42 +1291,6 @@ xfs_file_last_byte( return last_byte; } -#if defined(XFS_RW_TRACE) -STATIC void -xfs_itrunc_trace( - int tag, - xfs_inode_t *ip, - int flag, - xfs_fsize_t new_size, - xfs_off_t toss_start, - xfs_off_t toss_finish) -{ - if (ip->i_rwtrace == NULL) { - return; - } - - ktrace_enter(ip->i_rwtrace, - (void*)((long)tag), - (void*)ip, - (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff), - (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff), - (void*)((long)flag), - (void*)(unsigned long)((new_size >> 32) & 0xffffffff), - (void*)(unsigned long)(new_size & 0xffffffff), - (void*)(unsigned long)((toss_start >> 32) & 0xffffffff), - (void*)(unsigned long)(toss_start & 0xffffffff), - (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff), - (void*)(unsigned long)(toss_finish & 0xffffffff), - (void*)(unsigned long)current_cpu(), - (void*)(unsigned long)current_pid(), - (void*)NULL, - (void*)NULL, - (void*)NULL); -} -#else -#define xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish) -#endif - /* * Start the truncation of the file to new_size. The new size * must be smaller than the current size. This routine will @@ -1409,8 +1373,7 @@ xfs_itruncate_start( return 0; } last_byte = xfs_file_last_byte(ip); - xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start, - last_byte); + trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte); if (last_byte > toss_start) { if (flags & XFS_ITRUNC_DEFINITE) { xfs_tosspages(ip, toss_start, @@ -1514,7 +1477,8 @@ xfs_itruncate_finish( new_size = 0LL; } first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0); + trace_xfs_itruncate_finish_start(ip, new_size); + /* * The first thing we do is set the size to new_size permanently * on disk. This way we don't have to worry about anyone ever @@ -1731,7 +1695,7 @@ xfs_itruncate_finish( ASSERT((new_size != 0) || (fork == XFS_ATTR_FORK) || (ip->i_d.di_nextents == 0)); - xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0); + trace_xfs_itruncate_finish_end(ip, new_size); return 0; } @@ -2877,8 +2841,8 @@ xfs_iflush( mp = ip->i_mount; /* - * If the inode isn't dirty, then just release the inode - * flush lock and do nothing. + * If the inode isn't dirty, then just release the inode flush lock and + * do nothing. */ if (xfs_inode_clean(ip)) { xfs_ifunlock(ip); @@ -2904,6 +2868,19 @@ xfs_iflush( xfs_iunpin_wait(ip); /* + * For stale inodes we cannot rely on the backing buffer remaining + * stale in cache for the remaining life of the stale inode and so + * xfs_itobp() below may give us a buffer that no longer contains + * inodes below. We have to check this after ensuring the inode is + * unpinned so that it is safe to reclaim the stale inode after the + * flush call. + */ + if (xfs_iflags_test(ip, XFS_ISTALE)) { + xfs_ifunlock(ip); + return 0; + } + + /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this inode * to disk, because the log record didn't make it to disk! @@ -3252,23 +3229,6 @@ corrupt_out: return XFS_ERROR(EFSCORRUPTED); } - - -#ifdef XFS_ILOCK_TRACE -void -xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra) -{ - ktrace_enter(ip->i_lock_trace, - (void *)ip, - (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */ - (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */ - (void *)ra, /* caller of ilock */ - (void *)(unsigned long)current_cpu(), - (void *)(unsigned long)current_pid(), - NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL); -} -#endif - /* * Return a pointer to the extent record at file index idx. */ @@ -3300,13 +3260,17 @@ xfs_iext_get_ext( */ void xfs_iext_insert( - xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* starting index of new items */ xfs_extnum_t count, /* number of inserted items */ - xfs_bmbt_irec_t *new) /* items to insert */ + xfs_bmbt_irec_t *new, /* items to insert */ + int state) /* type of extent conversion */ { + xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; xfs_extnum_t i; /* extent record index */ + trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); xfs_iext_add(ifp, idx, count); for (i = idx; i < idx + count; i++, new++) @@ -3549,13 +3513,17 @@ xfs_iext_add_indirect_multi( */ void xfs_iext_remove( - xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff) /* number of extents to remove */ + int ext_diff, /* number of extents to remove */ + int state) /* type of extent conversion */ { + xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; xfs_extnum_t nextents; /* number of extents in file */ int new_size; /* size of extents after removal */ + trace_xfs_iext_remove(ip, idx, state, _RET_IP_); + ASSERT(ext_diff > 0); nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 41555de1d1db..ec1f28c4fc4f 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -213,7 +213,6 @@ typedef struct xfs_icdinode { struct bhv_desc; struct cred; -struct ktrace; struct xfs_buf; struct xfs_bmap_free; struct xfs_bmbt_irec; @@ -222,13 +221,6 @@ struct xfs_mount; struct xfs_trans; struct xfs_dquot; -#if defined(XFS_ILOCK_TRACE) -#define XFS_ILOCK_KTRACE_SIZE 32 -extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *); -#else -#define xfs_ilock_trace(i,n,f,ra) -#endif - typedef struct dm_attrs_s { __uint32_t da_dmevmask; /* DMIG event mask */ __uint16_t da_dmstate; /* DMIG state info */ @@ -271,26 +263,6 @@ typedef struct xfs_inode { /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ - - /* Trace buffers per inode. */ -#ifdef XFS_INODE_TRACE - struct ktrace *i_trace; /* general inode trace */ -#endif -#ifdef XFS_BMAP_TRACE - struct ktrace *i_xtrace; /* inode extent list trace */ -#endif -#ifdef XFS_BTREE_TRACE - struct ktrace *i_btrace; /* inode bmap btree trace */ -#endif -#ifdef XFS_RW_TRACE - struct ktrace *i_rwtrace; /* inode read/write trace */ -#endif -#ifdef XFS_ILOCK_TRACE - struct ktrace *i_lock_trace; /* inode lock/unlock trace */ -#endif -#ifdef XFS_DIR2_TRACE - struct ktrace *i_dir_trace; /* inode directory trace */ -#endif } xfs_inode_t; #define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \ @@ -406,6 +378,14 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) #define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED) +#define XFS_LOCK_FLAGS \ + { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ + { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ + { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ + { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \ + { XFS_IUNLOCK_NONOTIFY, "IUNLOCK_NONOTIFY" } + + /* * Flags for lockdep annotations. * @@ -455,6 +435,10 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) #define XFS_ITRUNC_DEFINITE 0x1 #define XFS_ITRUNC_MAYBE 0x2 +#define XFS_ITRUNC_FLAGS \ + { XFS_ITRUNC_DEFINITE, "DEFINITE" }, \ + { XFS_ITRUNC_MAYBE, "MAYBE" } + /* * For multiple groups support: if S_ISGID bit is set in the parent * directory, group of new file is set to that of the parent, and @@ -507,48 +491,16 @@ void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); void xfs_synchronize_times(xfs_inode_t *); void xfs_mark_inode_dirty_sync(xfs_inode_t *); -#if defined(XFS_INODE_TRACE) - -#define INODE_TRACE_SIZE 16 /* number of trace entries */ -#define INODE_KTRACE_ENTRY 1 -#define INODE_KTRACE_EXIT 2 -#define INODE_KTRACE_HOLD 3 -#define INODE_KTRACE_REF 4 -#define INODE_KTRACE_RELE 5 - -extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *); -extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *); -extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *); -extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *); -extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *); -#define xfs_itrace_entry(ip) \ - _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address) -#define xfs_itrace_exit(ip) \ - _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address) -#define xfs_itrace_exit_tag(ip, tag) \ - _xfs_itrace_exit(ip, tag, (inst_t *)__return_address) -#define xfs_itrace_ref(ip) \ - _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address) - -#else -#define xfs_itrace_entry(a) -#define xfs_itrace_exit(a) -#define xfs_itrace_exit_tag(a, b) -#define xfs_itrace_hold(a, b, c, d) -#define xfs_itrace_ref(a) -#define xfs_itrace_rele(a, b, c, d) -#endif - #define IHOLD(ip) \ do { \ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ atomic_inc(&(VFS_I(ip)->i_count)); \ - xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \ + trace_xfs_ihold(ip, _THIS_IP_); \ } while (0) #define IRELE(ip) \ do { \ - xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \ + trace_xfs_irele(ip, _THIS_IP_); \ iput(VFS_I(ip)); \ } while (0) @@ -577,11 +529,11 @@ int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); int xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int); xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t); -void xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t, - xfs_bmbt_irec_t *); +void xfs_iext_insert(xfs_inode_t *, xfs_extnum_t, xfs_extnum_t, + xfs_bmbt_irec_t *, int); void xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int); void xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int); -void xfs_iext_remove(xfs_ifork_t *, xfs_extnum_t, int); +void xfs_iext_remove(xfs_inode_t *, xfs_extnum_t, int, int); void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int); void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int); void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 9794b876d6ff..f38855d21ea5 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -41,6 +41,7 @@ #include "xfs_ialloc.h" #include "xfs_rw.h" #include "xfs_error.h" +#include "xfs_trace.h" kmem_zone_t *xfs_ili_zone; /* inode log item zone */ @@ -800,7 +801,9 @@ xfs_inode_item_pushbuf( !completion_done(&ip->i_flush)); iip->ili_pushbuf_flag = 0; xfs_iunlock(ip, XFS_ILOCK_SHARED); - xfs_buftrace("INODE ITEM PUSH", bp); + + trace_xfs_inode_item_push(bp, _RET_IP_); + if (XFS_BUF_ISPINNED(bp)) { xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 65bae4c9b8bf..cc8df1ac7783 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -127,7 +127,7 @@ static inline int xfs_ilog_fdata(int w) #ifdef __KERNEL__ struct xfs_buf; -struct xfs_bmbt_rec_64; +struct xfs_bmbt_rec; struct xfs_inode; struct xfs_mount; @@ -140,9 +140,9 @@ typedef struct xfs_inode_log_item { unsigned short ili_flags; /* misc flags */ unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ - struct xfs_bmbt_rec_64 *ili_extents_buf; /* array of logged + struct xfs_bmbt_rec *ili_extents_buf; /* array of logged data exts */ - struct xfs_bmbt_rec_64 *ili_aextents_buf; /* array of logged + struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged attr exts */ unsigned int ili_pushbuf_flag; /* one bit used in push_ail */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 67ae5555a30a..0b65039951a0 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -47,72 +47,8 @@ #include "xfs_trans_space.h" #include "xfs_utils.h" #include "xfs_iomap.h" +#include "xfs_trace.h" -#if defined(XFS_RW_TRACE) -void -xfs_iomap_enter_trace( - int tag, - xfs_inode_t *ip, - xfs_off_t offset, - ssize_t count) -{ - if (!ip->i_rwtrace) - return; - - ktrace_enter(ip->i_rwtrace, - (void *)((unsigned long)tag), - (void *)ip, - (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), - (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), - (void *)((unsigned long)((offset >> 32) & 0xffffffff)), - (void *)((unsigned long)(offset & 0xffffffff)), - (void *)((unsigned long)count), - (void *)((unsigned long)((ip->i_new_size >> 32) & 0xffffffff)), - (void *)((unsigned long)(ip->i_new_size & 0xffffffff)), - (void *)((unsigned long)current_pid()), - (void *)NULL, - (void *)NULL, - (void *)NULL, - (void *)NULL, - (void *)NULL, - (void *)NULL); -} - -void -xfs_iomap_map_trace( - int tag, - xfs_inode_t *ip, - xfs_off_t offset, - ssize_t count, - xfs_iomap_t *iomapp, - xfs_bmbt_irec_t *imapp, - int flags) -{ - if (!ip->i_rwtrace) - return; - - ktrace_enter(ip->i_rwtrace, - (void *)((unsigned long)tag), - (void *)ip, - (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), - (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), - (void *)((unsigned long)((offset >> 32) & 0xffffffff)), - (void *)((unsigned long)(offset & 0xffffffff)), - (void *)((unsigned long)count), - (void *)((unsigned long)flags), - (void *)((unsigned long)((iomapp->iomap_offset >> 32) & 0xffffffff)), - (void *)((unsigned long)(iomapp->iomap_offset & 0xffffffff)), - (void *)((unsigned long)(iomapp->iomap_delta)), - (void *)((unsigned long)(iomapp->iomap_bsize)), - (void *)((unsigned long)(iomapp->iomap_bn)), - (void *)(__psint_t)(imapp->br_startoff), - (void *)((unsigned long)(imapp->br_blockcount)), - (void *)(__psint_t)(imapp->br_startblock)); -} -#else -#define xfs_iomap_enter_trace(tag, io, offset, count) -#define xfs_iomap_map_trace(tag, io, offset, count, iomapp, imapp, flags) -#endif #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ << mp->m_writeio_log) @@ -187,21 +123,20 @@ xfs_iomap( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); + trace_xfs_iomap_enter(ip, offset, count, flags, NULL); + switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) { case BMAPI_READ: - xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, ip, offset, count); lockmode = xfs_ilock_map_shared(ip); bmapi_flags = XFS_BMAPI_ENTIRE; break; case BMAPI_WRITE: - xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, ip, offset, count); lockmode = XFS_ILOCK_EXCL; if (flags & BMAPI_IGNSTATE) bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE; xfs_ilock(ip, lockmode); break; case BMAPI_ALLOCATE: - xfs_iomap_enter_trace(XFS_IOMAP_ALLOC_ENTER, ip, offset, count); lockmode = XFS_ILOCK_SHARED; bmapi_flags = XFS_BMAPI_ENTIRE; @@ -237,8 +172,7 @@ xfs_iomap( if (nimaps && (imap.br_startblock != HOLESTARTBLOCK) && (imap.br_startblock != DELAYSTARTBLOCK)) { - xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip, - offset, count, iomapp, &imap, flags); + trace_xfs_iomap_found(ip, offset, count, flags, &imap); break; } @@ -250,8 +184,7 @@ xfs_iomap( &imap, &nimaps); } if (!error) { - xfs_iomap_map_trace(XFS_IOMAP_ALLOC_MAP, ip, - offset, count, iomapp, &imap, flags); + trace_xfs_iomap_alloc(ip, offset, count, flags, &imap); } iomap_flags = IOMAP_NEW; break; @@ -261,8 +194,7 @@ xfs_iomap( lockmode = 0; if (nimaps && !isnullstartblock(imap.br_startblock)) { - xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip, - offset, count, iomapp, &imap, flags); + trace_xfs_iomap_found(ip, offset, count, flags, &imap); break; } @@ -623,8 +555,7 @@ retry: * delalloc blocks and retry without EOF preallocation. */ if (nimaps == 0) { - xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE, - ip, offset, count); + trace_xfs_delalloc_enospc(ip, offset, count); if (flushed) return XFS_ERROR(ENOSPC); @@ -837,7 +768,7 @@ xfs_iomap_write_unwritten( int committed; int error; - xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN, ip, offset, count); + trace_xfs_unwritten_convert(ip, offset, count); offset_fsb = XFS_B_TO_FSBT(mp, offset); count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); @@ -860,8 +791,15 @@ xfs_iomap_write_unwritten( * set up a transaction to convert the range of extents * from unwritten to real. Do allocations in a loop until * we have covered the range passed in. + * + * Note that we open code the transaction allocation here + * to pass KM_NOFS--we can't risk to recursing back into + * the filesystem here as we might be asked to write out + * the same inode that we complete here and might deadlock + * on the iolock. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); + xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); + tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), 0, diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index fdcf7b82747f..174f29990991 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -43,6 +43,14 @@ typedef enum { BMAPI_TRYLOCK = (1 << 7), /* non-blocking request */ } bmapi_flags_t; +#define BMAPI_FLAGS \ + { BMAPI_READ, "READ" }, \ + { BMAPI_WRITE, "WRITE" }, \ + { BMAPI_ALLOCATE, "ALLOCATE" }, \ + { BMAPI_IGNSTATE, "IGNSTATE" }, \ + { BMAPI_DIRECT, "DIRECT" }, \ + { BMAPI_MMAP, "MMAP" }, \ + { BMAPI_TRYLOCK, "TRYLOCK" } /* * xfs_iomap_t: File system I/O map diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 9dbdff3ea484..600b5b06aaeb 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -40,6 +40,7 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_rw.h" +#include "xfs_trace.h" kmem_zone_t *xfs_log_ticket_zone; @@ -122,85 +123,6 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, STATIC int xlog_iclogs_empty(xlog_t *log); -#if defined(XFS_LOG_TRACE) - -#define XLOG_TRACE_LOGGRANT_SIZE 2048 -#define XLOG_TRACE_ICLOG_SIZE 256 - -void -xlog_trace_loggrant_alloc(xlog_t *log) -{ - log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS); -} - -void -xlog_trace_loggrant_dealloc(xlog_t *log) -{ - ktrace_free(log->l_grant_trace); -} - -void -xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) -{ - unsigned long cnts; - - /* ticket counts are 1 byte each */ - cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; - - ktrace_enter(log->l_grant_trace, - (void *)tic, - (void *)log->l_reserve_headq, - (void *)log->l_write_headq, - (void *)((unsigned long)log->l_grant_reserve_cycle), - (void *)((unsigned long)log->l_grant_reserve_bytes), - (void *)((unsigned long)log->l_grant_write_cycle), - (void *)((unsigned long)log->l_grant_write_bytes), - (void *)((unsigned long)log->l_curr_cycle), - (void *)((unsigned long)log->l_curr_block), - (void *)((unsigned long)CYCLE_LSN(log->l_tail_lsn)), - (void *)((unsigned long)BLOCK_LSN(log->l_tail_lsn)), - (void *)string, - (void *)((unsigned long)tic->t_trans_type), - (void *)cnts, - (void *)((unsigned long)tic->t_curr_res), - (void *)((unsigned long)tic->t_unit_res)); -} - -void -xlog_trace_iclog_alloc(xlog_in_core_t *iclog) -{ - iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS); -} - -void -xlog_trace_iclog_dealloc(xlog_in_core_t *iclog) -{ - ktrace_free(iclog->ic_trace); -} - -void -xlog_trace_iclog(xlog_in_core_t *iclog, uint state) -{ - ktrace_enter(iclog->ic_trace, - (void *)((unsigned long)state), - (void *)((unsigned long)current_pid()), - (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, - (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, - (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, - (void *)NULL, (void *)NULL); -} -#else - -#define xlog_trace_loggrant_alloc(log) -#define xlog_trace_loggrant_dealloc(log) -#define xlog_trace_loggrant(log,tic,string) - -#define xlog_trace_iclog_alloc(iclog) -#define xlog_trace_iclog_dealloc(iclog) -#define xlog_trace_iclog(iclog,state) - -#endif /* XFS_LOG_TRACE */ - static void xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) @@ -353,15 +275,17 @@ xfs_log_done(xfs_mount_t *mp, if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || (flags & XFS_LOG_REL_PERM_RESERV)) { + trace_xfs_log_done_nonperm(log, ticket); + /* * Release ticket if not permanent reservation or a specific * request has been made to release a permanent reservation. */ - xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)"); xlog_ungrant_log_space(log, ticket); xfs_log_ticket_put(ticket); } else { - xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)"); + trace_xfs_log_done_perm(log, ticket); + xlog_regrant_reserve_log_space(log, ticket); /* If this ticket was a permanent reservation and we aren't * trying to release it, reset the inited flags; so next time @@ -505,10 +429,13 @@ xfs_log_reserve(xfs_mount_t *mp, XFS_STATS_INC(xs_try_logspace); + if (*ticket != NULL) { ASSERT(flags & XFS_LOG_PERM_RESERV); internal_ticket = (xlog_ticket_t *)*ticket; - xlog_trace_loggrant(log, internal_ticket, "xfs_log_reserve: existing ticket (permanent trans)"); + + trace_xfs_log_reserve(log, internal_ticket); + xlog_grant_push_ail(mp, internal_ticket->t_unit_res); retval = xlog_regrant_write_log_space(log, internal_ticket); } else { @@ -519,10 +446,9 @@ xfs_log_reserve(xfs_mount_t *mp, return XFS_ERROR(ENOMEM); internal_ticket->t_trans_type = t_type; *ticket = internal_ticket; - xlog_trace_loggrant(log, internal_ticket, - (internal_ticket->t_flags & XLOG_TIC_PERM_RESERV) ? - "xfs_log_reserve: create new ticket (permanent trans)" : - "xfs_log_reserve: create new ticket"); + + trace_xfs_log_reserve(log, internal_ticket); + xlog_grant_push_ail(mp, (internal_ticket->t_unit_res * internal_ticket->t_cnt)); @@ -734,7 +660,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) spin_unlock(&log->l_icloglock); } if (tic) { - xlog_trace_loggrant(log, tic, "unmount rec"); + trace_xfs_log_umount_write(log, tic); xlog_ungrant_log_space(log, tic); xfs_log_ticket_put(tic); } @@ -1030,7 +956,6 @@ xlog_iodone(xfs_buf_t *bp) xfs_fs_cmn_err(CE_WARN, l->l_mp, "xlog_iodone: Barriers are no longer supported" " by device. Disabling barriers\n"); - xfs_buftrace("XLOG_IODONE BARRIERS OFF", bp); } /* @@ -1085,13 +1010,10 @@ xlog_bdstrat_cb(struct xfs_buf *bp) return 0; } - xfs_buftrace("XLOG__BDSTRAT IOERROR", bp); XFS_BUF_ERROR(bp, EIO); XFS_BUF_STALE(bp); xfs_biodone(bp); return XFS_ERROR(EIO); - - } /* @@ -1246,7 +1168,6 @@ xlog_alloc_log(xfs_mount_t *mp, spin_lock_init(&log->l_grant_lock); sv_init(&log->l_flush_wait, 0, "flush_wait"); - xlog_trace_loggrant_alloc(log); /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); @@ -1305,8 +1226,6 @@ xlog_alloc_log(xfs_mount_t *mp, sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); - xlog_trace_iclog_alloc(iclog); - iclogp = &iclog->ic_next; } *iclogp = log->l_iclog; /* complete ring */ @@ -1321,13 +1240,11 @@ out_free_iclog: sv_destroy(&iclog->ic_force_wait); sv_destroy(&iclog->ic_write_wait); xfs_buf_free(iclog->ic_bp); - xlog_trace_iclog_dealloc(iclog); } kmem_free(iclog); } spinlock_destroy(&log->l_icloglock); spinlock_destroy(&log->l_grant_lock); - xlog_trace_loggrant_dealloc(log); xfs_buf_free(log->l_xbuf); out_free_log: kmem_free(log); @@ -1524,6 +1441,7 @@ xlog_sync(xlog_t *log, XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); + bp->b_flags |= XBF_LOG_BUFFER; /* * Do an ordered write for the log block. * Its unnecessary to flush the first split block in the log wrap case. @@ -1561,6 +1479,7 @@ xlog_sync(xlog_t *log, XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); + bp->b_flags |= XBF_LOG_BUFFER; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) XFS_BUF_ORDERED(bp); dptr = XFS_BUF_PTR(bp); @@ -1607,7 +1526,6 @@ xlog_dealloc_log(xlog_t *log) sv_destroy(&iclog->ic_force_wait); sv_destroy(&iclog->ic_write_wait); xfs_buf_free(iclog->ic_bp); - xlog_trace_iclog_dealloc(iclog); next_iclog = iclog->ic_next; kmem_free(iclog); iclog = next_iclog; @@ -1616,7 +1534,6 @@ xlog_dealloc_log(xlog_t *log) spinlock_destroy(&log->l_grant_lock); xfs_buf_free(log->l_xbuf); - xlog_trace_loggrant_dealloc(log); log->l_mp->m_log = NULL; kmem_free(log); } /* xlog_dealloc_log */ @@ -2414,7 +2331,6 @@ restart: iclog = log->l_iclog; if (iclog->ic_state != XLOG_STATE_ACTIVE) { - xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH); XFS_STATS_INC(xs_log_noiclogs); /* Wait for log writes to have flushed */ @@ -2520,13 +2436,15 @@ xlog_grant_log_space(xlog_t *log, /* Is there space or do we need to sleep? */ spin_lock(&log->l_grant_lock); - xlog_trace_loggrant(log, tic, "xlog_grant_log_space: enter"); + + trace_xfs_log_grant_enter(log, tic); /* something is already sleeping; insert new transaction at end */ if (log->l_reserve_headq) { xlog_ins_ticketq(&log->l_reserve_headq, tic); - xlog_trace_loggrant(log, tic, - "xlog_grant_log_space: sleep 1"); + + trace_xfs_log_grant_sleep1(log, tic); + /* * Gotta check this before going to sleep, while we're * holding the grant lock. @@ -2540,8 +2458,7 @@ xlog_grant_log_space(xlog_t *log, * If we got an error, and the filesystem is shutting down, * we'll catch it down below. So just continue... */ - xlog_trace_loggrant(log, tic, - "xlog_grant_log_space: wake 1"); + trace_xfs_log_grant_wake1(log, tic); spin_lock(&log->l_grant_lock); } if (tic->t_flags & XFS_LOG_PERM_RESERV) @@ -2558,8 +2475,9 @@ redo: if (free_bytes < need_bytes) { if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) xlog_ins_ticketq(&log->l_reserve_headq, tic); - xlog_trace_loggrant(log, tic, - "xlog_grant_log_space: sleep 2"); + + trace_xfs_log_grant_sleep2(log, tic); + spin_unlock(&log->l_grant_lock); xlog_grant_push_ail(log->l_mp, need_bytes); spin_lock(&log->l_grant_lock); @@ -2571,8 +2489,8 @@ redo: if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; - xlog_trace_loggrant(log, tic, - "xlog_grant_log_space: wake 2"); + trace_xfs_log_grant_wake2(log, tic); + goto redo; } else if (tic->t_flags & XLOG_TIC_IN_Q) xlog_del_ticketq(&log->l_reserve_headq, tic); @@ -2592,7 +2510,7 @@ redo: ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn))); } #endif - xlog_trace_loggrant(log, tic, "xlog_grant_log_space: exit"); + trace_xfs_log_grant_exit(log, tic); xlog_verify_grant_head(log, 1); spin_unlock(&log->l_grant_lock); return 0; @@ -2600,7 +2518,9 @@ redo: error_return: if (tic->t_flags & XLOG_TIC_IN_Q) xlog_del_ticketq(&log->l_reserve_headq, tic); - xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret"); + + trace_xfs_log_grant_error(log, tic); + /* * If we are failing, make sure the ticket doesn't have any * current reservations. We don't want to add this back when @@ -2640,7 +2560,8 @@ xlog_regrant_write_log_space(xlog_t *log, #endif spin_lock(&log->l_grant_lock); - xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: enter"); + + trace_xfs_log_regrant_write_enter(log, tic); if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; @@ -2669,8 +2590,8 @@ xlog_regrant_write_log_space(xlog_t *log, if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) xlog_ins_ticketq(&log->l_write_headq, tic); - xlog_trace_loggrant(log, tic, - "xlog_regrant_write_log_space: sleep 1"); + trace_xfs_log_regrant_write_sleep1(log, tic); + spin_unlock(&log->l_grant_lock); xlog_grant_push_ail(log->l_mp, need_bytes); spin_lock(&log->l_grant_lock); @@ -2685,8 +2606,7 @@ xlog_regrant_write_log_space(xlog_t *log, if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; - xlog_trace_loggrant(log, tic, - "xlog_regrant_write_log_space: wake 1"); + trace_xfs_log_regrant_write_wake1(log, tic); } } @@ -2704,6 +2624,8 @@ redo: spin_lock(&log->l_grant_lock); XFS_STATS_INC(xs_sleep_logspace); + trace_xfs_log_regrant_write_sleep2(log, tic); + sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* If we're shutting down, this tic is already off the queue */ @@ -2711,8 +2633,7 @@ redo: if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; - xlog_trace_loggrant(log, tic, - "xlog_regrant_write_log_space: wake 2"); + trace_xfs_log_regrant_write_wake2(log, tic); goto redo; } else if (tic->t_flags & XLOG_TIC_IN_Q) xlog_del_ticketq(&log->l_write_headq, tic); @@ -2727,7 +2648,8 @@ redo: } #endif - xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit"); + trace_xfs_log_regrant_write_exit(log, tic); + xlog_verify_grant_head(log, 1); spin_unlock(&log->l_grant_lock); return 0; @@ -2736,7 +2658,9 @@ redo: error_return: if (tic->t_flags & XLOG_TIC_IN_Q) xlog_del_ticketq(&log->l_reserve_headq, tic); - xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret"); + + trace_xfs_log_regrant_write_error(log, tic); + /* * If we are failing, make sure the ticket doesn't have any * current reservations. We don't want to add this back when @@ -2760,8 +2684,8 @@ STATIC void xlog_regrant_reserve_log_space(xlog_t *log, xlog_ticket_t *ticket) { - xlog_trace_loggrant(log, ticket, - "xlog_regrant_reserve_log_space: enter"); + trace_xfs_log_regrant_reserve_enter(log, ticket); + if (ticket->t_cnt > 0) ticket->t_cnt--; @@ -2769,8 +2693,9 @@ xlog_regrant_reserve_log_space(xlog_t *log, xlog_grant_sub_space(log, ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; xlog_tic_reset_res(ticket); - xlog_trace_loggrant(log, ticket, - "xlog_regrant_reserve_log_space: sub current res"); + + trace_xfs_log_regrant_reserve_sub(log, ticket); + xlog_verify_grant_head(log, 1); /* just return if we still have some of the pre-reserved space */ @@ -2780,8 +2705,9 @@ xlog_regrant_reserve_log_space(xlog_t *log, } xlog_grant_add_space_reserve(log, ticket->t_unit_res); - xlog_trace_loggrant(log, ticket, - "xlog_regrant_reserve_log_space: exit"); + + trace_xfs_log_regrant_reserve_exit(log, ticket); + xlog_verify_grant_head(log, 0); spin_unlock(&log->l_grant_lock); ticket->t_curr_res = ticket->t_unit_res; @@ -2811,11 +2737,11 @@ xlog_ungrant_log_space(xlog_t *log, ticket->t_cnt--; spin_lock(&log->l_grant_lock); - xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter"); + trace_xfs_log_ungrant_enter(log, ticket); xlog_grant_sub_space(log, ticket->t_curr_res); - xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current"); + trace_xfs_log_ungrant_sub(log, ticket); /* If this is a permanent reservation ticket, we may be able to free * up more space based on the remaining count. @@ -2825,7 +2751,8 @@ xlog_ungrant_log_space(xlog_t *log, xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt); } - xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit"); + trace_xfs_log_ungrant_exit(log, ticket); + xlog_verify_grant_head(log, 1); spin_unlock(&log->l_grant_lock); xfs_log_move_tail(log->l_mp, 1); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 679c7c4926a2..d55662db7077 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -19,7 +19,6 @@ #define __XFS_LOG_PRIV_H__ struct xfs_buf; -struct ktrace; struct log; struct xlog_ticket; struct xfs_buf_cancel; @@ -135,6 +134,12 @@ static inline uint xlog_get_client_id(__be32 i) #define XLOG_TIC_INITED 0x1 /* has been initialized */ #define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ #define XLOG_TIC_IN_Q 0x4 + +#define XLOG_TIC_FLAGS \ + { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ + { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \ + { XLOG_TIC_IN_Q, "XLOG_TIC_IN_Q" } + #endif /* __KERNEL__ */ #define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ @@ -361,9 +366,6 @@ typedef struct xlog_in_core { int ic_bwritecnt; unsigned short ic_state; char *ic_datap; /* pointer to iclog data */ -#ifdef XFS_LOG_TRACE - struct ktrace *ic_trace; -#endif /* Callback structures need their own cacheline */ spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; @@ -429,10 +431,6 @@ typedef struct log { int l_grant_write_cycle; int l_grant_write_bytes; -#ifdef XFS_LOG_TRACE - struct ktrace *l_grant_trace; -#endif - /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG char *l_iclog_bak[XLOG_MAX_ICLOGS]; @@ -456,12 +454,6 @@ extern void xlog_put_bp(struct xfs_buf *); extern kmem_zone_t *xfs_log_ticket_zone; -/* iclog tracing */ -#define XLOG_TRACE_GRAB_FLUSH 1 -#define XLOG_TRACE_REL_FLUSH 2 -#define XLOG_TRACE_SLEEP_FLUSH 3 -#define XLOG_TRACE_WAKE_FLUSH 4 - /* * Unmount record type is used as a pseudo transaction type for the ticket. * It's value must be outside the range of XFS_TRANS_* values. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index fb17f8226b09..69ac2e5ef20c 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -46,6 +46,7 @@ #include "xfs_quota.h" #include "xfs_rw.h" #include "xfs_utils.h" +#include "xfs_trace.h" STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); @@ -225,16 +226,10 @@ xlog_header_check_dump( xfs_mount_t *mp, xlog_rec_header_t *head) { - int b; - - cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__); - for (b = 0; b < 16; b++) - cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&mp->m_sb.sb_uuid)[b]); - cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT); - cmn_err(CE_DEBUG, " log : uuid = "); - for (b = 0; b < 16; b++) - cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&head->h_fs_uuid)[b]); - cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt)); + cmn_err(CE_DEBUG, "%s: SB : uuid = %pU, fmt = %d\n", + __func__, &mp->m_sb.sb_uuid, XLOG_FMT); + cmn_err(CE_DEBUG, " log : uuid = %pU, fmt = %d\n", + &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); } #else #define xlog_header_check_dump(mp, head) @@ -2206,6 +2201,7 @@ xlog_recover_do_buffer_trans( xfs_daddr_t blkno; int len; ushort flags; + uint buf_flags; buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr; @@ -2246,12 +2242,11 @@ xlog_recover_do_buffer_trans( } mp = log->l_mp; - if (flags & XFS_BLI_INODE_BUF) { - bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len, - XFS_BUF_LOCK); - } else { - bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0); - } + buf_flags = XFS_BUF_LOCK; + if (!(flags & XFS_BLI_INODE_BUF)) + buf_flags |= XFS_BUF_MAPPED; + + bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); if (XFS_BUF_ISERROR(bp)) { xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, bp, blkno); @@ -2350,8 +2345,8 @@ xlog_recover_do_inode_trans( goto error; } - bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno, - in_f->ilf_len, XFS_BUF_LOCK); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, + XFS_BUF_LOCK); if (XFS_BUF_ISERROR(bp)) { xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, bp, in_f->ilf_blkno); @@ -3517,7 +3512,7 @@ xlog_do_recovery_pass( { xlog_rec_header_t *rhead; xfs_daddr_t blk_no; - xfs_caddr_t bufaddr, offset; + xfs_caddr_t offset; xfs_buf_t *hbp, *dbp; int error = 0, h_size; int bblks, split_bblks; @@ -3610,7 +3605,7 @@ xlog_do_recovery_pass( /* * Check for header wrapping around physical end-of-log */ - offset = NULL; + offset = XFS_BUF_PTR(hbp); split_hblks = 0; wrapped_hblks = 0; if (blk_no + hblks <= log->l_logBBsize) { @@ -3646,9 +3641,8 @@ xlog_do_recovery_pass( * - order is important. */ wrapped_hblks = hblks - split_hblks; - bufaddr = XFS_BUF_PTR(hbp); error = XFS_BUF_SET_PTR(hbp, - bufaddr + BBTOB(split_hblks), + offset + BBTOB(split_hblks), BBTOB(hblks - split_hblks)); if (error) goto bread_err2; @@ -3658,14 +3652,10 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - error = XFS_BUF_SET_PTR(hbp, bufaddr, + error = XFS_BUF_SET_PTR(hbp, offset, BBTOB(hblks)); if (error) goto bread_err2; - - if (!offset) - offset = xlog_align(log, 0, - wrapped_hblks, hbp); } rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, @@ -3685,7 +3675,7 @@ xlog_do_recovery_pass( } else { /* This log record is split across the * physical end of log */ - offset = NULL; + offset = XFS_BUF_PTR(dbp); split_bblks = 0; if (blk_no != log->l_logBBsize) { /* some data is before the physical @@ -3714,9 +3704,8 @@ xlog_do_recovery_pass( * _first_, then the log start (LR header end) * - order is important. */ - bufaddr = XFS_BUF_PTR(dbp); error = XFS_BUF_SET_PTR(dbp, - bufaddr + BBTOB(split_bblks), + offset + BBTOB(split_bblks), BBTOB(bblks - split_bblks)); if (error) goto bread_err2; @@ -3727,13 +3716,9 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size); + error = XFS_BUF_SET_PTR(dbp, offset, h_size); if (error) goto bread_err2; - - if (!offset) - offset = xlog_align(log, wrapped_hblks, - bblks - split_bblks, dbp); } xlog_unpack_data(rhead, offset, log); if ((error = xlog_recover_process_data(log, rhash, diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 8b6c9e807efb..eb403b40e120 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -44,6 +44,8 @@ #include "xfs_quota.h" #include "xfs_fsops.h" #include "xfs_utils.h" +#include "xfs_trace.h" + STATIC void xfs_unmountfs_wait(xfs_mount_t *); @@ -583,8 +585,8 @@ xfs_readsb(xfs_mount_t *mp, int flags) sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED; - bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), extra_flags); + bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), + extra_flags); if (!bp || XFS_BUF_ISERROR(bp)) { xfs_fs_mount_cmn_err(flags, "SB read failed"); error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; @@ -624,8 +626,8 @@ xfs_readsb(xfs_mount_t *mp, int flags) XFS_BUF_UNMANAGE(bp); xfs_buf_relse(bp); sector_size = mp->m_sb.sb_sectsize; - bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), extra_flags); + bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, + BTOBB(sector_size), extra_flags); if (!bp || XFS_BUF_ISERROR(bp)) { xfs_fs_mount_cmn_err(flags, "SB re-read failed"); error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; @@ -1471,7 +1473,7 @@ xfs_log_sbcount( if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) return 0; - tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT); + tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, XFS_DEFAULT_LOG_COUNT); if (error) { @@ -2123,7 +2125,7 @@ xfs_icsb_destroy_counters( mutex_destroy(&mp->m_icsb_mutex); } -STATIC_INLINE void +STATIC void xfs_icsb_lock_cntr( xfs_icsb_cnts_t *icsbp) { @@ -2132,7 +2134,7 @@ xfs_icsb_lock_cntr( } } -STATIC_INLINE void +STATIC void xfs_icsb_unlock_cntr( xfs_icsb_cnts_t *icsbp) { @@ -2140,7 +2142,7 @@ xfs_icsb_unlock_cntr( } -STATIC_INLINE void +STATIC void xfs_icsb_lock_all_counters( xfs_mount_t *mp) { @@ -2153,7 +2155,7 @@ xfs_icsb_lock_all_counters( } } -STATIC_INLINE void +STATIC void xfs_icsb_unlock_all_counters( xfs_mount_t *mp) { @@ -2389,12 +2391,12 @@ xfs_icsb_modify_counters( { xfs_icsb_cnts_t *icsbp; long long lcounter; /* long counter for 64 bit fields */ - int cpu, ret = 0; + int ret = 0; might_sleep(); again: - cpu = get_cpu(); - icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu); + preempt_disable(); + icsbp = this_cpu_ptr(mp->m_sb_cnts); /* * if the counter is disabled, go to slow path @@ -2438,11 +2440,11 @@ again: break; } xfs_icsb_unlock_cntr(icsbp); - put_cpu(); + preempt_enable(); return 0; slow_path: - put_cpu(); + preempt_enable(); /* * serialise with a mutex so we don't burn lots of cpu on @@ -2490,7 +2492,7 @@ slow_path: balance_counter: xfs_icsb_unlock_cntr(icsbp); - put_cpu(); + preempt_enable(); /* * We may have multiple threads here if multiple per-cpu diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a6c023bc0fb2..1df7e4502967 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -93,6 +93,9 @@ typedef struct xfs_dmops { xfs_send_unmount_t xfs_send_unmount; } xfs_dmops_t; +#define XFS_DMAPI_UNMOUNT_FLAGS(mp) \ + (((mp)->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? 0 : DM_FLAGS_UNWANTED) + #define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \ (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock) #define XFS_SEND_MMAP(mp, vma,fl) \ @@ -101,12 +104,24 @@ typedef struct xfs_dmops { (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right) #define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl) -#define XFS_SEND_PREUNMOUNT(mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ - (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT,mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) #define XFS_SEND_MOUNT(mp,right,path,name) \ (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name) -#define XFS_SEND_UNMOUNT(mp, ip,right,mode,rval,fl) \ - (*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl) +#define XFS_SEND_PREUNMOUNT(mp) \ +do { \ + if (mp->m_flags & XFS_MOUNT_DMAPI) { \ + (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT, mp, \ + (mp)->m_rootip, DM_RIGHT_NULL, \ + (mp)->m_rootip, DM_RIGHT_NULL, \ + NULL, NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \ + } \ +} while (0) +#define XFS_SEND_UNMOUNT(mp) \ +do { \ + if (mp->m_flags & XFS_MOUNT_DMAPI) { \ + (*(mp)->m_dm_ops->xfs_send_unmount)(mp, (mp)->m_rootip, \ + DM_RIGHT_NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \ + } \ +} while (0) #ifdef HAVE_PERCPU_SB @@ -387,13 +402,13 @@ xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag) * Per-cpu superblock locking functions */ #ifdef HAVE_PERCPU_SB -STATIC_INLINE void +static inline void xfs_icsb_lock(xfs_mount_t *mp) { mutex_lock(&mp->m_icsb_mutex); } -STATIC_INLINE void +static inline void xfs_icsb_unlock(xfs_mount_t *mp) { mutex_unlock(&mp->m_icsb_mutex); diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 3ec91ac74c2a..91bfd60f4c74 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -92,6 +92,14 @@ typedef struct xfs_dqblk { #define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) +#define XFS_DQ_FLAGS \ + { XFS_DQ_USER, "USER" }, \ + { XFS_DQ_PROJ, "PROJ" }, \ + { XFS_DQ_GROUP, "GROUP" }, \ + { XFS_DQ_DIRTY, "DIRTY" }, \ + { XFS_DQ_WANT, "WANT" }, \ + { XFS_DQ_INACTIVE, "INACTIVE" } + /* * In the worst case, when both user and group quotas are on, * we can have a max of three dquots changing in a single transaction. diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index b81deea0ce19..fc1cda23b817 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -39,6 +39,7 @@ #include "xfs_utils.h" #include "xfs_trans_space.h" #include "xfs_vnodeops.h" +#include "xfs_trace.h" /* diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 385f6dceba5d..6be05f756d59 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -45,6 +45,7 @@ #include "xfs_inode_item.h" #include "xfs_trans_space.h" #include "xfs_utils.h" +#include "xfs_trace.h" /* @@ -1516,6 +1517,8 @@ xfs_rtfree_range( */ error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, &postblock); + if (error) + return error; /* * If there are blocks not being freed at the front of the * old extent, add summary data for them to be allocated. diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index 3f816ad7ff19..5aa07caea5f1 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -44,6 +44,7 @@ #include "xfs_error.h" #include "xfs_buf_item.h" #include "xfs_rw.h" +#include "xfs_trace.h" /* * This is a subroutine for xfs_write() and other writers (xfs_ioctl) @@ -171,7 +172,6 @@ xfs_bioerror( * No need to wait until the buffer is unpinned. * We aren't flushing it. */ - xfs_buftrace("XFS IOERROR", bp); XFS_BUF_ERROR(bp, EIO); /* * We're calling biodone, so delete B_DONE flag. Either way @@ -205,7 +205,6 @@ xfs_bioerror_relse( ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks); ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone); - xfs_buftrace("XFS IOERRELSE", bp); fl = XFS_BUF_BFLAGS(bp); /* * No need to wait until the buffer is unpinned. @@ -277,10 +276,10 @@ xfs_read_buf( xfs_buf_t *bp; int error; - if (flags) - bp = xfs_buf_read_flags(target, blkno, len, flags); - else - bp = xfs_buf_read(target, blkno, len, flags); + if (!flags) + flags = XBF_LOCK | XBF_MAPPED; + + bp = xfs_buf_read(target, blkno, len, flags); if (!bp) return XFS_ERROR(EIO); error = XFS_BUF_GETERROR(bp); @@ -336,3 +335,25 @@ xfs_bwrite( } return (error); } + +/* + * helper function to extract extent size hint from inode + */ +xfs_extlen_t +xfs_get_extsz_hint( + struct xfs_inode *ip) +{ + xfs_extlen_t extsz; + + if (unlikely(XFS_IS_REALTIME_INODE(ip))) { + extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) + ? ip->i_d.di_extsize + : ip->i_mount->m_sb.sb_rextsize; + ASSERT(extsz); + } else { + extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) + ? ip->i_d.di_extsize : 0; + } + + return extsz; +} diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index f5e4874c37d8..571f2174435c 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -37,34 +37,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) } /* - * Flags for xfs_free_eofblocks - */ -#define XFS_FREE_EOF_LOCK (1<<0) -#define XFS_FREE_EOF_NOLOCK (1<<1) - - -/* - * helper function to extract extent size hint from inode - */ -STATIC_INLINE xfs_extlen_t -xfs_get_extsz_hint( - xfs_inode_t *ip) -{ - xfs_extlen_t extsz; - - if (unlikely(XFS_IS_REALTIME_INODE(ip))) { - extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) - ? ip->i_d.di_extsize - : ip->i_mount->m_sb.sb_rextsize; - ASSERT(extsz); - } else { - extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) - ? ip->i_d.di_extsize : 0; - } - return extsz; -} - -/* * Prototypes for functions in xfs_rw.c. */ extern int xfs_write_clear_setuid(struct xfs_inode *ip); @@ -76,5 +48,6 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, struct xfs_buf **bpp); extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, xfs_buf_t *bp, xfs_daddr_t blkno); +extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); #endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 66b849358e62..237badcbac3b 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -236,19 +236,20 @@ xfs_trans_alloc( uint type) { xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); - return _xfs_trans_alloc(mp, type); + return _xfs_trans_alloc(mp, type, KM_SLEEP); } xfs_trans_t * _xfs_trans_alloc( xfs_mount_t *mp, - uint type) + uint type, + uint memflags) { xfs_trans_t *tp; atomic_inc(&mp->m_active_trans); - tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); + tp = kmem_zone_zalloc(xfs_trans_zone, memflags); tp->t_magic = XFS_TRANS_MAGIC; tp->t_type = type; tp->t_mountp = mp; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index ed47fc77759c..ca64f33c63a3 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -100,6 +100,49 @@ typedef struct xfs_trans_header { #define XFS_TRANS_TYPE_MAX 41 /* new transaction types need to be reflected in xfs_logprint(8) */ +#define XFS_TRANS_TYPES \ + { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \ + { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ + { XFS_TRANS_INACTIVE, "INACTIVE" }, \ + { XFS_TRANS_CREATE, "CREATE" }, \ + { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ + { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ + { XFS_TRANS_REMOVE, "REMOVE" }, \ + { XFS_TRANS_LINK, "LINK" }, \ + { XFS_TRANS_RENAME, "RENAME" }, \ + { XFS_TRANS_MKDIR, "MKDIR" }, \ + { XFS_TRANS_RMDIR, "RMDIR" }, \ + { XFS_TRANS_SYMLINK, "SYMLINK" }, \ + { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \ + { XFS_TRANS_GROWFS, "GROWFS" }, \ + { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \ + { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \ + { XFS_TRANS_WRITEID, "WRITEID" }, \ + { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \ + { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \ + { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \ + { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \ + { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ + { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ + { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ + { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \ + { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ + { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ + { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ + { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ + { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ + { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ + { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \ + { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ + { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ + { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ + { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ + { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ + { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ + { XFS_TRANS_DUMMY1, "DUMMY1" }, \ + { XFS_TRANS_DUMMY2, "DUMMY2" }, \ + { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } + /* * This structure is used to track log items associated with * a transaction. It points to the log item and keeps some @@ -782,6 +825,10 @@ typedef struct xfs_log_item { #define XFS_LI_IN_AIL 0x1 #define XFS_LI_ABORTED 0x2 +#define XFS_LI_FLAGS \ + { XFS_LI_IN_AIL, "IN_AIL" }, \ + { XFS_LI_ABORTED, "ABORTED" } + typedef struct xfs_item_ops { uint (*iop_size)(xfs_log_item_t *); void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); @@ -924,7 +971,7 @@ typedef struct xfs_trans { * XFS transaction mechanism exported interfaces. */ xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); -xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint); +xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, uint); xfs_trans_t *xfs_trans_dup(xfs_trans_t *); int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, uint, uint); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 218829e6a152..49130628d5ef 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -38,6 +38,7 @@ #include "xfs_trans_priv.h" #include "xfs_error.h" #include "xfs_rw.h" +#include "xfs_trace.h" STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *, @@ -79,11 +80,8 @@ xfs_trans_get_buf(xfs_trans_t *tp, /* * Default to a normal get_buf() call if the tp is NULL. */ - if (tp == NULL) { - bp = xfs_buf_get_flags(target_dev, blkno, len, - flags | BUF_BUSY); - return(bp); - } + if (tp == NULL) + return xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY); /* * If we find the buffer in the cache with this transaction @@ -98,26 +96,23 @@ xfs_trans_get_buf(xfs_trans_t *tp, } if (bp != NULL) { ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); - if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { - xfs_buftrace("TRANS GET RECUR SHUT", bp); + if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) XFS_BUF_SUPER_STALE(bp); - } + /* * If the buffer is stale then it was binval'ed * since last read. This doesn't matter since the * caller isn't allowed to use the data anyway. */ - else if (XFS_BUF_ISSTALE(bp)) { - xfs_buftrace("TRANS GET RECUR STALE", bp); + else if (XFS_BUF_ISSTALE(bp)) ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); - } + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; - xfs_buftrace("TRANS GET RECUR", bp); - xfs_buf_item_trace("GET RECUR", bip); + trace_xfs_trans_get_buf_recur(bip); return (bp); } @@ -129,7 +124,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, * easily deadlock with our current transaction as well as cause * us to run out of stack space. */ - bp = xfs_buf_get_flags(target_dev, blkno, len, flags | BUF_BUSY); + bp = xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY); if (bp == NULL) { return NULL; } @@ -169,8 +164,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, */ XFS_BUF_SET_FSPRIVATE2(bp, tp); - xfs_buftrace("TRANS GET", bp); - xfs_buf_item_trace("GET", bip); + trace_xfs_trans_get_buf(bip); return (bp); } @@ -210,7 +204,7 @@ xfs_trans_getsb(xfs_trans_t *tp, ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; - xfs_buf_item_trace("GETSB RECUR", bip); + trace_xfs_trans_getsb_recur(bip); return (bp); } @@ -252,7 +246,7 @@ xfs_trans_getsb(xfs_trans_t *tp, */ XFS_BUF_SET_FSPRIVATE2(bp, tp); - xfs_buf_item_trace("GETSB", bip); + trace_xfs_trans_getsb(bip); return (bp); } @@ -302,7 +296,7 @@ xfs_trans_read_buf( * Default to a normal get_buf() call if the tp is NULL. */ if (tp == NULL) { - bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY); + bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY); if (!bp) return (flags & XFS_BUF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); @@ -350,7 +344,7 @@ xfs_trans_read_buf( ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); ASSERT((XFS_BUF_ISERROR(bp)) == 0); if (!(XFS_BUF_ISDONE(bp))) { - xfs_buftrace("READ_BUF_INCORE !DONE", bp); + trace_xfs_trans_read_buf_io(bp, _RET_IP_); ASSERT(!XFS_BUF_ISASYNC(bp)); XFS_BUF_READ(bp); xfsbdstrat(tp->t_mountp, bp); @@ -375,7 +369,7 @@ xfs_trans_read_buf( * brelse it either. Just get out. */ if (XFS_FORCED_SHUTDOWN(mp)) { - xfs_buftrace("READ_BUF_INCORE XFSSHUTDN", bp); + trace_xfs_trans_read_buf_shut(bp, _RET_IP_); *bpp = NULL; return XFS_ERROR(EIO); } @@ -385,7 +379,7 @@ xfs_trans_read_buf( bip->bli_recur++; ASSERT(atomic_read(&bip->bli_refcount) > 0); - xfs_buf_item_trace("READ RECUR", bip); + trace_xfs_trans_read_buf_recur(bip); *bpp = bp; return 0; } @@ -398,14 +392,13 @@ xfs_trans_read_buf( * easily deadlock with our current transaction as well as cause * us to run out of stack space. */ - bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY); + bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY); if (bp == NULL) { *bpp = NULL; return 0; } if (XFS_BUF_GETERROR(bp) != 0) { XFS_BUF_SUPER_STALE(bp); - xfs_buftrace("READ ERROR", bp); error = XFS_BUF_GETERROR(bp); xfs_ioerror_alert("xfs_trans_read_buf", mp, @@ -464,8 +457,7 @@ xfs_trans_read_buf( */ XFS_BUF_SET_FSPRIVATE2(bp, tp); - xfs_buftrace("TRANS READ", bp); - xfs_buf_item_trace("READ", bip); + trace_xfs_trans_read_buf(bip); *bpp = bp; return 0; @@ -483,7 +475,7 @@ shutdown_abort: ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) != (XFS_B_STALE|XFS_B_DELWRI)); - xfs_buftrace("READ_BUF XFSSHUTDN", bp); + trace_xfs_trans_read_buf_shut(bp, _RET_IP_); xfs_buf_relse(bp); *bpp = NULL; return XFS_ERROR(EIO); @@ -549,13 +541,14 @@ xfs_trans_brelse(xfs_trans_t *tp, lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); ASSERT(lidp != NULL); + trace_xfs_trans_brelse(bip); + /* * If the release is just for a recursive lock, * then decrement the count and return. */ if (bip->bli_recur > 0) { bip->bli_recur--; - xfs_buf_item_trace("RELSE RECUR", bip); return; } @@ -563,10 +556,8 @@ xfs_trans_brelse(xfs_trans_t *tp, * If the buffer is dirty within this transaction, we can't * release it until we commit. */ - if (lidp->lid_flags & XFS_LID_DIRTY) { - xfs_buf_item_trace("RELSE DIRTY", bip); + if (lidp->lid_flags & XFS_LID_DIRTY) return; - } /* * If the buffer has been invalidated, then we can't release @@ -574,13 +565,10 @@ xfs_trans_brelse(xfs_trans_t *tp, * as part of this transaction. This prevents us from pulling * the item from the AIL before we should. */ - if (bip->bli_flags & XFS_BLI_STALE) { - xfs_buf_item_trace("RELSE STALE", bip); + if (bip->bli_flags & XFS_BLI_STALE) return; - } ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - xfs_buf_item_trace("RELSE", bip); /* * Free up the log item descriptor tracking the released item. @@ -677,7 +665,7 @@ xfs_trans_bjoin(xfs_trans_t *tp, */ XFS_BUF_SET_FSPRIVATE2(bp, tp); - xfs_buf_item_trace("BJOIN", bip); + trace_xfs_trans_bjoin(bip); } /* @@ -701,7 +689,7 @@ xfs_trans_bhold(xfs_trans_t *tp, ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_HOLD; - xfs_buf_item_trace("BHOLD", bip); + trace_xfs_trans_bhold(bip); } /* @@ -724,7 +712,8 @@ xfs_trans_bhold_release(xfs_trans_t *tp, ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT(bip->bli_flags & XFS_BLI_HOLD); bip->bli_flags &= ~XFS_BLI_HOLD; - xfs_buf_item_trace("BHOLD RELEASE", bip); + + trace_xfs_trans_bhold_release(bip); } /* @@ -770,6 +759,8 @@ xfs_trans_log_buf(xfs_trans_t *tp, XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone; + trace_xfs_trans_log_buf(bip); + /* * If we invalidated the buffer within this transaction, then * cancel the invalidation now that we're dirtying the buffer @@ -777,7 +768,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, * because we have a reference to the buffer this entire time. */ if (bip->bli_flags & XFS_BLI_STALE) { - xfs_buf_item_trace("BLOG UNSTALE", bip); bip->bli_flags &= ~XFS_BLI_STALE; ASSERT(XFS_BUF_ISSTALE(bp)); XFS_BUF_UNSTALE(bp); @@ -792,7 +782,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, lidp->lid_flags &= ~XFS_LID_BUF_STALE; bip->bli_flags |= XFS_BLI_LOGGED; xfs_buf_item_log(bip, first, last); - xfs_buf_item_trace("BLOG", bip); } @@ -831,6 +820,8 @@ xfs_trans_binval( ASSERT(lidp != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); + trace_xfs_trans_binval(bip); + if (bip->bli_flags & XFS_BLI_STALE) { /* * If the buffer is already invalidated, then @@ -843,8 +834,6 @@ xfs_trans_binval( ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); ASSERT(lidp->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); - xfs_buftrace("XFS_BINVAL RECUR", bp); - xfs_buf_item_trace("BINVAL RECUR", bip); return; } @@ -878,8 +867,6 @@ xfs_trans_binval( (bip->bli_format.blf_map_size * sizeof(uint))); lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE; tp->t_flags |= XFS_TRANS_DIRTY; - xfs_buftrace("XFS_BINVAL", bp); - xfs_buf_item_trace("BINVAL", bip); } /* diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index b572f7e840e0..6f268756bf36 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -53,6 +53,7 @@ #include "xfs_log_priv.h" #include "xfs_filestream.h" #include "xfs_vnodeops.h" +#include "xfs_trace.h" int xfs_setattr( @@ -69,7 +70,6 @@ xfs_setattr( uint commit_flags=0; uid_t uid=0, iuid=0; gid_t gid=0, igid=0; - int timeflags = 0; struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; int need_iolock = 1; @@ -134,16 +134,13 @@ xfs_setattr( if (flags & XFS_ATTR_NOLOCK) need_iolock = 0; if (!(mask & ATTR_SIZE)) { - if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) || - (mp->m_flags & XFS_MOUNT_WSYNC)) { - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - commit_flags = 0; - if ((code = xfs_trans_reserve(tp, 0, - XFS_ICHANGE_LOG_RES(mp), 0, - 0, 0))) { - lock_flags = 0; - goto error_return; - } + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + commit_flags = 0; + code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), + 0, 0, 0); + if (code) { + lock_flags = 0; + goto error_return; } } else { if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) && @@ -294,15 +291,23 @@ xfs_setattr( * or we are explicitly asked to change it. This handles * the semantic difference between truncate() and ftruncate() * as implemented in the VFS. + * + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME + * is a special case where we need to update the times despite + * not having these flags set. For all other operations the + * VFS set these flags explicitly if it wants a timestamp + * update. */ - if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME)) - timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + if (iattr->ia_size != ip->i_size && + (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { + iattr->ia_ctime = iattr->ia_mtime = + current_fs_time(inode->i_sb); + mask |= ATTR_CTIME | ATTR_MTIME; + } if (iattr->ia_size > ip->i_size) { ip->i_d.di_size = iattr->ia_size; ip->i_size = iattr->ia_size; - if (!(flags & XFS_ATTR_DMI)) - xfs_ichgtime(ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } else if (iattr->ia_size <= ip->i_size || (iattr->ia_size == 0 && ip->i_d.di_nextents)) { @@ -373,9 +378,6 @@ xfs_setattr( ip->i_d.di_gid = gid; inode->i_gid = gid; } - - xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); - timeflags |= XFS_ICHGTIME_CHG; } /* @@ -392,51 +394,37 @@ xfs_setattr( inode->i_mode &= S_IFMT; inode->i_mode |= mode & ~S_IFMT; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - timeflags |= XFS_ICHGTIME_CHG; } /* * Change file access or modified times. */ - if (mask & (ATTR_ATIME|ATTR_MTIME)) { - if (mask & ATTR_ATIME) { - inode->i_atime = iattr->ia_atime; - ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; - ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; - ip->i_update_core = 1; - } - if (mask & ATTR_MTIME) { - inode->i_mtime = iattr->ia_mtime; - ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - timeflags &= ~XFS_ICHGTIME_MOD; - timeflags |= XFS_ICHGTIME_CHG; - } - if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET))) - xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); + if (mask & ATTR_ATIME) { + inode->i_atime = iattr->ia_atime; + ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; + ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; + ip->i_update_core = 1; } - - /* - * Change file inode change time only if ATTR_CTIME set - * AND we have been called by a DMI function. - */ - - if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) { + if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; ip->i_update_core = 1; - timeflags &= ~XFS_ICHGTIME_CHG; + } + if (mask & ATTR_MTIME) { + inode->i_mtime = iattr->ia_mtime; + ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; + ip->i_update_core = 1; } /* - * Send out timestamp changes that need to be set to the - * current time. Not done when called by a DMI function. + * And finally, log the inode core if any attribute in it + * has been changed. */ - if (timeflags && !(flags & XFS_ATTR_DMI)) - xfs_ichgtime(ip, timeflags); + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE| + ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(xs_ig_attrchg); @@ -451,12 +439,10 @@ xfs_setattr( * mix so this probably isn't worth the trouble to optimize. */ code = 0; - if (tp) { - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(tp); + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); - code = xfs_trans_commit(tp, commit_flags); - } + code = xfs_trans_commit(tp, commit_flags); xfs_iunlock(ip, lock_flags); @@ -538,9 +524,8 @@ xfs_readlink_bmap( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt), - XBF_LOCK | XBF_MAPPED | - XBF_DONT_BLOCK); + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), + XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK); error = XFS_BUF_GETERROR(bp); if (error) { xfs_ioerror_alert("xfs_readlink", @@ -709,6 +694,11 @@ xfs_fsync( } /* + * Flags for xfs_free_eofblocks + */ +#define XFS_FREE_EOF_TRYLOCK (1<<0) + +/* * This is called by xfs_inactive to free any blocks beyond eof * when the link count isn't zero and by xfs_dm_punch_hole() when * punching a hole to EOF. @@ -726,7 +716,6 @@ xfs_free_eofblocks( xfs_filblks_t map_len; int nimaps; xfs_bmbt_irec_t imap; - int use_iolock = (flags & XFS_FREE_EOF_LOCK); /* * Figure out if there are any blocks beyond the end @@ -768,14 +757,19 @@ xfs_free_eofblocks( * cache and we can't * do that within a transaction. */ - if (use_iolock) + if (flags & XFS_FREE_EOF_TRYLOCK) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + xfs_trans_cancel(tp, 0); + return 0; + } + } else { xfs_ilock(ip, XFS_IOLOCK_EXCL); + } error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, ip->i_size); if (error) { xfs_trans_cancel(tp, 0); - if (use_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; } @@ -812,8 +806,7 @@ xfs_free_eofblocks( error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); } - xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL) - : XFS_ILOCK_EXCL)); + xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL); } return error; } @@ -1113,7 +1106,17 @@ xfs_release( (ip->i_df.if_flags & XFS_IFEXTENTS)) && (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { - error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK); + + /* + * If we can't get the iolock just skip truncating + * the blocks past EOF because we could deadlock + * with the mmap_sem otherwise. We'll get another + * chance to drop them once the last reference to + * the inode is dropped, so we'll never leak blocks + * permanently. + */ + error = xfs_free_eofblocks(mp, ip, + XFS_FREE_EOF_TRYLOCK); if (error) return error; } @@ -1184,7 +1187,7 @@ xfs_inactive( (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || (ip->i_delayed_blks != 0)))) { - error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK); + error = xfs_free_eofblocks(mp, ip, 0); if (error) return VN_INACTIVE_CACHE; } @@ -1380,7 +1383,6 @@ xfs_lookup( if (error) goto out_free_name; - xfs_itrace_ref(*ipp); return 0; out_free_name: @@ -1526,7 +1528,6 @@ xfs_create( * At this point, we've gotten a newly allocated inode. * It is locked (and joined to the transaction). */ - xfs_itrace_ref(ip); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); /* @@ -1986,9 +1987,6 @@ xfs_remove( if (!is_dir && link_zero && xfs_inode_is_filestream(ip)) xfs_filestream_deassociate(ip); - xfs_itrace_exit(ip); - xfs_itrace_exit(dp); - std_return: if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) { XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL, @@ -2285,7 +2283,6 @@ xfs_symlink( goto error_return; goto error1; } - xfs_itrace_ref(ip); /* * An error after we've joined dp to the transaction will result in the @@ -2456,46 +2453,6 @@ xfs_set_dmattrs( return error; } -int -xfs_reclaim( - xfs_inode_t *ip) -{ - - xfs_itrace_entry(ip); - - ASSERT(!VN_MAPPED(VFS_I(ip))); - - /* bad inode, get out here ASAP */ - if (is_bad_inode(VFS_I(ip))) { - xfs_ireclaim(ip); - return 0; - } - - xfs_ioend_wait(ip); - - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); - - /* - * If we have nothing to flush with this inode then complete the - * teardown now, otherwise break the link between the xfs inode and the - * linux inode and clean up the xfs inode later. This avoids flushing - * the inode to disk during the delete operation itself. - * - * When breaking the link, we need to set the XFS_IRECLAIMABLE flag - * first to ensure that xfs_iunpin() will never see an xfs inode - * that has a linux inode being reclaimed. Synchronisation is provided - * by the i_flags_lock. - */ - if (!ip->i_update_core && (ip->i_itemp == NULL)) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_iflock(ip); - xfs_iflags_set(ip, XFS_IRECLAIMABLE); - return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC); - } - xfs_inode_set_reclaim_tag(ip); - return 0; -} - /* * xfs_alloc_file_space() * This routine allocates disk space for the given file. @@ -2868,7 +2825,6 @@ xfs_free_file_space( ioffset = offset & ~(rounding - 1); if (VN_CACHED(VFS_I(ip)) != 0) { - xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1); error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED); if (error) goto out_unlock_iolock; diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index a9e102de71a1..167a467403a5 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -38,7 +38,6 @@ int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, mode_t mode, struct xfs_inode **ipp, cred_t *credp); int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); -int xfs_reclaim(struct xfs_inode *ip); int xfs_change_file_space(struct xfs_inode *ip, int cmd, xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, |