diff options
Diffstat (limited to 'fs')
284 files changed, 16268 insertions, 10691 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 780725a463b1..c229f828eb01 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -211,6 +211,7 @@ source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" source "fs/f2fs/Kconfig" +source "fs/efivarfs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 0efd1524b977..370b24cee4d8 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -65,6 +65,20 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS This config option changes the default setting of coredump_filter seen at boot time. If unsure, say Y. +config BINFMT_SCRIPT + tristate "Kernel support for scripts starting with #!" + default y + help + Say Y here if you want to execute interpreted scripts starting with + #! followed by the path to an interpreter. + + You can build this support as a module; however, until that module + gets loaded, you cannot run scripts. Thus, if you want to load this + module from an initramfs, the portion of the initramfs before loading + this module must consist of compiled binaries only. + + Most systems will not boot if you say M or N here. If unsure, say Y. + config BINFMT_FLAT bool "Kernel support for flat binaries" depends on !MMU && (!FRV || BROKEN) diff --git a/fs/Makefile b/fs/Makefile index 9d53192236fc..4fe6df3ec28f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -7,10 +7,10 @@ obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ - ioctl.o readdir.o select.o fifo.o dcache.o inode.o \ + ioctl.o readdir.o select.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ - pnode.o drop_caches.o splice.o sync.o utimes.o \ + pnode.o splice.o sync.o utimes.o \ stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) @@ -34,10 +34,7 @@ obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o - -# binfmt_script is always there -obj-y += binfmt_script.o - +obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o @@ -49,6 +46,7 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o obj-$(CONFIG_COREDUMP) += coredump.o +obj-$(CONFIG_SYSCTL) += drop_caches.o obj-$(CONFIG_FHANDLE) += fhandle.o @@ -127,3 +125,4 @@ obj-$(CONFIG_F2FS_FS) += f2fs/ obj-y += exofs/ # Multiple modules obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ +obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 096b23f821a1..526e4bbbde59 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -190,7 +190,7 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file) return ret; m = file->private_data; - m->private = PDE(inode)->data; + m->private = PDE_DATA(inode); return 0; } @@ -448,7 +448,7 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file) struct seq_file *m; int ret; - cell = PDE(inode)->data; + cell = PDE_DATA(inode); if (!cell) return -ENOENT; @@ -554,7 +554,7 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) struct seq_file *m; int ret; - cell = PDE(inode)->data; + cell = PDE_DATA(inode); if (!cell) return -ENOENT; @@ -659,7 +659,7 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file) struct seq_file *m; int ret; - cell = PDE(inode)->data; + cell = PDE_DATA(inode); if (!cell) return -ENOENT; @@ -1029,9 +1029,9 @@ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) spin_unlock(&info->ring_lock); out: - kunmap_atomic(ring); dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, (unsigned long)ring->head, (unsigned long)ring->tail); + kunmap_atomic(ring); return ret; } @@ -1324,6 +1324,8 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) if (iocb->ki_pos < 0) return -EINVAL; + if (opcode == IOCB_CMD_PWRITEV) + file_start_write(file); do { ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], iocb->ki_nr_segs - iocb->ki_cur_seg, @@ -1336,6 +1338,8 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) } while (ret > 0 && iocb->ki_left > 0 && (opcode == IOCB_CMD_PWRITEV || (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); + if (opcode == IOCB_CMD_PWRITEV) + file_end_write(file); /* This means we must have transferred all that we could */ /* No need to retry anymore */ @@ -1790,7 +1794,5 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, ret = read_events(ioctx, min_nr, nr, events, timeout); put_ioctx(ioctx); } - - asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout); return ret; } diff --git a/fs/befs/btree.c b/fs/befs/btree.c index a66c9b1136e0..74e397db0b8b 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -436,8 +436,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, goto error; } - if ((this_node = (befs_btree_node *) - kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) { + if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) { befs_error(sb, "befs_btree_read() failed to allocate %u " "bytes of memory", sizeof (befs_btree_node)); goto error; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index bbc8f8827eac..bce87694f7b0 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -62,7 +62,6 @@ static int aout_core_dump(struct coredump_params *cprm) fs = get_fs(); set_fs(KERNEL_DS); has_dumped = 1; - current->flags |= PF_DUMPCORE; strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); dump.u_ar0 = offsetof(struct user, regs); dump.signal = cprm->siginfo->si_signo; @@ -287,15 +286,12 @@ static int load_aout_binary(struct linux_binprm * bprm) return error; } - error = bprm->file->f_op->read(bprm->file, - (char __user *)text_addr, - ex.a_text+ex.a_data, &pos); + error = read_code(bprm->file, text_addr, pos, + ex.a_text+ex.a_data); if ((signed long)error < 0) { send_sig(SIGKILL, current, 0); return error; } - - flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data); } else { if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && (N_MAGIC(ex) != NMAGIC) && printk_ratelimit()) @@ -311,14 +307,9 @@ static int load_aout_binary(struct linux_binprm * bprm) } if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { - loff_t pos = fd_offset; vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - bprm->file->f_op->read(bprm->file, - (char __user *)N_TXTADDR(ex), - ex.a_text+ex.a_data, &pos); - flush_icache_range((unsigned long) N_TXTADDR(ex), - (unsigned long) N_TXTADDR(ex) + - ex.a_text+ex.a_data); + read_code(bprm->file, N_TXTADDR(ex), fd_offset, + ex.a_text + ex.a_data); goto beyond_if; } @@ -397,8 +388,6 @@ static int load_aout_library(struct file *file) start_addr = ex.a_entry & 0xfffff000; if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { - loff_t pos = N_TXTOFF(ex); - if (printk_ratelimit()) { printk(KERN_WARNING @@ -407,11 +396,8 @@ static int load_aout_library(struct file *file) } vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - file->f_op->read(file, (char __user *)start_addr, - ex.a_text + ex.a_data, &pos); - flush_icache_range((unsigned long) start_addr, - (unsigned long) start_addr + ex.a_text + ex.a_data); - + read_code(file, start_addr, N_TXTOFF(ex), + ex.a_text + ex.a_data); retval = 0; goto out; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3939829f6c5c..f8a0b0efda44 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -240,6 +240,9 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid)); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); +#ifdef ELF_HWCAP2 + NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); +#endif NEW_AUX_ENT(AT_EXECFN, bprm->exec); if (k_platform) { NEW_AUX_ENT(AT_PLATFORM, @@ -803,7 +806,8 @@ static int load_elf_binary(struct linux_binprm *bprm) * follow the loader, and is not movable. */ #ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE /* Memory randomization might have been switched off - * in runtime via sysctl. + * in runtime via sysctl or explicit setting of + * personality flags. * If that is the case, retain the original non-zero * load_bias value in order to establish proper * non-randomized mappings. @@ -1137,6 +1141,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, goto whole; if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) goto whole; + return 0; } /* Do not dump I/O mapped devices or special mappings */ @@ -2090,8 +2095,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto cleanup; has_dumped = 1; - current->flags |= PF_DUMPCORE; - + fs = get_fs(); set_fs(KERNEL_DS); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 9c13e023e2b7..c166f325a183 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -483,7 +483,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, size_t platform_len = 0, len; char *k_platform, *k_base_platform; char __user *u_platform, *u_base_platform, *p; - long hwcap; int loop; int nr; /* reset for each csp adjustment */ @@ -502,8 +501,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, return -EFAULT; #endif - hwcap = ELF_HWCAP; - /* * If this architecture has a platform capability string, copy it * to userspace. In some cases (Sparc), this info is impossible @@ -617,7 +614,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, nr = 0; csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); - NEW_AUX_ENT(AT_HWCAP, hwcap); + NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); +#ifdef ELF_HWCAP2 + NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); +#endif NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); NEW_AUX_ENT(AT_PHDR, exec_params->ph_addr); @@ -926,7 +926,6 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( struct elf32_fdpic_loadseg *seg; struct elf32_phdr *phdr; unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags; - loff_t fpos; int loop, ret; load_addr = params->load_addr; @@ -964,14 +963,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( if (params->phdrs[loop].p_type != PT_LOAD) continue; - fpos = phdr->p_offset; - seg->addr = maddr + (phdr->p_vaddr - base); seg->p_vaddr = phdr->p_vaddr; seg->p_memsz = phdr->p_memsz; - ret = file->f_op->read(file, (void *) seg->addr, - phdr->p_filesz, &fpos); + ret = read_code(file, seg->addr, phdr->p_offset, + phdr->p_filesz); if (ret < 0) return ret; @@ -1687,8 +1684,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) fill_elf_fdpic_header(elf, e_phnum); has_dumped = 1; - current->flags |= PF_DUMPCORE; - /* * Set up the notes in similar form to SVR4 core dumps made * with info from their /proc. diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 2036d21baaef..d50bbe59da1e 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -207,11 +207,12 @@ static int decompress_exec( /* Read in first chunk of data and parse gzip header. */ fpos = offset; - ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); + ret = kernel_read(bprm->file, offset, buf, LBUFSIZE); strm.next_in = buf; strm.avail_in = ret; strm.total_in = 0; + fpos += ret; retval = -ENOEXEC; @@ -277,7 +278,7 @@ static int decompress_exec( } while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK) { - ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); + ret = kernel_read(bprm->file, fpos, buf, LBUFSIZE); if (ret <= 0) break; len -= ret; @@ -285,6 +286,7 @@ static int decompress_exec( strm.next_in = buf; strm.avail_in = ret; strm.total_in = 0; + fpos += ret; } if (ret < 0) { @@ -428,6 +430,7 @@ static int load_flat_file(struct linux_binprm * bprm, unsigned long textpos = 0, datapos = 0, result; unsigned long realdatastart = 0; unsigned long text_len, data_len, bss_len, stack_len, flags; + unsigned long full_data; unsigned long len, memp = 0; unsigned long memp_size, extra, rlim; unsigned long *reloc = 0, *rp; @@ -451,6 +454,7 @@ static int load_flat_file(struct linux_binprm * bprm, relocs = ntohl(hdr->reloc_count); flags = ntohl(hdr->flags); rev = ntohl(hdr->rev); + full_data = data_len + relocs * sizeof(unsigned long); if (strncmp(hdr->magic, "bFLT", 4)) { /* @@ -577,12 +581,12 @@ static int load_flat_file(struct linux_binprm * bprm, #ifdef CONFIG_BINFMT_ZFLAT if (flags & FLAT_FLAG_GZDATA) { result = decompress_exec(bprm, fpos, (char *) datapos, - data_len + (relocs * sizeof(unsigned long)), 0); + full_data, 0); } else #endif { - result = bprm->file->f_op->read(bprm->file, (char *) datapos, - data_len + (relocs * sizeof(unsigned long)), &fpos); + result = read_code(bprm->file, datapos, fpos, + full_data); } if (IS_ERR_VALUE(result)) { printk("Unable to read data+bss, errno %d\n", (int)-result); @@ -627,30 +631,25 @@ static int load_flat_file(struct linux_binprm * bprm, if (flags & FLAT_FLAG_GZIP) { result = decompress_exec(bprm, sizeof (struct flat_hdr), (((char *) textpos) + sizeof (struct flat_hdr)), - (text_len + data_len + (relocs * sizeof(unsigned long)) + (text_len + full_data - sizeof (struct flat_hdr)), 0); memmove((void *) datapos, (void *) realdatastart, - data_len + (relocs * sizeof(unsigned long))); + full_data); } else if (flags & FLAT_FLAG_GZDATA) { - fpos = 0; - result = bprm->file->f_op->read(bprm->file, - (char *) textpos, text_len, &fpos); + result = read_code(bprm->file, textpos, 0, text_len); if (!IS_ERR_VALUE(result)) result = decompress_exec(bprm, text_len, (char *) datapos, - data_len + (relocs * sizeof(unsigned long)), 0); + full_data, 0); } else #endif { - fpos = 0; - result = bprm->file->f_op->read(bprm->file, - (char *) textpos, text_len, &fpos); - if (!IS_ERR_VALUE(result)) { - fpos = ntohl(hdr->data_start); - result = bprm->file->f_op->read(bprm->file, (char *) datapos, - data_len + (relocs * sizeof(unsigned long)), &fpos); - } + result = read_code(bprm->file, textpos, 0, text_len); + if (!IS_ERR_VALUE(result)) + result = read_code(bprm->file, datapos, + ntohl(hdr->data_start), + full_data); } if (IS_ERR_VALUE(result)) { printk("Unable to read code+data+bss, errno %d\n",(int)-result); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 751df5e4f61a..1c740e152f38 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -23,6 +23,7 @@ #include <linux/binfmts.h> #include <linux/slab.h> #include <linux/ctype.h> +#include <linux/string_helpers.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/namei.h> @@ -234,24 +235,6 @@ static char *scanarg(char *s, char del) return s; } -static int unquote(char *from) -{ - char c = 0, *s = from, *p = from; - - while ((c = *s++) != '\0') { - if (c == '\\' && *s == 'x') { - s++; - c = toupper(*s++); - *p = (c - (isdigit(c) ? '0' : 'A' - 10)) << 4; - c = toupper(*s++); - *p++ |= c - (isdigit(c) ? '0' : 'A' - 10); - continue; - } - *p++ = c; - } - return p - from; -} - static char * check_special_flags (char * sfs, Node * e) { char * p = sfs; @@ -354,8 +337,9 @@ static Node *create_entry(const char __user *buffer, size_t count) p[-1] = '\0'; if (!e->mask[0]) e->mask = NULL; - e->size = unquote(e->magic); - if (e->mask && unquote(e->mask) != e->size) + e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX); + if (e->mask && + string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) goto Einval; if (e->size + e->offset > BINPRM_BUF_SIZE) goto Einval; @@ -1428,8 +1428,6 @@ void bio_endio(struct bio *bio, int error) else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; - trace_block_bio_complete(bio, error); - if (bio->bi_end_io) bio->bi_end_io(bio, error); } diff --git a/fs/block_dev.c b/fs/block_dev.c index aea605c98ba6..ce08de7467a3 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -551,6 +551,7 @@ struct block_device *bdgrab(struct block_device *bdev) ihold(bdev->bd_inode); return bdev; } +EXPORT_SYMBOL(bdgrab); long nr_blockdev_pages(void) { @@ -616,11 +617,9 @@ void bd_forget(struct inode *inode) struct block_device *bdev = NULL; spin_lock(&bdev_lock); - if (inode->i_bdev) { - if (!sb_is_blkdev_sb(inode->i_sb)) - bdev = inode->i_bdev; - __bd_forget(inode); - } + if (!sb_is_blkdev_sb(inode->i_sb)) + bdev = inode->i_bdev; + __bd_forget(inode); spin_unlock(&bdev_lock); if (bdev) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ecd25a1b4e51..ca9d8f1a3bb6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -651,6 +651,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (tree_mod_dont_log(fs_info, NULL)) return 0; + __tree_mod_log_free_eb(fs_info, old_root); + ret = tree_mod_alloc(fs_info, flags, &tm); if (ret < 0) goto out; @@ -736,7 +738,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) static noinline void tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, - unsigned long src_offset, int nr_items) + unsigned long src_offset, int nr_items, int log_removal) { int ret; int i; @@ -750,10 +752,12 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, } for (i = 0; i < nr_items; i++) { - ret = tree_mod_log_insert_key_locked(fs_info, src, - i + src_offset, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); + if (log_removal) { + ret = tree_mod_log_insert_key_locked(fs_info, src, + i + src_offset, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + } ret = tree_mod_log_insert_key_locked(fs_info, dst, i + dst_offset, MOD_LOG_KEY_ADD); @@ -927,7 +931,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); /* -ENOMEM */ } - tree_mod_log_free_eb(root->fs_info, buf); clean_tree_block(trans, root, buf); *last_ref = 1; } @@ -1046,6 +1049,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); + tree_mod_log_free_eb(root->fs_info, buf); btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); } @@ -1750,7 +1754,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } - tree_mod_log_free_eb(root->fs_info, root->node); tree_mod_log_set_root_pointer(root, child); rcu_assign_pointer(root->node, child); @@ -2995,7 +2998,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items = min(src_nritems - 8, push_items); tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, - push_items); + push_items, 1); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), @@ -3066,7 +3069,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, sizeof(struct btrfs_key_ptr)); tree_mod_log_eb_copy(root->fs_info, dst, src, 0, - src_nritems - push_items, push_items); + src_nritems - push_items, push_items, 1); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3218,12 +3221,18 @@ static noinline int split_node(struct btrfs_trans_handle *trans, int mid; int ret; u32 c_nritems; + int tree_mod_log_removal = 1; c = path->nodes[level]; WARN_ON(btrfs_header_generation(c) != trans->transid); if (c == root->node) { /* trying to split the root, lets make a new one */ ret = insert_new_root(trans, root, path, level + 1); + /* + * removal of root nodes has been logged by + * tree_mod_log_set_root_pointer due to locking + */ + tree_mod_log_removal = 0; if (ret) return ret; } else { @@ -3261,7 +3270,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); + tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid, + tree_mod_log_removal); copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7d84651e850b..6d19a0a554aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1291,6 +1291,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); + leaf = NULL; goto fail; } @@ -1334,11 +1335,16 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_tree_unlock(leaf); + return root; + fail: - if (ret) - return ERR_PTR(ret); + if (leaf) { + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + } + kfree(root); - return root; + return ERR_PTR(ret); } static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, @@ -3253,7 +3259,7 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); btrfs_free_log_root_tree(NULL, fs_info); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9ac2eca681eb..3d551231caba 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -257,7 +257,8 @@ static int exclude_super_stripes(struct btrfs_root *root, cache->bytes_super += stripe_len; ret = add_excluded_extent(root, cache->key.objectid, stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -265,13 +266,17 @@ static int exclude_super_stripes(struct btrfs_root *root, ret = btrfs_rmap_block(&root->fs_info->mapping_tree, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; while (nr--) { cache->bytes_super += stripe_len; ret = add_excluded_extent(root, logical[nr], stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + kfree(logical); + return ret; + } } kfree(logical); @@ -4438,7 +4443,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); - block_rsv->size = num_bytes; + block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + sinfo->bytes_reserved + sinfo->bytes_readonly + @@ -4793,14 +4798,49 @@ out_fail: * If the inodes csum_bytes is the same as the original * csum_bytes then we know we haven't raced with any free()ers * so we can just reduce our inodes csum bytes and carry on. - * Otherwise we have to do the normal free thing to account for - * the case that the free side didn't free up its reserve - * because of this outstanding reservation. */ - if (BTRFS_I(inode)->csum_bytes == csum_bytes) + if (BTRFS_I(inode)->csum_bytes == csum_bytes) { calc_csum_metadata_size(inode, num_bytes, 0); - else - to_free = calc_csum_metadata_size(inode, num_bytes, 0); + } else { + u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; + u64 bytes; + + /* + * This is tricky, but first we need to figure out how much we + * free'd from any free-ers that occured during this + * reservation, so we reset ->csum_bytes to the csum_bytes + * before we dropped our lock, and then call the free for the + * number of bytes that were freed while we were trying our + * reservation. + */ + bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; + BTRFS_I(inode)->csum_bytes = csum_bytes; + to_free = calc_csum_metadata_size(inode, bytes, 0); + + + /* + * Now we need to see how much we would have freed had we not + * been making this reservation and our ->csum_bytes were not + * artificially inflated. + */ + BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; + bytes = csum_bytes - orig_csum_bytes; + bytes = calc_csum_metadata_size(inode, bytes, 0); + + /* + * Now reset ->csum_bytes to what it should be. If bytes is + * more than to_free then we would have free'd more space had we + * not had an artificially high ->csum_bytes, so we need to free + * the remainder. If bytes is the same or less then we don't + * need to do anything, the other free-ers did the correct + * thing. + */ + BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; + if (bytes > to_free) + to_free = bytes - to_free; + else + to_free = 0; + } spin_unlock(&BTRFS_I(inode)->lock); if (dropped) to_free += btrfs_calc_trans_metadata_size(root, dropped); @@ -7947,7 +7987,17 @@ int btrfs_read_block_groups(struct btrfs_root *root) * info has super bytes accounted for, otherwise we'll think * we have more space than we actually do. */ - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + kfree(cache->free_space_ctl); + kfree(cache); + goto error; + } /* * check for two cases, either we are full, and therefore @@ -8089,7 +8139,17 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + kfree(cache->free_space_ctl); + kfree(cache); + return ret; + } add_new_free_space(cache, root->fs_info, chunk_offset, chunk_offset + size); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f173c5af6461..cdee391fc7bf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1257,6 +1257,39 @@ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) GFP_NOFS); } +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + clear_page_dirty_for_io(page); + page_cache_release(page); + index++; + } + return 0; +} + +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + account_page_redirty(page); + __set_page_dirty_nobuffers(page); + page_cache_release(page); + index++; + } + return 0; +} + /* * helper function to set both pages and extents in the tree writeback */ diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6068a1985560..258c92156857 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -325,6 +325,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long *map_len); int extent_range_uptodate(struct extent_io_tree *tree, u64 start, u64 end); +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index ec160202be3e..c4628a201cb3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -118,9 +118,11 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); csums_in_item /= csum_size; - if (csum_offset >= csums_in_item) { + if (csum_offset == csums_in_item) { ret = -EFBIG; goto fail; + } else if (csum_offset > csums_in_item) { + goto fail; } } item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); @@ -728,7 +730,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, return -ENOMEM; sector_sum = sums->sums; - trans->adding_csums = 1; again: next_offset = (u64)-1; found_next = 0; @@ -899,7 +900,6 @@ next_sector: goto again; } out: - trans->adding_csums = 0; btrfs_free_path(path); return ret; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5b4ea5f55b8f..bb8b7a0e28a6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1514,8 +1514,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, size_t count, ocount; bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); - sb_start_write(inode->i_sb); - mutex_lock(&inode->i_mutex); err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); @@ -1617,7 +1615,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); out: - sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; } @@ -2142,6 +2139,7 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -2169,6 +2167,11 @@ static long btrfs_fallocate(struct file *file, int mode, ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) return ret; + if (root->fs_info->quota_enabled) { + ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); + if (ret) + goto out_reserve_fail; + } /* * wait for ordered IO before we have any locks. We'll loop again @@ -2272,6 +2275,9 @@ static long btrfs_fallocate(struct file *file, int mode, &cached_state, GFP_NOFS); out: mutex_unlock(&inode->i_mutex); + if (root->fs_info->quota_enabled) + btrfs_qgroup_free(root, alloc_end - alloc_start); +out_reserve_fail: /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ca1b767d51f7..09c58a35b429 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -353,6 +353,7 @@ static noinline int compress_file_range(struct inode *inode, int i; int will_compress; int compress_type = root->fs_info->compress_type; + int redirty = 0; /* if this is a small write inside eof, kick off a defrag */ if ((end - start + 1) < 16 * 1024 && @@ -415,6 +416,17 @@ again: if (BTRFS_I(inode)->force_compress) compress_type = BTRFS_I(inode)->force_compress; + /* + * we need to call clear_page_dirty_for_io on each + * page in the range. Otherwise applications with the file + * mmap'd can wander in and change the page contents while + * we are compressing them. + * + * If the compression fails for any reason, we set the pages + * dirty again later on. + */ + extent_range_clear_dirty_for_io(inode, start, end); + redirty = 1; ret = btrfs_compress_pages(compress_type, inode->i_mapping, start, total_compressed, pages, @@ -554,6 +566,8 @@ cleanup_and_bail_uncompressed: __set_page_dirty_nobuffers(locked_page); /* unlocked later on in the async handlers */ } + if (redirty) + extent_range_redirty_for_io(inode, start, end); add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); *num_added += 1; @@ -1743,8 +1757,10 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sum; list_for_each_entry(sum, list, list) { + trans->adding_csums = 1; btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); + trans->adding_csums = 0; } return 0; } @@ -3679,11 +3695,9 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, * 1 for the dir item * 1 for the dir index * 1 for the inode ref - * 1 for the inode ref in the tree log - * 2 for the dir entries in the log * 1 for the inode */ - trans = btrfs_start_transaction(root, 8); + trans = btrfs_start_transaction(root, 5); if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; @@ -8127,7 +8141,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items * should cover the worst case number of items we'll modify. */ - trans = btrfs_start_transaction(root, 20); + trans = btrfs_start_transaction(root, 11); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index dc08d77b717e..005c45db699e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -557,6 +557,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&splice); INIT_LIST_HEAD(&works); + mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); list_splice_init(&root->fs_info->ordered_extents, &splice); while (!list_empty(&splice)) { @@ -600,6 +601,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) cond_resched(); } + mutex_unlock(&root->fs_info->ordered_operations_mutex); } /* diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5471e47d6559..b44124dd2370 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1153,7 +1153,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, sgn > 0 ? node->seq - 1 : node->seq, &roots); if (ret < 0) - goto out; + return ret; spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; @@ -1275,7 +1275,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ret = 0; unlock: spin_unlock(&fs_info->qgroup_lock); -out: ulist_free(roots); ulist_free(tmp); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 53c3501fa4ca..85e072b956d5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -542,7 +542,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); item_size = btrfs_item_size_nr(eb, path->slots[0]); - btrfs_release_path(path); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { @@ -558,7 +557,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) ret < 0 ? -1 : ref_level, ret < 0 ? -1 : ref_root); } while (ret != 1); + btrfs_release_path(path); } else { + btrfs_release_path(path); swarn.path = path; swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f7a8b861058b..c85e7c6b4598 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3945,12 +3945,10 @@ static int is_extent_unchanged(struct send_ctx *sctx, found_key.type != key.type) { key.offset += right_len; break; - } else { - if (found_key.offset != key.offset + right_len) { - /* Should really not happen */ - ret = -EIO; - goto out; - } + } + if (found_key.offset != key.offset + right_len) { + ret = 0; + goto out; } key = found_key; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 451fad96ecd1..ef96381569a4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -317,6 +317,7 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, unsigned long src_ptr; unsigned long dst_ptr; int overwrite_root = 0; + bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) overwrite_root = 1; @@ -326,6 +327,9 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, /* look for the key in the destination tree */ ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + return ret; + if (ret == 0) { char *src_copy; char *dst_copy; @@ -367,6 +371,30 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, return 0; } + /* + * We need to load the old nbytes into the inode so when we + * replay the extents we've logged we get the right nbytes. + */ + if (inode_item) { + struct btrfs_inode_item *item; + u64 nbytes; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + nbytes = btrfs_inode_nbytes(path->nodes[0], item); + item = btrfs_item_ptr(eb, slot, + struct btrfs_inode_item); + btrfs_set_inode_nbytes(eb, item, nbytes); + } + } else if (inode_item) { + struct btrfs_inode_item *item; + + /* + * New inode, set nbytes to 0 so that the nbytes comes out + * properly when we replay the extents. + */ + item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); + btrfs_set_inode_nbytes(eb, item, 0); } insert: btrfs_release_path(path); @@ -486,7 +514,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, int found_type; u64 extent_end; u64 start = key->offset; - u64 saved_nbytes; + u64 nbytes = 0; struct btrfs_file_extent_item *item; struct inode *inode = NULL; unsigned long size; @@ -496,10 +524,19 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, found_type = btrfs_file_extent_type(eb, item); if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) - extent_end = start + btrfs_file_extent_num_bytes(eb, item); - else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + nbytes = btrfs_file_extent_num_bytes(eb, item); + extent_end = start + nbytes; + + /* + * We don't add to the inodes nbytes if we are prealloc or a + * hole. + */ + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + nbytes = 0; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size = btrfs_file_extent_inline_len(eb, item); + nbytes = btrfs_file_extent_ram_bytes(eb, item); extent_end = ALIGN(start + size, root->sectorsize); } else { ret = 0; @@ -548,7 +585,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); BUG_ON(ret); @@ -635,7 +671,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, BUG_ON(ret); } - inode_set_bytes(inode, saved_nbytes); + inode_add_bytes(inode, nbytes); ret = btrfs_update_inode(trans, root, inode); out: if (inode) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5989a92236f7..2854c824ab64 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4935,7 +4935,18 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, em = lookup_extent_mapping(em_tree, chunk_start, 1); read_unlock(&em_tree->lock); - BUG_ON(!em || em->start != chunk_start); + if (!em) { + printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n", + chunk_start); + return -EIO; + } + + if (em->start != chunk_start) { + printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n", + em->start, chunk_start); + free_extent_map(em); + return -EIO; + } map = (struct map_lookup *)em->bdev; length = em->len; diff --git a/fs/buffer.c b/fs/buffer.c index b4dcb34c9635..bc1fe14aaa3e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -865,8 +865,6 @@ try_again: /* Link the buffer to its page */ set_bh_page(bh, page, offset); - - init_buffer(bh, NULL, NULL); } return head; /* @@ -2949,7 +2947,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) } } -int submit_bh(int rw, struct buffer_head * bh) +int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) { struct bio *bio; int ret = 0; @@ -2984,10 +2982,16 @@ int submit_bh(int rw, struct buffer_head * bh) bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; + bio->bi_flags |= bio_flags; /* Take care of bh's that straddle the end of the device */ guard_bh_eod(rw, bio, bh); + if (buffer_meta(bh)) + rw |= REQ_META; + if (buffer_prio(bh)) + rw |= REQ_PRIO; + bio_get(bio); submit_bio(rw, bio); @@ -2997,6 +3001,12 @@ int submit_bh(int rw, struct buffer_head * bh) bio_put(bio); return ret; } +EXPORT_SYMBOL_GPL(_submit_bh); + +int submit_bh(int rw, struct buffer_head *bh) +{ + return _submit_bh(rw, bh, 0); +} EXPORT_SYMBOL(submit_bh); /** diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 480992259707..317f9ee9c991 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -962,12 +962,14 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) } data = kmap(page); + file_start_write(file); old_fs = get_fs(); set_fs(KERNEL_DS); ret = file->f_op->write( file, (const void __user *) data, len, &pos); set_fs(old_fs); kunmap(page); + file_end_write(file); if (ret != len) ret = -EIO; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 991c63c6bdd0..21b3a291c327 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1575,14 +1575,24 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } break; case Opt_blank_pass: - vol->password = NULL; - break; - case Opt_pass: /* passwords have to be handled differently * to allow the character used for deliminator * to be passed within them */ + /* + * Check if this is a case where the password + * starts with a delimiter + */ + tmp_end = strchr(data, '='); + tmp_end++; + if (!(tmp_end < end && tmp_end[1] == delim)) { + /* No it is not. Set the password to NULL */ + vol->password = NULL; + break; + } + /* Yes it is. Drop down to Opt_pass below.*/ + case Opt_pass: /* Obtain the value string */ value = strchr(data, '='); value++; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7a0dd99e4507..2d4a231dd70b 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2520,8 +2520,6 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); - sb_start_write(inode->i_sb); - /* * We need to hold the sem to be sure nobody modifies lock list * with a brlock that prevents writing. @@ -2545,7 +2543,6 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, } up_read(&cinode->lock_sem); - sb_end_write(inode->i_sb); return rc; } diff --git a/fs/coda/file.c b/fs/coda/file.c index fa4c100bdc7d..380b798f8443 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -79,6 +79,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo return -EINVAL; host_inode = file_inode(host_file); + file_start_write(host_file); mutex_lock(&coda_inode->i_mutex); ret = host_file->f_op->write(host_file, buf, count, ppos); @@ -87,6 +88,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC; mutex_unlock(&coda_inode->i_mutex); + file_end_write(host_file); return ret; } diff --git a/fs/compat.c b/fs/compat.c index d487985dd0ea..d0560c93973d 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -44,7 +44,6 @@ #include <linux/signal.h> #include <linux/poll.h> #include <linux/mm.h> -#include <linux/eventpoll.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/pagemap.h> @@ -1069,210 +1068,6 @@ asmlinkage long compat_sys_getdents64(unsigned int fd, } #endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */ -static ssize_t compat_do_readv_writev(int type, struct file *file, - const struct compat_iovec __user *uvector, - unsigned long nr_segs, loff_t *pos) -{ - compat_ssize_t tot_len; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - ssize_t ret; - io_fn_t fn; - iov_fn_t fnv; - - ret = -EINVAL; - if (!file->f_op) - goto out; - - ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, - UIO_FASTIOV, iovstack, &iov); - if (ret <= 0) - goto out; - - tot_len = ret; - ret = rw_verify_area(type, file, pos, tot_len); - if (ret < 0) - goto out; - - fnv = NULL; - if (type == READ) { - fn = file->f_op->read; - fnv = file->f_op->aio_read; - } else { - fn = (io_fn_t)file->f_op->write; - fnv = file->f_op->aio_write; - } - - if (fnv) - ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, - pos, fnv); - else - ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); - -out: - if (iov != iovstack) - kfree(iov); - if ((ret + (type == READ)) > 0) { - if (type == READ) - fsnotify_access(file); - else - fsnotify_modify(file); - } - return ret; -} - -static size_t compat_readv(struct file *file, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos) -{ - ssize_t ret = -EBADF; - - if (!(file->f_mode & FMODE_READ)) - goto out; - - ret = -EINVAL; - if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) - goto out; - - ret = compat_do_readv_writev(READ, file, vec, vlen, pos); - -out: - if (ret > 0) - add_rchar(current, ret); - inc_syscr(current); - return ret; -} - -asmlinkage ssize_t -compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen) -{ - struct fd f = fdget(fd); - ssize_t ret; - loff_t pos; - - if (!f.file) - return -EBADF; - pos = f.file->f_pos; - ret = compat_readv(f.file, vec, vlen, &pos); - f.file->f_pos = pos; - fdput(f); - return ret; -} - -asmlinkage ssize_t -compat_sys_preadv64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos) -{ - struct fd f; - ssize_t ret; - - if (pos < 0) - return -EINVAL; - f = fdget(fd); - if (!f.file) - return -EBADF; - ret = -ESPIPE; - if (f.file->f_mode & FMODE_PREAD) - ret = compat_readv(f.file, vec, vlen, &pos); - fdput(f); - return ret; -} - -asmlinkage ssize_t -compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, u32 pos_low, u32 pos_high) -{ - loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return compat_sys_preadv64(fd, vec, vlen, pos); -} - -static size_t compat_writev(struct file *file, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos) -{ - ssize_t ret = -EBADF; - - if (!(file->f_mode & FMODE_WRITE)) - goto out; - - ret = -EINVAL; - if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) - goto out; - - ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); - -out: - if (ret > 0) - add_wchar(current, ret); - inc_syscw(current); - return ret; -} - -asmlinkage ssize_t -compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen) -{ - struct fd f = fdget(fd); - ssize_t ret; - loff_t pos; - - if (!f.file) - return -EBADF; - pos = f.file->f_pos; - ret = compat_writev(f.file, vec, vlen, &pos); - f.file->f_pos = pos; - fdput(f); - return ret; -} - -asmlinkage ssize_t -compat_sys_pwritev64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos) -{ - struct fd f; - ssize_t ret; - - if (pos < 0) - return -EINVAL; - f = fdget(fd); - if (!f.file) - return -EBADF; - ret = -ESPIPE; - if (f.file->f_mode & FMODE_PWRITE) - ret = compat_writev(f.file, vec, vlen, &pos); - fdput(f); - return ret; -} - -asmlinkage ssize_t -compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, u32 pos_low, u32 pos_high) -{ - loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return compat_sys_pwritev64(fd, vec, vlen, pos); -} - -asmlinkage long -compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32, - unsigned int nr_segs, unsigned int flags) -{ - unsigned i; - struct iovec __user *iov; - if (nr_segs > UIO_MAXIOV) - return -EINVAL; - iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); - for (i = 0; i < nr_segs; i++) { - struct compat_iovec v; - if (get_user(v.iov_base, &iov32[i].iov_base) || - get_user(v.iov_len, &iov32[i].iov_len) || - put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || - put_user(v.iov_len, &iov[i].iov_len)) - return -EFAULT; - } - return sys_vmsplice(fd, iov, nr_segs, flags); -} - /* * Exactly like fs/open.c:sys_open(), except that it doesn't set the * O_LARGEFILE flag. @@ -1658,84 +1453,6 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, return ret; } -#ifdef CONFIG_EPOLL - -asmlinkage long compat_sys_epoll_pwait(int epfd, - struct compat_epoll_event __user *events, - int maxevents, int timeout, - const compat_sigset_t __user *sigmask, - compat_size_t sigsetsize) -{ - long err; - compat_sigset_t csigmask; - sigset_t ksigmask, sigsaved; - - /* - * If the caller wants a certain signal mask to be set during the wait, - * we apply it here. - */ - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) - return -EFAULT; - sigset_from_compat(&ksigmask, &csigmask); - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } - - err = sys_epoll_wait(epfd, events, maxevents, timeout); - - /* - * If we changed the signal mask, we need to restore the original one. - * In case we've got a signal while waiting, we do not restore the - * signal mask yet, and we allow do_signal() to deliver the signal on - * the way back to userspace, before the signal mask is restored. - */ - if (sigmask) { - if (err == -EINTR) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } else - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - } - - return err; -} - -#endif /* CONFIG_EPOLL */ - -#ifdef CONFIG_SIGNALFD - -asmlinkage long compat_sys_signalfd4(int ufd, - const compat_sigset_t __user *sigmask, - compat_size_t sigsetsize, int flags) -{ - compat_sigset_t ss32; - sigset_t tmp; - sigset_t __user *ksigmask; - - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) - return -EFAULT; - sigset_from_compat(&tmp, &ss32); - ksigmask = compat_alloc_user_space(sizeof(sigset_t)); - if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t))) - return -EFAULT; - - return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags); -} - -asmlinkage long compat_sys_signalfd(int ufd, - const compat_sigset_t __user *sigmask, - compat_size_t sigsetsize) -{ - return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0); -} -#endif /* CONFIG_SIGNALFD */ - #ifdef CONFIG_FHANDLE /* * Exactly like fs/open.c:sys_open_by_handle_at(), except that it @@ -1747,25 +1464,3 @@ COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, return do_handle_open(mountdirfd, handle, flags); } #endif - -#ifdef __ARCH_WANT_COMPAT_SYS_SENDFILE -asmlinkage long compat_sys_sendfile(int out_fd, int in_fd, - compat_off_t __user *offset, compat_size_t count) -{ - loff_t pos; - off_t off; - ssize_t ret; - - if (offset) { - if (unlikely(get_user(off, offset))) - return -EFAULT; - pos = off; - ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); - if (unlikely(put_user(pos, offset))) - return -EFAULT; - return ret; - } - - return do_sendfile(out_fd, in_fd, NULL, count, 0); -} -#endif /* __ARCH_WANT_COMPAT_SYS_SENDFILE */ diff --git a/fs/coredump.c b/fs/coredump.c index c6479658d487..a9abe313e8d5 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -263,7 +263,6 @@ static int zap_process(struct task_struct *start, int exit_code) struct task_struct *t; int nr = 0; - start->signal->flags = SIGNAL_GROUP_EXIT; start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; @@ -280,8 +279,8 @@ static int zap_process(struct task_struct *start, int exit_code) return nr; } -static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, - struct core_state *core_state, int exit_code) +static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, + struct core_state *core_state, int exit_code) { struct task_struct *g, *p; unsigned long flags; @@ -291,11 +290,16 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (!signal_group_exit(tsk->signal)) { mm->core_state = core_state; nr = zap_process(tsk, exit_code); + tsk->signal->group_exit_task = tsk; + /* ignore all signals except SIGKILL, see prepare_signal() */ + tsk->signal->flags = SIGNAL_GROUP_COREDUMP; + clear_tsk_thread_flag(tsk, TIF_SIGPENDING); } spin_unlock_irq(&tsk->sighand->siglock); if (unlikely(nr < 0)) return nr; + tsk->flags = PF_DUMPCORE; if (atomic_read(&mm->mm_users) == nr + 1) goto done; /* @@ -340,6 +344,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (unlikely(p->mm == mm)) { lock_task_sighand(p, &flags); nr += zap_process(p, exit_code); + p->signal->flags = SIGNAL_GROUP_EXIT; unlock_task_sighand(p, &flags); } break; @@ -386,11 +391,18 @@ static int coredump_wait(int exit_code, struct core_state *core_state) return core_waiters; } -static void coredump_finish(struct mm_struct *mm) +static void coredump_finish(struct mm_struct *mm, bool core_dumped) { struct core_thread *curr, *next; struct task_struct *task; + spin_lock_irq(¤t->sighand->siglock); + if (core_dumped && !__fatal_signal_pending(current)) + current->signal->group_exit_code |= 0x80; + current->signal->group_exit_task = NULL; + current->signal->flags = SIGNAL_GROUP_EXIT; + spin_unlock_irq(¤t->sighand->siglock); + next = mm->core_state->dumper.next; while ((curr = next) != NULL) { next = curr->next; @@ -407,26 +419,38 @@ static void coredump_finish(struct mm_struct *mm) mm->core_state = NULL; } -static void wait_for_dump_helpers(struct file *file) +static bool dump_interrupted(void) { - struct pipe_inode_info *pipe; + /* + * SIGKILL or freezing() interrupt the coredumping. Perhaps we + * can do try_to_freeze() and check __fatal_signal_pending(), + * but then we need to teach dump_write() to restart and clear + * TIF_SIGPENDING. + */ + return signal_pending(current); +} - pipe = file_inode(file)->i_pipe; +static void wait_for_dump_helpers(struct file *file) +{ + struct pipe_inode_info *pipe = file->private_data; pipe_lock(pipe); pipe->readers++; pipe->writers--; + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + pipe_unlock(pipe); - while ((pipe->readers > 1) && (!signal_pending(current))) { - wake_up_interruptible_sync(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - pipe_wait(pipe); - } + /* + * We actually want wait_event_freezable() but then we need + * to clear TIF_SIGPENDING and improve dump_interrupted(). + */ + wait_event_interruptible(pipe->wait, pipe->readers == 1); + pipe_lock(pipe); pipe->readers--; pipe->writers++; pipe_unlock(pipe); - } /* @@ -471,6 +495,7 @@ void do_coredump(siginfo_t *siginfo) int ispipe; struct files_struct *displaced; bool need_nonrelative = false; + bool core_dumped = false; static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { .siginfo = siginfo, @@ -514,17 +539,12 @@ void do_coredump(siginfo_t *siginfo) old_cred = override_creds(cred); - /* - * Clear any false indication of pending signals that might - * be seen by the filesystem code called to write the core file. - */ - clear_thread_flag(TIF_SIGPENDING); - ispipe = format_corename(&cn, &cprm); - if (ispipe) { + if (ispipe) { int dump_count; char **helper_argv; + struct subprocess_info *sub_info; if (ispipe < 0) { printk(KERN_WARNING "format_corename failed\n"); @@ -571,15 +591,20 @@ void do_coredump(siginfo_t *siginfo) goto fail_dropcount; } - retval = call_usermodehelper_fns(helper_argv[0], helper_argv, - NULL, UMH_WAIT_EXEC, umh_pipe_setup, - NULL, &cprm); + retval = -ENOMEM; + sub_info = call_usermodehelper_setup(helper_argv[0], + helper_argv, NULL, GFP_KERNEL, + umh_pipe_setup, NULL, &cprm); + if (sub_info) + retval = call_usermodehelper_exec(sub_info, + UMH_WAIT_EXEC); + argv_free(helper_argv); if (retval) { - printk(KERN_INFO "Core dump to %s pipe failed\n", + printk(KERN_INFO "Core dump to %s pipe failed\n", cn.corename); goto close_fail; - } + } } else { struct inode *inode; @@ -629,9 +654,9 @@ void do_coredump(siginfo_t *siginfo) goto close_fail; if (displaced) put_files_struct(displaced); - retval = binfmt->core_dump(&cprm); - if (retval) - current->signal->group_exit_code |= 0x80; + file_start_write(cprm.file); + core_dumped = !dump_interrupted() && binfmt->core_dump(&cprm); + file_end_write(cprm.file); if (ispipe && core_pipe_limit) wait_for_dump_helpers(cprm.file); @@ -644,7 +669,7 @@ fail_dropcount: fail_unlock: kfree(cn.corename); fail_corename: - coredump_finish(mm); + coredump_finish(mm, core_dumped); revert_creds(old_cred); fail_creds: put_cred(cred); @@ -659,7 +684,9 @@ fail: */ int dump_write(struct file *file, const void *addr, int nr) { - return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; + return !dump_interrupted() && + access_ok(VERIFY_READ, addr, nr) && + file->f_op->write(file, addr, nr, &file->f_pos) == nr; } EXPORT_SYMBOL(dump_write); @@ -668,7 +695,8 @@ int dump_seek(struct file *file, loff_t off) int ret = 1; if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - if (file->f_op->llseek(file, off, SEEK_CUR) < 0) + if (dump_interrupted() || + file->f_op->llseek(file, off, SEEK_CUR) < 0) return 0; } else { char *buf = (char *)get_zeroed_page(GFP_KERNEL); diff --git a/fs/dcache.c b/fs/dcache.c index e8bc3420d63e..e689268046c3 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1230,8 +1230,10 @@ void shrink_dcache_parent(struct dentry * parent) LIST_HEAD(dispose); int found; - while ((found = select_parent(parent, &dispose)) != 0) + while ((found = select_parent(parent, &dispose)) != 0) { shrink_dentry_list(&dispose); + cond_resched(); + } } EXPORT_SYMBOL(shrink_dcache_parent); diff --git a/fs/dcookies.c b/fs/dcookies.c index 17c779967828..ab5954b50267 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -25,6 +25,7 @@ #include <linux/dcookies.h> #include <linux/mutex.h> #include <linux/path.h> +#include <linux/compat.h> #include <asm/uaccess.h> /* The dcookies are allocated from a kmem_cache and @@ -145,7 +146,7 @@ out: /* And here is where the userspace process can look up the cookie value * to retrieve the path. */ -SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len) +SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len) { unsigned long cookie = (unsigned long)cookie64; int err = -EINVAL; @@ -201,12 +202,16 @@ out: mutex_unlock(&dcookie_mutex); return err; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_lookup_dcookie(u64 cookie64, long buf, long len) + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len) { - return SYSC_lookup_dcookie(cookie64, (char __user *) buf, (size_t) len); +#ifdef __BIG_ENDIAN + return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); +#else + return sys_lookup_dcookie(((u64)w1 << 32) | w0, buf, len); +#endif } -SYSCALL_ALIAS(sys_lookup_dcookie, SyS_lookup_dcookie); #endif static int dcookie_init(void) diff --git a/fs/direct-io.c b/fs/direct-io.c index f853263cf74f..cfb816dc6d9f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -672,12 +672,6 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, if (sdio->final_block_in_bio != sdio->cur_page_block || cur_offset != bio_next_offset) dio_bio_submit(dio, sdio); - /* - * Submit now if the underlying fs is about to perform a - * metadata read - */ - else if (sdio->boundary) - dio_bio_submit(dio, sdio); } if (sdio->bio == NULL) { @@ -737,16 +731,6 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, sdio->cur_page_block + (sdio->cur_page_len >> sdio->blkbits) == blocknr) { sdio->cur_page_len += len; - - /* - * If sdio->boundary then we want to schedule the IO now to - * avoid metadata seeks. - */ - if (sdio->boundary) { - ret = dio_send_cur_page(dio, sdio, map_bh); - page_cache_release(sdio->cur_page); - sdio->cur_page = NULL; - } goto out; } @@ -758,7 +742,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, page_cache_release(sdio->cur_page); sdio->cur_page = NULL; if (ret) - goto out; + return ret; } page_cache_get(page); /* It is in dio */ @@ -768,6 +752,16 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, sdio->cur_page_block = blocknr; sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; out: + /* + * If sdio->boundary then we want to schedule the IO now to + * avoid metadata seeks. + */ + if (sdio->boundary) { + ret = dio_send_cur_page(dio, sdio, map_bh); + dio_bio_submit(dio, sdio); + page_cache_release(sdio->cur_page); + sdio->cur_page = NULL; + } return ret; } @@ -969,7 +963,8 @@ do_holes: this_chunk_bytes = this_chunk_blocks << blkbits; BUG_ON(this_chunk_bytes == 0); - sdio->boundary = buffer_boundary(map_bh); + if (this_chunk_blocks == sdio->blocks_available) + sdio->boundary = buffer_boundary(map_bh); ret = submit_page_section(dio, sdio, page, offset_in_page, this_chunk_bytes, diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 4f5ad246582f..d0ccd2fd79eb 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -52,8 +52,8 @@ #include <linux/mutex.h> #include <linux/sctp.h> #include <linux/slab.h> +#include <linux/sctp.h> #include <net/sctp/sctp.h> -#include <net/sctp/user.h> #include <net/ipv6.h> #include "dlm_internal.h" diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 01fd5c11a7fb..f704458ea5f5 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -247,6 +247,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, struct dlm_ls *ls; struct plock_op *op; int rv; + unsigned char fl_flags = fl->fl_flags; ls = dlm_find_lockspace_local(lockspace); if (!ls) @@ -258,9 +259,18 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, goto out; } - if (posix_lock_file_wait(file, fl) < 0) - log_error(ls, "dlm_posix_unlock: vfs unlock error %llx", - (unsigned long long)number); + /* cause the vfs unlock to return ENOENT if lock is not found */ + fl->fl_flags |= FL_EXISTS; + + rv = posix_lock_file_wait(file, fl); + if (rv == -ENOENT) { + rv = 0; + goto out_free; + } + if (rv < 0) { + log_error(ls, "dlm_posix_unlock: vfs unlock error %d %llx", + rv, (unsigned long long)number); + } op->info.optype = DLM_PLOCK_OP_UNLOCK; op->info.pid = fl->fl_pid; @@ -296,9 +306,11 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, if (rv == -ENOENT) rv = 0; +out_free: kfree(op); out: dlm_put_lockspace(ls); + fl->fl_flags = fl_flags; return rv; } EXPORT_SYMBOL_GPL(dlm_posix_unlock); diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index 412e6eda25f8..e4141f257495 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -80,13 +80,6 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file) int rc; mutex_lock(&ecryptfs_daemon_hash_mux); - rc = try_module_get(THIS_MODULE); - if (rc == 0) { - rc = -EIO; - printk(KERN_ERR "%s: Error attempting to increment module use " - "count; rc = [%d]\n", __func__, rc); - goto out_unlock_daemon_list; - } rc = ecryptfs_find_daemon_by_euid(&daemon); if (!rc) { rc = -EINVAL; @@ -96,7 +89,7 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file) if (rc) { printk(KERN_ERR "%s: Error attempting to spawn daemon; " "rc = [%d]\n", __func__, rc); - goto out_module_put_unlock_daemon_list; + goto out_unlock_daemon_list; } mutex_lock(&daemon->mux); if (daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN) { @@ -108,9 +101,6 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file) atomic_inc(&ecryptfs_num_miscdev_opens); out_unlock_daemon: mutex_unlock(&daemon->mux); -out_module_put_unlock_daemon_list: - if (rc) - module_put(THIS_MODULE); out_unlock_daemon_list: mutex_unlock(&ecryptfs_daemon_hash_mux); return rc; @@ -147,7 +137,6 @@ ecryptfs_miscdev_release(struct inode *inode, struct file *file) "bug.\n", __func__, rc); BUG(); } - module_put(THIS_MODULE); return rc; } @@ -471,6 +460,7 @@ out_free: static const struct file_operations ecryptfs_miscdev_fops = { + .owner = THIS_MODULE, .open = ecryptfs_miscdev_open, .poll = ecryptfs_miscdev_poll, .read = ecryptfs_miscdev_read, diff --git a/fs/efivarfs/Kconfig b/fs/efivarfs/Kconfig new file mode 100644 index 000000000000..367bbb10c543 --- /dev/null +++ b/fs/efivarfs/Kconfig @@ -0,0 +1,12 @@ +config EFIVAR_FS + tristate "EFI Variable filesystem" + depends on EFI + help + efivarfs is a replacement filesystem for the old EFI + variable support via sysfs, as it doesn't suffer from the + same 1024-byte variable size limit. + + To compile this file system support as a module, choose M + here. The module will be called efivarfs. + + If unsure, say N. diff --git a/fs/efivarfs/Makefile b/fs/efivarfs/Makefile new file mode 100644 index 000000000000..955d478177d5 --- /dev/null +++ b/fs/efivarfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the efivarfs filesystem +# + +obj-$(CONFIG_EFIVAR_FS) += efivarfs.o + +efivarfs-objs := inode.o file.o super.o diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c new file mode 100644 index 000000000000..bfb531564319 --- /dev/null +++ b/fs/efivarfs/file.c @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/efi.h> +#include <linux/fs.h> +#include <linux/slab.h> + +#include "internal.h" + +static ssize_t efivarfs_file_write(struct file *file, + const char __user *userbuf, size_t count, loff_t *ppos) +{ + struct efivar_entry *var = file->private_data; + void *data; + u32 attributes; + struct inode *inode = file->f_mapping->host; + unsigned long datasize = count - sizeof(attributes); + ssize_t bytes = 0; + bool set = false; + + if (count < sizeof(attributes)) + return -EINVAL; + + if (copy_from_user(&attributes, userbuf, sizeof(attributes))) + return -EFAULT; + + if (attributes & ~(EFI_VARIABLE_MASK)) + return -EINVAL; + + data = kmalloc(datasize, GFP_KERNEL); + if (!data) + return -ENOMEM; + + if (copy_from_user(data, userbuf + sizeof(attributes), datasize)) { + bytes = -EFAULT; + goto out; + } + + bytes = efivar_entry_set_get_size(var, attributes, &datasize, + data, &set); + if (!set && bytes) + goto out; + + if (bytes == -ENOENT) { + drop_nlink(inode); + d_delete(file->f_dentry); + dput(file->f_dentry); + } else { + mutex_lock(&inode->i_mutex); + i_size_write(inode, datasize + sizeof(attributes)); + mutex_unlock(&inode->i_mutex); + } + + bytes = count; + +out: + kfree(data); + + return bytes; +} + +static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct efivar_entry *var = file->private_data; + unsigned long datasize = 0; + u32 attributes; + void *data; + ssize_t size = 0; + int err; + + err = efivar_entry_size(var, &datasize); + if (err) + return err; + + data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL); + + if (!data) + return -ENOMEM; + + size = efivar_entry_get(var, &attributes, &datasize, + data + sizeof(attributes)); + if (size) + goto out_free; + + memcpy(data, &attributes, sizeof(attributes)); + size = simple_read_from_buffer(userbuf, count, ppos, + data, datasize + sizeof(attributes)); +out_free: + kfree(data); + + return size; +} + +const struct file_operations efivarfs_file_operations = { + .open = simple_open, + .read = efivarfs_file_read, + .write = efivarfs_file_write, + .llseek = no_llseek, +}; diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c new file mode 100644 index 000000000000..7e787fb90293 --- /dev/null +++ b/fs/efivarfs/inode.c @@ -0,0 +1,174 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/efi.h> +#include <linux/fs.h> +#include <linux/ctype.h> +#include <linux/slab.h> + +#include "internal.h" + +struct inode *efivarfs_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev) +{ + struct inode *inode = new_inode(sb); + + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_fop = &efivarfs_file_operations; + break; + case S_IFDIR: + inode->i_op = &efivarfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inc_nlink(inode); + break; + } + } + return inode; +} + +/* + * Return true if 'str' is a valid efivarfs filename of the form, + * + * VariableName-12345678-1234-1234-1234-1234567891bc + */ +bool efivarfs_valid_name(const char *str, int len) +{ + static const char dashes[EFI_VARIABLE_GUID_LEN] = { + [8] = 1, [13] = 1, [18] = 1, [23] = 1 + }; + const char *s = str + len - EFI_VARIABLE_GUID_LEN; + int i; + + /* + * We need a GUID, plus at least one letter for the variable name, + * plus the '-' separator + */ + if (len < EFI_VARIABLE_GUID_LEN + 2) + return false; + + /* GUID must be preceded by a '-' */ + if (*(s - 1) != '-') + return false; + + /* + * Validate that 's' is of the correct format, e.g. + * + * 12345678-1234-1234-1234-123456789abc + */ + for (i = 0; i < EFI_VARIABLE_GUID_LEN; i++) { + if (dashes[i]) { + if (*s++ != '-') + return false; + } else { + if (!isxdigit(*s++)) + return false; + } + } + + return true; +} + +static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid) +{ + guid->b[0] = hex_to_bin(str[6]) << 4 | hex_to_bin(str[7]); + guid->b[1] = hex_to_bin(str[4]) << 4 | hex_to_bin(str[5]); + guid->b[2] = hex_to_bin(str[2]) << 4 | hex_to_bin(str[3]); + guid->b[3] = hex_to_bin(str[0]) << 4 | hex_to_bin(str[1]); + guid->b[4] = hex_to_bin(str[11]) << 4 | hex_to_bin(str[12]); + guid->b[5] = hex_to_bin(str[9]) << 4 | hex_to_bin(str[10]); + guid->b[6] = hex_to_bin(str[16]) << 4 | hex_to_bin(str[17]); + guid->b[7] = hex_to_bin(str[14]) << 4 | hex_to_bin(str[15]); + guid->b[8] = hex_to_bin(str[19]) << 4 | hex_to_bin(str[20]); + guid->b[9] = hex_to_bin(str[21]) << 4 | hex_to_bin(str[22]); + guid->b[10] = hex_to_bin(str[24]) << 4 | hex_to_bin(str[25]); + guid->b[11] = hex_to_bin(str[26]) << 4 | hex_to_bin(str[27]); + guid->b[12] = hex_to_bin(str[28]) << 4 | hex_to_bin(str[29]); + guid->b[13] = hex_to_bin(str[30]) << 4 | hex_to_bin(str[31]); + guid->b[14] = hex_to_bin(str[32]) << 4 | hex_to_bin(str[33]); + guid->b[15] = hex_to_bin(str[34]) << 4 | hex_to_bin(str[35]); +} + +static int efivarfs_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) +{ + struct inode *inode; + struct efivar_entry *var; + int namelen, i = 0, err = 0; + + if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len)) + return -EINVAL; + + inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0); + if (!inode) + return -ENOMEM; + + var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL); + if (!var) { + err = -ENOMEM; + goto out; + } + + /* length of the variable name itself: remove GUID and separator */ + namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1; + + efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1, + &var->var.VendorGuid); + + for (i = 0; i < namelen; i++) + var->var.VariableName[i] = dentry->d_name.name[i]; + + var->var.VariableName[i] = '\0'; + + inode->i_private = var; + + efivar_entry_add(var, &efivarfs_list); + d_instantiate(dentry, inode); + dget(dentry); +out: + if (err) { + kfree(var); + iput(inode); + } + return err; +} + +static int efivarfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct efivar_entry *var = dentry->d_inode->i_private; + + if (efivar_entry_delete(var)) + return -EINVAL; + + drop_nlink(dentry->d_inode); + dput(dentry); + return 0; +}; + +/* + * Handle negative dentry. + */ +static struct dentry *efivarfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + d_add(dentry, NULL); + return NULL; +} + +const struct inode_operations efivarfs_dir_inode_operations = { + .lookup = efivarfs_lookup, + .unlink = efivarfs_unlink, + .create = efivarfs_create, +}; diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h new file mode 100644 index 000000000000..b5ff16addb7c --- /dev/null +++ b/fs/efivarfs/internal.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef EFIVAR_FS_INTERNAL_H +#define EFIVAR_FS_INTERNAL_H + +#include <linux/list.h> + +extern const struct file_operations efivarfs_file_operations; +extern const struct inode_operations efivarfs_dir_inode_operations; +extern bool efivarfs_valid_name(const char *str, int len); +extern struct inode *efivarfs_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev); + +extern struct list_head efivarfs_list; + +#endif /* EFIVAR_FS_INTERNAL_H */ diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c new file mode 100644 index 000000000000..141aee31884f --- /dev/null +++ b/fs/efivarfs/super.c @@ -0,0 +1,270 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/ctype.h> +#include <linux/efi.h> +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/pagemap.h> +#include <linux/ucs2_string.h> +#include <linux/slab.h> +#include <linux/magic.h> + +#include "internal.h" + +LIST_HEAD(efivarfs_list); + +static void efivarfs_evict_inode(struct inode *inode) +{ + clear_inode(inode); +} + +static const struct super_operations efivarfs_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = efivarfs_evict_inode, + .show_options = generic_show_options, +}; + +static struct super_block *efivarfs_sb; + +/* + * Compare two efivarfs file names. + * + * An efivarfs filename is composed of two parts, + * + * 1. A case-sensitive variable name + * 2. A case-insensitive GUID + * + * So we need to perform a case-sensitive match on part 1 and a + * case-insensitive match on part 2. + */ +static int efivarfs_d_compare(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, + const struct qstr *name) +{ + int guid = len - EFI_VARIABLE_GUID_LEN; + + if (name->len != len) + return 1; + + /* Case-sensitive compare for the variable name */ + if (memcmp(str, name->name, guid)) + return 1; + + /* Case-insensitive compare for the GUID */ + return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN); +} + +static int efivarfs_d_hash(const struct dentry *dentry, + const struct inode *inode, struct qstr *qstr) +{ + unsigned long hash = init_name_hash(); + const unsigned char *s = qstr->name; + unsigned int len = qstr->len; + + if (!efivarfs_valid_name(s, len)) + return -EINVAL; + + while (len-- > EFI_VARIABLE_GUID_LEN) + hash = partial_name_hash(*s++, hash); + + /* GUID is case-insensitive. */ + while (len--) + hash = partial_name_hash(tolower(*s++), hash); + + qstr->hash = end_name_hash(hash); + return 0; +} + +/* + * Retaining negative dentries for an in-memory filesystem just wastes + * memory and lookup time: arrange for them to be deleted immediately. + */ +static int efivarfs_delete_dentry(const struct dentry *dentry) +{ + return 1; +} + +static struct dentry_operations efivarfs_d_ops = { + .d_compare = efivarfs_d_compare, + .d_hash = efivarfs_d_hash, + .d_delete = efivarfs_delete_dentry, +}; + +static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) +{ + struct dentry *d; + struct qstr q; + int err; + + q.name = name; + q.len = strlen(name); + + err = efivarfs_d_hash(NULL, NULL, &q); + if (err) + return ERR_PTR(err); + + d = d_alloc(parent, &q); + if (d) + return d; + + return ERR_PTR(-ENOMEM); +} + +static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, + unsigned long name_size, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct efivar_entry *entry; + struct inode *inode = NULL; + struct dentry *dentry, *root = sb->s_root; + unsigned long size = 0; + char *name; + int len, i; + int err = -ENOMEM; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return err; + + memcpy(entry->var.VariableName, name16, name_size); + memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t)); + + len = ucs2_strlen(entry->var.VariableName); + + /* name, plus '-', plus GUID, plus NUL*/ + name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL); + if (!name) + goto fail; + + for (i = 0; i < len; i++) + name[i] = entry->var.VariableName[i] & 0xFF; + + name[len] = '-'; + + efi_guid_unparse(&entry->var.VendorGuid, name + len + 1); + + name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; + + inode = efivarfs_get_inode(sb, root->d_inode, S_IFREG | 0644, 0); + if (!inode) + goto fail_name; + + dentry = efivarfs_alloc_dentry(root, name); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto fail_inode; + } + + /* copied by the above to local storage in the dentry. */ + kfree(name); + + efivar_entry_size(entry, &size); + efivar_entry_add(entry, &efivarfs_list); + + mutex_lock(&inode->i_mutex); + inode->i_private = entry; + i_size_write(inode, size + sizeof(entry->var.Attributes)); + mutex_unlock(&inode->i_mutex); + d_add(dentry, inode); + + return 0; + +fail_inode: + iput(inode); +fail_name: + kfree(name); +fail: + kfree(entry); + return err; +} + +static int efivarfs_destroy(struct efivar_entry *entry, void *data) +{ + efivar_entry_remove(entry); + kfree(entry); + return 0; +} + +static int efivarfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode = NULL; + struct dentry *root; + int err; + + efivarfs_sb = sb; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = EFIVARFS_MAGIC; + sb->s_op = &efivarfs_ops; + sb->s_d_op = &efivarfs_d_ops; + sb->s_time_gran = 1; + + inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0); + if (!inode) + return -ENOMEM; + inode->i_op = &efivarfs_dir_inode_operations; + + root = d_make_root(inode); + sb->s_root = root; + if (!root) + return -ENOMEM; + + INIT_LIST_HEAD(&efivarfs_list); + + err = efivar_init(efivarfs_callback, (void *)sb, false, + true, &efivarfs_list); + if (err) + __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); + + return err; +} + +static struct dentry *efivarfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, efivarfs_fill_super); +} + +static void efivarfs_kill_sb(struct super_block *sb) +{ + kill_litter_super(sb); + efivarfs_sb = NULL; + + /* Remove all entries and destroy */ + __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); +} + +static struct file_system_type efivarfs_type = { + .name = "efivarfs", + .mount = efivarfs_mount, + .kill_sb = efivarfs_kill_sb, +}; + +static __init int efivarfs_init(void) +{ + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return 0; + + if (!efivars_kobject()) + return 0; + + return register_filesystem(&efivarfs_type); +} + +MODULE_AUTHOR("Matthew Garrett, Jeremy Kerr"); +MODULE_DESCRIPTION("EFI Variable Filesystem"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_FS("efivarfs"); + +module_init(efivarfs_init); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 9fec1836057a..deecc7294a67 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -40,6 +40,7 @@ #include <linux/atomic.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/compat.h> /* * LOCKING: @@ -104,7 +105,7 @@ struct epoll_filefd { struct file *file; int fd; -}; +} __packed; /* * Structure used to track possible nested calls, for too deep recursions @@ -128,6 +129,8 @@ struct nested_calls { /* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. + * Avoid increasing the size of this struct, there can be many thousands + * of these on a server and we do not want this to take another cache line. */ struct epitem { /* RB tree node used to link this structure to the eventpoll RB tree */ @@ -158,7 +161,7 @@ struct epitem { struct list_head fllink; /* wakeup_source used when EPOLLWAKEUP is set */ - struct wakeup_source *ws; + struct wakeup_source __rcu *ws; /* The structure that describe the interested events and the source fd */ struct epoll_event event; @@ -536,6 +539,38 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) } } +/* call only when ep->mtx is held */ +static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi) +{ + return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx)); +} + +/* call only when ep->mtx is held */ +static inline void ep_pm_stay_awake(struct epitem *epi) +{ + struct wakeup_source *ws = ep_wakeup_source(epi); + + if (ws) + __pm_stay_awake(ws); +} + +static inline bool ep_has_wakeup_source(struct epitem *epi) +{ + return rcu_access_pointer(epi->ws) ? true : false; +} + +/* call when ep->mtx cannot be held (ep_poll_callback) */ +static inline void ep_pm_stay_awake_rcu(struct epitem *epi) +{ + struct wakeup_source *ws; + + rcu_read_lock(); + ws = rcu_dereference(epi->ws); + if (ws) + __pm_stay_awake(ws); + rcu_read_unlock(); +} + /** * ep_scan_ready_list - Scans the ready list in a way that makes possible for * the scan code, to call f_op->poll(). Also allows for @@ -599,7 +634,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, */ if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); } } /* @@ -668,7 +703,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); - wakeup_source_unregister(epi->ws); + wakeup_source_unregister(ep_wakeup_source(epi)); /* At this point it is safe to free the eventpoll item */ kmem_cache_free(epi_cache, epi); @@ -711,11 +746,15 @@ static void ep_free(struct eventpoll *ep) * point we are sure no poll callbacks will be lingering around, and also by * holding "epmutex" we can be sure that no file cleanup code will hit * us during this operation. So we can avoid the lock on "ep->lock". + * We do not need to lock ep->mtx, either, we only do it to prevent + * a lockdep warning. */ + mutex_lock(&ep->mtx); while ((rbp = rb_first(&ep->rbr)) != NULL) { epi = rb_entry(rbp, struct epitem, rbn); ep_remove(ep, epi); } + mutex_unlock(&ep->mtx); mutex_unlock(&epmutex); mutex_destroy(&ep->mtx); @@ -734,6 +773,13 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) return 0; } +static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt) +{ + pt->_key = epi->event.events; + + return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events; +} + static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) { @@ -741,10 +787,9 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, poll_table pt; init_poll_funcptr(&pt, NULL); + list_for_each_entry_safe(epi, tmp, head, rdllink) { - pt._key = epi->event.events; - if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) & - epi->event.events) + if (ep_item_poll(epi, &pt)) return POLLIN | POLLRDNORM; else { /* @@ -752,7 +797,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ - __pm_relax(epi->ws); + __pm_relax(ep_wakeup_source(epi)); list_del_init(&epi->rdllink); } } @@ -984,7 +1029,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k /* If this file is already in the ready list we exit soon */ if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake_rcu(epi); } /* @@ -1146,6 +1191,7 @@ static int reverse_path_check(void) static int ep_create_wakeup_source(struct epitem *epi) { const char *name; + struct wakeup_source *ws; if (!epi->ep->ws) { epi->ep->ws = wakeup_source_register("eventpoll"); @@ -1154,17 +1200,29 @@ static int ep_create_wakeup_source(struct epitem *epi) } name = epi->ffd.file->f_path.dentry->d_name.name; - epi->ws = wakeup_source_register(name); - if (!epi->ws) + ws = wakeup_source_register(name); + + if (!ws) return -ENOMEM; + rcu_assign_pointer(epi->ws, ws); return 0; } -static void ep_destroy_wakeup_source(struct epitem *epi) +/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ +static noinline void ep_destroy_wakeup_source(struct epitem *epi) { - wakeup_source_unregister(epi->ws); - epi->ws = NULL; + struct wakeup_source *ws = ep_wakeup_source(epi); + + RCU_INIT_POINTER(epi->ws, NULL); + + /* + * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is + * used internally by wakeup_source_remove, too (called by + * wakeup_source_unregister), so we cannot use call_rcu + */ + synchronize_rcu(); + wakeup_source_unregister(ws); } /* @@ -1199,13 +1257,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, if (error) goto error_create_wakeup_source; } else { - epi->ws = NULL; + RCU_INIT_POINTER(epi->ws, NULL); } /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); - epq.pt._key = event->events; /* * Attach the item to the poll hooks and get current event bits. @@ -1214,7 +1271,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, * this operation completes, the poll callback can start hitting * the new item. */ - revents = tfile->f_op->poll(tfile, &epq.pt); + revents = ep_item_poll(epi, &epq.pt); /* * We have to check if something went wrong during the poll wait queue @@ -1247,7 +1304,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) @@ -1288,7 +1345,7 @@ error_unregister: list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); - wakeup_source_unregister(epi->ws); + wakeup_source_unregister(ep_wakeup_source(epi)); error_create_wakeup_source: kmem_cache_free(epi_cache, epi); @@ -1314,12 +1371,11 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; /* need barrier below */ - pt._key = event->events; epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { - if (!epi->ws) + if (!ep_has_wakeup_source(epi)) ep_create_wakeup_source(epi); - } else if (epi->ws) { + } else if (ep_has_wakeup_source(epi)) { ep_destroy_wakeup_source(epi); } @@ -1347,7 +1403,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ - revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt); + revents = ep_item_poll(epi, &pt); /* * If the item is "hot" and it is not registered inside the ready @@ -1357,7 +1413,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even spin_lock_irq(&ep->lock); if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) @@ -1383,6 +1439,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, unsigned int revents; struct epitem *epi; struct epoll_event __user *uevent; + struct wakeup_source *ws; poll_table pt; init_poll_funcptr(&pt, NULL); @@ -1405,14 +1462,16 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, * instead, but then epi->ws would temporarily be out of sync * with ep_is_linked(). */ - if (epi->ws && epi->ws->active) - __pm_stay_awake(ep->ws); - __pm_relax(epi->ws); + ws = ep_wakeup_source(epi); + if (ws) { + if (ws->active) + __pm_stay_awake(ep->ws); + __pm_relax(ws); + } + list_del_init(&epi->rdllink); - pt._key = epi->event.events; - revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) & - epi->event.events; + revents = ep_item_poll(epi, &pt); /* * If the event mask intersect the caller-requested one, @@ -1424,7 +1483,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); return eventcnt ? eventcnt : -EFAULT; } eventcnt++; @@ -1444,7 +1503,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); } } } @@ -1940,6 +1999,52 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, return error; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, + struct epoll_event __user *, events, + int, maxevents, int, timeout, + const compat_sigset_t __user *, sigmask, + compat_size_t, sigsetsize) +{ + long err; + compat_sigset_t csigmask; + sigset_t ksigmask, sigsaved; + + /* + * If the caller wants a certain signal mask to be set during the wait, + * we apply it here. + */ + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) + return -EFAULT; + sigset_from_compat(&ksigmask, &csigmask); + sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + err = sys_epoll_wait(epfd, events, maxevents, timeout); + + /* + * If we changed the signal mask, we need to restore the original one. + * In case we've got a signal while waiting, we do not restore the + * signal mask yet, and we allow do_signal() to deliver the signal on + * the way back to userspace, before the signal mask is restored. + */ + if (sigmask) { + if (err == -EINTR) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } else + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + } + + return err; +} +#endif + static int __init eventpoll_init(void) { struct sysinfo si; @@ -1964,6 +2069,12 @@ static int __init eventpoll_init(void) /* Initialize the structure used to perform file's f_op->poll() calls */ ep_nested_calls_init(&poll_readywalk_ncalls); + /* + * We can have many thousands of epitems, so prevent this from + * using an extra cache line on 64-bit (and smaller) CPUs + */ + BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128); + /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); diff --git a/fs/exec.c b/fs/exec.c index a96a4885bbbf..643019585574 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -613,7 +613,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : 0); + vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch @@ -622,7 +622,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : 0); + vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb, new_end, old_end); @@ -802,6 +802,15 @@ int kernel_read(struct file *file, loff_t offset, EXPORT_SYMBOL(kernel_read); +ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) +{ + ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos); + if (res > 0) + flush_icache_range(addr, addr + len); + return res; +} +EXPORT_SYMBOL(read_code); + static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; @@ -898,11 +907,13 @@ static int de_thread(struct task_struct *tsk) sig->notify_count = -1; /* for exit_notify() */ for (;;) { + threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); + threadgroup_change_end(tsk); schedule(); if (unlikely(__fatal_signal_pending(tsk))) goto killed; @@ -960,6 +971,7 @@ static int de_thread(struct task_struct *tsk) if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); write_unlock_irq(&tasklist_lock); + threadgroup_change_end(tsk); release_task(leader); } @@ -1027,17 +1039,7 @@ EXPORT_SYMBOL_GPL(get_task_comm); void set_task_comm(struct task_struct *tsk, char *buf) { task_lock(tsk); - trace_task_rename(tsk, buf); - - /* - * Threads may access current->comm without holding - * the task lock, so write the string carefully. - * Readers without a lock may see incomplete new - * names but are safe from non-terminating string reads. - */ - memset(tsk->comm, 0, TASK_COMM_LEN); - wmb(); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index fb5120a5505c..3dc48cc8b6eb 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2067,7 +2067,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); - sb->s_flags |= MS_SNAP_STABLE; return 0; diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 987358740cb9..efea5d5c44ce 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -71,4 +71,5 @@ config EXT4_DEBUG Enables run-time debugging support for the ext4 filesystem. If you select Y here, then you will be able to turn on debugging - with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug" + with a command such as: + echo 1 > /sys/module/ext4/parameters/mballoc_debug diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 92e68b33fffd..d0f13eada0ed 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -30,6 +30,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, */ /* + * Calculate block group number for a given block number + */ +ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block) +{ + ext4_group_t group; + + if (test_opt2(sb, STD_GROUP_SIZE)) + group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + + block) >> + (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); + else + ext4_get_group_no_and_offset(sb, block, &group, NULL); + return group; +} + +/* * Calculate the block group number and offset into the block/cluster * allocation bitmap, given a block number */ @@ -49,14 +66,18 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, } -static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, - ext4_group_t block_group) +/* + * Check whether the 'block' lives within the 'block_group'. Returns 1 if so + * and 0 otherwise. + */ +static inline int ext4_block_in_group(struct super_block *sb, + ext4_fsblk_t block, + ext4_group_t block_group) { ext4_group_t actual_group; - ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); - if (actual_group == block_group) - return 1; - return 0; + + actual_group = ext4_get_group_number(sb, block); + return (actual_group == block_group) ? 1 : 0; } /* Return the number of clusters used for file system metadata; this @@ -420,7 +441,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) trace_ext4_read_block_bitmap_load(sb, block_group); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); - submit_bh(READ, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); return bh; verify: ext4_validate_block_bitmap(sb, desc, block_group, bh); @@ -478,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) static int ext4_has_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { - s64 free_clusters, dirty_clusters, root_clusters; + s64 free_clusters, dirty_clusters, rsv, resv_clusters; struct percpu_counter *fcc = &sbi->s_freeclusters_counter; struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; free_clusters = percpu_counter_read_positive(fcc); dirty_clusters = percpu_counter_read_positive(dcc); + resv_clusters = atomic64_read(&sbi->s_resv_clusters); /* * r_blocks_count should always be multiple of the cluster ratio so * we are safe to do a plane bit shift only. */ - root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; + rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + + resv_clusters; - if (free_clusters - (nclusters + root_clusters + dirty_clusters) < + if (free_clusters - (nclusters + rsv + dirty_clusters) < EXT4_FREECLUSTERS_WATERMARK) { free_clusters = percpu_counter_sum_positive(fcc); dirty_clusters = percpu_counter_sum_positive(dcc); @@ -499,15 +522,21 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, /* Check whether we have space after accounting for current * dirty clusters & root reserved clusters. */ - if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters)) + if (free_clusters >= (rsv + nclusters + dirty_clusters)) return 1; /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + if (free_clusters >= (nclusters + dirty_clusters + + resv_clusters)) + return 1; + } + /* No free blocks. Let's see if we can dip into reserved pool */ + if (flags & EXT4_MB_USE_RESERVED) { if (free_clusters >= (nclusters + dirty_clusters)) return 1; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d8cd1f0f4661..f8d56e4254e0 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -46,7 +46,8 @@ static int is_dx_dir(struct inode *inode) if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_COMPAT_DIR_INDEX) && ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) + ((inode->i_size >> sb->s_blocksize_bits) == 1) || + ext4_has_inline_data(inode))) return 1; return 0; @@ -115,14 +116,6 @@ static int ext4_readdir(struct file *filp, int ret = 0; int dir_has_error = 0; - if (ext4_has_inline_data(inode)) { - int has_inline_data = 1; - ret = ext4_read_inline_dir(filp, dirent, filldir, - &has_inline_data); - if (has_inline_data) - return ret; - } - if (is_dx_dir(inode)) { err = ext4_dx_readdir(filp, dirent, filldir); if (err != ERR_BAD_DX_DIR) { @@ -136,6 +129,15 @@ static int ext4_readdir(struct file *filp, ext4_clear_inode_flag(file_inode(filp), EXT4_INODE_INDEX); } + + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + ret = ext4_read_inline_dir(filp, dirent, filldir, + &has_inline_data); + if (has_inline_data) + return ret; + } + stored = 0; offset = filp->f_pos & (sb->s_blocksize - 1); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3b83cd604796..0aabb344b02e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t; #define EXT4_MB_STREAM_ALLOC 0x0800 /* Use reserved root blocks if needed */ #define EXT4_MB_USE_ROOT_BLOCKS 0x1000 +/* Use blocks from reserved pool */ +#define EXT4_MB_USE_RESERVED 0x2000 struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -196,19 +198,8 @@ struct mpage_da_data { #define EXT4_IO_END_ERROR 0x0002 #define EXT4_IO_END_DIRECT 0x0004 -struct ext4_io_page { - struct page *p_page; - atomic_t p_count; -}; - -#define MAX_IO_PAGES 128 - /* * For converting uninitialized extents on a work queue. - * - * 'page' is only used from the writepage() path; 'pages' is only used for - * buffered writes; they are used to keep page references until conversion - * takes place. For AIO/DIO, neither field is filled in. */ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ @@ -218,15 +209,13 @@ typedef struct ext4_io_end { ssize_t size; /* size of the extent */ struct kiocb *iocb; /* iocb struct for AIO */ int result; /* error value for AIO */ - int num_io_pages; /* for writepages() */ - struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */ + atomic_t count; /* reference counter */ } ext4_io_end_t; struct ext4_io_submit { int io_op; struct bio *io_bio; ext4_io_end_t *io_end; - struct ext4_io_page *io_page; sector_t io_next_block; }; @@ -403,7 +392,7 @@ struct flex_groups { #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */ /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ @@ -557,9 +546,8 @@ enum { #define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ EXT4_GET_BLOCKS_CREATE) - /* Caller is from the delayed allocation writeout path, - so set the magic i_delalloc_reserve_flag after taking the - inode allocation semaphore for */ + /* Caller is from the delayed allocation writeout path + * finally doing the actual allocation of delayed blocks */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 /* caller is from the direct IO path, request to creation of an unitialized extents if not allocated, split the uninitialized @@ -571,8 +559,9 @@ enum { /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) - /* Punch out blocks of an extent */ -#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 + /* Eventual metadata allocation (due to growing extent tree) + * should not fail, so try to use reserved blocks for that.*/ +#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 /* Don't normalize allocation size (used for fallocate) */ #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* Request will not result in inode size update (user for fallocate) */ @@ -616,6 +605,7 @@ enum { #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) +#define EXT4_IOC_SWAP_BOOT _IO('f', 17) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -949,7 +939,7 @@ struct ext4_inode_info { #define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ /* - * Mount flags + * Mount flags set via mount options or defaults */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ @@ -981,8 +971,16 @@ struct ext4_inode_info { #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ +/* + * Mount flags set either automatically (could not be set by mount option) + * based on per file system feature or property or in special cases such as + * distinguishing between explicit mount option definition and default. + */ #define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly specified delalloc */ +#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group + size of blocksize * 8 + blocks */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -1179,6 +1177,7 @@ struct ext4_sb_info { unsigned int s_mount_flags; unsigned int s_def_mount_opt; ext4_fsblk_t s_sb_block; + atomic64_t s_resv_clusters; kuid_t s_resuid; kgid_t s_resgid; unsigned short s_mount_state; @@ -1333,6 +1332,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) return ino == EXT4_ROOT_INO || ino == EXT4_USR_QUOTA_INO || ino == EXT4_GRP_QUOTA_INO || + ino == EXT4_BOOT_LOADER_INO || ino == EXT4_JOURNAL_INO || ino == EXT4_RESIZE_INO || (ino >= EXT4_FIRST_INO(sb) && @@ -1374,6 +1374,7 @@ enum { EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ + EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1784,9 +1785,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) */ #define ERR_BAD_DX_DIR -75000 -void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, - ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); - /* * Timeout and state flag for lazy initialization inode thread. */ @@ -1908,6 +1906,13 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, struct buffer_head *bh); /* balloc.c */ +extern void ext4_get_group_no_and_offset(struct super_block *sb, + ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, + ext4_grpblk_t *offsetp); +extern ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block); + extern void ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, unsigned int block_group, @@ -2108,8 +2113,9 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, unsigned long nr_segs); extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); -extern void ext4_ind_truncate(struct inode *inode); -extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length); +extern void ext4_ind_truncate(handle_t *, struct inode *inode); +extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t first, ext4_lblk_t stop); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); @@ -2117,6 +2123,7 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); +extern int ext4_ind_migrate(struct inode *inode); /* namei.c */ extern int ext4_dirent_csum_verify(struct inode *inode, @@ -2511,6 +2518,11 @@ extern int ext4_try_create_inline_dir(handle_t *handle, extern int ext4_read_inline_dir(struct file *filp, void *dirent, filldir_t filldir, int *has_inline_data); +extern int htree_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, @@ -2547,6 +2559,24 @@ extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t, extern int ext4_handle_dirty_dirent_node(handle_t *handle, struct inode *inode, struct buffer_head *bh); +#define S_SHIFT 12 +static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; @@ -2573,9 +2603,9 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern void ext4_ext_truncate(struct inode *); -extern int ext4_ext_punch_hole(struct file *file, loff_t offset, - loff_t length); +extern void ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct file *file, int mode, loff_t offset, @@ -2609,17 +2639,26 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* move_extent.c */ +extern void ext4_double_down_write_data_sem(struct inode *first, + struct inode *second); +extern void ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode); +void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2); +void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2); extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); /* page-io.c */ extern int __init ext4_init_pageio(void); -extern void ext4_add_complete_io(ext4_io_end_t *io_end); extern void ext4_exit_pageio(void); extern void ext4_ioend_shutdown(struct inode *); -extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); +extern int ext4_put_io_end(ext4_io_end_t *io_end); +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); +extern void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc); extern void ext4_end_io_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 8643ff5bbeb7..51bc821ade90 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -270,5 +270,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, 0xffff); } +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) +int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, + struct inode *inode, struct ext4_ext_path *path); + #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 7058975e3a55..451eb4045330 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -43,6 +43,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, { journal_t *journal; + might_sleep(); + trace_ext4_journal_start(sb, nblocks, _RET_IP_); if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); @@ -113,6 +115,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, { int err = 0; + might_sleep(); + if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) @@ -209,6 +213,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, { int err = 0; + might_sleep(); + + set_buffer_meta(bh); + set_buffer_prio(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); if (err) { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 4c216b1bf20c..c8c6885406db 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -29,11 +29,13 @@ * block to complete the transaction. * * For extents-enabled fs we may have to allocate and modify up to - * 5 levels of tree + root which are stored in the inode. */ + * 5 levels of tree, data block (for each of these we need bitmap + group + * summaries), root which is stored in the inode, sb + */ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ - ? 27U : 8U) + ? 20U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode @@ -194,16 +196,20 @@ static inline void ext4_journal_callback_add(handle_t *handle, * ext4_journal_callback_del: delete a registered callback * @handle: active journal transaction handle on which callback was registered * @jce: registered journal callback entry to unregister + * Return true if object was sucessfully removed */ -static inline void ext4_journal_callback_del(handle_t *handle, +static inline bool ext4_journal_callback_try_del(handle_t *handle, struct ext4_journal_cb_entry *jce) { + bool deleted; struct ext4_sb_info *sbi = EXT4_SB(handle->h_transaction->t_journal->j_private); spin_lock(&sbi->s_md_lock); + deleted = !list_empty(&jce->jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); + return deleted; } int diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 56efcaadf848..107936db244e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -157,11 +157,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, * - ENOMEM * - EIO */ -#define ext4_ext_dirty(handle, inode, path) \ - __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) -static int __ext4_ext_dirty(const char *where, unsigned int line, - handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) +int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, + struct inode *inode, struct ext4_ext_path *path) { int err; if (path->p_bh) { @@ -1813,39 +1810,101 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, } depth = ext_depth(inode); ex = path[depth].p_ext; + eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); return -EIO; } /* try to insert block into found extent and return */ - if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) - && ext4_can_extents_be_merged(inode, ex, newext)) { - ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n", - ext4_ext_is_uninitialized(newext), - ext4_ext_get_actual_len(newext), - le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), - ext4_ext_get_actual_len(ex), - ext4_ext_pblock(ex)); - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - return err; + if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) { /* - * ext4_can_extents_be_merged should have checked that either - * both extents are uninitialized, or both aren't. Thus we - * need to check only one of them here. + * Try to see whether we should rather test the extent on + * right from ex, or from the left of ex. This is because + * ext4_ext_find_extent() can return either extent on the + * left, or on the right from the searched position. This + * will make merging more effective. */ - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; - ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + if (ex < EXT_LAST_EXTENT(eh) && + (le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex) < + le32_to_cpu(newext->ee_block))) { + ex += 1; + goto prepend; + } else if ((ex > EXT_FIRST_EXTENT(eh)) && + (le32_to_cpu(newext->ee_block) + + ext4_ext_get_actual_len(newext) < + le32_to_cpu(ex->ee_block))) + ex -= 1; + + /* Try to append newex to the ex */ + if (ext4_can_extents_be_merged(inode, ex, newext)) { + ext_debug("append [%d]%d block to %u:[%d]%d" + "(from %llu)\n", + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + return err; + + /* + * ext4_can_extents_be_merged should have checked + * that either both extents are uninitialized, or + * both aren't. Thus we need to check only one of + * them here. + */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); - if (uninitialized) - ext4_ext_mark_uninitialized(ex); - eh = path[depth].p_hdr; - nearex = ex; - goto merge; + if (uninitialized) + ext4_ext_mark_uninitialized(ex); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } + +prepend: + /* Try to prepend newex to the ex */ + if (ext4_can_extents_be_merged(inode, newext, ex)) { + ext_debug("prepend %u[%d]%d block to %u:[%d]%d" + "(from %llu)\n", + le32_to_cpu(newext->ee_block), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + return err; + + /* + * ext4_can_extents_be_merged should have checked + * that either both extents are uninitialized, or + * both aren't. Thus we need to check only one of + * them here. + */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_block = newext->ee_block; + ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if (uninitialized) + ext4_ext_mark_uninitialized(ex); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } } depth = ext_depth(inode); @@ -1880,8 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ - if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) - flags = EXT4_MB_USE_ROOT_BLOCKS; + if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL) + flags = EXT4_MB_USE_RESERVED; err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); if (err) goto cleanup; @@ -2599,8 +2658,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t end) +int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); @@ -2667,12 +2726,14 @@ again: /* * Split the extent in two so that 'end' is the last - * block in the first new extent + * block in the first new extent. Also we should not + * fail removing space due to ENOSPC so try to use + * reserved block if that happens. */ err = ext4_split_extent_at(handle, inode, path, - end + 1, split_flag, - EXT4_GET_BLOCKS_PRE_IO | - EXT4_GET_BLOCKS_PUNCH_OUT_EXT); + end + 1, split_flag, + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_METADATA_NOFAIL); if (err < 0) goto out; @@ -2999,20 +3060,23 @@ static int ext4_split_extent_at(handle_t *handle, if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); zero_ex.ee_block = ex2->ee_block; - zero_ex.ee_len = ext4_ext_get_actual_len(ex2); + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(ex2)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex2)); } else { err = ext4_ext_zeroout(inode, ex); zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = ext4_ext_get_actual_len(ex); + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); } } else { err = ext4_ext_zeroout(inode, &orig_ex); zero_ex.ee_block = orig_ex.ee_block; - zero_ex.ee_len = ext4_ext_get_actual_len(&orig_ex); + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(&orig_ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(&orig_ex)); } @@ -3144,35 +3208,35 @@ out: static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path) + struct ext4_ext_path *path, + int flags) { struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex; - struct ext4_extent *ex; + struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; - unsigned int ee_len, depth; - int allocated, max_zeroout = 0; + unsigned int ee_len, depth, map_len = map->m_len; + int allocated = 0, max_zeroout = 0; int err = 0; int split_flag = 0; ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map->m_len); + (unsigned long long)map->m_lblk, map_len); sbi = EXT4_SB(inode->i_sb); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; - if (eof_block < map->m_lblk + map->m_len) - eof_block = map->m_lblk + map->m_len; + if (eof_block < map->m_lblk + map_len) + eof_block = map->m_lblk + map_len; depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - allocated = ee_len - (map->m_lblk - ee_block); zero_ex.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3183,77 +3247,121 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, /* * Attempt to transfer newly initialized blocks from the currently - * uninitialized extent to its left neighbor. This is much cheaper + * uninitialized extent to its neighbor. This is much cheaper * than an insertion followed by a merge as those involve costly - * memmove() calls. This is the common case in steady state for - * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append - * writes. + * memmove() calls. Transferring to the left is the common case in + * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) + * followed by append writes. * * Limitations of the current logic: - * - L1: we only deal with writes at the start of the extent. - * The approach could be extended to writes at the end - * of the extent but this scenario was deemed less common. - * - L2: we do not deal with writes covering the whole extent. + * - L1: we do not deal with writes covering the whole extent. * This would require removing the extent if the transfer * is possible. - * - L3: we only attempt to merge with an extent stored in the + * - L2: we only attempt to merge with an extent stored in the * same extent tree node. */ - if ((map->m_lblk == ee_block) && /*L1*/ - (map->m_len < ee_len) && /*L2*/ - (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ - struct ext4_extent *prev_ex; + if ((map->m_lblk == ee_block) && + /* See if we can merge left */ + (map_len < ee_len) && /*L1*/ + (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ ext4_lblk_t prev_lblk; ext4_fsblk_t prev_pblk, ee_pblk; - unsigned int prev_len, write_len; + unsigned int prev_len; - prev_ex = ex - 1; - prev_lblk = le32_to_cpu(prev_ex->ee_block); - prev_len = ext4_ext_get_actual_len(prev_ex); - prev_pblk = ext4_ext_pblock(prev_ex); + abut_ex = ex - 1; + prev_lblk = le32_to_cpu(abut_ex->ee_block); + prev_len = ext4_ext_get_actual_len(abut_ex); + prev_pblk = ext4_ext_pblock(abut_ex); ee_pblk = ext4_ext_pblock(ex); - write_len = map->m_len; /* - * A transfer of blocks from 'ex' to 'prev_ex' is allowed + * A transfer of blocks from 'ex' to 'abut_ex' is allowed * upon those conditions: - * - C1: prev_ex is initialized, - * - C2: prev_ex is logically abutting ex, - * - C3: prev_ex is physically abutting ex, - * - C4: prev_ex can receive the additional blocks without + * - C1: abut_ex is initialized, + * - C2: abut_ex is logically abutting ex, + * - C3: abut_ex is physically abutting ex, + * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ - if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ + if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ - (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ + (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; trace_ext4_ext_convert_to_initialized_fastpath(inode, - map, ex, prev_ex); + map, ex, abut_ex); - /* Shift the start of ex by 'write_len' blocks */ - ex->ee_block = cpu_to_le32(ee_block + write_len); - ext4_ext_store_pblock(ex, ee_pblk + write_len); - ex->ee_len = cpu_to_le16(ee_len - write_len); + /* Shift the start of ex by 'map_len' blocks */ + ex->ee_block = cpu_to_le32(ee_block + map_len); + ext4_ext_store_pblock(ex, ee_pblk + map_len); + ex->ee_len = cpu_to_le16(ee_len - map_len); ext4_ext_mark_uninitialized(ex); /* Restore the flag */ - /* Extend prev_ex by 'write_len' blocks */ - prev_ex->ee_len = cpu_to_le16(prev_len + write_len); + /* Extend abut_ex by 'map_len' blocks */ + abut_ex->ee_len = cpu_to_le16(prev_len + map_len); - /* Mark the block containing both extents as dirty */ - ext4_ext_dirty(handle, inode, path + depth); + /* Result: number of initialized blocks past m_lblk */ + allocated = map_len; + } + } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && + (map_len < ee_len) && /*L1*/ + ex < EXT_LAST_EXTENT(eh)) { /*L2*/ + /* See if we can merge right */ + ext4_lblk_t next_lblk; + ext4_fsblk_t next_pblk, ee_pblk; + unsigned int next_len; + + abut_ex = ex + 1; + next_lblk = le32_to_cpu(abut_ex->ee_block); + next_len = ext4_ext_get_actual_len(abut_ex); + next_pblk = ext4_ext_pblock(abut_ex); + ee_pblk = ext4_ext_pblock(ex); - /* Update path to point to the right extent */ - path[depth].p_ext = prev_ex; + /* + * A transfer of blocks from 'ex' to 'abut_ex' is allowed + * upon those conditions: + * - C1: abut_ex is initialized, + * - C2: abut_ex is logically abutting ex, + * - C3: abut_ex is physically abutting ex, + * - C4: abut_ex can receive the additional blocks without + * overflowing the (initialized) length limit. + */ + if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ + ((map->m_lblk + map_len) == next_lblk) && /*C2*/ + ((ee_pblk + ee_len) == next_pblk) && /*C3*/ + (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + trace_ext4_ext_convert_to_initialized_fastpath(inode, + map, ex, abut_ex); + + /* Shift the start of abut_ex by 'map_len' blocks */ + abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); + ext4_ext_store_pblock(abut_ex, next_pblk - map_len); + ex->ee_len = cpu_to_le16(ee_len - map_len); + ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + + /* Extend abut_ex by 'map_len' blocks */ + abut_ex->ee_len = cpu_to_le16(next_len + map_len); /* Result: number of initialized blocks past m_lblk */ - allocated = write_len; - goto out; + allocated = map_len; } } + if (allocated) { + /* Mark the block containing both extents as dirty */ + ext4_ext_dirty(handle, inode, path + depth); + + /* Update path to point to the right extent */ + path[depth].p_ext = abut_ex; + goto out; + } else + allocated = ee_len - (map->m_lblk - ee_block); WARN_ON(map->m_lblk < ee_block); /* @@ -3272,7 +3380,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (err) goto out; zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = ext4_ext_get_actual_len(ex); + zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); @@ -3327,7 +3435,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } allocated = ext4_split_extent(handle, inode, path, - &split_map, split_flag, 0); + &split_map, split_flag, flags); if (allocated < 0) err = allocated; @@ -3647,6 +3755,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, flags, allocated); ext4_ext_show_leaf(inode, path); + /* + * When writing into uninitialized space, we should not fail to + * allocate metadata blocks for the new extent block if needed. + */ + flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; + trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, allocated, newblock); @@ -3710,7 +3824,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, } /* buffered write, writepage time, convert*/ - ret = ext4_ext_convert_to_initialized(handle, inode, map, path); + ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out: @@ -4254,48 +4368,13 @@ out3: return err ? err : allocated; } -void ext4_ext_truncate(struct inode *inode) +void ext4_ext_truncate(handle_t *handle, struct inode *inode) { - struct address_space *mapping = inode->i_mapping; struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; - handle_t *handle; - loff_t page_len; int err = 0; /* - * finish any pending end_io work so we won't run the risk of - * converting any truncated blocks to initialized later - */ - ext4_flush_unwritten_io(inode); - - /* - * probably first extent we're gonna free will be last in block - */ - err = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err); - if (IS_ERR(handle)) - return; - - if (inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out_stop; - } - - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - down_write(&EXT4_I(inode)->i_data_sem); - - ext4_discard_preallocations(inode); - - /* * TODO: optimization is possible here. * Probably we need not scan at all, * because page truncation is enough. @@ -4310,29 +4389,6 @@ void ext4_ext_truncate(struct inode *inode) err = ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); - - /* In a multi-transaction truncate, we only make the final - * transaction synchronous. - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - up_write(&EXT4_I(inode)->i_data_sem); - -out_stop: - /* - * If this was a simple ftruncate() and the file will remain alive, - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); } static void ext4_falloc_update_inode(struct inode *inode, @@ -4620,187 +4676,6 @@ static int ext4_xattr_fiemap(struct inode *inode, return (error < 0 ? error : 0); } -/* - * ext4_ext_punch_hole - * - * Punches a hole of "length" bytes in a file starting - * at byte "offset" - * - * @inode: The inode of the file to punch a hole in - * @offset: The starting byte offset of the hole - * @length: The length of the hole - * - * Returns the number of blocks removed or negative on err - */ -int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) -{ - struct inode *inode = file_inode(file); - struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - handle_t *handle; - loff_t first_page, last_page, page_len; - loff_t first_page_offset, last_page_offset; - int credits, err = 0; - - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - err = filemap_write_and_wait_range(mapping, - offset, offset + length - 1); - - if (err) - return err; - } - - mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - err = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { - err = -ETXTBSY; - goto out_mutex; - } - - /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) - goto out_mutex; - - /* - * If the hole extends beyond i_size, set the hole - * to end after the page that contains i_size - */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - - offset; - } - - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - last_page = (offset + length) >> PAGE_CACHE_SHIFT; - - first_page_offset = first_page << PAGE_CACHE_SHIFT; - last_page_offset = last_page << PAGE_CACHE_SHIFT; - - /* Now release the pages */ - if (last_page_offset > first_page_offset) { - truncate_pagecache_range(inode, first_page_offset, - last_page_offset - 1); - } - - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - err = ext4_flush_unwritten_io(inode); - if (err) - goto out_dio; - inode_dio_wait(inode); - - credits = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto out_dio; - } - - - /* - * Now we need to zero out the non-page-aligned data in the - * pages at the start and tail of the hole, and unmap the buffer - * heads for the block aligned regions of the page that were - * completely zeroed. - */ - if (first_page > last_page) { - /* - * If the file space being truncated is contained within a page - * just zero out and unmap the middle of that page - */ - err = ext4_discard_partial_page_buffers(handle, - mapping, offset, length, 0); - - if (err) - goto out; - } else { - /* - * zero out and unmap the partial page that contains - * the start of the hole - */ - page_len = first_page_offset - offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - offset, page_len, 0); - if (err) - goto out; - } - - /* - * zero out and unmap the partial page that contains - * the end of the hole - */ - page_len = offset + length - last_page_offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - last_page_offset, page_len, 0); - if (err) - goto out; - } - } - - /* - * If i_size is contained in the last page, we need to - * unmap and zero the partial page after i_size - */ - if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && - inode->i_size % PAGE_CACHE_SIZE != 0) { - - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out; - } - } - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - - /* If there are no blocks to remove, return now */ - if (first_block >= stop_block) - goto out; - - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); - - err = ext4_es_remove_extent(inode, first_block, - stop_block - first_block); - err = ext4_ext_remove_space(inode, first_block, stop_block - 1); - - ext4_discard_preallocations(inode); - - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - up_write(&EXT4_I(inode)->i_data_sem); - -out: - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); -out_dio: - ext4_inode_resume_unlocked_dio(inode); -out_mutex: - mutex_unlock(&inode->i_mutex); - return err; -} - int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 3278e64e57b6..e0ba8a408def 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -166,8 +166,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (journal->j_flags & JBD2_BARRIER && !jbd2_trans_will_send_data_barrier(journal, commit_tid)) needs_barrier = true; - jbd2_log_start_commit(journal, commit_tid); - ret = jbd2_log_wait_commit(journal, commit_tid); + ret = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); if (!ret) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 6c5bb8d993fe..00a818d67b54 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -166,7 +166,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) trace_ext4_load_inode_bitmap(sb, block_group); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); - submit_bh(READ, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { put_bh(bh); @@ -666,6 +666,23 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ei = EXT4_I(inode); sbi = EXT4_SB(sb); + /* + * Initalize owners and quota early so that we don't have to account + * for quota initialization worst case in standard inode creating + * transaction + */ + if (owner) { + inode->i_mode = mode; + i_uid_write(inode, owner[0]); + i_gid_write(inode, owner[1]); + } else if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(inode, dir, mode); + dquot_initialize(inode); + if (!goal) goal = sbi->s_inode_goal; @@ -697,7 +714,7 @@ got_group: gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) - goto fail; + goto out; /* * Check free inodes count before loading bitmap. @@ -711,7 +728,7 @@ got_group: brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); if (!inode_bitmap_bh) - goto fail; + goto out; repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) @@ -733,13 +750,16 @@ repeat_in_this_group: handle_type, nblocks); if (IS_ERR(handle)) { err = PTR_ERR(handle); - goto fail; + ext4_std_error(sb, err); + goto out; } } BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode_bitmap_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } ext4_lock_group(sb, group); ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); ext4_unlock_group(sb, group); @@ -755,8 +775,10 @@ repeat_in_this_group: got: BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && @@ -768,7 +790,8 @@ got: err = ext4_journal_get_write_access(handle, block_bitmap_bh); if (err) { brelse(block_bitmap_bh); - goto fail; + ext4_std_error(sb, err); + goto out; } BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); @@ -787,14 +810,18 @@ got: ext4_unlock_group(sb, group); brelse(block_bitmap_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } } BUFFER_TRACE(group_desc_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, group_desc_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { @@ -840,8 +867,10 @@ got: BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) @@ -851,16 +880,6 @@ got: flex_group = ext4_flex_group(sbi, group); atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); } - if (owner) { - inode->i_mode = mode; - i_uid_write(inode, owner[0]); - i_gid_write(inode, owner[1]); - } else if (test_opt(sb, GRPID)) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = dir->i_gid; - } else - inode_init_owner(inode, dir, mode); inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ @@ -889,7 +908,9 @@ got: * twice. */ err = -EIO; - goto fail; + ext4_error(sb, "failed to insert inode %lu: doubly allocated?", + inode->i_ino); + goto out; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; @@ -899,7 +920,6 @@ got: if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { __u32 csum; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, @@ -918,7 +938,6 @@ got: ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; - dquot_initialize(inode); err = dquot_alloc_inode(inode); if (err) goto fail_drop; @@ -952,24 +971,17 @@ got: ext4_debug("allocating inode %lu\n", inode->i_ino); trace_ext4_allocate_inode(inode, dir, mode); - goto really_out; -fail: - ext4_std_error(sb, err); -out: - iput(inode); - ret = ERR_PTR(err); -really_out: brelse(inode_bitmap_bh); return ret; fail_free_drop: dquot_free_inode(inode); - fail_drop: - dquot_drop(inode); - inode->i_flags |= S_NOQUOTA; clear_nlink(inode); unlock_new_inode(inode); +out: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; iput(inode); brelse(inode_bitmap_bh); return ERR_PTR(err); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index b505a145a593..98be6f697463 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -292,131 +292,6 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, } /** - * ext4_alloc_blocks: multiple allocate blocks needed for a branch - * @handle: handle for this transaction - * @inode: inode which needs allocated blocks - * @iblock: the logical block to start allocated at - * @goal: preferred physical block of allocation - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * @blks: number of desired blocks - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, - * @err: on return it will store the error code - * - * This function will return the number of blocks allocated as - * requested by the passed-in parameters. - */ -static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) -{ - struct ext4_allocation_request ar; - int target, i; - unsigned long count = 0, blk_allocated = 0; - int index = 0; - ext4_fsblk_t current_block = 0; - int ret = 0; - - /* - * Here we try to allocate the requested multiple blocks at once, - * on a best-effort basis. - * To build a branch, we should allocate blocks for - * the indirect blocks(if not allocated yet), and at least - * the first direct block of this branch. That's the - * minimum number of blocks need to allocate(required) - */ - /* first we try to allocate the indirect blocks */ - target = indirect_blks; - while (target > 0) { - count = target; - /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_meta_blocks(handle, inode, goal, - 0, &count, err); - if (*err) - goto failed_out; - - if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + count %lu > %d!", - current_block, count, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - target -= count; - /* allocate blocks for indirect blocks */ - while (index < indirect_blks && count) { - new_blocks[index++] = current_block++; - count--; - } - if (count > 0) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - WARN(1, KERN_INFO "%s returned more blocks than " - "requested\n", __func__); - break; - } - } - - target = blks - count ; - blk_allocated = count; - if (!target) - goto allocated; - /* Now allocate data blocks */ - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.goal = goal; - ar.len = target; - ar.logical = iblock; - if (S_ISREG(inode->i_mode)) - /* enable in-core preallocation only for regular files */ - ar.flags = EXT4_MB_HINT_DATA; - - current_block = ext4_mb_new_blocks(handle, &ar, err); - if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + ar.len %d > %d!", - current_block, ar.len, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - if (*err && (target == blks)) { - /* - * if the allocation failed and we didn't allocate - * any blocks before - */ - goto failed_out; - } - if (!*err) { - if (target == blks) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - } - blk_allocated += ar.len; - } -allocated: - /* total number of blocks allocated for direct blocks */ - ret = blk_allocated; - *err = 0; - return ret; -failed_out: - for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - return ret; -} - -/** * ext4_alloc_branch - allocate and set up a chain of blocks. * @handle: handle for this transaction * @inode: owner @@ -448,60 +323,59 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, int *blks, ext4_fsblk_t goal, ext4_lblk_t *offsets, Indirect *branch) { - int blocksize = inode->i_sb->s_blocksize; - int i, n = 0; - int err = 0; - struct buffer_head *bh; - int num; - ext4_fsblk_t new_blocks[4]; - ext4_fsblk_t current_block; - - num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, - *blks, new_blocks, &err); - if (err) - return err; + struct ext4_allocation_request ar; + struct buffer_head * bh; + ext4_fsblk_t b, new_blocks[4]; + __le32 *p; + int i, j, err, len = 1; - branch[0].key = cpu_to_le32(new_blocks[0]); /* - * metadata blocks and data blocks are allocated. + * Set up for the direct block allocation */ - for (n = 1; n <= indirect_blks; n++) { - /* - * Get buffer_head for parent block, zero it out - * and set the pointer to new one, then send - * parent to disk. - */ - bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.len = *blks; + ar.logical = iblock; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + + for (i = 0; i <= indirect_blks; i++) { + if (i == indirect_blks) { + ar.goal = goal; + new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err); + } else + goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode, + goal, 0, NULL, &err); + if (err) { + i--; + goto failed; + } + branch[i].key = cpu_to_le32(new_blocks[i]); + if (i == 0) + continue; + + bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]); if (unlikely(!bh)) { err = -ENOMEM; goto failed; } - - branch[n].bh = bh; lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, bh); if (err) { - /* Don't brelse(bh) here; it's done in - * ext4_journal_forget() below */ unlock_buffer(bh); goto failed; } - memset(bh->b_data, 0, blocksize); - branch[n].p = (__le32 *) bh->b_data + offsets[n]; - branch[n].key = cpu_to_le32(new_blocks[n]); - *branch[n].p = branch[n].key; - if (n == indirect_blks) { - current_block = new_blocks[n]; - /* - * End of chain, update the last new metablock of - * the chain to point to the new allocated - * data blocks numbers - */ - for (i = 1; i < num; i++) - *(branch[n].p + i) = cpu_to_le32(++current_block); - } + memset(bh->b_data, 0, bh->b_size); + p = branch[i].p = (__le32 *) bh->b_data + offsets[i]; + b = new_blocks[i]; + + if (i == indirect_blks) + len = ar.len; + for (j = 0; j < len; j++) + *p++ = cpu_to_le32(b++); + BUFFER_TRACE(bh, "marking uptodate"); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -511,25 +385,16 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, if (err) goto failed; } - *blks = num; - return err; + *blks = ar.len; + return 0; failed: - /* Allocation failed, free what we already allocated */ - ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); - for (i = 1; i <= n ; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, - EXT4_FREE_BLOCKS_FORGET); + for (; i >= 0; i--) { + if (i != indirect_blks && branch[i].bh) + ext4_forget(handle, 1, inode, branch[i].bh, + branch[i].bh->b_blocknr); + ext4_free_blocks(handle, inode, NULL, new_blocks[i], + (i == indirect_blks) ? ar.len : 1, 0); } - for (i = n+1; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - - ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); - return err; } @@ -941,26 +806,9 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) * be able to restart the transaction at a conventient checkpoint to make * sure we don't overflow the journal. * - * start_transaction gets us a new handle for a truncate transaction, - * and extend_transaction tries to extend the existing one a bit. If + * Try to extend this transaction for the purposes of truncation. If * extend fails, we need to propagate the failure up and restart the * transaction in the top-level truncate loop. --sct - */ -static handle_t *start_transaction(struct inode *inode) -{ - handle_t *result; - - result = ext4_journal_start(inode, EXT4_HT_TRUNCATE, - ext4_blocks_for_truncate(inode)); - if (!IS_ERR(result)) - return result; - - ext4_std_error(inode->i_sb, PTR_ERR(result)); - return result; -} - -/* - * Try to extend this transaction for the purposes of truncation. * * Returns 0 if we managed to create more room. If we can't create more * room, and the transaction must be restarted we return 1. @@ -1353,68 +1201,30 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, } } -void ext4_ind_truncate(struct inode *inode) +void ext4_ind_truncate(handle_t *handle, struct inode *inode) { - handle_t *handle; struct ext4_inode_info *ei = EXT4_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; __le32 nr = 0; int n = 0; ext4_lblk_t last_block, max_block; - loff_t page_len; unsigned blocksize = inode->i_sb->s_blocksize; - int err; - - handle = start_transaction(inode); - if (IS_ERR(handle)) - return; /* AKPM: return what? */ last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - if (inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out_stop; - } - if (last_block != max_block) { n = ext4_block_to_path(inode, last_block, offsets, NULL); if (n == 0) - goto out_stop; /* error */ + return; } - /* - * OK. This truncate is going to happen. We add the inode to the - * orphan list, so that if this truncate spans multiple transactions, - * and we crash, we will resume the truncate when the filesystem - * recovers. It also marks the inode dirty, to catch the new size. - * - * Implication: the file must always be in a sane, consistent - * truncatable state while each transaction commits. - */ - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - /* - * From here we block out all ext4_get_block() callers who want to - * modify the block allocation tree. - */ - down_write(&ei->i_data_sem); - - ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); /* @@ -1431,7 +1241,7 @@ void ext4_ind_truncate(struct inode *inode) * It is unnecessary to free any data blocks if last_block is * equal to the indirect block limit. */ - goto out_unlock; + return; } else if (n == 1) { /* direct blocks */ ext4_free_data(handle, inode, NULL, i_data+offsets[0], i_data + EXT4_NDIR_BLOCKS); @@ -1491,31 +1301,6 @@ do_indirects: case EXT4_TIND_BLOCK: ; } - -out_unlock: - up_write(&ei->i_data_sem); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - - /* - * In a multi-transaction truncate, we only make the final transaction - * synchronous - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); -out_stop: - /* - * If this was a simple ftruncate(), and the file will remain alive - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - - ext4_journal_stop(handle); - trace_ext4_truncate_exit(inode); } static int free_hole_blocks(handle_t *handle, struct inode *inode, @@ -1539,9 +1324,9 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode, blk = *i_data; if (level > 0) { ext4_lblk_t first2; - bh = sb_bread(inode->i_sb, blk); + bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, blk, + EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), "Read failure"); return -EIO; } @@ -1569,8 +1354,8 @@ err: return ret; } -static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t first, ext4_lblk_t stop) +int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t first, ext4_lblk_t stop) { int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); int level, ret = 0; @@ -1604,157 +1389,3 @@ err: return ret; } -int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length) -{ - struct inode *inode = file_inode(file); - struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - handle_t *handle = NULL; - loff_t first_page, last_page, page_len; - loff_t first_page_offset, last_page_offset; - int err = 0; - - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - err = filemap_write_and_wait_range(mapping, - offset, offset + length - 1); - if (err) - return err; - } - - mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - err = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { - err = -ETXTBSY; - goto out_mutex; - } - - /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) - goto out_mutex; - - /* - * If the hole extents beyond i_size, set the hole - * to end after the page that contains i_size - */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - - offset; - } - - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - last_page = (offset + length) >> PAGE_CACHE_SHIFT; - - first_page_offset = first_page << PAGE_CACHE_SHIFT; - last_page_offset = last_page << PAGE_CACHE_SHIFT; - - /* Now release the pages */ - if (last_page_offset > first_page_offset) { - truncate_pagecache_range(inode, first_page_offset, - last_page_offset - 1); - } - - /* Wait all existing dio works, newcomers will block on i_mutex */ - inode_dio_wait(inode); - - handle = start_transaction(inode); - if (IS_ERR(handle)) - goto out_mutex; - - /* - * Now we need to zero out the non-page-aligned data in the - * pages at the start and tail of the hole, and unmap the buffer - * heads for the block aligned regions of the page that were - * completely zerod. - */ - if (first_page > last_page) { - /* - * If the file space being truncated is contained within a page - * just zero out and unmap the middle of that page - */ - err = ext4_discard_partial_page_buffers(handle, - mapping, offset, length, 0); - if (err) - goto out; - } else { - /* - * Zero out and unmap the paritial page that contains - * the start of the hole - */ - page_len = first_page_offset - offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - offset, page_len, 0); - if (err) - goto out; - } - - /* - * Zero out and unmap the partial page that contains - * the end of the hole - */ - page_len = offset + length - last_page_offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - last_page_offset, page_len, 0); - if (err) - goto out; - } - } - - /* - * If i_size contained in the last page, we need to - * unmap and zero the paritial page after i_size - */ - if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && - inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - if (err) - goto out; - } - } - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - - if (first_block >= stop_block) - goto out; - - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); - - err = ext4_es_remove_extent(inode, first_block, - stop_block - first_block); - err = ext4_free_hole_blocks(handle, inode, first_block, stop_block); - - ext4_discard_preallocations(inode); - - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - up_write(&EXT4_I(inode)->i_data_sem); - -out: - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - -out_mutex: - mutex_unlock(&inode->i_mutex); - - return err; -} diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index c0fd1a123f7d..3e2bf873e8a8 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -19,7 +19,8 @@ #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) -#define EXT4_INLINE_DOTDOT_SIZE 4 +#define EXT4_INLINE_DOTDOT_OFFSET 2 +#define EXT4_INLINE_DOTDOT_SIZE 4 int ext4_get_inline_size(struct inode *inode) { @@ -1289,6 +1290,120 @@ out: return ret; } +/* + * This function fills a red-black tree with information from an + * inlined dir. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +int htree_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data) +{ + int err = 0, count = 0; + unsigned int parent_ino; + int pos; + struct ext4_dir_entry_2 *de; + struct inode *inode = file_inode(dir_file); + int ret, inline_size = 0; + struct ext4_iloc iloc; + void *dir_buf = NULL; + struct ext4_dir_entry_2 fake; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + *has_inline_data = 0; + goto out; + } + + inline_size = ext4_get_inline_size(inode); + dir_buf = kmalloc(inline_size, GFP_NOFS); + if (!dir_buf) { + ret = -ENOMEM; + up_read(&EXT4_I(inode)->xattr_sem); + goto out; + } + + ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + if (ret < 0) + goto out; + + pos = 0; + parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + while (pos < inline_size) { + /* + * As inlined dir doesn't store any information about '.' and + * only the inode number of '..' is stored, we have to handle + * them differently. + */ + if (pos == 0) { + fake.inode = cpu_to_le32(inode->i_ino); + fake.name_len = 1; + strcpy(fake.name, "."); + fake.rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(fake.name_len), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; + pos = EXT4_INLINE_DOTDOT_OFFSET; + } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { + fake.inode = cpu_to_le32(parent_ino); + fake.name_len = 2; + strcpy(fake.name, ".."); + fake.rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(fake.name_len), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; + pos = EXT4_INLINE_DOTDOT_SIZE; + } else { + de = (struct ext4_dir_entry_2 *)(dir_buf + pos); + pos += ext4_rec_len_from_disk(de->rec_len, inline_size); + if (ext4_check_dir_entry(inode, dir_file, de, + iloc.bh, dir_buf, + inline_size, pos)) { + ret = count; + goto out; + } + } + + ext4fs_dirhash(de->name, de->name_len, hinfo); + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de); + if (err) { + count = err; + goto out; + } + count++; + } + ret = count; +out: + kfree(dir_buf); + brelse(iloc.bh); + return ret; +} + +/* + * So this function is called when the volume is mkfsed with + * dir_index disabled. In order to keep f_pos persistent + * after we convert from an inlined dir to a blocked based, + * we just pretend that we are a normal dir and return the + * offset as if '.' and '..' really take place. + * + */ int ext4_read_inline_dir(struct file *filp, void *dirent, filldir_t filldir, int *has_inline_data) @@ -1302,6 +1417,7 @@ int ext4_read_inline_dir(struct file *filp, int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; + int dotdot_offset, dotdot_size, extra_offset, extra_size; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -1330,8 +1446,21 @@ int ext4_read_inline_dir(struct file *filp, sb = inode->i_sb; stored = 0; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + offset = filp->f_pos; - while (!error && !stored && filp->f_pos < inode->i_size) { + /* + * dotdot_offset and dotdot_size is the real offset and + * size for ".." and "." if the dir is block based while + * the real size for them are only EXT4_INLINE_DOTDOT_SIZE. + * So we will use extra_offset and extra_size to indicate them + * during the inline dir iteration. + */ + dotdot_offset = EXT4_DIR_REC_LEN(1); + dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); + extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; + extra_size = extra_offset + inline_size; + + while (!error && !stored && filp->f_pos < extra_size) { revalidate: /* * If the version has changed since the last call to @@ -1340,15 +1469,23 @@ revalidate: * dir to make sure. */ if (filp->f_version != inode->i_version) { - for (i = 0; - i < inode->i_size && i < offset;) { + for (i = 0; i < extra_size && i < offset;) { + /* + * "." is with offset 0 and + * ".." is dotdot_offset. + */ if (!i) { - /* skip "." and ".." if needed. */ - i += EXT4_INLINE_DOTDOT_SIZE; + i = dotdot_offset; + continue; + } else if (i == dotdot_offset) { + i = dotdot_size; continue; } + /* for other entry, the real offset in + * the buf has to be tuned accordingly. + */ de = (struct ext4_dir_entry_2 *) - (dir_buf + i); + (dir_buf + i - extra_offset); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at @@ -1356,43 +1493,47 @@ revalidate: * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, - inline_size) < EXT4_DIR_REC_LEN(1)) + extra_size) < EXT4_DIR_REC_LEN(1)) break; i += ext4_rec_len_from_disk(de->rec_len, - inline_size); + extra_size); } offset = i; filp->f_pos = offset; filp->f_version = inode->i_version; } - while (!error && filp->f_pos < inode->i_size) { + while (!error && filp->f_pos < extra_size) { if (filp->f_pos == 0) { error = filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR); if (error) break; stored++; + filp->f_pos = dotdot_offset; + continue; + } - error = filldir(dirent, "..", 2, 0, parent_ino, - DT_DIR); + if (filp->f_pos == dotdot_offset) { + error = filldir(dirent, "..", 2, + dotdot_offset, + parent_ino, DT_DIR); if (error) break; stored++; - filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE; + filp->f_pos = dotdot_size; continue; } - de = (struct ext4_dir_entry_2 *)(dir_buf + offset); + de = (struct ext4_dir_entry_2 *) + (dir_buf + filp->f_pos - extra_offset); if (ext4_check_dir_entry(inode, filp, de, iloc.bh, dir_buf, - inline_size, offset)) { + extra_size, filp->f_pos)) { ret = stored; goto out; } - offset += ext4_rec_len_from_disk(de->rec_len, - inline_size); if (le32_to_cpu(de->inode)) { /* We might block in the next section * if the data destination is @@ -1415,9 +1556,8 @@ revalidate: stored++; } filp->f_pos += ext4_rec_len_from_disk(de->rec_len, - inline_size); + extra_size); } - offset = 0; } out: kfree(dir_buf); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b3a5213bc73e..793d44b84d7f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -55,21 +55,21 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, __u16 csum_hi = 0; __u32 csum; - csum_lo = raw->i_checksum_lo; + csum_lo = le16_to_cpu(raw->i_checksum_lo); raw->i_checksum_lo = 0; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum_hi = raw->i_checksum_hi; + csum_hi = le16_to_cpu(raw->i_checksum_hi); raw->i_checksum_hi = 0; } csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, EXT4_INODE_SIZE(inode->i_sb)); - raw->i_checksum_lo = csum_lo; + raw->i_checksum_lo = cpu_to_le16(csum_lo); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) - raw->i_checksum_hi = csum_hi; + raw->i_checksum_hi = cpu_to_le16(csum_hi); return csum; } @@ -210,8 +210,7 @@ void ext4_evict_inode(struct inode *inode) journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; - jbd2_log_start_commit(journal, commit_tid); - jbd2_log_wait_commit(journal, commit_tid); + jbd2_complete_transaction(journal, commit_tid); filemap_write_and_wait(&inode->i_data); } truncate_inode_pages(&inode->i_data, 0); @@ -1081,20 +1080,42 @@ retry_journal: /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct buffer_head *bh) { + int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); - return ext4_handle_dirty_metadata(handle, NULL, bh); + ret = ext4_handle_dirty_metadata(handle, NULL, bh); + clear_buffer_meta(bh); + clear_buffer_prio(bh); + return ret; } -static int ext4_generic_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from page_symlink(). + * + * ext4 never places buffers on inode->i_mapping->private_list. metadata + * buffers are managed internally. + */ +static int ext4_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { - int i_size_changed = 0; - struct inode *inode = mapping->host; handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + int ret = 0, ret2; + int i_size_changed = 0; + + trace_ext4_write_end(inode, pos, len, copied); + if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { + ret = ext4_jbd2_file_inode(handle, inode); + if (ret) { + unlock_page(page); + page_cache_release(page); + goto errout; + } + } if (ext4_has_inline_data(inode)) copied = ext4_write_inline_data_end(inode, pos, len, @@ -1105,7 +1126,7 @@ static int ext4_generic_write_end(struct file *file, /* * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. + * cannot change under us because we hole i_mutex. * * But it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1115,10 +1136,10 @@ static int ext4_generic_write_end(struct file *file, i_size_changed = 1; } - if (pos + copied > EXT4_I(inode)->i_disksize) { + if (pos + copied > EXT4_I(inode)->i_disksize) { /* We need to mark inode dirty even if * new_i_size is less that inode->i_size - * bu greater than i_disksize.(hint delalloc) + * but greater than i_disksize. (hint delalloc) */ ext4_update_i_disksize(inode, (pos + copied)); i_size_changed = 1; @@ -1135,87 +1156,15 @@ static int ext4_generic_write_end(struct file *file, if (i_size_changed) ext4_mark_inode_dirty(handle, inode); - return copied; -} - -/* - * We need to pick up the new inode size which generic_commit_write gave us - * `file' can be NULL - eg, when called from page_symlink(). - * - * ext4 never places buffers on inode->i_mapping->private_list. metadata - * buffers are managed internally. - */ -static int ext4_ordered_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = mapping->host; - int ret = 0, ret2; - - trace_ext4_ordered_write_end(inode, pos, len, copied); - ret = ext4_jbd2_file_inode(handle, inode); - - if (ret == 0) { - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; - if (pos + len > inode->i_size && ext4_can_truncate(inode)) - /* if we have allocated more blocks and copied - * less. We will have blocks allocated outside - * inode->i_size. So truncate them - */ - ext4_orphan_add(handle, inode); - if (ret2 < 0) - ret = ret2; - } else { - unlock_page(page); - page_cache_release(page); - } - - ret2 = ext4_journal_stop(handle); - if (!ret) - ret = ret2; - - if (pos + len > inode->i_size) { - ext4_truncate_failed_write(inode); - /* - * If truncate failed early the inode might still be - * on the orphan list; we need to make sure the inode - * is removed from the orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - - - return ret ? ret : copied; -} - -static int ext4_writeback_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = mapping->host; - int ret = 0, ret2; - - trace_ext4_writeback_write_end(inode, pos, len, copied); - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; + if (copied < 0) + ret = copied; if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); - - if (ret2 < 0) - ret = ret2; - +errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -1538,7 +1487,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, struct ext4_io_submit io_submit; BUG_ON(mpd->next_page <= mpd->first_page); - memset(&io_submit, 0, sizeof(io_submit)); + ext4_io_submit_init(&io_submit, mpd->wbc); + io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_submit.io_end) + return -ENOMEM; /* * We need to start from the first_page to the next_page - 1 * to make sure we also write the mapped dirty buffer_heads. @@ -1626,6 +1578,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, pagevec_release(&pvec); } ext4_io_submit(&io_submit); + /* Drop io_end reference we got from init */ + ext4_put_io_end_defer(io_submit.io_end); return ret; } @@ -1670,22 +1624,25 @@ static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), - ext4_count_free_clusters(inode->i_sb))); + ext4_count_free_clusters(sb))); ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", - (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", - (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); ext4_msg(sb, KERN_CRIT, "Block reservation details"); ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", - EXT4_I(inode)->i_reserved_data_blocks); + ei->i_reserved_data_blocks); ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", - EXT4_I(inode)->i_reserved_meta_blocks); + ei->i_reserved_meta_blocks); + ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u", + ei->i_allocated_meta_blocks); return; } @@ -1740,12 +1697,21 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) */ map.m_lblk = next; map.m_len = max_blocks; - get_blocks_flags = EXT4_GET_BLOCKS_CREATE; + /* + * We're in delalloc path and it is possible that we're going to + * need more metadata blocks than previously reserved. However + * we must not fail because we're in writeback and there is + * nothing we can do about it so it might result in data loss. + * So use reserved blocks to allocate metadata if possible. + */ + get_blocks_flags = EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_METADATA_NOFAIL; if (ext4_should_dioread_nolock(mpd->inode)) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; if (mpd->b_state & (1 << BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); if (blks < 0) { struct super_block *sb = mpd->inode->i_sb; @@ -2272,9 +2238,16 @@ static int ext4_writepage(struct page *page, */ return __ext4_journalled_writepage(page, len); - memset(&io_submit, 0, sizeof(io_submit)); + ext4_io_submit_init(&io_submit, wbc); + io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_submit.io_end) { + redirty_page_for_writepage(wbc, page); + return -ENOMEM; + } ret = ext4_bio_write_page(&io_submit, page, len, wbc); ext4_io_submit(&io_submit); + /* Drop io_end reference we got from init */ + ext4_put_io_end_defer(io_submit.io_end); return ret; } @@ -2661,7 +2634,7 @@ out_writepages: static int ext4_nonda_switch(struct super_block *sb) { - s64 free_blocks, dirty_blocks; + s64 free_clusters, dirty_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* @@ -2672,17 +2645,18 @@ static int ext4_nonda_switch(struct super_block *sb) * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ - free_blocks = EXT4_C2B(sbi, - percpu_counter_read_positive(&sbi->s_freeclusters_counter)); - dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); + free_clusters = + percpu_counter_read_positive(&sbi->s_freeclusters_counter); + dirty_clusters = + percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); /* * Start pushing delalloc when 1/2 of free blocks are dirty. */ - if (dirty_blocks && (free_blocks < 2 * dirty_blocks)) + if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); - if (2 * free_blocks < 3 * dirty_blocks || - free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { + if (2 * free_clusters < 3 * dirty_clusters || + free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark @@ -2818,18 +2792,9 @@ static int ext4_da_write_end(struct file *file, unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; - if (write_mode == FALL_BACK_TO_NONDELALLOC) { - switch (ext4_inode_journal_mode(inode)) { - case EXT4_INODE_ORDERED_DATA_MODE: - return ext4_ordered_write_end(file, mapping, pos, - len, copied, page, fsdata); - case EXT4_INODE_WRITEBACK_DATA_MODE: - return ext4_writeback_write_end(file, mapping, pos, - len, copied, page, fsdata); - default: - BUG(); - } - } + if (write_mode == FALL_BACK_TO_NONDELALLOC) + return ext4_write_end(file, mapping, pos, + len, copied, page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); start = pos & (PAGE_CACHE_SIZE - 1); @@ -3113,9 +3078,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, struct inode *inode = file_inode(iocb->ki_filp); ext4_io_end_t *io_end = iocb->private; - /* if not async direct IO or dio with 0 bytes write, just return */ - if (!io_end || !size) - goto out; + /* if not async direct IO just return */ + if (!io_end) { + inode_dio_done(inode); + if (is_async) + aio_complete(iocb, ret, 0); + return; + } ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", @@ -3123,25 +3092,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, size); iocb->private = NULL; - - /* if not aio dio with unwritten extents, just free io and return */ - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - ext4_free_io_end(io_end); -out: - inode_dio_done(inode); - if (is_async) - aio_complete(iocb, ret, 0); - return; - } - io_end->offset = offset; io_end->size = size; if (is_async) { io_end->iocb = iocb; io_end->result = ret; } - - ext4_add_complete_io(io_end); + ext4_put_io_end_defer(io_end); } /* @@ -3175,6 +3132,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, get_block_t *get_block_func = NULL; int dio_flags = 0; loff_t final_size = offset + count; + ext4_io_end_t *io_end = NULL; /* Use the old path for reads and writes beyond i_size. */ if (rw != WRITE || final_size > inode->i_size) @@ -3213,13 +3171,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, iocb->private = NULL; ext4_inode_aio_set(inode, NULL); if (!is_sync_kiocb(iocb)) { - ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); + io_end = ext4_init_io_end(inode, GFP_NOFS); if (!io_end) { ret = -ENOMEM; goto retake_lock; } io_end->flag |= EXT4_IO_END_DIRECT; - iocb->private = io_end; + /* + * Grab reference for DIO. Will be dropped in ext4_end_io_dio() + */ + iocb->private = ext4_get_io_end(io_end); /* * we save the io structure for current async direct * IO, so that later ext4_map_blocks() could flag the @@ -3243,26 +3204,27 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, NULL, dio_flags); - if (iocb->private) - ext4_inode_aio_set(inode, NULL); /* - * The io_end structure takes a reference to the inode, that - * structure needs to be destroyed and the reference to the - * inode need to be dropped, when IO is complete, even with 0 - * byte write, or failed. - * - * In the successful AIO DIO case, the io_end structure will - * be destroyed and the reference to the inode will be dropped - * after the end_io call back function is called. - * - * In the case there is 0 byte write, or error case, since VFS - * direct IO won't invoke the end_io call back function, we - * need to free the end_io structure here. + * Put our reference to io_end. This can free the io_end structure e.g. + * in sync IO case or in case of error. It can even perform extent + * conversion if all bios we submitted finished before we got here. + * Note that in that case iocb->private can be already set to NULL + * here. */ - if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { - ext4_free_io_end(iocb->private); - iocb->private = NULL; - } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, + if (io_end) { + ext4_inode_aio_set(inode, NULL); + ext4_put_io_end(io_end); + /* + * In case of error or no write ext4_end_io_dio() was not + * called so we have to put iocb's reference. + */ + if (ret <= 0 && ret != -EIOCBQUEUED) { + WARN_ON(iocb->private != io_end); + ext4_put_io_end(io_end); + iocb->private = NULL; + } + } + if (ret > 0 && !overwrite && ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN)) { int err; /* @@ -3334,27 +3296,12 @@ static int ext4_journalled_set_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); } -static const struct address_space_operations ext4_ordered_aops = { +static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, .write_begin = ext4_write_begin, - .write_end = ext4_ordered_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, -}; - -static const struct address_space_operations ext4_writeback_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_writepage, - .write_begin = ext4_write_begin, - .write_end = ext4_writeback_write_end, + .write_end = ext4_write_end, .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, @@ -3399,23 +3346,21 @@ void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { case EXT4_INODE_ORDERED_DATA_MODE: - if (test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else - inode->i_mapping->a_ops = &ext4_ordered_aops; + ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE); break; case EXT4_INODE_WRITEBACK_DATA_MODE: - if (test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else - inode->i_mapping->a_ops = &ext4_writeback_aops; + ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE); break; case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; - break; + return; default: BUG(); } + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_aops; } @@ -3646,20 +3591,190 @@ int ext4_can_truncate(struct inode *inode) int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + ext4_lblk_t first_block, stop_block; + struct address_space *mapping = inode->i_mapping; + loff_t first_page, last_page, page_len; + loff_t first_page_offset, last_page_offset; + handle_t *handle; + unsigned int credits; + int ret = 0; + if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - return ext4_ind_punch_hole(file, offset, length); - - if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { + if (EXT4_SB(sb)->s_cluster_ratio > 1) { /* TODO: Add support for bigalloc file systems */ return -EOPNOTSUPP; } trace_ext4_punch_hole(inode, offset, length); - return ext4_ext_punch_hole(file, offset, length); + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + ret = filemap_write_and_wait_range(mapping, offset, + offset + length - 1); + if (ret) + return ret; + } + + mutex_lock(&inode->i_mutex); + /* It's not possible punch hole on append only file */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + ret = -EPERM; + goto out_mutex; + } + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + goto out_mutex; + } + + /* No need to punch hole beyond i_size */ + if (offset >= inode->i_size) + goto out_mutex; + + /* + * If the hole extends beyond i_size, set the hole + * to end after the page that contains i_size + */ + if (offset + length > inode->i_size) { + length = inode->i_size + + PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - + offset; + } + + first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + last_page = (offset + length) >> PAGE_CACHE_SHIFT; + + first_page_offset = first_page << PAGE_CACHE_SHIFT; + last_page_offset = last_page << PAGE_CACHE_SHIFT; + + /* Now release the pages */ + if (last_page_offset > first_page_offset) { + truncate_pagecache_range(inode, first_page_offset, + last_page_offset - 1); + } + + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + ret = ext4_flush_unwritten_io(inode); + if (ret) + goto out_dio; + inode_dio_wait(inode); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + credits = ext4_writepage_trans_blocks(inode); + else + credits = ext4_blocks_for_truncate(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_std_error(sb, ret); + goto out_dio; + } + + /* + * Now we need to zero out the non-page-aligned data in the + * pages at the start and tail of the hole, and unmap the + * buffer heads for the block aligned regions of the page that + * were completely zeroed. + */ + if (first_page > last_page) { + /* + * If the file space being truncated is contained + * within a page just zero out and unmap the middle of + * that page + */ + ret = ext4_discard_partial_page_buffers(handle, + mapping, offset, length, 0); + + if (ret) + goto out_stop; + } else { + /* + * zero out and unmap the partial page that contains + * the start of the hole + */ + page_len = first_page_offset - offset; + if (page_len > 0) { + ret = ext4_discard_partial_page_buffers(handle, mapping, + offset, page_len, 0); + if (ret) + goto out_stop; + } + + /* + * zero out and unmap the partial page that contains + * the end of the hole + */ + page_len = offset + length - last_page_offset; + if (page_len > 0) { + ret = ext4_discard_partial_page_buffers(handle, mapping, + last_page_offset, page_len, 0); + if (ret) + goto out_stop; + } + } + + /* + * If i_size is contained in the last page, we need to + * unmap and zero the partial page after i_size + */ + if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && + inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + if (page_len > 0) { + ret = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (ret) + goto out_stop; + } + } + + first_block = (offset + sb->s_blocksize - 1) >> + EXT4_BLOCK_SIZE_BITS(sb); + stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + + /* If there are no blocks to remove, return now */ + if (first_block >= stop_block) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + ret = ext4_es_remove_extent(inode, first_block, + stop_block - first_block); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ret = ext4_ext_remove_space(inode, first_block, + stop_block - 1); + else + ret = ext4_free_hole_blocks(handle, inode, first_block, + stop_block); + + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; } /* @@ -3692,6 +3807,19 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) */ void ext4_truncate(struct inode *inode) { + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int credits; + handle_t *handle; + struct address_space *mapping = inode->i_mapping; + loff_t page_len; + + /* + * There is a possibility that we're either freeing the inode + * or it completely new indode. In those cases we might not + * have i_mutex locked because it's not necessary. + */ + if (!(inode->i_state & (I_NEW|I_FREEING))) + WARN_ON(!mutex_is_locked(&inode->i_mutex)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) @@ -3710,10 +3838,72 @@ void ext4_truncate(struct inode *inode) return; } + /* + * finish any pending end_io work so we won't run the risk of + * converting any truncated blocks to initialized later + */ + ext4_flush_unwritten_io(inode); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + credits = ext4_writepage_trans_blocks(inode); + else + credits = ext4_blocks_for_truncate(inode); + + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); + return; + } + + if (inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + if (ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0)) + goto out_stop; + } + + /* + * We add the inode to the orphan list, so that if this + * truncate spans multiple transactions, and we crash, we will + * resume the truncate when the filesystem recovers. It also + * marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + + ext4_discard_preallocations(inode); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ext4_ext_truncate(inode); + ext4_ext_truncate(handle, inode); else - ext4_ind_truncate(inode); + ext4_ind_truncate(handle, inode); + + up_write(&ei->i_data_sem); + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + +out_stop: + /* + * If this was a simple ftruncate() and the file will remain alive, + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); trace_ext4_truncate_exit(inode); } @@ -3821,13 +4011,14 @@ make_io: if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; + __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; table = ext4_inode_table(sb, gdp); /* s_inode_readahead_blks is always a power of 2 */ - b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); + b = block & ~((ext4_fsblk_t) ra_blks - 1); if (table > b) b = table; - end = b + EXT4_SB(sb)->s_inode_readahead_blks; + end = b + ra_blks; num = EXT4_INODES_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) num -= ext4_itable_unused_count(sb, gdp); @@ -4024,8 +4215,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { - if (inode->i_mode == 0 || - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { + if ((inode->i_mode == 0 || + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && + ino != EXT4_BOOT_LOADER_INO) { /* this inode is deleted */ ret = -ESTALE; goto bad_inode; @@ -4033,7 +4225,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) /* The only unlinked inodes we let through here have * valid i_mode and are being read by the orphan * recovery code: that's fine, we're about to complete - * the process of deleting those. */ + * the process of deleting those. + * OR it is the EXT4_BOOT_LOADER_INO which is + * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); @@ -4153,6 +4347,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } else if (ino == EXT4_BOOT_LOADER_INO) { + make_bad_inode(inode); } else { ret = -EIO; EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 721f4d33e148..9491ac0590f7 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -17,9 +17,201 @@ #include <asm/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" +#include "ext4_extents.h" #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) +/** + * Swap memory between @a and @b for @len bytes. + * + * @a: pointer to first memory area + * @b: pointer to second memory area + * @len: number of bytes to swap + * + */ +static void memswap(void *a, void *b, size_t len) +{ + unsigned char *ap, *bp; + unsigned char tmp; + + ap = (unsigned char *)a; + bp = (unsigned char *)b; + while (len-- > 0) { + tmp = *ap; + *ap = *bp; + *bp = tmp; + ap++; + bp++; + } +} + +/** + * Swap i_data and associated attributes between @inode1 and @inode2. + * This function is used for the primary swap between inode1 and inode2 + * and also to revert this primary swap in case of errors. + * + * Therefore you have to make sure, that calling this method twice + * will revert all changes. + * + * @inode1: pointer to first inode + * @inode2: pointer to second inode + */ +static void swap_inode_data(struct inode *inode1, struct inode *inode2) +{ + loff_t isize; + struct ext4_inode_info *ei1; + struct ext4_inode_info *ei2; + + ei1 = EXT4_I(inode1); + ei2 = EXT4_I(inode2); + + memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags)); + memswap(&inode1->i_version, &inode2->i_version, + sizeof(inode1->i_version)); + memswap(&inode1->i_blocks, &inode2->i_blocks, + sizeof(inode1->i_blocks)); + memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes)); + memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime)); + memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime)); + + memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); + memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags)); + memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); + memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree)); + memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr)); + + isize = i_size_read(inode1); + i_size_write(inode1, i_size_read(inode2)); + i_size_write(inode2, isize); +} + +/** + * Swap the information from the given @inode and the inode + * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other + * important fields of the inodes. + * + * @sb: the super block of the filesystem + * @inode: the inode to swap with EXT4_BOOT_LOADER_INO + * + */ +static long swap_inode_boot_loader(struct super_block *sb, + struct inode *inode) +{ + handle_t *handle; + int err; + struct inode *inode_bl; + struct ext4_inode_info *ei; + struct ext4_inode_info *ei_bl; + struct ext4_sb_info *sbi; + + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { + err = -EINVAL; + goto swap_boot_out; + } + + if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto swap_boot_out; + } + + sbi = EXT4_SB(sb); + ei = EXT4_I(inode); + + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); + if (IS_ERR(inode_bl)) { + err = PTR_ERR(inode_bl); + goto swap_boot_out; + } + ei_bl = EXT4_I(inode_bl); + + filemap_flush(inode->i_mapping); + filemap_flush(inode_bl->i_mapping); + + /* Protect orig inodes against a truncate and make sure, + * that only 1 swap_inode_boot_loader is running. */ + ext4_inode_double_lock(inode, inode_bl); + + truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(&inode_bl->i_data, 0); + + /* Wait for all existing dio workers */ + ext4_inode_block_unlocked_dio(inode); + ext4_inode_block_unlocked_dio(inode_bl); + inode_dio_wait(inode); + inode_dio_wait(inode_bl); + + handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); + if (IS_ERR(handle)) { + err = -EINVAL; + goto swap_boot_out; + } + + /* Protect extent tree against block allocations via delalloc */ + ext4_double_down_write_data_sem(inode, inode_bl); + + if (inode_bl->i_nlink == 0) { + /* this inode has never been used as a BOOT_LOADER */ + set_nlink(inode_bl, 1); + i_uid_write(inode_bl, 0); + i_gid_write(inode_bl, 0); + inode_bl->i_flags = 0; + ei_bl->i_flags = 0; + inode_bl->i_version = 1; + i_size_write(inode_bl, 0); + inode_bl->i_mode = S_IFREG; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_EXTENTS)) { + ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode_bl); + } else + memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); + } + + swap_inode_data(inode, inode_bl); + + inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode); + + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + inode_bl->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + + ext4_discard_preallocations(inode); + + err = ext4_mark_inode_dirty(handle, inode); + if (err < 0) { + ext4_warning(inode->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode->i_ino, err); + /* Revert all changes: */ + swap_inode_data(inode, inode_bl); + } else { + err = ext4_mark_inode_dirty(handle, inode_bl); + if (err < 0) { + ext4_warning(inode_bl->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode_bl->i_ino, err); + /* Revert all changes: */ + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + } + } + + ext4_journal_stop(handle); + + ext4_double_up_write_data_sem(inode, inode_bl); + + ext4_inode_resume_unlocked_dio(inode); + ext4_inode_resume_unlocked_dio(inode_bl); + + ext4_inode_double_unlock(inode, inode_bl); + + iput(inode_bl); + +swap_boot_out: + return err; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -83,17 +275,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } - if (oldflags & EXT4_EXTENTS_FL) { - /* We don't support clearning extent flags */ - if (!(flags & EXT4_EXTENTS_FL)) { - err = -EOPNOTSUPP; - goto flags_out; - } - } else if (flags & EXT4_EXTENTS_FL) { - /* migrate the file */ + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) migrate = 1; - flags &= ~EXT4_EXTENTS_FL; - } if (flags & EXT4_EOFBLOCKS_FL) { /* we don't support adding EOFBLOCKS flag */ @@ -137,8 +320,13 @@ flags_err: err = ext4_change_inode_journal_flag(inode, jflag); if (err) goto flags_out; - if (migrate) - err = ext4_ext_migrate(inode); + if (migrate) { + if (flags & EXT4_EXTENTS_FL) + err = ext4_ext_migrate(inode); + else + err = ext4_ind_migrate(inode); + } + flags_out: mutex_unlock(&inode->i_mutex); mnt_drop_write_file(filp); @@ -357,9 +545,13 @@ group_add_out: return err; } + case EXT4_IOC_SWAP_BOOT: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + return swap_inode_boot_loader(sb, inode); + case EXT4_IOC_RESIZE_FS: { ext4_fsblk_t n_blocks_count; - struct super_block *sb = inode->i_sb; int err = 0, err2 = 0; ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index ee6614bdb639..b1ed9e07434b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -405,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr) ext4_clear_bit(bit, addr); } +static inline int mb_test_and_clear_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + return ext4_test_and_clear_bit(bit, addr); +} + static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; @@ -764,6 +770,24 @@ void ext4_mb_generate_buddy(struct super_block *sb, spin_unlock(&EXT4_SB(sb)->s_bal_lock); } +static void mb_regenerate_buddy(struct ext4_buddy *e4b) +{ + int count; + int order = 1; + void *buddy; + + while ((buddy = mb_find_buddy(e4b, order++, &count))) { + ext4_set_bits(buddy, 0, count); + } + e4b->bd_info->bb_fragments = 0; + memset(e4b->bd_info->bb_counters, 0, + sizeof(*e4b->bd_info->bb_counters) * + (e4b->bd_sb->s_blocksize_bits + 2)); + + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, + e4b->bd_bitmap, e4b->bd_group); +} + /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve @@ -860,8 +884,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore) first_block = page->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { - int group; - group = (first_block + i) >> 1; if (group >= ngroups) break; @@ -1011,6 +1033,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) struct page *page; int ret = 0; + might_sleep(); mb_debug(1, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); /* @@ -1082,6 +1105,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; + might_sleep(); mb_debug(1, "load group %u\n", group); blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; @@ -1244,6 +1268,33 @@ static void mb_clear_bits(void *bm, int cur, int len) } } +/* clear bits in given range + * will return first found zero bit if any, -1 otherwise + */ +static int mb_test_and_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + int zero_bit = -1; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + if (*addr != (__u32)(-1) && zero_bit == -1) + zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); + *addr = 0; + cur += 32; + continue; + } + if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) + zero_bit = cur; + cur++; + } + + return zero_bit; +} + void ext4_set_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1262,17 +1313,90 @@ void ext4_set_bits(void *bm, int cur, int len) } } +/* + * _________________________________________________________________ */ + +static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) +{ + if (mb_test_bit(*bit + side, bitmap)) { + mb_clear_bit(*bit, bitmap); + (*bit) -= side; + return 1; + } + else { + (*bit) += side; + mb_set_bit(*bit, bitmap); + return -1; + } +} + +static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) +{ + int max; + int order = 1; + void *buddy = mb_find_buddy(e4b, order, &max); + + while (buddy) { + void *buddy2; + + /* Bits in range [first; last] are known to be set since + * corresponding blocks were allocated. Bits in range + * (first; last) will stay set because they form buddies on + * upper layer. We just deal with borders if they don't + * align with upper layer and then go up. + * Releasing entire group is all about clearing + * single bit of highest order buddy. + */ + + /* Example: + * --------------------------------- + * | 1 | 1 | 1 | 1 | + * --------------------------------- + * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | + * --------------------------------- + * 0 1 2 3 4 5 6 7 + * \_____________________/ + * + * Neither [1] nor [6] is aligned to above layer. + * Left neighbour [0] is free, so mark it busy, + * decrease bb_counters and extend range to + * [0; 6] + * Right neighbour [7] is busy. It can't be coaleasced with [6], so + * mark [6] free, increase bb_counters and shrink range to + * [0; 5]. + * Then shift range to [0; 2], go up and do the same. + */ + + + if (first & 1) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); + if (!(last & 1)) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); + if (first > last) + break; + order++; + + if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { + mb_clear_bits(buddy, first, last - first + 1); + e4b->bd_info->bb_counters[order - 1] += last - first + 1; + break; + } + first >>= 1; + last >>= 1; + buddy = buddy2; + } +} + static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, - int first, int count) + int first, int count) { - int block = 0; - int max = 0; - int order; - void *buddy; - void *buddy2; + int left_is_free = 0; + int right_is_free = 0; + int block; + int last = first + count - 1; struct super_block *sb = e4b->bd_sb; - BUG_ON(first + count > (sb->s_blocksize << 3)); + BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); @@ -1281,67 +1405,54 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; - /* let's maintain fragments counter */ + /* access memory sequentially: check left neighbour, + * clear range and then check right neighbour + */ if (first != 0) - block = !mb_test_bit(first - 1, e4b->bd_bitmap); - if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) - max = !mb_test_bit(first + count, e4b->bd_bitmap); - if (block && max) - e4b->bd_info->bb_fragments--; - else if (!block && !max) - e4b->bd_info->bb_fragments++; + left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); + block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); + if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) + right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); - /* let's maintain buddy itself */ - while (count-- > 0) { - block = first++; - order = 0; + if (unlikely(block != -1)) { + ext4_fsblk_t blocknr; - if (!mb_test_bit(block, e4b->bd_bitmap)) { - ext4_fsblk_t blocknr; - - blocknr = ext4_group_first_block_no(sb, e4b->bd_group); - blocknr += EXT4_C2B(EXT4_SB(sb), block); - ext4_grp_locked_error(sb, e4b->bd_group, - inode ? inode->i_ino : 0, - blocknr, - "freeing already freed block " - "(bit %u)", block); - } - mb_clear_bit(block, e4b->bd_bitmap); - e4b->bd_info->bb_counters[order]++; - - /* start of the buddy */ - buddy = mb_find_buddy(e4b, order, &max); - - do { - block &= ~1UL; - if (mb_test_bit(block, buddy) || - mb_test_bit(block + 1, buddy)) - break; - - /* both the buddies are free, try to coalesce them */ - buddy2 = mb_find_buddy(e4b, order + 1, &max); + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += EXT4_C2B(EXT4_SB(sb), block); + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing already freed block " + "(bit %u)", block); + mb_regenerate_buddy(e4b); + goto done; + } - if (!buddy2) - break; + /* let's maintain fragments counter */ + if (left_is_free && right_is_free) + e4b->bd_info->bb_fragments--; + else if (!left_is_free && !right_is_free) + e4b->bd_info->bb_fragments++; - if (order > 0) { - /* for special purposes, we don't set - * free bits in bitmap */ - mb_set_bit(block, buddy); - mb_set_bit(block + 1, buddy); - } - e4b->bd_info->bb_counters[order]--; - e4b->bd_info->bb_counters[order]--; + /* buddy[0] == bd_bitmap is a special case, so handle + * it right away and let mb_buddy_mark_free stay free of + * zero order checks. + * Check if neighbours are to be coaleasced, + * adjust bitmap bb_counters and borders appropriately. + */ + if (first & 1) { + first += !left_is_free; + e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; + } + if (!(last & 1)) { + last -= !right_is_free; + e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; + } - block = block >> 1; - order++; - e4b->bd_info->bb_counters[order]++; + if (first <= last) + mb_buddy_mark_free(e4b, first >> 1, last >> 1); - mb_clear_bit(block, buddy2); - buddy = buddy2; - } while (1); - } +done: mb_set_largest_free_order(sb, e4b->bd_info); mb_check_buddy(e4b); } @@ -2149,7 +2260,7 @@ static const struct seq_operations ext4_mb_seq_groups_ops = { static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) { - struct super_block *sb = PDE(inode)->data; + struct super_block *sb = PDE_DATA(inode); int rc; rc = seq_open(file, &ext4_mb_seq_groups_ops); @@ -3342,7 +3453,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, if (pa->pa_type == MB_GROUP_PA) grp_blk--; - ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); + grp = ext4_get_group_number(sb, grp_blk); /* * possible race: @@ -3807,7 +3918,7 @@ repeat: list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { BUG_ON(pa->pa_type != MB_INODE_PA); - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { @@ -4069,7 +4180,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + group = ext4_get_group_number(sb, pa->pa_pstart); if (ext4_mb_load_buddy(sb, group, &e4b)) { ext4_error(sb, "Error loading buddy information for %u", group); @@ -4217,6 +4328,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, unsigned int inquota = 0; unsigned int reserv_clstrs = 0; + might_sleep(); sb = ar->inode->i_sb; sbi = EXT4_SB(sb); @@ -4420,11 +4532,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, node = rb_prev(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(entry, new_entry)) { + if (can_merge(entry, new_entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { new_entry->efd_start_cluster = entry->efd_start_cluster; new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - ext4_journal_callback_del(handle, &entry->efd_jce); kmem_cache_free(ext4_free_data_cachep, entry); } } @@ -4432,10 +4544,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, node = rb_next(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(new_entry, entry)) { + if (can_merge(new_entry, entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - ext4_journal_callback_del(handle, &entry->efd_jce); kmem_cache_free(ext4_free_data_cachep, entry); } } @@ -4470,6 +4582,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, int err = 0; int ret; + might_sleep(); if (bh) { if (block) BUG_ON(block != bh->b_blocknr); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 480acf4a085f..49e8bdff9163 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -426,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode) return retval; } return retval; - } int ext4_ext_migrate(struct inode *inode) @@ -606,3 +605,64 @@ out: return retval; } + +/* + * Migrate a simple extent-based inode to use the i_blocks[] array + */ +int ext4_ind_migrate(struct inode *inode) +{ + struct ext4_extent_header *eh; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_extent *ex; + unsigned int i, len; + ext4_fsblk_t blk; + handle_t *handle; + int ret; + + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS) || + (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EINVAL; + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) + return -EOPNOTSUPP; + + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_check_inode(inode); + if (ret) + goto errout; + + eh = ext_inode_hdr(inode); + ex = EXT_FIRST_EXTENT(eh); + if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || + eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) { + ret = -EOPNOTSUPP; + goto errout; + } + if (eh->eh_entries == 0) + blk = len = 0; + else { + len = le16_to_cpu(ex->ee_len); + blk = ext4_ext_pblock(ex); + if (len > EXT4_NDIR_BLOCKS) { + ret = -EOPNOTSUPP; + goto errout; + } + } + + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + memset(ei->i_data, 0, sizeof(ei->i_data)); + for (i=0; i < len; i++) + ei->i_data[i] = cpu_to_le32(blk++); + ext4_mark_inode_dirty(handle, inode); +errout: + ext4_journal_stop(handle); + up_write(&EXT4_I(inode)->i_data_sem); + return ret; +} diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index f9b551561d2c..214461e42a05 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -7,7 +7,7 @@ #include "ext4.h" /* Checksumming functions */ -static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) +static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) { struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct mmp_struct, mmp_checksum); @@ -54,7 +54,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(WRITE_SYNC, bh); + submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) @@ -86,7 +86,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, get_bh(*bh); lock_buffer(*bh); (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(READ_SYNC, *bh); + submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { brelse(*bh); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 33e1c086858b..3dcbf364022f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -144,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, } /** - * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem + * ext4_double_down_write_data_sem - Acquire two inodes' write lock + * of i_data_sem * * Acquire write lock of i_data_sem of the two inodes */ -static void -double_down_write_data_sem(struct inode *first, struct inode *second) +void +ext4_double_down_write_data_sem(struct inode *first, struct inode *second) { if (first < second) { down_write(&EXT4_I(first)->i_data_sem); @@ -162,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second) } /** - * double_up_write_data_sem - Release two inodes' write lock of i_data_sem + * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem * * @orig_inode: original inode structure to be released its lock first * @donor_inode: donor inode structure to be released its lock second * Release write lock of i_data_sem of two inodes (orig and donor). */ -static void -double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) +void +ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode) { up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); @@ -407,18 +409,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode, mext_insert_inside_block(o_start, o_end, start_ext, new_ext, end_ext, eh, range_to_move); - if (depth) { - ret = ext4_handle_dirty_metadata(handle, orig_inode, - orig_path->p_bh); - if (ret) - return ret; - } else { - ret = ext4_mark_inode_dirty(handle, orig_inode); - if (ret < 0) - return ret; - } - - return 0; + return ext4_ext_dirty(handle, orig_inode, orig_path); } /** @@ -737,6 +728,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, donor_off += dext_alen; orig_off += dext_alen; + BUG_ON(replaced_count > count); /* Already moved the expected blocks */ if (replaced_count >= count) break; @@ -814,7 +806,13 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, page_cache_release(page[0]); return -ENOMEM; } - + /* + * grab_cache_page_write_begin() may not wait on page's writeback if + * BDI not demand that. But it is reasonable to be very conservative + * here and explicitly wait on page's writeback + */ + wait_on_page_writeback(page[0]); + wait_on_page_writeback(page[1]); if (inode1 > inode2) { struct page *tmp; tmp = page[0]; @@ -856,7 +854,6 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) if (buffer_uptodate(bh)) continue; if (!buffer_mapped(bh)) { - int err = 0; err = ext4_get_block(inode, block, bh, 0); if (err) { SetPageError(page); @@ -976,7 +973,7 @@ again: * necessary, just swap data blocks between orig and donor. */ if (uninit) { - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to * fallback to data copying */ uninit = mext_check_coverage(orig_inode, orig_blk_offset, @@ -990,7 +987,7 @@ again: goto drop_data_sem; if (!uninit) { - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } if ((page_has_private(pagep[0]) && @@ -1004,7 +1001,7 @@ again: donor_inode, orig_blk_offset, block_len_in_page, err); drop_data_sem: - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); goto unlock_pages; } data_copy: @@ -1033,7 +1030,7 @@ data_copy: } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - *err = __block_write_begin(pagep[0], from, from + replaced_size, + *err = __block_write_begin(pagep[0], from, replaced_size, ext4_get_block); if (!*err) *err = block_commit_write(pagep[0], from, from + replaced_size); @@ -1065,11 +1062,11 @@ repair_branches: * Extents are swapped already, but we are not able to copy data. * Try to swap extents to it's original places */ - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, orig_blk_offset, block_len_in_page, &err2); - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); if (replaced_count != block_len_in_page) { EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), "Unable to copy data block," @@ -1209,15 +1206,15 @@ mext_check_arguments(struct inode *orig_inode, } /** - * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 + * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 * * @inode1: the inode structure * @inode2: the inode structure * * Lock two inodes' i_mutex */ -static void -mext_inode_double_lock(struct inode *inode1, struct inode *inode2) +void +ext4_inode_double_lock(struct inode *inode1, struct inode *inode2) { BUG_ON(inode1 == inode2); if (inode1 < inode2) { @@ -1230,15 +1227,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) } /** - * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 + * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 * * @inode1: the inode that is released first * @inode2: the inode that is released second * */ -static void -mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) +void +ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2) { mutex_unlock(&inode1->i_mutex); mutex_unlock(&inode2->i_mutex); @@ -1333,7 +1330,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, return -EINVAL; } /* Protect orig and donor inodes against a truncate */ - mext_inode_double_lock(orig_inode, donor_inode); + ext4_inode_double_lock(orig_inode, donor_inode); /* Wait for all existing dio workers */ ext4_inode_block_unlocked_dio(orig_inode); @@ -1342,7 +1339,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, inode_dio_wait(donor_inode); /* Protect extent tree against block allocations via delalloc */ - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ ret = mext_check_arguments(orig_inode, donor_inode, orig_start, donor_start, &len); @@ -1466,7 +1463,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, * b. racing with ->readpage, ->write_begin, and ext4_get_block * in move_extent_per_page */ - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); while (orig_page_offset <= seq_end_page) { @@ -1500,7 +1497,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, block_len_in_page = rest_blocks; } - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); if (ret < 0) break; @@ -1538,10 +1535,10 @@ out: ext4_ext_drop_refs(holecheck_path); kfree(holecheck_path); } - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); ext4_inode_resume_unlocked_dio(orig_inode); ext4_inode_resume_unlocked_dio(donor_inode); - mext_inode_double_unlock(orig_inode, donor_inode); + ext4_inode_double_unlock(orig_inode, donor_inode); return ret; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 3825d6aa8336..6653fc35ecb7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -416,15 +416,16 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); - __u32 csum, old_csum; + __u32 csum; + __le32 save_csum; int size; size = count_offset + (count * sizeof(struct dx_entry)); - old_csum = t->dt_checksum; + save_csum = t->dt_checksum; t->dt_checksum = 0; csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); - t->dt_checksum = old_csum; + t->dt_checksum = save_csum; return cpu_to_le32(csum); } @@ -971,6 +972,17 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + count = htree_inlinedir_to_tree(dir_file, dir, 0, + &hinfo, start_hash, + start_minor_hash, + &has_inline_data); + if (has_inline_data) { + *next_hash = ~0; + return count; + } + } count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); *next_hash = ~0; @@ -1455,24 +1467,6 @@ struct dentry *ext4_get_parent(struct dentry *child) return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); } -#define S_SHIFT 12 -static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, - [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, - [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, -}; - -static inline void ext4_set_de_type(struct super_block *sb, - struct ext4_dir_entry_2 *de, - umode_t mode) { - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) - de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; -} - /* * Move count entries from end of map between two memory locations. * Returns pointer to last entry moved. @@ -2251,8 +2245,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, dquot_initialize(dir); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); @@ -2286,8 +2279,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, dquot_initialize(dir); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); @@ -2396,8 +2388,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) dquot_initialize(dir); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, &dentry->d_name, @@ -2826,8 +2817,7 @@ static int ext4_symlink(struct inode *dir, * quota blocks, sb is already counted in previous macros). */ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; } retry: inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 047a6de04a0a..5929cd0baa20 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -29,25 +29,19 @@ #include "xattr.h" #include "acl.h" -static struct kmem_cache *io_page_cachep, *io_end_cachep; +static struct kmem_cache *io_end_cachep; int __init ext4_init_pageio(void) { - io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); - if (io_page_cachep == NULL) - return -ENOMEM; io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); - if (io_end_cachep == NULL) { - kmem_cache_destroy(io_page_cachep); + if (io_end_cachep == NULL) return -ENOMEM; - } return 0; } void ext4_exit_pageio(void) { kmem_cache_destroy(io_end_cachep); - kmem_cache_destroy(io_page_cachep); } /* @@ -67,29 +61,28 @@ void ext4_ioend_shutdown(struct inode *inode) cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); } -static void put_io_page(struct ext4_io_page *io_page) +static void ext4_release_io_end(ext4_io_end_t *io_end) { - if (atomic_dec_and_test(&io_page->p_count)) { - end_page_writeback(io_page->p_page); - put_page(io_page->p_page); - kmem_cache_free(io_page_cachep, io_page); - } + BUG_ON(!list_empty(&io_end->list)); + BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); + + if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) + wake_up_all(ext4_ioend_wq(io_end->inode)); + if (io_end->flag & EXT4_IO_END_DIRECT) + inode_dio_done(io_end->inode); + if (io_end->iocb) + aio_complete(io_end->iocb, io_end->result, 0); + kmem_cache_free(io_end_cachep, io_end); } -void ext4_free_io_end(ext4_io_end_t *io) +static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) { - int i; - - BUG_ON(!io); - BUG_ON(!list_empty(&io->list)); - BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); + struct inode *inode = io_end->inode; - for (i = 0; i < io->num_io_pages; i++) - put_io_page(io->pages[i]); - io->num_io_pages = 0; - if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) - wake_up_all(ext4_ioend_wq(io->inode)); - kmem_cache_free(io_end_cachep, io); + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); } /* check a range of space and convert unwritten extents to written. */ @@ -112,13 +105,8 @@ static int ext4_end_io(ext4_io_end_t *io) "(inode %lu, offset %llu, size %zd, error %d)", inode->i_ino, offset, size, ret); } - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(inode)); - if (io->flag & EXT4_IO_END_DIRECT) - inode_dio_done(inode); - if (io->iocb) - aio_complete(io->iocb, io->result, 0); + ext4_clear_io_unwritten_flag(io); + ext4_release_io_end(io); return ret; } @@ -149,7 +137,7 @@ static void dump_completed_IO(struct inode *inode) } /* Add the io_end to per-inode completed end_io list. */ -void ext4_add_complete_io(ext4_io_end_t *io_end) +static void ext4_add_complete_io(ext4_io_end_t *io_end) { struct ext4_inode_info *ei = EXT4_I(io_end->inode); struct workqueue_struct *wq; @@ -186,8 +174,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode) err = ext4_end_io(io); if (unlikely(!ret && err)) ret = err; - io->flag &= ~EXT4_IO_END_UNWRITTEN; - ext4_free_io_end(io); } return ret; } @@ -219,10 +205,43 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; INIT_LIST_HEAD(&io->list); + atomic_set(&io->count, 1); } return io; } +void ext4_put_io_end_defer(ext4_io_end_t *io_end) +{ + if (atomic_dec_and_test(&io_end->count)) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { + ext4_release_io_end(io_end); + return; + } + ext4_add_complete_io(io_end); + } +} + +int ext4_put_io_end(ext4_io_end_t *io_end) +{ + int err = 0; + + if (atomic_dec_and_test(&io_end->count)) { + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + err = ext4_convert_unwritten_extents(io_end->inode, + io_end->offset, io_end->size); + ext4_clear_io_unwritten_flag(io_end); + } + ext4_release_io_end(io_end); + } + return err; +} + +ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) +{ + atomic_inc(&io_end->count); + return io_end; +} + /* * Print an buffer I/O error compatible with the fs/buffer.c. This * provides compatibility with dmesg scrapers that look for a specific @@ -243,45 +262,56 @@ static void ext4_end_bio(struct bio *bio, int error) ext4_io_end_t *io_end = bio->bi_private; struct inode *inode; int i; + int blocksize; sector_t bi_sector = bio->bi_sector; BUG_ON(!io_end); + inode = io_end->inode; + blocksize = 1 << inode->i_blkbits; bio->bi_private = NULL; bio->bi_end_io = NULL; if (test_bit(BIO_UPTODATE, &bio->bi_flags)) error = 0; - bio_put(bio); - - for (i = 0; i < io_end->num_io_pages; i++) { - struct page *page = io_end->pages[i]->p_page; + for (i = 0; i < bio->bi_vcnt; i++) { + struct bio_vec *bvec = &bio->bi_io_vec[i]; + struct page *page = bvec->bv_page; struct buffer_head *bh, *head; - loff_t offset; - loff_t io_end_offset; + unsigned bio_start = bvec->bv_offset; + unsigned bio_end = bio_start + bvec->bv_len; + unsigned under_io = 0; + unsigned long flags; + + if (!page) + continue; if (error) { SetPageError(page); set_bit(AS_EIO, &page->mapping->flags); - head = page_buffers(page); - BUG_ON(!head); - - io_end_offset = io_end->offset + io_end->size; - - offset = (sector_t) page->index << PAGE_CACHE_SHIFT; - bh = head; - do { - if ((offset >= io_end->offset) && - (offset+bh->b_size <= io_end_offset)) - buffer_io_error(bh); - - offset += bh->b_size; - bh = bh->b_this_page; - } while (bh != head); } - - put_io_page(io_end->pages[i]); + bh = head = page_buffers(page); + /* + * We check all buffers in the page under BH_Uptodate_Lock + * to avoid races with other end io clearing async_write flags + */ + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); + do { + if (bh_offset(bh) < bio_start || + bh_offset(bh) + blocksize > bio_end) { + if (buffer_async_write(bh)) + under_io++; + continue; + } + clear_buffer_async_write(bh); + if (error) + buffer_io_error(bh); + } while ((bh = bh->b_this_page) != head); + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); + local_irq_restore(flags); + if (!under_io) + end_page_writeback(page); } - io_end->num_io_pages = 0; - inode = io_end->inode; + bio_put(bio); if (error) { io_end->flag |= EXT4_IO_END_ERROR; @@ -294,12 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error) bi_sector >> (inode->i_blkbits - 9)); } - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - ext4_free_io_end(io_end); - return; - } - - ext4_add_complete_io(io_end); + ext4_put_io_end_defer(io_end); } void ext4_io_submit(struct ext4_io_submit *io) @@ -313,76 +338,59 @@ void ext4_io_submit(struct ext4_io_submit *io) bio_put(io->io_bio); } io->io_bio = NULL; - io->io_op = 0; +} + +void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc) +{ + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + io->io_bio = NULL; io->io_end = NULL; } -static int io_submit_init(struct ext4_io_submit *io, - struct inode *inode, - struct writeback_control *wbc, - struct buffer_head *bh) +static int io_submit_init_bio(struct ext4_io_submit *io, + struct buffer_head *bh) { - ext4_io_end_t *io_end; - struct page *page = bh->b_page; int nvecs = bio_get_nr_vecs(bh->b_bdev); struct bio *bio; - io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) - return -ENOMEM; bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; - bio->bi_private = io->io_end = io_end; bio->bi_end_io = ext4_end_bio; - - io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); - + bio->bi_private = ext4_get_io_end(io->io_end); + if (!io->io_end->size) + io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT) + + bh_offset(bh); io->io_bio = bio; - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); io->io_next_block = bh->b_blocknr; return 0; } static int io_submit_add_bh(struct ext4_io_submit *io, - struct ext4_io_page *io_page, struct inode *inode, - struct writeback_control *wbc, struct buffer_head *bh) { ext4_io_end_t *io_end; int ret; - if (buffer_new(bh)) { - clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); - } - if (io->io_bio && bh->b_blocknr != io->io_next_block) { submit_and_retry: ext4_io_submit(io); } if (io->io_bio == NULL) { - ret = io_submit_init(io, inode, wbc, bh); + ret = io_submit_init_bio(io, bh); if (ret) return ret; } - io_end = io->io_end; - if ((io_end->num_io_pages >= MAX_IO_PAGES) && - (io_end->pages[io_end->num_io_pages-1] != io_page)) - goto submit_and_retry; - if (buffer_uninit(bh)) - ext4_set_io_unwritten_flag(inode, io_end); - io->io_end->size += bh->b_size; - io->io_next_block++; ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; - if ((io_end->num_io_pages == 0) || - (io_end->pages[io_end->num_io_pages-1] != io_page)) { - io_end->pages[io_end->num_io_pages++] = io_page; - atomic_inc(&io_page->p_count); - } + io_end = io->io_end; + if (test_clear_buffer_uninit(bh)) + ext4_set_io_unwritten_flag(inode, io_end); + io_end->size += bh->b_size; + io->io_next_block++; return 0; } @@ -392,33 +400,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - unsigned block_start, block_end, blocksize; - struct ext4_io_page *io_page; + unsigned block_start, blocksize; struct buffer_head *bh, *head; int ret = 0; + int nr_submitted = 0; blocksize = 1 << inode->i_blkbits; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); - if (!io_page) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return -ENOMEM; - } - io_page->p_page = page; - atomic_set(&io_page->p_count, 1); - get_page(page); set_page_writeback(page); ClearPageError(page); - for (bh = head = page_buffers(page), block_start = 0; - bh != head || !block_start; - block_start = block_end, bh = bh->b_this_page) { - - block_end = block_start + blocksize; + /* + * In the first loop we prepare and mark buffers to submit. We have to + * mark all buffers in the page before submitting so that + * end_page_writeback() cannot be called from ext4_bio_end_io() when IO + * on the first buffer finishes and we are still working on submitting + * the second buffer. + */ + bh = head = page_buffers(page); + do { + block_start = bh_offset(bh); if (block_start >= len) { /* * Comments copied from block_write_full_page_endio: @@ -431,7 +435,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * mapped, and writes to that region are not written * out to the file." */ - zero_user_segment(page, block_start, block_end); + zero_user_segment(page, block_start, + block_start + blocksize); clear_buffer_dirty(bh); set_buffer_uptodate(bh); continue; @@ -445,7 +450,19 @@ int ext4_bio_write_page(struct ext4_io_submit *io, ext4_io_submit(io); continue; } - ret = io_submit_add_bh(io, io_page, inode, wbc, bh); + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + set_buffer_async_write(bh); + } while ((bh = bh->b_this_page) != head); + + /* Now submit buffers to write */ + bh = head = page_buffers(page); + do { + if (!buffer_async_write(bh)) + continue; + ret = io_submit_add_bh(io, inode, bh); if (ret) { /* * We only get here on ENOMEM. Not much else @@ -455,17 +472,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io, redirty_page_for_writepage(wbc, page); break; } + nr_submitted++; clear_buffer_dirty(bh); + } while ((bh = bh->b_this_page) != head); + + /* Error stopped previous loop? Clean up buffers... */ + if (ret) { + do { + clear_buffer_async_write(bh); + bh = bh->b_this_page; + } while (bh != head); } unlock_page(page); - /* - * If the page was truncated before we could do the writeback, - * or we had a memory allocation error while trying to write - * the first buffer head, we won't have submitted any pages for - * I/O. In that case we need to make sure we've cleared the - * PageWriteback bit from the page to prevent the system from - * wedging later on. - */ - put_io_page(io_page); + /* Nothing submitted - we have to end page writeback */ + if (!nr_submitted) + end_page_writeback(page); return ret; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c169477a62c9..b27c96d01965 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -272,7 +272,7 @@ next_group: if (start_blk >= last_blk) goto next_group; group_data[bb_index].block_bitmap = start_blk++; - ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; if (flexbg_size > 1) @@ -284,7 +284,7 @@ next_group: if (start_blk >= last_blk) goto next_group; group_data[ib_index].inode_bitmap = start_blk++; - ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; if (flexbg_size > 1) @@ -296,7 +296,7 @@ next_group: if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) goto next_group; group_data[it_index].inode_table = start_blk; - ext4_get_group_no_and_offset(sb, start_blk, &group, NULL); + group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count -= EXT4_SB(sb)->s_itb_per_group; @@ -392,7 +392,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, ext4_group_t group; int err; - ext4_get_group_no_and_offset(sb, block, &group, NULL); + group = ext4_get_group_number(sb, block); start = ext4_group_first_block_no(sb, group); group -= flex_gd->groups[0].group; @@ -1341,6 +1341,8 @@ static void ext4_update_super(struct super_block *sb, /* Update the global fs size fields */ sbi->s_groups_count += flex_gd->count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); /* Update the reserved block counts only once the new group is * active. */ @@ -1879,7 +1881,11 @@ retry: /* Nothing need to do */ return 0; - ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); + n_group = ext4_get_group_number(sb, n_blocks_count - 1); + if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { + ext4_warning(sb, "resize would cause inodes_count overflow"); + return -EINVAL; + } ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); n_desc_blocks = num_desc_blocks(sb, n_group + 1); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5d6d53578124..24a146bde742 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -81,6 +81,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t); #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext2_fs_type = { @@ -353,10 +354,13 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int error = is_journal_aborted(journal); - struct ext4_journal_cb_entry *jce, *tmp; + struct ext4_journal_cb_entry *jce; + BUG_ON(txn->t_state == T_FINISHED); spin_lock(&sbi->s_md_lock); - list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { + while (!list_empty(&txn->t_private_list)) { + jce = list_entry(txn->t_private_list.next, + struct ext4_journal_cb_entry, jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); jce->jce_func(sb, jce, error); @@ -1802,7 +1806,7 @@ static int options_seq_show(struct seq_file *seq, void *offset) static int options_open_fs(struct inode *inode, struct file *file) { - return single_open(file, options_seq_show, PDE(inode)->data); + return single_open(file, options_seq_show, PDE_DATA(inode)); } static const struct file_operations ext4_seq_options_fops = { @@ -1948,16 +1952,16 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, if ((sbi->s_es->s_feature_ro_compat & cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { /* Use new metadata_csum algorithm */ - __u16 old_csum; + __le16 save_csum; __u32 csum32; - old_csum = gdp->bg_checksum; + save_csum = gdp->bg_checksum; gdp->bg_checksum = 0; csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, sbi->s_desc_size); - gdp->bg_checksum = old_csum; + gdp->bg_checksum = save_csum; crc = csum32 & 0xFFFF; goto out; @@ -2379,17 +2383,15 @@ struct ext4_attr { int offset; }; -static int parse_strtoul(const char *buf, - unsigned long max, unsigned long *value) +static int parse_strtoull(const char *buf, + unsigned long long max, unsigned long long *value) { - char *endp; - - *value = simple_strtoul(skip_spaces(buf), &endp, 0); - endp = skip_spaces(endp); - if (*endp || *value > max) - return -EINVAL; + int ret; - return 0; + ret = kstrtoull(skip_spaces(buf), 0, value); + if (!ret && *value > max) + ret = -EINVAL; + return ret; } static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, @@ -2431,11 +2433,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, const char *buf, size_t count) { unsigned long t; + int ret; - if (parse_strtoul(buf, 0x40000000, &t)) - return -EINVAL; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; - if (t && !is_power_of_2(t)) + if (t && (!is_power_of_2(t) || t > 0x40000000)) return -EINVAL; sbi->s_inode_readahead_blks = t; @@ -2456,13 +2460,36 @@ static ssize_t sbi_ui_store(struct ext4_attr *a, { unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); unsigned long t; + int ret; - if (parse_strtoul(buf, 0xffffffff, &t)) - return -EINVAL; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; *ui = t; return count; } +static ssize_t reserved_clusters_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long) atomic64_read(&sbi->s_resv_clusters)); +} + +static ssize_t reserved_clusters_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long long val; + int ret; + + if (parse_strtoull(buf, -1ULL, &val)) + return -EINVAL; + ret = ext4_reserve_clusters(sbi, val); + + return ret ? ret : count; +} + static ssize_t trigger_test_error(struct ext4_attr *a, struct ext4_sb_info *sbi, const char *buf, size_t count) @@ -2500,6 +2527,7 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) EXT4_RO_ATTR(delayed_allocation_blocks); EXT4_RO_ATTR(session_write_kbytes); EXT4_RO_ATTR(lifetime_write_kbytes); +EXT4_RW_ATTR(reserved_clusters); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, inode_readahead_blks_store, s_inode_readahead_blks); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); @@ -2517,6 +2545,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), ATTR_LIST(session_write_kbytes), ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(reserved_clusters), ATTR_LIST(inode_readahead_blks), ATTR_LIST(inode_goal), ATTR_LIST(mb_stats), @@ -3192,6 +3221,40 @@ int ext4_calculate_overhead(struct super_block *sb) return 0; } + +static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) +{ + ext4_fsblk_t resv_clusters; + + /* + * By default we reserve 2% or 4096 clusters, whichever is smaller. + * This should cover the situations where we can not afford to run + * out of space like for example punch hole, or converting + * uninitialized extents in delalloc path. In most cases such + * allocation would require 1, or 2 blocks, higher numbers are + * very rare. + */ + resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; + + do_div(resv_clusters, 50); + resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); + + return resv_clusters; +} + + +static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count) +{ + ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >> + sbi->s_cluster_bits; + + if (count >= clusters) + return -EINVAL; + + atomic64_set(&sbi->s_resv_clusters, count); + return 0; +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) { char *orig_data = kstrdup(data, GFP_KERNEL); @@ -3526,6 +3589,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); + /* Do we have standard group size of blocksize * 8 blocks ? */ + if (sbi->s_blocks_per_group == blocksize << 3) + set_opt2(sb, STD_GROUP_SIZE); + for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; @@ -3698,6 +3765,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_err_report.function = print_daily_error_info; sbi->s_err_report.data = (unsigned long) sb; + /* Register extent status tree shrinker */ + ext4_es_register_shrinker(sb); + err = percpu_counter_init(&sbi->s_freeclusters_counter, ext4_count_free_clusters(sb)); if (!err) { @@ -3723,9 +3793,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_max_writeback_mb_bump = 128; sbi->s_extent_max_zeroout_kb = 32; - /* Register extent status tree shrinker */ - ext4_es_register_shrinker(sb); - /* * set up enough so that it can read an inode */ @@ -3911,6 +3978,13 @@ no_journal: "available"); } + err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi)); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " + "reserved pool", ext4_calculate_resv_clusters(sbi)); + goto failed_mount4a; + } + err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " @@ -4010,6 +4084,7 @@ failed_mount_wq: sbi->s_journal = NULL; } failed_mount3: + ext4_es_unregister_shrinker(sb); del_timer(&sbi->s_err_report); if (sbi->s_flex_groups) ext4_kvfree(sbi->s_flex_groups); @@ -4177,7 +4252,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, goto out_bdev; } journal->j_private = sb; - ll_rw_block(READ, 1, &journal->j_sb_buffer); + ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer); wait_on_buffer(journal->j_sb_buffer); if (!buffer_uptodate(journal->j_sb_buffer)) { ext4_msg(sb, KERN_ERR, "I/O error on journal device"); @@ -4742,9 +4817,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - ext4_fsblk_t overhead = 0; + ext4_fsblk_t overhead = 0, resv_blocks; u64 fsid; s64 bfree; + resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); if (!test_opt(sb, MINIX_DF)) overhead = sbi->s_overhead; @@ -4756,8 +4832,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); - buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); - if (buf->f_bfree < ext4_r_blocks_count(es)) + buf->f_bavail = buf->f_bfree - + (ext4_r_blocks_count(es) + resv_blocks); + if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); @@ -4945,6 +5022,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, return PTR_ERR(qf_inode); } + /* Don't account quota for quota files to avoid recursion */ + qf_inode->i_flags |= S_NOQUOTA; err = dquot_enable(qf_inode, type, format_id, flags); iput(qf_inode); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3a120b277240..c081e34f717f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -122,17 +122,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, struct ext4_xattr_header *hdr) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - __u32 csum, old; + __u32 csum; + __le32 save_csum; + __le64 dsk_block_nr = cpu_to_le64(block_nr); - old = hdr->h_checksum; + save_csum = hdr->h_checksum; hdr->h_checksum = 0; - block_nr = cpu_to_le64(block_nr); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr, - sizeof(block_nr)); + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); csum = ext4_chksum(sbi, csum, (__u8 *)hdr, EXT4_BLOCK_SIZE(inode->i_sb)); - hdr->h_checksum = old; + hdr->h_checksum = save_csum; return cpu_to_le32(csum); } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index aa25deb5c6cd..c767dbdd7fc4 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -22,6 +22,7 @@ #define EXT4_XATTR_INDEX_LUSTRE 5 #define EXT4_XATTR_INDEX_SECURITY 6 #define EXT4_XATTR_INDEX_SYSTEM 7 +#define EXT4_XATTR_INDEX_RICHACL 8 struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 137af4255da6..44abc2f286e0 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -299,7 +299,7 @@ int f2fs_acl_chmod(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct posix_acl *acl; int error; - mode_t mode = get_inode_mode(inode); + umode_t mode = get_inode_mode(inode); if (!test_opt(sbi, POSIX_ACL)) return 0; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a1f38443ecee..1be948768e2f 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -60,7 +60,7 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cc2213afdcc7..201c8d3b0f86 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -137,7 +137,7 @@ struct extent_info { rwlock_t ext_lock; /* rwlock for consistency */ unsigned int fofs; /* start offset in a file */ u32 blk_addr; /* start block address of the extent */ - unsigned int len; /* lenth of the extent */ + unsigned int len; /* length of the extent */ }; /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 958a46da19ae..db626282d424 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -590,7 +590,7 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { unsigned int oldflags; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -627,7 +627,7 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return ret; } default: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 94b8a0c48453..2e3eb2d4fc30 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -222,7 +222,7 @@ static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, } /* - * This function is called from two pathes. + * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. * When it is called during GC, it just gets a victim segment * and it does not remove it from dirty seglist. diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fea6e582a2ed..62e017743af6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -82,7 +82,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); - /* Initilize f2fs-specific inode info */ + /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; atomic_set(&fi->dirty_dents, 0); fi->i_current_depth = 1; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 165012ef363a..7a6f02caf286 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -964,6 +964,29 @@ int fat_scan(struct inode *dir, const unsigned char *name, } EXPORT_SYMBOL_GPL(fat_scan); +/* + * Scans a directory for a given logstart. + * Returns an error code or zero. + */ +int fat_scan_logstart(struct inode *dir, int i_logstart, + struct fat_slot_info *sinfo) +{ + struct super_block *sb = dir->i_sb; + + sinfo->slot_off = 0; + sinfo->bh = NULL; + while (fat_get_short_entry(dir, &sinfo->slot_off, &sinfo->bh, + &sinfo->de) >= 0) { + if (fat_get_start(MSDOS_SB(sb), sinfo->de) == i_logstart) { + sinfo->slot_off -= sizeof(*sinfo->de); + sinfo->nr_slots = 1; + sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); + return 0; + } + } + return -ENOENT; +} + static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots) { struct super_block *sb = dir->i_sb; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e9cc3f0d58e2..21664fcf3616 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -23,6 +23,9 @@ #define FAT_ERRORS_PANIC 2 /* panic on error */ #define FAT_ERRORS_RO 3 /* remount r/o on error */ +#define FAT_NFS_STALE_RW 1 /* NFS RW support, can cause ESTALE */ +#define FAT_NFS_NOSTALE_RO 2 /* NFS RO support, no ESTALE issue */ + struct fat_mount_options { kuid_t fs_uid; kgid_t fs_gid; @@ -34,6 +37,7 @@ struct fat_mount_options { unsigned short shortname; /* flags for shortname display/create rule */ unsigned char name_check; /* r = relaxed, n = normal, s = strict */ unsigned char errors; /* On error: continue, panic, remount-ro */ + unsigned char nfs; /* NFS support: nostale_ro, stale_rw */ unsigned short allow_utime;/* permission for setting the [am]time */ unsigned quiet:1, /* set = fake successful chmods and chowns */ showexec:1, /* set = only set x bit for com/exe/bat */ @@ -48,8 +52,7 @@ struct fat_mount_options { usefree:1, /* Use free_clusters for FAT32 */ tz_set:1, /* Filesystem timestamps' offset set */ rodir:1, /* allow ATTR_RO for directory */ - discard:1, /* Issue discard requests on deletions */ - nfs:1; /* Do extra work needed for NFS export */ + discard:1; /* Issue discard requests on deletions */ }; #define FAT_HASH_BITS 8 @@ -72,6 +75,7 @@ struct msdos_sb_info { unsigned long root_cluster; /* first cluster of the root directory */ unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */ struct mutex fat_lock; + struct mutex nfs_build_inode_lock; struct mutex s_lock; unsigned int prev_free; /* previously allocated cluster number */ unsigned int free_clusters; /* -1 if undefined */ @@ -215,6 +219,27 @@ static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus) + sbi->data_start; } +static inline void fat_get_blknr_offset(struct msdos_sb_info *sbi, + loff_t i_pos, sector_t *blknr, int *offset) +{ + *blknr = i_pos >> sbi->dir_per_block_bits; + *offset = i_pos & (sbi->dir_per_block - 1); +} + +static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi, + struct inode *inode) +{ + loff_t i_pos; +#if BITS_PER_LONG == 32 + spin_lock(&sbi->inode_hash_lock); +#endif + i_pos = MSDOS_I(inode)->i_pos; +#if BITS_PER_LONG == 32 + spin_unlock(&sbi->inode_hash_lock); +#endif + return i_pos; +} + static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len) { #ifdef __BIG_ENDIAN @@ -271,6 +296,8 @@ extern int fat_dir_empty(struct inode *dir); extern int fat_subdirs(struct inode *dir); extern int fat_scan(struct inode *dir, const unsigned char *name, struct fat_slot_info *sinfo); +extern int fat_scan_logstart(struct inode *dir, int i_logstart, + struct fat_slot_info *sinfo); extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, struct msdos_dir_entry **de); extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts); @@ -348,6 +375,7 @@ extern struct inode *fat_build_inode(struct super_block *sb, extern int fat_sync_inode(struct inode *inode); extern int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, void (*setup)(struct super_block *)); +extern int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de); extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); @@ -382,12 +410,8 @@ int fat_cache_init(void); void fat_cache_destroy(void); /* fat/nfs.c */ -struct fid; -extern struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type); -extern struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type); -extern struct dentry *fat_get_parent(struct dentry *child_dir); +extern const struct export_operations fat_export_ops; +extern const struct export_operations fat_export_ops_nostale; /* helper for printk */ typedef unsigned long long llu; diff --git a/fs/fat/file.c b/fs/fat/file.c index 3978f8ca1823..b0b632e50ddb 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -306,6 +306,11 @@ int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) struct inode *inode = dentry->d_inode; generic_fillattr(inode, stat); stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; + + if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) { + /* Use i_pos for ino. This is used as fileid of nfs. */ + stat->ino = fat_i_pos_read(MSDOS_SB(inode->i_sb), inode); + } return 0; } EXPORT_SYMBOL_GPL(fat_getattr); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index acf6e479b443..4ff901632b26 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -18,7 +18,6 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/buffer_head.h> -#include <linux/exportfs.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/parser.h> @@ -385,7 +384,7 @@ static int fat_calc_dir_size(struct inode *inode) } /* doesn't deal with root inode */ -static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) +int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int error; @@ -444,12 +443,25 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) return 0; } +static inline void fat_lock_build_inode(struct msdos_sb_info *sbi) +{ + if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) + mutex_lock(&sbi->nfs_build_inode_lock); +} + +static inline void fat_unlock_build_inode(struct msdos_sb_info *sbi) +{ + if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) + mutex_unlock(&sbi->nfs_build_inode_lock); +} + struct inode *fat_build_inode(struct super_block *sb, struct msdos_dir_entry *de, loff_t i_pos) { struct inode *inode; int err; + fat_lock_build_inode(MSDOS_SB(sb)); inode = fat_iget(sb, i_pos); if (inode) goto out; @@ -469,6 +481,7 @@ struct inode *fat_build_inode(struct super_block *sb, fat_attach(inode, i_pos); insert_inode_hash(inode); out: + fat_unlock_build_inode(MSDOS_SB(sb)); return inode; } @@ -655,20 +668,6 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi, - struct inode *inode) -{ - loff_t i_pos; -#if BITS_PER_LONG == 32 - spin_lock(&sbi->inode_hash_lock); -#endif - i_pos = MSDOS_I(inode)->i_pos; -#if BITS_PER_LONG == 32 - spin_unlock(&sbi->inode_hash_lock); -#endif - return i_pos; -} - static int __fat_write_inode(struct inode *inode, int wait) { struct super_block *sb = inode->i_sb; @@ -676,7 +675,8 @@ static int __fat_write_inode(struct inode *inode, int wait) struct buffer_head *bh; struct msdos_dir_entry *raw_entry; loff_t i_pos; - int err; + sector_t blocknr; + int err, offset; if (inode->i_ino == MSDOS_ROOT_INO) return 0; @@ -686,7 +686,8 @@ retry: if (!i_pos) return 0; - bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits); + fat_get_blknr_offset(sbi, i_pos, &blocknr, &offset); + bh = sb_bread(sb, blocknr); if (!bh) { fat_msg(sb, KERN_ERR, "unable to read inode block " "for updating (i_pos %lld)", i_pos); @@ -699,8 +700,7 @@ retry: goto retry; } - raw_entry = &((struct msdos_dir_entry *) (bh->b_data)) - [i_pos & (sbi->dir_per_block - 1)]; + raw_entry = &((struct msdos_dir_entry *) (bh->b_data))[offset]; if (S_ISDIR(inode->i_mode)) raw_entry->size = 0; else @@ -761,12 +761,6 @@ static const struct super_operations fat_sops = { .show_options = fat_show_options, }; -static const struct export_operations fat_export_ops = { - .fh_to_dentry = fat_fh_to_dentry, - .fh_to_parent = fat_fh_to_parent, - .get_parent = fat_get_parent, -}; - static int fat_show_options(struct seq_file *m, struct dentry *root) { struct msdos_sb_info *sbi = MSDOS_SB(root->d_sb); @@ -814,8 +808,6 @@ static int fat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",usefree"); if (opts->quiet) seq_puts(m, ",quiet"); - if (opts->nfs) - seq_puts(m, ",nfs"); if (opts->showexec) seq_puts(m, ",showexec"); if (opts->sys_immutable) @@ -849,6 +841,10 @@ static int fat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",errors=panic"); else seq_puts(m, ",errors=remount-ro"); + if (opts->nfs == FAT_NFS_NOSTALE_RO) + seq_puts(m, ",nfs=nostale_ro"); + else if (opts->nfs) + seq_puts(m, ",nfs=stale_rw"); if (opts->discard) seq_puts(m, ",discard"); @@ -865,7 +861,7 @@ enum { Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset, - Opt_err, + Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, }; static const match_table_t fat_tokens = { @@ -895,7 +891,9 @@ static const match_table_t fat_tokens = { {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, {Opt_discard, "discard"}, - {Opt_nfs, "nfs"}, + {Opt_nfs_stale_rw, "nfs"}, + {Opt_nfs_stale_rw, "nfs=stale_rw"}, + {Opt_nfs_nostale_ro, "nfs=nostale_ro"}, {Opt_obsolete, "conv=binary"}, {Opt_obsolete, "conv=text"}, {Opt_obsolete, "conv=auto"}, @@ -1092,6 +1090,12 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, case Opt_err_ro: opts->errors = FAT_ERRORS_RO; break; + case Opt_nfs_stale_rw: + opts->nfs = FAT_NFS_STALE_RW; + break; + case Opt_nfs_nostale_ro: + opts->nfs = FAT_NFS_NOSTALE_RO; + break; /* msdos specific */ case Opt_dots: @@ -1150,9 +1154,6 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, case Opt_discard: opts->discard = 1; break; - case Opt_nfs: - opts->nfs = 1; - break; /* obsolete mount options */ case Opt_obsolete: @@ -1183,6 +1184,10 @@ out: opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH); if (opts->unicode_xlate) opts->utf8 = 0; + if (opts->nfs == FAT_NFS_NOSTALE_RO) { + sb->s_flags |= MS_RDONLY; + sb->s_export_op = &fat_export_ops_nostale; + } return 0; } @@ -1193,7 +1198,7 @@ static int fat_read_root(struct inode *inode) struct msdos_sb_info *sbi = MSDOS_SB(sb); int error; - MSDOS_I(inode)->i_pos = 0; + MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO; inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode->i_version++; @@ -1256,6 +1261,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sb->s_magic = MSDOS_SUPER_MAGIC; sb->s_op = &fat_sops; sb->s_export_op = &fat_export_ops; + mutex_init(&sbi->nfs_build_inode_lock); ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index 499c10438ca2..93e14933dcb6 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -14,6 +14,18 @@ #include <linux/exportfs.h> #include "fat.h" +struct fat_fid { + u32 i_gen; + u32 i_pos_low; + u16 i_pos_hi; + u16 parent_i_pos_hi; + u32 parent_i_pos_low; + u32 parent_i_gen; +}; + +#define FAT_FID_SIZE_WITHOUT_PARENT 3 +#define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32)) + /** * Look up a directory inode given its starting cluster. */ @@ -38,63 +50,252 @@ static struct inode *fat_dget(struct super_block *sb, int i_logstart) return inode; } -static struct inode *fat_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) +static struct inode *fat_ilookup(struct super_block *sb, u64 ino, loff_t i_pos) { - struct inode *inode; + if (MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO) + return fat_iget(sb, i_pos); - if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO)) - return NULL; + else { + if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO)) + return NULL; + return ilookup(sb, ino); + } +} + +static struct inode *__fat_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation, loff_t i_pos) +{ + struct inode *inode = fat_ilookup(sb, ino, i_pos); - inode = ilookup(sb, ino); if (inode && generation && (inode->i_generation != generation)) { iput(inode); inode = NULL; } + if (inode == NULL && MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO) { + struct buffer_head *bh = NULL; + struct msdos_dir_entry *de ; + sector_t blocknr; + int offset; + fat_get_blknr_offset(MSDOS_SB(sb), i_pos, &blocknr, &offset); + bh = sb_bread(sb, blocknr); + if (!bh) { + fat_msg(sb, KERN_ERR, + "unable to read block(%llu) for building NFS inode", + (llu)blocknr); + return inode; + } + de = (struct msdos_dir_entry *)bh->b_data; + /* If a file is deleted on server and client is not updated + * yet, we must not build the inode upon a lookup call. + */ + if (IS_FREE(de[offset].name)) + inode = NULL; + else + inode = fat_build_inode(sb, &de[offset], i_pos); + brelse(bh); + } return inode; } +static struct inode *fat_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + + return __fat_nfs_get_inode(sb, ino, generation, 0); +} + +static int +fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp, + struct inode *parent) +{ + int len = *lenp; + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + struct fat_fid *fid = (struct fat_fid *) fh; + loff_t i_pos; + int type = FILEID_FAT_WITHOUT_PARENT; + + if (parent) { + if (len < FAT_FID_SIZE_WITH_PARENT) { + *lenp = FAT_FID_SIZE_WITH_PARENT; + return FILEID_INVALID; + } + } else { + if (len < FAT_FID_SIZE_WITHOUT_PARENT) { + *lenp = FAT_FID_SIZE_WITHOUT_PARENT; + return FILEID_INVALID; + } + } + + i_pos = fat_i_pos_read(sbi, inode); + *lenp = FAT_FID_SIZE_WITHOUT_PARENT; + fid->i_gen = inode->i_generation; + fid->i_pos_low = i_pos & 0xFFFFFFFF; + fid->i_pos_hi = (i_pos >> 32) & 0xFFFF; + if (parent) { + i_pos = fat_i_pos_read(sbi, parent); + fid->parent_i_pos_hi = (i_pos >> 32) & 0xFFFF; + fid->parent_i_pos_low = i_pos & 0xFFFFFFFF; + fid->parent_i_gen = parent->i_generation; + type = FILEID_FAT_WITH_PARENT; + *lenp = FAT_FID_SIZE_WITH_PARENT; + } + + return type; +} + /** * Map a NFS file handle to a corresponding dentry. * The dentry may or may not be connected to the filesystem root. */ -struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid, +static struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, fat_nfs_get_inode); } +static struct dentry *fat_fh_to_dentry_nostale(struct super_block *sb, + struct fid *fh, int fh_len, + int fh_type) +{ + struct inode *inode = NULL; + struct fat_fid *fid = (struct fat_fid *)fh; + loff_t i_pos; + + switch (fh_type) { + case FILEID_FAT_WITHOUT_PARENT: + if (fh_len < FAT_FID_SIZE_WITHOUT_PARENT) + return NULL; + break; + case FILEID_FAT_WITH_PARENT: + if (fh_len < FAT_FID_SIZE_WITH_PARENT) + return NULL; + break; + default: + return NULL; + } + i_pos = fid->i_pos_hi; + i_pos = (i_pos << 32) | (fid->i_pos_low); + inode = __fat_nfs_get_inode(sb, 0, fid->i_gen, i_pos); + + return d_obtain_alias(inode); +} + /* * Find the parent for a file specified by NFS handle. * This requires that the handle contain the i_ino of the parent. */ -struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid, +static struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, fat_nfs_get_inode); } +static struct dentry *fat_fh_to_parent_nostale(struct super_block *sb, + struct fid *fh, int fh_len, + int fh_type) +{ + struct inode *inode = NULL; + struct fat_fid *fid = (struct fat_fid *)fh; + loff_t i_pos; + + if (fh_len < FAT_FID_SIZE_WITH_PARENT) + return NULL; + + switch (fh_type) { + case FILEID_FAT_WITH_PARENT: + i_pos = fid->parent_i_pos_hi; + i_pos = (i_pos << 32) | (fid->parent_i_pos_low); + inode = __fat_nfs_get_inode(sb, 0, fid->parent_i_gen, i_pos); + break; + } + + return d_obtain_alias(inode); +} + +/* + * Rebuild the parent for a directory that is not connected + * to the filesystem root + */ +static +struct inode *fat_rebuild_parent(struct super_block *sb, int parent_logstart) +{ + int search_clus, clus_to_match; + struct msdos_dir_entry *de; + struct inode *parent = NULL; + struct inode *dummy_grand_parent = NULL; + struct fat_slot_info sinfo; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + sector_t blknr = fat_clus_to_blknr(sbi, parent_logstart); + struct buffer_head *parent_bh = sb_bread(sb, blknr); + if (!parent_bh) { + fat_msg(sb, KERN_ERR, + "unable to read cluster of parent directory"); + return NULL; + } + + de = (struct msdos_dir_entry *) parent_bh->b_data; + clus_to_match = fat_get_start(sbi, &de[0]); + search_clus = fat_get_start(sbi, &de[1]); + + dummy_grand_parent = fat_dget(sb, search_clus); + if (!dummy_grand_parent) { + dummy_grand_parent = new_inode(sb); + if (!dummy_grand_parent) { + brelse(parent_bh); + return parent; + } + + dummy_grand_parent->i_ino = iunique(sb, MSDOS_ROOT_INO); + fat_fill_inode(dummy_grand_parent, &de[1]); + MSDOS_I(dummy_grand_parent)->i_pos = -1; + } + + if (!fat_scan_logstart(dummy_grand_parent, clus_to_match, &sinfo)) + parent = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + + brelse(parent_bh); + iput(dummy_grand_parent); + + return parent; +} + /* * Find the parent for a directory that is not currently connected to * the filesystem root. * * On entry, the caller holds child_dir->d_inode->i_mutex. */ -struct dentry *fat_get_parent(struct dentry *child_dir) +static struct dentry *fat_get_parent(struct dentry *child_dir) { struct super_block *sb = child_dir->d_sb; struct buffer_head *bh = NULL; struct msdos_dir_entry *de; struct inode *parent_inode = NULL; + struct msdos_sb_info *sbi = MSDOS_SB(sb); if (!fat_get_dotdot_entry(child_dir->d_inode, &bh, &de)) { - int parent_logstart = fat_get_start(MSDOS_SB(sb), de); + int parent_logstart = fat_get_start(sbi, de); parent_inode = fat_dget(sb, parent_logstart); + if (!parent_inode && sbi->options.nfs == FAT_NFS_NOSTALE_RO) + parent_inode = fat_rebuild_parent(sb, parent_logstart); } brelse(bh); return d_obtain_alias(parent_inode); } + +const struct export_operations fat_export_ops = { + .fh_to_dentry = fat_fh_to_dentry, + .fh_to_parent = fat_fh_to_parent, + .get_parent = fat_get_parent, +}; + +const struct export_operations fat_export_ops_nostale = { + .encode_fh = fat_encode_fh_nostale, + .fh_to_dentry = fat_fh_to_dentry_nostale, + .fh_to_parent = fat_fh_to_parent_nostale, + .get_parent = fat_get_parent, +}; diff --git a/fs/fifo.c b/fs/fifo.c deleted file mode 100644 index cf6f4345ceb0..000000000000 --- a/fs/fifo.c +++ /dev/null @@ -1,153 +0,0 @@ -/* - * linux/fs/fifo.c - * - * written by Paul H. Hargrove - * - * Fixes: - * 10-06-1999, AV: fixed OOM handling in fifo_open(), moved - * initialization there, switched to external - * allocation of pipe_inode_info. - */ - -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/sched.h> -#include <linux/pipe_fs_i.h> - -static int wait_for_partner(struct inode* inode, unsigned int *cnt) -{ - int cur = *cnt; - - while (cur == *cnt) { - pipe_wait(inode->i_pipe); - if (signal_pending(current)) - break; - } - return cur == *cnt ? -ERESTARTSYS : 0; -} - -static void wake_up_partner(struct inode* inode) -{ - wake_up_interruptible(&inode->i_pipe->wait); -} - -static int fifo_open(struct inode *inode, struct file *filp) -{ - struct pipe_inode_info *pipe; - int ret; - - mutex_lock(&inode->i_mutex); - pipe = inode->i_pipe; - if (!pipe) { - ret = -ENOMEM; - pipe = alloc_pipe_info(inode); - if (!pipe) - goto err_nocleanup; - inode->i_pipe = pipe; - } - filp->f_version = 0; - - /* We can only do regular read/write on fifos */ - filp->f_mode &= (FMODE_READ | FMODE_WRITE); - - switch (filp->f_mode) { - case FMODE_READ: - /* - * O_RDONLY - * POSIX.1 says that O_NONBLOCK means return with the FIFO - * opened, even when there is no process writing the FIFO. - */ - filp->f_op = &read_pipefifo_fops; - pipe->r_counter++; - if (pipe->readers++ == 0) - wake_up_partner(inode); - - if (!pipe->writers) { - if ((filp->f_flags & O_NONBLOCK)) { - /* suppress POLLHUP until we have - * seen a writer */ - filp->f_version = pipe->w_counter; - } else { - if (wait_for_partner(inode, &pipe->w_counter)) - goto err_rd; - } - } - break; - - case FMODE_WRITE: - /* - * O_WRONLY - * POSIX.1 says that O_NONBLOCK means return -1 with - * errno=ENXIO when there is no process reading the FIFO. - */ - ret = -ENXIO; - if ((filp->f_flags & O_NONBLOCK) && !pipe->readers) - goto err; - - filp->f_op = &write_pipefifo_fops; - pipe->w_counter++; - if (!pipe->writers++) - wake_up_partner(inode); - - if (!pipe->readers) { - if (wait_for_partner(inode, &pipe->r_counter)) - goto err_wr; - } - break; - - case FMODE_READ | FMODE_WRITE: - /* - * O_RDWR - * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. - * This implementation will NEVER block on a O_RDWR open, since - * the process can at least talk to itself. - */ - filp->f_op = &rdwr_pipefifo_fops; - - pipe->readers++; - pipe->writers++; - pipe->r_counter++; - pipe->w_counter++; - if (pipe->readers == 1 || pipe->writers == 1) - wake_up_partner(inode); - break; - - default: - ret = -EINVAL; - goto err; - } - - /* Ok! */ - mutex_unlock(&inode->i_mutex); - return 0; - -err_rd: - if (!--pipe->readers) - wake_up_interruptible(&pipe->wait); - ret = -ERESTARTSYS; - goto err; - -err_wr: - if (!--pipe->writers) - wake_up_interruptible(&pipe->wait); - ret = -ERESTARTSYS; - goto err; - -err: - if (!pipe->readers && !pipe->writers) - free_pipe_info(inode); - -err_nocleanup: - mutex_unlock(&inode->i_mutex); - return ret; -} - -/* - * Dummy default file-operations: the only thing this does - * is contain the open that then fills in the correct operations - * depending on the access mode of the file... - */ -const struct file_operations def_fifo_fops = { - .open = fifo_open, /* will set read_ or write_pipefifo_fops */ - .llseek = noop_llseek, -}; diff --git a/fs/file.c b/fs/file.c index 3906d9577a18..4a78f981557a 100644 --- a/fs/file.c +++ b/fs/file.c @@ -23,24 +23,10 @@ #include <linux/rcupdate.h> #include <linux/workqueue.h> -struct fdtable_defer { - spinlock_t lock; - struct work_struct wq; - struct fdtable *next; -}; - int sysctl_nr_open __read_mostly = 1024*1024; int sysctl_nr_open_min = BITS_PER_LONG; int sysctl_nr_open_max = 1024 * 1024; /* raised later */ -/* - * We use this list to defer free fdtables that have vmalloced - * sets/arrays. By keeping a per-cpu list, we avoid having to embed - * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in - * this per-task structure. - */ -static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); - static void *alloc_fdmem(size_t size) { /* @@ -67,46 +53,9 @@ static void __free_fdtable(struct fdtable *fdt) kfree(fdt); } -static void free_fdtable_work(struct work_struct *work) -{ - struct fdtable_defer *f = - container_of(work, struct fdtable_defer, wq); - struct fdtable *fdt; - - spin_lock_bh(&f->lock); - fdt = f->next; - f->next = NULL; - spin_unlock_bh(&f->lock); - while(fdt) { - struct fdtable *next = fdt->next; - - __free_fdtable(fdt); - fdt = next; - } -} - static void free_fdtable_rcu(struct rcu_head *rcu) { - struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); - struct fdtable_defer *fddef; - - BUG_ON(!fdt); - BUG_ON(fdt->max_fds <= NR_OPEN_DEFAULT); - - if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) { - kfree(fdt->fd); - kfree(fdt->open_fds); - kfree(fdt); - } else { - fddef = &get_cpu_var(fdtable_defer_list); - spin_lock(&fddef->lock); - fdt->next = fddef->next; - fddef->next = fdt; - /* vmallocs are handled from the workqueue context */ - schedule_work(&fddef->wq); - spin_unlock(&fddef->lock); - put_cpu_var(fdtable_defer_list); - } + __free_fdtable(container_of(rcu, struct fdtable, rcu)); } /* @@ -174,7 +123,6 @@ static struct fdtable * alloc_fdtable(unsigned int nr) fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; - fdt->next = NULL; return fdt; @@ -221,7 +169,7 @@ static int expand_fdtable(struct files_struct *files, int nr) /* Continue as planned */ copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); - if (cur_fdt->max_fds > NR_OPEN_DEFAULT) + if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); } else { /* Somebody else expanded, so undo our attempt */ @@ -316,7 +264,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->fd = &newf->fd_array[0]; - new_fdt->next = NULL; spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); @@ -490,19 +437,8 @@ void exit_files(struct task_struct *tsk) } } -static void fdtable_defer_list_init(int cpu) -{ - struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); - spin_lock_init(&fddef->lock); - INIT_WORK(&fddef->wq, free_fdtable_work); - fddef->next = NULL; -} - void __init files_defer_init(void) { - int i; - for_each_possible_cpu(i) - fdtable_defer_list_init(i); sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 21f46fb3a101..798d4458a4d3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1028,6 +1028,7 @@ int bdi_writeback_thread(void *data) struct backing_dev_info *bdi = wb->bdi; long pages_written; + set_worker_desc("flush-%s", dev_name(bdi->dev)); current->flags |= PF_SWAPWRITE; set_freezable(); wb->last_active = jiffies; diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 8179e8bc4a3d..40d13c70ef51 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -287,5 +287,5 @@ const struct file_operations fscache_stats_fops = { .open = fscache_stats_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = single_release, }; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 11dfa0c3fb46..9bfd1a3214e6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1319,7 +1319,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, page_nr++; ret += buf->len; - if (pipe->inode) + if (pipe->files) do_wakeup = 1; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 34b80ba95bad..d15c6f21c17f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -971,7 +971,6 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return err; count = ocount; - sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ @@ -1030,7 +1029,6 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, out: current->backing_dev_info = NULL; mutex_unlock(&inode->i_mutex); - sb_end_write(inode->i_sb); return written ? written : err; } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 24f414f0ce61..9883694f1e7c 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -1055,7 +1055,7 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) if (atomic_read(&bh->b_count)) goto cannot_release; bd = bh->b_private; - if (bd && bd->bd_ail) + if (bd && bd->bd_tr) goto cannot_release; if (buffer_pinned(bh) || buffer_dirty(bh)) goto not_possible; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5e83657f046e..1dc9a13ce6bb 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -787,7 +787,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, goto out_rlist; if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */ - gfs2_rs_deltree(ip, ip->i_res); + gfs2_rs_deltree(ip->i_res); error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT + RES_STATFS + RES_QUOTA, diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 019f45e45097..d79c2dadc536 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -923,8 +923,11 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) cmd = F_SETLK; fl->fl_type = F_UNLCK; } - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { + if (fl->fl_type == F_UNLCK) + posix_lock_file_wait(file, fl); return -EIO; + } if (IS_GETLK(cmd)) return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); else if (fl->fl_type == F_UNLCK) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index cf3515546739..9435384562a2 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -912,7 +912,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh) */ static void handle_callback(struct gfs2_glock *gl, unsigned int state, - unsigned long delay) + unsigned long delay, bool remote) { int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; @@ -925,8 +925,8 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, gl->gl_demote_state = LM_ST_UNLOCKED; } if (gl->gl_ops->go_callback) - gl->gl_ops->go_callback(gl); - trace_gfs2_demote_rq(gl); + gl->gl_ops->go_callback(gl, remote); + trace_gfs2_demote_rq(gl, remote); } void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) @@ -1017,11 +1017,11 @@ do_cancel: return; trap_recursive: - print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip); + printk(KERN_ERR "original: %pSR\n", (void *)gh2->gh_ip); printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid)); printk(KERN_ERR "lock type: %d req lock state : %d\n", gh2->gh_gl->gl_name.ln_type, gh2->gh_state); - print_symbol(KERN_ERR "new: %s\n", gh->gh_ip); + printk(KERN_ERR "new: %pSR\n", (void *)gh->gh_ip); printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid)); printk(KERN_ERR "lock type: %d req lock state : %d\n", gh->gh_gl->gl_name.ln_type, gh->gh_state); @@ -1091,7 +1091,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) spin_lock(&gl->gl_spin); if (gh->gh_flags & GL_NOCACHE) - handle_callback(gl, LM_ST_UNLOCKED, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0, false); list_del_init(&gh->gh_list); if (find_first_holder(gl) == NULL) { @@ -1279,19 +1279,6 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) gfs2_glock_dq(&ghs[num_gh]); } -/** - * gfs2_glock_dq_uninit_m - release multiple glocks - * @num_gh: the number of structures - * @ghs: an array of struct gfs2_holder structures - * - */ - -void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) -{ - while (num_gh--) - gfs2_glock_dq_uninit(&ghs[num_gh]); -} - void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) { unsigned long delay = 0; @@ -1309,7 +1296,7 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) } spin_lock(&gl->gl_spin); - handle_callback(gl, state, delay); + handle_callback(gl, state, delay, true); spin_unlock(&gl->gl_spin); if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) gfs2_glock_put(gl); @@ -1422,7 +1409,7 @@ __acquires(&lru_lock) spin_unlock(&lru_lock); spin_lock(&gl->gl_spin); if (demote_ok(gl)) - handle_callback(gl, LM_ST_UNLOCKED, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0, false); WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); smp_mb__after_clear_bit(); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) @@ -1547,7 +1534,7 @@ static void clear_glock(struct gfs2_glock *gl) spin_lock(&gl->gl_spin); if (gl->gl_state != LM_ST_UNLOCKED) - handle_callback(gl, LM_ST_UNLOCKED, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0, false); spin_unlock(&gl->gl_spin); gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) @@ -1590,6 +1577,7 @@ static void dump_glock_func(struct gfs2_glock *gl) void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) { set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); + flush_workqueue(glock_workqueue); glock_hash_walk(clear_glock, sdp); flush_workqueue(glock_workqueue); wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index fd580b7861d5..69f66e3d22bf 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -201,7 +201,6 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, struct gfs2_holder *gh); extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); -extern void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) extern __printf(2, 3) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 444b6503ebc4..c66e99c97571 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -515,12 +515,12 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl) * * gl_spin lock is held while calling this */ -static void iopen_go_callback(struct gfs2_glock *gl) +static void iopen_go_callback(struct gfs2_glock *gl, bool remote) { struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; struct gfs2_sbd *sdp = gl->gl_sbd; - if (sdp->sd_vfs->s_flags & MS_RDONLY) + if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY)) return; if (gl->gl_demote_state == LM_ST_UNLOCKED && diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 156e42ec84ea..26aabd7caba7 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -31,7 +31,6 @@ struct gfs2_holder; struct gfs2_glock; struct gfs2_quota_data; struct gfs2_trans; -struct gfs2_ail; struct gfs2_jdesc; struct gfs2_sbd; struct lm_lockops; @@ -53,7 +52,7 @@ struct gfs2_log_header_host { struct gfs2_log_operations { void (*lo_before_commit) (struct gfs2_sbd *sdp); - void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai); + void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); void (*lo_before_scan) (struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass); int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start, @@ -139,7 +138,7 @@ struct gfs2_bufdata { struct list_head bd_list; const struct gfs2_log_operations *bd_ops; - struct gfs2_ail *bd_ail; + struct gfs2_trans *bd_tr; struct list_head bd_ail_st_list; struct list_head bd_ail_gl_list; }; @@ -211,7 +210,7 @@ struct gfs2_glock_operations { int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); - void (*go_callback) (struct gfs2_glock *gl); + void (*go_callback)(struct gfs2_glock *gl, bool remote); const int go_type; const unsigned long go_flags; #define GLOF_ASPACE 1 @@ -433,6 +432,7 @@ struct gfs2_trans { struct gfs2_holder tr_t_gh; int tr_touched; + int tr_attached; unsigned int tr_num_buf_new; unsigned int tr_num_databuf_new; @@ -440,14 +440,12 @@ struct gfs2_trans { unsigned int tr_num_databuf_rm; unsigned int tr_num_revoke; unsigned int tr_num_revoke_rm; -}; -struct gfs2_ail { - struct list_head ai_list; + struct list_head tr_list; - unsigned int ai_first; - struct list_head ai_ail1_list; - struct list_head ai_ail2_list; + unsigned int tr_first; + struct list_head tr_ail1_list; + struct list_head tr_ail2_list; }; struct gfs2_journal_extent { @@ -588,6 +586,7 @@ struct lm_lockstruct { struct dlm_lksb ls_control_lksb; /* control_lock */ char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */ struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ + char *ls_lvb_bits; spinlock_t ls_recover_spin; /* protects following fields */ unsigned long ls_recover_flags; /* DFL_ */ @@ -709,6 +708,7 @@ struct gfs2_sbd { spinlock_t sd_log_lock; + struct gfs2_trans *sd_log_tr; unsigned int sd_log_blks_reserved; unsigned int sd_log_commited_buf; unsigned int sd_log_commited_databuf; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index cc00bd1d1f87..8833a4f264e3 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -392,11 +392,15 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags) int error; int dblocks = 1; - error = gfs2_inplace_reserve(ip, RES_DINODE, flags); + error = gfs2_quota_lock_check(ip); if (error) goto out; - error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0); + error = gfs2_inplace_reserve(ip, RES_DINODE, flags); + if (error) + goto out_quota; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipreserv; @@ -409,6 +413,8 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags) out_ipreserv: gfs2_inplace_release(ip); +out_quota: + gfs2_quota_unlock(ip); out: return error; } @@ -440,59 +446,27 @@ static void gfs2_init_dir(struct buffer_head *dibh, */ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, - const char *symname, struct buffer_head **bhp) + const char *symname) { - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_dinode *di; struct buffer_head *dibh; - struct timespec tv = CURRENT_TIME; dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr); gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI); - gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); di = (struct gfs2_dinode *)dibh->b_data; + gfs2_dinode_out(ip, di); - di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); - di->di_num.no_addr = cpu_to_be64(ip->i_no_addr); - di->di_mode = cpu_to_be32(ip->i_inode.i_mode); - di->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode)); - di->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode)); - di->di_nlink = 0; - di->di_size = cpu_to_be64(ip->i_inode.i_size); - di->di_blocks = cpu_to_be64(1); - di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec); di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev)); di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev)); - di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_no_addr); - di->di_generation = cpu_to_be64(ip->i_generation); - di->di_flags = 0; di->__pad1 = 0; - di->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) ? GFS2_FORMAT_DE : 0); - di->di_height = 0; di->__pad2 = 0; di->__pad3 = 0; - di->di_depth = 0; - di->di_entries = 0; memset(&di->__pad4, 0, sizeof(di->__pad4)); - di->di_eattr = 0; - di->di_atime_nsec = cpu_to_be32(tv.tv_nsec); - di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec); - di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec); memset(&di->di_reserved, 0, sizeof(di->di_reserved)); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); switch(ip->i_inode.i_mode & S_IFMT) { - case S_IFREG: - if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || - gfs2_tune_get(sdp, gt_new_files_jdata)) - di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); - break; case S_IFDIR: - di->di_flags |= cpu_to_be32(dip->i_diskflags & - GFS2_DIF_INHERIT_JDATA); - di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); - di->di_size = cpu_to_be64(sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)); - di->di_entries = cpu_to_be32(2); gfs2_init_dir(dibh, dip); break; case S_IFLNK: @@ -501,63 +475,17 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, } set_buffer_uptodate(dibh); - - *bhp = dibh; -} - -static int make_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, - const char *symname, struct buffer_head **bhp) -{ - struct inode *inode = &ip->i_inode; - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - int error; - - error = gfs2_rindex_update(sdp); - if (error) - return error; - - error = gfs2_quota_lock(dip, inode->i_uid, inode->i_gid); - if (error) - return error; - - error = gfs2_quota_check(dip, inode->i_uid, inode->i_gid); - if (error) - goto out_quota; - - error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0); - if (error) - goto out_quota; - - init_dinode(dip, ip, symname, bhp); - gfs2_quota_change(dip, +1, inode->i_uid, inode->i_gid); - gfs2_trans_end(sdp); - -out_quota: - gfs2_quota_unlock(dip); - return error; + brelse(dibh); } static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip) + struct gfs2_inode *ip, int arq) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - int alloc_required; - struct buffer_head *dibh; int error; - error = gfs2_rindex_update(sdp); - if (error) - return error; - - error = gfs2_quota_lock(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); - if (error) - goto fail; - - error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name); - if (alloc_required < 0) - goto fail_quota_locks; - if (alloc_required) { - error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid); + if (arq) { + error = gfs2_quota_lock_check(dip); if (error) goto fail_quota_locks; @@ -581,26 +509,12 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) goto fail_end_trans; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto fail_end_trans; - set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1); - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - return 0; - fail_end_trans: gfs2_trans_end(sdp); - fail_ipreserv: - if (alloc_required) - gfs2_inplace_release(dip); - + gfs2_inplace_release(dip); fail_quota_locks: gfs2_quota_unlock(dip); - -fail: return error; } @@ -650,8 +564,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_glock *io_gl; int error; - struct buffer_head *bh = NULL; u32 aflags = 0; + int arq; if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; @@ -660,6 +574,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) return error; + error = gfs2_rindex_update(sdp); + if (error) + return error; + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); if (error) goto fail; @@ -674,22 +592,48 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock; + arq = error = gfs2_diradd_alloc_required(dir, name); + if (error < 0) + goto fail_gunlock; + inode = new_inode(sdp->sd_vfs); - if (!inode) { - gfs2_glock_dq_uninit(ghs); - return -ENOMEM; - } + error = -ENOMEM; + if (!inode) + goto fail_gunlock; + ip = GFS2_I(inode); error = gfs2_rs_alloc(ip); if (error) goto fail_free_inode; - set_bit(GIF_INVALID, &ip->i_flags); inode->i_mode = mode; + set_nlink(inode, S_ISDIR(mode) ? 2 : 1); inode->i_rdev = dev; inode->i_size = size; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + gfs2_set_inode_blocks(inode, 1); munge_mode_uid_gid(dip, inode); ip->i_goal = dip->i_goal; + ip->i_diskflags = 0; + ip->i_eattr = 0; + ip->i_height = 0; + ip->i_depth = 0; + ip->i_entries = 0; + + switch(mode & S_IFMT) { + case S_IFREG: + if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || + gfs2_tune_get(sdp, gt_new_files_jdata)) + ip->i_diskflags |= GFS2_DIF_JDATA; + gfs2_set_aops(inode); + break; + case S_IFDIR: + ip->i_diskflags |= (dip->i_diskflags & GFS2_DIF_INHERIT_JDATA); + ip->i_diskflags |= GFS2_DIF_JDATA; + ip->i_entries = 2; + break; + } + gfs2_set_inode_flags(inode); if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) || (dip->i_diskflags & GFS2_DIF_TOPDIR)) @@ -708,10 +652,13 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_free_inode; - error = make_dinode(dip, ip, symname, &bh); + error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) goto fail_gunlock2; + init_dinode(dip, ip, symname); + gfs2_trans_end(sdp); + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (error) goto fail_gunlock2; @@ -725,10 +672,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, gfs2_set_iop(inode); insert_inode_hash(inode); - error = gfs2_inode_refresh(ip); - if (error) - goto fail_gunlock3; - error = gfs2_acl_create(dip, inode); if (error) goto fail_gunlock3; @@ -737,18 +680,13 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock3; - error = link_dinode(dip, name, ip); + error = link_dinode(dip, name, ip, arq); if (error) goto fail_gunlock3; - if (bh) - brelse(bh); - - gfs2_trans_end(sdp); - gfs2_inplace_release(dip); - gfs2_quota_unlock(dip); mark_inode_dirty(inode); - gfs2_glock_dq_uninit_m(2, ghs); + gfs2_glock_dq_uninit(ghs); + gfs2_glock_dq_uninit(ghs + 1); d_instantiate(dentry, inode); return 0; @@ -769,12 +707,12 @@ fail_free_inode: fail_gunlock: gfs2_glock_dq_uninit(ghs); if (inode && !IS_ERR(inode)) { + clear_nlink(inode); + mark_inode_dirty(inode); set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags); iput(inode); } fail: - if (bh) - brelse(bh); return error; } @@ -1151,7 +1089,9 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0); + struct gfs2_sbd *sdp = GFS2_SB(dir); + unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); + return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, dsize, 0); } /** diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 9802de0f85e6..c8423d6de6c3 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -483,12 +483,8 @@ static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen, static int all_jid_bits_clear(char *lvb) { - int i; - for (i = JID_BITMAP_OFFSET; i < GDLM_LVB_SIZE; i++) { - if (lvb[i]) - return 0; - } - return 1; + return !memchr_inv(lvb + JID_BITMAP_OFFSET, 0, + GDLM_LVB_SIZE - JID_BITMAP_OFFSET); } static void sync_wait_cb(void *arg) @@ -580,7 +576,6 @@ static void gfs2_control_func(struct work_struct *work) { struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work); struct lm_lockstruct *ls = &sdp->sd_lockstruct; - char lvb_bits[GDLM_LVB_SIZE]; uint32_t block_gen, start_gen, lvb_gen, flags; int recover_set = 0; int write_lvb = 0; @@ -634,7 +629,7 @@ static void gfs2_control_func(struct work_struct *work) return; } - control_lvb_read(ls, &lvb_gen, lvb_bits); + control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits); spin_lock(&ls->ls_recover_spin); if (block_gen != ls->ls_recover_block || @@ -664,10 +659,10 @@ static void gfs2_control_func(struct work_struct *work) ls->ls_recover_result[i] = 0; - if (!test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) + if (!test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) continue; - __clear_bit_le(i, lvb_bits + JID_BITMAP_OFFSET); + __clear_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET); write_lvb = 1; } } @@ -691,7 +686,7 @@ static void gfs2_control_func(struct work_struct *work) continue; if (ls->ls_recover_submit[i] < start_gen) { ls->ls_recover_submit[i] = 0; - __set_bit_le(i, lvb_bits + JID_BITMAP_OFFSET); + __set_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET); } } /* even if there are no bits to set, we need to write the @@ -705,7 +700,7 @@ static void gfs2_control_func(struct work_struct *work) spin_unlock(&ls->ls_recover_spin); if (write_lvb) { - control_lvb_write(ls, start_gen, lvb_bits); + control_lvb_write(ls, start_gen, ls->ls_lvb_bits); flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK; } else { flags = DLM_LKF_CONVERT; @@ -725,7 +720,7 @@ static void gfs2_control_func(struct work_struct *work) */ for (i = 0; i < recover_size; i++) { - if (test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) { + if (test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) { fs_info(sdp, "recover generation %u jid %d\n", start_gen, i); gfs2_recover_set(sdp, i); @@ -758,7 +753,6 @@ static void gfs2_control_func(struct work_struct *work) static int control_mount(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; - char lvb_bits[GDLM_LVB_SIZE]; uint32_t start_gen, block_gen, mount_gen, lvb_gen; int mounted_mode; int retries = 0; @@ -857,7 +851,7 @@ locks_done: * lvb_gen will be non-zero. */ - control_lvb_read(ls, &lvb_gen, lvb_bits); + control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits); if (lvb_gen == 0xFFFFFFFF) { /* special value to force mount attempts to fail */ @@ -887,7 +881,7 @@ locks_done: * and all lvb bits to be clear (no pending journal recoveries.) */ - if (!all_jid_bits_clear(lvb_bits)) { + if (!all_jid_bits_clear(ls->ls_lvb_bits)) { /* journals need recovery, wait until all are clear */ fs_info(sdp, "control_mount wait for journal recovery\n"); goto restart; @@ -949,7 +943,6 @@ static int dlm_recovery_wait(void *word) static int control_first_done(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; - char lvb_bits[GDLM_LVB_SIZE]; uint32_t start_gen, block_gen; int error; @@ -991,8 +984,8 @@ restart: memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t)); spin_unlock(&ls->ls_recover_spin); - memset(lvb_bits, 0, sizeof(lvb_bits)); - control_lvb_write(ls, start_gen, lvb_bits); + memset(ls->ls_lvb_bits, 0, GDLM_LVB_SIZE); + control_lvb_write(ls, start_gen, ls->ls_lvb_bits); error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT); if (error) @@ -1022,6 +1015,12 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, uint32_t old_size, new_size; int i, max_jid; + if (!ls->ls_lvb_bits) { + ls->ls_lvb_bits = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); + if (!ls->ls_lvb_bits) + return -ENOMEM; + } + max_jid = 0; for (i = 0; i < num_slots; i++) { if (max_jid < slots[i].slot - 1) @@ -1057,6 +1056,7 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, static void free_recover_size(struct lm_lockstruct *ls) { + kfree(ls->ls_lvb_bits); kfree(ls->ls_recover_submit); kfree(ls->ls_recover_result); ls->ls_recover_submit = NULL; @@ -1205,6 +1205,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) ls->ls_recover_size = 0; ls->ls_recover_submit = NULL; ls->ls_recover_result = NULL; + ls->ls_lvb_bits = NULL; error = set_recover_size(sdp, NULL, 0); if (error) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 9a2ca8be7647..b404f4853034 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -73,7 +73,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, void gfs2_remove_from_ail(struct gfs2_bufdata *bd) { - bd->bd_ail = NULL; + bd->bd_tr = NULL; list_del_init(&bd->bd_ail_st_list); list_del_init(&bd->bd_ail_gl_list); atomic_dec(&bd->bd_gl->gl_ail_count); @@ -90,7 +90,7 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) static int gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct writeback_control *wbc, - struct gfs2_ail *ai) + struct gfs2_trans *tr) __releases(&sdp->sd_ail_lock) __acquires(&sdp->sd_ail_lock) { @@ -99,15 +99,15 @@ __acquires(&sdp->sd_ail_lock) struct gfs2_bufdata *bd, *s; struct buffer_head *bh; - list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, bd_ail_st_list) { + list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; - gfs2_assert(sdp, bd->bd_ail == ai); + gfs2_assert(sdp, bd->bd_tr == tr); if (!buffer_busy(bh)) { if (!buffer_uptodate(bh)) gfs2_io_error_bh(sdp, bh); - list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); + list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); continue; } @@ -116,7 +116,7 @@ __acquires(&sdp->sd_ail_lock) if (gl == bd->bd_gl) continue; gl = bd->bd_gl; - list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list); + list_move(&bd->bd_ail_st_list, &tr->tr_ail1_list); mapping = bh->b_page->mapping; if (!mapping) continue; @@ -144,15 +144,15 @@ __acquires(&sdp->sd_ail_lock) void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) { struct list_head *head = &sdp->sd_ail1_list; - struct gfs2_ail *ai; + struct gfs2_trans *tr; trace_gfs2_ail_flush(sdp, wbc, 1); spin_lock(&sdp->sd_ail_lock); restart: - list_for_each_entry_reverse(ai, head, ai_list) { + list_for_each_entry_reverse(tr, head, tr_list) { if (wbc->nr_to_write <= 0) break; - if (gfs2_ail1_start_one(sdp, wbc, ai)) + if (gfs2_ail1_start_one(sdp, wbc, tr)) goto restart; } spin_unlock(&sdp->sd_ail_lock); @@ -183,20 +183,20 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp) * */ -static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct gfs2_bufdata *bd, *s; struct buffer_head *bh; - list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, + list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; - gfs2_assert(sdp, bd->bd_ail == ai); + gfs2_assert(sdp, bd->bd_tr == tr); if (buffer_busy(bh)) continue; if (!buffer_uptodate(bh)) gfs2_io_error_bh(sdp, bh); - list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); + list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); } } @@ -210,14 +210,14 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) static int gfs2_ail1_empty(struct gfs2_sbd *sdp) { - struct gfs2_ail *ai, *s; + struct gfs2_trans *tr, *s; int ret; spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) { - gfs2_ail1_empty_one(sdp, ai); - if (list_empty(&ai->ai_ail1_list)) - list_move(&ai->ai_list, &sdp->sd_ail2_list); + list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { + gfs2_ail1_empty_one(sdp, tr); + if (list_empty(&tr->tr_ail1_list)) + list_move(&tr->tr_list, &sdp->sd_ail2_list); else break; } @@ -229,13 +229,13 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp) static void gfs2_ail1_wait(struct gfs2_sbd *sdp) { - struct gfs2_ail *ai; + struct gfs2_trans *tr; struct gfs2_bufdata *bd; struct buffer_head *bh; spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_reverse(ai, &sdp->sd_ail1_list, ai_list) { - list_for_each_entry(bd, &ai->ai_ail1_list, bd_ail_st_list) { + list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry(bd, &tr->tr_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; if (!buffer_locked(bh)) continue; @@ -256,40 +256,40 @@ static void gfs2_ail1_wait(struct gfs2_sbd *sdp) * */ -static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - struct list_head *head = &ai->ai_ail2_list; + struct list_head *head = &tr->tr_ail2_list; struct gfs2_bufdata *bd; while (!list_empty(head)) { bd = list_entry(head->prev, struct gfs2_bufdata, bd_ail_st_list); - gfs2_assert(sdp, bd->bd_ail == ai); + gfs2_assert(sdp, bd->bd_tr == tr); gfs2_remove_from_ail(bd); } } static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) { - struct gfs2_ail *ai, *safe; + struct gfs2_trans *tr, *safe; unsigned int old_tail = sdp->sd_log_tail; int wrap = (new_tail < old_tail); int a, b, rm; spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) { - a = (old_tail <= ai->ai_first); - b = (ai->ai_first < new_tail); + list_for_each_entry_safe(tr, safe, &sdp->sd_ail2_list, tr_list) { + a = (old_tail <= tr->tr_first); + b = (tr->tr_first < new_tail); rm = (wrap) ? (a || b) : (a && b); if (!rm) continue; - gfs2_ail2_empty_one(sdp, ai); - list_del(&ai->ai_list); - gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list)); - gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list)); - kfree(ai); + gfs2_ail2_empty_one(sdp, tr); + list_del(&tr->tr_list); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); + kfree(tr); } spin_unlock(&sdp->sd_ail_lock); @@ -435,7 +435,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp) static unsigned int current_tail(struct gfs2_sbd *sdp) { - struct gfs2_ail *ai; + struct gfs2_trans *tr; unsigned int tail; spin_lock(&sdp->sd_ail_lock); @@ -443,8 +443,9 @@ static unsigned int current_tail(struct gfs2_sbd *sdp) if (list_empty(&sdp->sd_ail1_list)) { tail = sdp->sd_log_head; } else { - ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list); - tail = ai->ai_first; + tr = list_entry(sdp->sd_ail1_list.prev, struct gfs2_trans, + tr_list); + tail = tr->tr_first; } spin_unlock(&sdp->sd_ail_lock); @@ -600,7 +601,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) { - struct gfs2_ail *ai; + struct gfs2_trans *tr; down_write(&sdp->sd_log_flush_lock); @@ -611,9 +612,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) } trace_gfs2_log_flush(sdp, 1); - ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); - INIT_LIST_HEAD(&ai->ai_ail1_list); - INIT_LIST_HEAD(&ai->ai_ail2_list); + tr = sdp->sd_log_tr; + if (tr) { + sdp->sd_log_tr = NULL; + INIT_LIST_HEAD(&tr->tr_ail1_list); + INIT_LIST_HEAD(&tr->tr_ail2_list); + } if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) { printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf, @@ -630,7 +634,8 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) sdp->sd_log_flush_head = sdp->sd_log_head; sdp->sd_log_flush_wrapped = 0; - ai->ai_first = sdp->sd_log_flush_head; + if (tr) + tr->tr_first = sdp->sd_log_flush_head; gfs2_ordered_write(sdp); lops_before_commit(sdp); @@ -643,7 +648,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) trace_gfs2_log_blocks(sdp, -1); log_write_header(sdp, 0); } - lops_after_commit(sdp, ai); + lops_after_commit(sdp, tr); gfs2_log_lock(sdp); sdp->sd_log_head = sdp->sd_log_flush_head; @@ -653,16 +658,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) sdp->sd_log_commited_revoke = 0; spin_lock(&sdp->sd_ail_lock); - if (!list_empty(&ai->ai_ail1_list)) { - list_add(&ai->ai_list, &sdp->sd_ail1_list); - ai = NULL; + if (tr && !list_empty(&tr->tr_ail1_list)) { + list_add(&tr->tr_list, &sdp->sd_ail1_list); + tr = NULL; } spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); trace_gfs2_log_flush(sdp, 0); up_write(&sdp->sd_log_flush_lock); - kfree(ai); + kfree(tr); } static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) @@ -687,6 +692,12 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) sdp->sd_jdesc->jd_blocks); sdp->sd_log_blks_reserved = reserved; + if (sdp->sd_log_tr == NULL && + (tr->tr_num_buf_new || tr->tr_num_databuf_new)) { + gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl); + sdp->sd_log_tr = tr; + tr->tr_attached = 1; + } gfs2_log_unlock(sdp); } @@ -708,7 +719,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { log_refund(sdp, tr); - up_read(&sdp->sd_log_flush_lock); if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) || ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) > diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index a5055977a214..7318abf9d0fb 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -53,8 +53,8 @@ void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) * to in-place disk block, remove it from the AIL. */ spin_lock(&sdp->sd_ail_lock); - if (bd->bd_ail) - list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); + if (bd->bd_tr) + list_move(&bd->bd_ail_st_list, &bd->bd_tr->tr_ail2_list); spin_unlock(&sdp->sd_ail_lock); get_bh(bh); atomic_inc(&sdp->sd_log_pinned); @@ -94,7 +94,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd) */ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, - struct gfs2_ail *ai) + struct gfs2_trans *tr) { struct gfs2_bufdata *bd = bh->b_private; @@ -109,7 +109,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, maybe_release_space(bd); spin_lock(&sdp->sd_ail_lock); - if (bd->bd_ail) { + if (bd->bd_tr) { list_del(&bd->bd_ail_st_list); brelse(bh); } else { @@ -117,8 +117,8 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list); atomic_inc(&gl->gl_ail_count); } - bd->bd_ail = ai; - list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); + bd->bd_tr = tr; + list_add(&bd->bd_ail_st_list, &tr->tr_ail1_list); spin_unlock(&sdp->sd_ail_lock); clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); @@ -480,17 +480,22 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) &sdp->sd_log_le_buf, 0); } -static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct list_head *head = &sdp->sd_log_le_buf; struct gfs2_bufdata *bd; + if (tr == NULL) { + gfs2_assert(sdp, list_empty(head)); + return; + } + while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); sdp->sd_log_num_buf--; - gfs2_unpin(sdp, bd->bd_bh, ai); + gfs2_unpin(sdp, bd->bd_bh, tr); } gfs2_assert_warn(sdp, !sdp->sd_log_num_buf); } @@ -613,7 +618,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) gfs2_log_write_page(sdp, page); } -static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct list_head *head = &sdp->sd_log_le_revoke; struct gfs2_bufdata *bd; @@ -791,16 +796,21 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); } -static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct list_head *head = &sdp->sd_log_le_databuf; struct gfs2_bufdata *bd; + if (tr == NULL) { + gfs2_assert(sdp, list_empty(head)); + return; + } + while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); sdp->sd_log_num_databuf--; - gfs2_unpin(sdp, bd->bd_bh, ai); + gfs2_unpin(sdp, bd->bd_bh, tr); } gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf); } diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index ba77b7da8325..87e062e05c92 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -55,12 +55,13 @@ static inline void lops_before_commit(struct gfs2_sbd *sdp) gfs2_log_ops[x]->lo_before_commit(sdp); } -static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static inline void lops_after_commit(struct gfs2_sbd *sdp, + struct gfs2_trans *tr) { int x; for (x = 0; gfs2_log_ops[x]; x++) if (gfs2_log_ops[x]->lo_after_commit) - gfs2_log_ops[x]->lo_after_commit(sdp, ai); + gfs2_log_ops[x]->lo_after_commit(sdp, tr); } static inline void lops_before_scan(struct gfs2_jdesc *jd, diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index b059bbb5059e..1a89afb68472 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -295,7 +295,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int } if (bd) { spin_lock(&sdp->sd_ail_lock); - if (bd->bd_ail) { + if (bd->bd_tr) { gfs2_remove_from_ail(bd); bh->b_private = NULL; bd->bd_bh = NULL; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index d1f51fd73f86..0c5a575b513e 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -576,7 +576,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip) RB_CLEAR_NODE(&ip->i_res->rs_node); out: up_write(&ip->i_rw_mutex); - return 0; + return error; } static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) @@ -592,7 +592,7 @@ static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) * @rs: The reservation to remove * */ -static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs) +static void __rs_deltree(struct gfs2_blkreserv *rs) { struct gfs2_rgrpd *rgd; @@ -605,7 +605,7 @@ static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs) RB_CLEAR_NODE(&rs->rs_node); if (rs->rs_free) { - /* return reserved blocks to the rgrp and the ip */ + /* return reserved blocks to the rgrp */ BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; rs->rs_free = 0; @@ -619,14 +619,14 @@ static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs) * @rs: The reservation to remove * */ -void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs) +void gfs2_rs_deltree(struct gfs2_blkreserv *rs) { struct gfs2_rgrpd *rgd; rgd = rs->rs_rbm.rgd; if (rgd) { spin_lock(&rgd->rd_rsspin); - __rs_deltree(ip, rs); + __rs_deltree(rs); spin_unlock(&rgd->rd_rsspin); } } @@ -640,7 +640,7 @@ void gfs2_rs_delete(struct gfs2_inode *ip) { down_write(&ip->i_rw_mutex); if (ip->i_res) { - gfs2_rs_deltree(ip, ip->i_res); + gfs2_rs_deltree(ip->i_res); BUG_ON(ip->i_res->rs_free); kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); ip->i_res = NULL; @@ -664,7 +664,7 @@ static void return_all_reservations(struct gfs2_rgrpd *rgd) spin_lock(&rgd->rd_rsspin); while ((n = rb_first(&rgd->rd_rstree))) { rs = rb_entry(n, struct gfs2_blkreserv, rs_node); - __rs_deltree(NULL, rs); + __rs_deltree(rs); } spin_unlock(&rgd->rd_rsspin); } @@ -1181,12 +1181,9 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed) { struct super_block *sb = sdp->sd_vfs; - struct block_device *bdev = sb->s_bdev; - const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize / - bdev_logical_block_size(sb->s_bdev); u64 blk; sector_t start = 0; - sector_t nr_sects = 0; + sector_t nr_blks = 0; int rv; unsigned int x; u32 trimmed = 0; @@ -1206,35 +1203,34 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, if (diff == 0) continue; blk = offset + ((bi->bi_start + x) * GFS2_NBBY); - blk *= sects_per_blk; /* convert to sectors */ while(diff) { if (diff & 1) { - if (nr_sects == 0) + if (nr_blks == 0) goto start_new_extent; - if ((start + nr_sects) != blk) { - if (nr_sects >= minlen) { - rv = blkdev_issue_discard(bdev, - start, nr_sects, + if ((start + nr_blks) != blk) { + if (nr_blks >= minlen) { + rv = sb_issue_discard(sb, + start, nr_blks, GFP_NOFS, 0); if (rv) goto fail; - trimmed += nr_sects; + trimmed += nr_blks; } - nr_sects = 0; + nr_blks = 0; start_new_extent: start = blk; } - nr_sects += sects_per_blk; + nr_blks++; } diff >>= 2; - blk += sects_per_blk; + blk++; } } - if (nr_sects >= minlen) { - rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0); + if (nr_blks >= minlen) { + rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0); if (rv) goto fail; - trimmed += nr_sects; + trimmed += nr_blks; } if (ptrimmed) *ptrimmed = trimmed; @@ -1878,7 +1874,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags) /* Drop reservation, if we couldn't use reserved rgrp */ if (gfs2_rs_active(rs)) - gfs2_rs_deltree(ip, rs); + gfs2_rs_deltree(rs); check_rgrp: /* Check for unlinked inodes which can be reclaimed */ if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) @@ -2091,7 +2087,7 @@ static void gfs2_adjust_reservation(struct gfs2_inode *ip, if (rs->rs_free && !ret) goto out; } - __rs_deltree(ip, rs); + __rs_deltree(rs); } out: spin_unlock(&rgd->rd_rsspin); @@ -2184,13 +2180,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, if (dinode) gfs2_trans_add_unrevoke(sdp, block, 1); - /* - * This needs reviewing to see why we cannot do the quota change - * at this point in the dinode case. - */ - if (ndata) - gfs2_quota_change(ip, ndata, ip->i_inode.i_uid, - ip->i_inode.i_gid); + gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid); rbm.rgd->rd_free_clone -= *nblocks; trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks, diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 842185853f6b..5b3f4a896e6c 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -47,7 +47,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, bool dinode, u64 *generation); extern int gfs2_rs_alloc(struct gfs2_inode *ip); -extern void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs); +extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); extern void gfs2_rs_delete(struct gfs2_inode *ip); extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index cab77b8ba84f..917c8e1eb4ae 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1512,7 +1512,7 @@ out_truncate: out_unlock: /* Error path for case 1 */ if (gfs2_rs_active(ip->i_res)) - gfs2_rs_deltree(ip, ip->i_res); + gfs2_rs_deltree(ip->i_res); if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) gfs2_glock_dq(&ip->i_iopen_gh); diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 2ee13e841e9f..20c007d747ab 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -159,9 +159,9 @@ TRACE_EVENT(gfs2_glock_put, /* Callback (local or remote) requesting lock demotion */ TRACE_EVENT(gfs2_demote_rq, - TP_PROTO(const struct gfs2_glock *gl), + TP_PROTO(const struct gfs2_glock *gl, bool remote), - TP_ARGS(gl), + TP_ARGS(gl, remote), TP_STRUCT__entry( __field( dev_t, dev ) @@ -170,6 +170,7 @@ TRACE_EVENT(gfs2_demote_rq, __field( u8, cur_state ) __field( u8, dmt_state ) __field( unsigned long, flags ) + __field( bool, remote ) ), TP_fast_assign( @@ -179,14 +180,16 @@ TRACE_EVENT(gfs2_demote_rq, __entry->cur_state = glock_trace_state(gl->gl_state); __entry->dmt_state = glock_trace_state(gl->gl_demote_state); __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0); + __entry->remote = remote; ), - TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s", + TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, (unsigned long long)__entry->glnum, glock_trace_name(__entry->cur_state), glock_trace_name(__entry->dmt_state), - show_glock_flags(__entry->flags)) + show_glock_flags(__entry->flags), + __entry->remote ? "remote" : "local") ); diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 88162fae27a5..7374907742a8 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -96,7 +96,8 @@ static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) static void gfs2_print_trans(const struct gfs2_trans *tr) { - print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip); + printk(KERN_WARNING "GFS2: Transaction created at: %pSR\n", + (void *)tr->tr_ip); printk(KERN_WARNING "GFS2: blocks=%u revokes=%u reserved=%u touched=%d\n", tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched); printk(KERN_WARNING "GFS2: Buf %u/%u Databuf %u/%u Revoke %u/%u\n", @@ -135,8 +136,10 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) if (tr->tr_t_gh.gh_gl) { gfs2_glock_dq(&tr->tr_t_gh); gfs2_holder_uninit(&tr->tr_t_gh); - kfree(tr); + if (!tr->tr_attached) + kfree(tr); } + up_read(&sdp->sd_log_flush_lock); if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) gfs2_log_flush(sdp, NULL); diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index 571abe97b42a..de69d8a24f6d 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -22,7 +22,8 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); + hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + tree->cnid, __builtin_return_address(0)); mutex_lock(&tree->tree_lock); return 0; } @@ -31,7 +32,8 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); + hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", + fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } @@ -135,8 +137,8 @@ int hfs_brec_find(struct hfs_find_data *fd) return res; invalid: - printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", - height, bnode->height, bnode->type, nidx, parent); + pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", + height, bnode->height, bnode->type, nidx, parent); res = -EIO; release: hfs_bnode_put(bnode); diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c index c6e97366e8ac..28307bc9ec1e 100644 --- a/fs/hfs/bitmap.c +++ b/fs/hfs/bitmap.c @@ -158,7 +158,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits) } } - dprint(DBG_BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits); + hfs_dbg(BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits); HFS_SB(sb)->free_ablocks -= *num_bits; hfs_bitmap_dirty(sb); out: @@ -200,7 +200,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count) if (!count) return 0; - dprint(DBG_BITMAP, "clear_bits: %u,%u\n", start, count); + hfs_dbg(BITMAP, "clear_bits: %u,%u\n", start, count); /* are all of the bits in range? */ if ((start + count) > HFS_SB(sb)->fs_ablocks) return -2; diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index cdb41a1f6a64..f3b1a15ccd59 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -100,7 +100,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct hfs_btree *tree; struct page *src_page, *dst_page; - dprint(DBG_BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; tree = src_node->tree; @@ -120,7 +120,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) struct page *page; void *ptr; - dprint(DBG_BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); if (!len) return; src += node->page_offset; @@ -138,16 +138,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - dprint(DBG_BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - dprint(DBG_BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - dprint(DBG_BNODE_MOD, " %d", key_off); + hfs_dbg_cont(BNODE_MOD, " %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -155,17 +155,18 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1; else tmp = node->tree->max_key_len + 1; - dprint(DBG_BNODE_MOD, " (%d,%d", tmp, hfs_bnode_read_u8(node, key_off)); + hfs_dbg_cont(BNODE_MOD, " (%d,%d", + tmp, hfs_bnode_read_u8(node, key_off)); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - dprint(DBG_BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u8(node, key_off); - dprint(DBG_BNODE_MOD, " (%d)", tmp); + hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); } } - dprint(DBG_BNODE_MOD, "\n"); + hfs_dbg_cont(BNODE_MOD, "\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -220,7 +221,7 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) struct hfs_bnode *node; if (cnid >= tree->node_count) { - printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid); + pr_err("request for non-existent node %d in B*Tree\n", cnid); return NULL; } @@ -243,7 +244,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) loff_t off; if (cnid >= tree->node_count) { - printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid); + pr_err("request for non-existent node %d in B*Tree\n", cnid); return NULL; } @@ -257,8 +258,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - dprint(DBG_BNODE_REFS, "new_node(%d:%d): 1\n", - node->tree->cnid, node->this); + hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); node2 = hfs_bnode_findhash(tree, cnid); @@ -301,7 +302,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - dprint(DBG_BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -443,8 +444,9 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n", - node->tree->cnid, node->this, atomic_read(&node->refcnt)); + hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); } } @@ -455,8 +457,9 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n", - node->tree->cnid, node->this, atomic_read(&node->refcnt)); + hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) return; diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 92fb358ce824..9f4ee7f52026 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -47,15 +47,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) if (node->tree->attributes & HFS_TREE_BIGKEYS) { retval = hfs_bnode_read_u16(node, recoff) + 2; if (retval > node->tree->max_key_len + 2) { - printk(KERN_ERR "hfs: keylen %d too large\n", - retval); + pr_err("keylen %d too large\n", retval); retval = 0; } } else { retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1; if (retval > node->tree->max_key_len + 1) { - printk(KERN_ERR "hfs: keylen %d too large\n", - retval); + pr_err("keylen %d too large\n", retval); retval = 0; } } @@ -94,7 +92,8 @@ again: end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off); + hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) panic("not enough room!\n"); @@ -190,7 +189,8 @@ again: mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength); + hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", + fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); if (!node->parent) @@ -240,7 +240,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - dprint(DBG_BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -374,7 +374,8 @@ again: newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1; else fd->keylength = newkeylen = tree->max_key_len + 1; - dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen); + hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", + rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; @@ -385,7 +386,7 @@ again: end_off = hfs_bnode_read_u16(parent, end_rec_off); if (end_rec_off - end_off < diff) { - printk(KERN_DEBUG "hfs: splitting index node...\n"); + printk(KERN_DEBUG "splitting index node...\n"); fd->bnode = parent; new_node = hfs_bnode_split(fd); if (IS_ERR(new_node)) diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 1cbdeea1db44..1ab19e660e69 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -48,7 +48,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz)); if (HFS_I(tree->inode)->alloc_blocks > HFS_I(tree->inode)->first_blocks) { - printk(KERN_ERR "hfs: invalid btree extent records\n"); + pr_err("invalid btree extent records\n"); unlock_new_inode(tree->inode); goto free_inode; } @@ -60,8 +60,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz)); if (!HFS_I(tree->inode)->first_blocks) { - printk(KERN_ERR "hfs: invalid btree extent records " - "(0 size).\n"); + pr_err("invalid btree extent records (0 size)\n"); unlock_new_inode(tree->inode); goto free_inode; } @@ -100,15 +99,15 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke switch (id) { case HFS_EXT_CNID: if (tree->max_key_len != HFS_MAX_EXT_KEYLEN) { - printk(KERN_ERR "hfs: invalid extent max_key_len %d\n", - tree->max_key_len); + pr_err("invalid extent max_key_len %d\n", + tree->max_key_len); goto fail_page; } break; case HFS_CAT_CNID: if (tree->max_key_len != HFS_MAX_CAT_KEYLEN) { - printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n", - tree->max_key_len); + pr_err("invalid catalog max_key_len %d\n", + tree->max_key_len); goto fail_page; } break; @@ -146,8 +145,9 @@ void hfs_btree_close(struct hfs_btree *tree) while ((node = tree->node_hash[i])) { tree->node_hash[i] = node->next_hash; if (atomic_read(&node->refcnt)) - printk(KERN_ERR "hfs: node %d:%d still has %d user(s)!\n", - node->tree->cnid, node->this, atomic_read(&node->refcnt)); + pr_err("node %d:%d still has %d user(s)!\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); hfs_bnode_free(node); tree->node_hash_cnt--; } @@ -290,7 +290,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) kunmap(*pagep); nidx = node->next; if (!nidx) { - printk(KERN_DEBUG "hfs: create new bmap node...\n"); + printk(KERN_DEBUG "create new bmap node...\n"); next_node = hfs_bmap_new_bmap(node, idx); } else next_node = hfs_bnode_find(tree, nidx); @@ -316,7 +316,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); tree = node->tree; nidx = node->this; node = hfs_bnode_find(tree, 0); @@ -331,7 +331,8 @@ void hfs_bmap_free(struct hfs_bnode *node) hfs_bnode_put(node); if (!i) { /* panic */; - printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this); + pr_crit("unable to free bnode %u. bmap not found!\n", + node->this); return; } node = hfs_bnode_find(tree, i); @@ -339,7 +340,8 @@ void hfs_bmap_free(struct hfs_bnode *node) return; if (node->type != HFS_NODE_MAP) { /* panic */; - printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type); + pr_crit("invalid bmap found! (%u,%d)\n", + node->this, node->type); hfs_bnode_put(node); return; } @@ -352,7 +354,8 @@ void hfs_bmap_free(struct hfs_bnode *node) m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { - printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type); + pr_crit("trying to free free bnode %u(%d)\n", + node->this, node->type); kunmap(page); hfs_bnode_put(node); return; diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index 424b0337f524..ff0316b925a5 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -87,12 +87,15 @@ int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode * int entry_size; int err; - dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); + hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", + str->name, cnid, inode->i_nlink); if (dir->i_size >= HFS_MAX_VALENCE) return -ENOSPC; sb = dir->i_sb; - hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (err) + return err; hfs_cat_build_key(sb, fd.search_key, cnid, NULL); entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ? @@ -184,14 +187,14 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid, type = rec.type; if (type != HFS_CDR_THD && type != HFS_CDR_FTH) { - printk(KERN_ERR "hfs: found bad thread record in catalog\n"); + pr_err("found bad thread record in catalog\n"); return -EIO; } fd->search_key->cat.ParID = rec.thread.ParID; len = fd->search_key->cat.CName.len = rec.thread.CName.len; if (len > HFS_NAMELEN) { - printk(KERN_ERR "hfs: bad catalog namelength\n"); + pr_err("bad catalog namelength\n"); return -EIO; } memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len); @@ -212,9 +215,11 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str) struct list_head *pos; int res, type; - dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); sb = dir->i_sb; - hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (res) + return res; hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str); res = hfs_brec_find(&fd); @@ -278,10 +283,13 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name, int entry_size, type; int err; - dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, + hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); sb = src_dir->i_sb; - hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd); + err = hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd); + if (err) + return err; dst_fd = src_fd; /* find the old dir entry and read the data */ diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 5f7f1abd5f6d..17c22a8fd40a 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -25,7 +25,9 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; int res; - hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + if (res) + return ERR_PTR(res); hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name); res = hfs_brec_read(&fd, &rec, sizeof(rec)); if (res) { @@ -63,7 +65,9 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (filp->f_pos >= inode->i_size) return 0; - hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (err) + return err; hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); err = hfs_brec_find(&fd); if (err) @@ -84,12 +88,12 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); if (entry.type != HFS_CDR_THD) { - printk(KERN_ERR "hfs: bad catalog folder thread\n"); + pr_err("bad catalog folder thread\n"); err = -EIO; goto out; } //if (fd.entrylength < HFS_MIN_THREAD_SZ) { - // printk(KERN_ERR "hfs: truncated catalog thread\n"); + // pr_err("truncated catalog thread\n"); // err = -EIO; // goto out; //} @@ -108,7 +112,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) for (;;) { if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) { - printk(KERN_ERR "hfs: walked past end of dir\n"); + pr_err("walked past end of dir\n"); err = -EIO; goto out; } @@ -123,7 +127,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName); if (type == HFS_CDR_DIR) { if (fd.entrylength < sizeof(struct hfs_cat_dir)) { - printk(KERN_ERR "hfs: small dir entry\n"); + pr_err("small dir entry\n"); err = -EIO; goto out; } @@ -132,7 +136,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) break; } else if (type == HFS_CDR_FIL) { if (fd.entrylength < sizeof(struct hfs_cat_file)) { - printk(KERN_ERR "hfs: small file entry\n"); + pr_err("small file entry\n"); err = -EIO; goto out; } @@ -140,7 +144,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) be32_to_cpu(entry.file.FlNum), DT_REG)) break; } else { - printk(KERN_ERR "hfs: bad catalog entry type %d\n", type); + pr_err("bad catalog entry type %d\n", type); err = -EIO; goto out; } diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index a67955a0c36f..e33a0d36a93e 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -107,7 +107,7 @@ static u16 hfs_ext_lastblock(struct hfs_extent *ext) return be16_to_cpu(ext->block) + be16_to_cpu(ext->count); } -static void __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) +static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) { int res; @@ -116,26 +116,31 @@ static void __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd res = hfs_brec_find(fd); if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) { if (res != -ENOENT) - return; + return res; hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec)); HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); } else { if (res) - return; + return res; hfs_bnode_write(fd->bnode, HFS_I(inode)->cached_extents, fd->entryoffset, fd->entrylength); HFS_I(inode)->flags &= ~HFS_FLG_EXT_DIRTY; } + return 0; } -void hfs_ext_write_extent(struct inode *inode) +int hfs_ext_write_extent(struct inode *inode) { struct hfs_find_data fd; + int res = 0; if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) { - hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); - __hfs_ext_write_extent(inode, &fd); + res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); + if (res) + return res; + res = __hfs_ext_write_extent(inode, &fd); hfs_find_exit(&fd); } + return res; } static inline int __hfs_ext_read_extent(struct hfs_find_data *fd, struct hfs_extent *extent, @@ -161,8 +166,11 @@ static inline int __hfs_ext_cache_extent(struct hfs_find_data *fd, struct inode { int res; - if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) - __hfs_ext_write_extent(inode, fd); + if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) { + res = __hfs_ext_write_extent(inode, fd); + if (res) + return res; + } res = __hfs_ext_read_extent(fd, HFS_I(inode)->cached_extents, inode->i_ino, block, HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA); @@ -185,9 +193,11 @@ static int hfs_ext_read_extent(struct inode *inode, u16 block) block < HFS_I(inode)->cached_start + HFS_I(inode)->cached_blocks) return 0; - hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); - res = __hfs_ext_cache_extent(&fd, inode, block); - hfs_find_exit(&fd); + res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); + if (!res) { + res = __hfs_ext_cache_extent(&fd, inode, block); + hfs_find_exit(&fd); + } return res; } @@ -195,11 +205,12 @@ static void hfs_dump_extent(struct hfs_extent *extent) { int i; - dprint(DBG_EXTENT, " "); + hfs_dbg(EXTENT, " "); for (i = 0; i < 3; i++) - dprint(DBG_EXTENT, " %u:%u", be16_to_cpu(extent[i].block), - be16_to_cpu(extent[i].count)); - dprint(DBG_EXTENT, "\n"); + hfs_dbg_cont(EXTENT, " %u:%u", + be16_to_cpu(extent[i].block), + be16_to_cpu(extent[i].count)); + hfs_dbg_cont(EXTENT, "\n"); } static int hfs_add_extent(struct hfs_extent *extent, u16 offset, @@ -298,7 +309,9 @@ int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type) if (total_blocks == blocks) return 0; - hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + if (res) + return res; do { res = __hfs_ext_read_extent(&fd, extent, cnid, total_blocks, type); if (res) @@ -392,10 +405,10 @@ int hfs_extend_file(struct inode *inode) goto out; } - dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) { if (!HFS_I(inode)->first_blocks) { - dprint(DBG_EXTENT, "first extents\n"); + hfs_dbg(EXTENT, "first extents\n"); /* no extents yet */ HFS_I(inode)->first_extents[0].block = cpu_to_be16(start); HFS_I(inode)->first_extents[0].count = cpu_to_be16(len); @@ -437,8 +450,10 @@ out: return res; insert_extent: - dprint(DBG_EXTENT, "insert new extent\n"); - hfs_ext_write_extent(inode); + hfs_dbg(EXTENT, "insert new extent\n"); + res = hfs_ext_write_extent(inode); + if (res) + goto out; memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec)); HFS_I(inode)->cached_extents[0].block = cpu_to_be16(start); @@ -460,13 +475,13 @@ void hfs_file_truncate(struct inode *inode) u32 size; int res; - dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino, - (long long)HFS_I(inode)->phys_size, inode->i_size); + hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n", + inode->i_ino, (long long)HFS_I(inode)->phys_size, + inode->i_size); if (inode->i_size > HFS_I(inode)->phys_size) { struct address_space *mapping = inode->i_mapping; void *fsdata; struct page *page; - int res; /* XXX: Can use generic_cont_expand? */ size = inode->i_size - 1; @@ -488,7 +503,12 @@ void hfs_file_truncate(struct inode *inode) goto out; mutex_lock(&HFS_I(inode)->extents_lock); - hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + if (res) { + mutex_unlock(&HFS_I(inode)->extents_lock); + /* XXX: We lack error handling of hfs_file_truncate() */ + return; + } while (1) { if (alloc_cnt == HFS_I(inode)->first_blocks) { hfs_free_extents(sb, HFS_I(inode)->first_extents, diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 693df9fe52b2..a73b11839a41 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -9,6 +9,12 @@ #ifndef _LINUX_HFS_FS_H #define _LINUX_HFS_FS_H +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/types.h> #include <linux/mutex.h> @@ -34,8 +40,18 @@ //#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) #define DBG_MASK (0) -#define dprint(flg, fmt, args...) \ - if (flg & DBG_MASK) printk(fmt , ## args) +#define hfs_dbg(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ +} while (0) + +#define hfs_dbg_cont(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + pr_cont(fmt, ##__VA_ARGS__); \ +} while (0) + /* * struct hfs_inode_info @@ -174,7 +190,7 @@ extern const struct inode_operations hfs_dir_inode_operations; /* extent.c */ extern int hfs_ext_keycmp(const btree_key *, const btree_key *); extern int hfs_free_fork(struct super_block *, struct hfs_cat_file *, int); -extern void hfs_ext_write_extent(struct inode *); +extern int hfs_ext_write_extent(struct inode *); extern int hfs_extend_file(struct inode *); extern void hfs_file_truncate(struct inode *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 3031dfdd2358..716e1aafb2e2 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -237,7 +237,7 @@ void hfs_delete_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; - dprint(DBG_INODE, "delete_inode: %lu\n", inode->i_ino); + hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino); if (S_ISDIR(inode->i_mode)) { HFS_SB(sb)->folder_count--; if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) @@ -416,9 +416,12 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) struct inode *main_inode = inode; struct hfs_find_data fd; hfs_cat_rec rec; + int res; - dprint(DBG_INODE, "hfs_write_inode: %lu\n", inode->i_ino); - hfs_ext_write_extent(inode); + hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino); + res = hfs_ext_write_extent(inode); + if (res) + return res; if (inode->i_ino < HFS_FIRSTUSER_CNID) { switch (inode->i_ino) { @@ -515,7 +518,11 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry, if (!inode) return ERR_PTR(-ENOMEM); - hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + if (res) { + iput(inode); + return ERR_PTR(res); + } fd.search_key->cat = HFS_I(dir)->cat_key; res = hfs_brec_read(&fd, &rec, sizeof(rec)); if (!res) { diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index b7ec224910c5..aa3f0d6d043c 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -48,7 +48,7 @@ static int hfs_get_last_session(struct super_block *sb, *start = (sector_t)te.cdte_addr.lba << 2; return 0; } - printk(KERN_ERR "hfs: invalid session number or type of track\n"); + pr_err("invalid session number or type of track\n"); return -EINVAL; } ms_info.addr_format = CDROM_LBA; @@ -101,7 +101,7 @@ int hfs_mdb_get(struct super_block *sb) HFS_SB(sb)->alloc_blksz = size = be32_to_cpu(mdb->drAlBlkSiz); if (!size || (size & (HFS_SECTOR_SIZE - 1))) { - printk(KERN_ERR "hfs: bad allocation block size %d\n", size); + pr_err("bad allocation block size %d\n", size); goto out_bh; } @@ -118,7 +118,7 @@ int hfs_mdb_get(struct super_block *sb) size >>= 1; brelse(bh); if (!sb_set_blocksize(sb, size)) { - printk(KERN_ERR "hfs: unable to set blocksize to %u\n", size); + pr_err("unable to set blocksize to %u\n", size); goto out; } @@ -162,8 +162,8 @@ int hfs_mdb_get(struct super_block *sb) } if (!HFS_SB(sb)->alt_mdb) { - printk(KERN_WARNING "hfs: unable to locate alternate MDB\n"); - printk(KERN_WARNING "hfs: continuing without an alternate MDB\n"); + pr_warn("unable to locate alternate MDB\n"); + pr_warn("continuing without an alternate MDB\n"); } HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0); @@ -178,7 +178,7 @@ int hfs_mdb_get(struct super_block *sb) while (size) { bh = sb_bread(sb, off >> sb->s_blocksize_bits); if (!bh) { - printk(KERN_ERR "hfs: unable to read volume bitmap\n"); + pr_err("unable to read volume bitmap\n"); goto out; } off2 = off & (sb->s_blocksize - 1); @@ -192,23 +192,22 @@ int hfs_mdb_get(struct super_block *sb) HFS_SB(sb)->ext_tree = hfs_btree_open(sb, HFS_EXT_CNID, hfs_ext_keycmp); if (!HFS_SB(sb)->ext_tree) { - printk(KERN_ERR "hfs: unable to open extent tree\n"); + pr_err("unable to open extent tree\n"); goto out; } HFS_SB(sb)->cat_tree = hfs_btree_open(sb, HFS_CAT_CNID, hfs_cat_keycmp); if (!HFS_SB(sb)->cat_tree) { - printk(KERN_ERR "hfs: unable to open catalog tree\n"); + pr_err("unable to open catalog tree\n"); goto out; } attrib = mdb->drAtrb; if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { - printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, " - "running fsck.hfs is recommended. mounting read-only.\n"); + pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) { - printk(KERN_WARNING "hfs: filesystem is marked locked, mounting read-only.\n"); + pr_warn("filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } if (!(sb->s_flags & MS_RDONLY)) { @@ -312,7 +311,7 @@ void hfs_mdb_commit(struct super_block *sb) while (size) { bh = sb_bread(sb, block); if (!bh) { - printk(KERN_ERR "hfs: unable to read volume bitmap\n"); + pr_err("unable to read volume bitmap\n"); break; } len = min((int)sb->s_blocksize - off, size); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index bbaaa8a4ee64..2d2039e754cd 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -117,12 +117,11 @@ static int hfs_remount(struct super_block *sb, int *flags, char *data) return 0; if (!(*flags & MS_RDONLY)) { if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { - printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, " - "running fsck.hfs is recommended. leaving read-only.\n"); + pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) { - printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); + pr_warn("filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } @@ -253,29 +252,29 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) switch (token) { case opt_uid: if (match_int(&args[0], &tmp)) { - printk(KERN_ERR "hfs: uid requires an argument\n"); + pr_err("uid requires an argument\n"); return 0; } hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp); if (!uid_valid(hsb->s_uid)) { - printk(KERN_ERR "hfs: invalid uid %d\n", tmp); + pr_err("invalid uid %d\n", tmp); return 0; } break; case opt_gid: if (match_int(&args[0], &tmp)) { - printk(KERN_ERR "hfs: gid requires an argument\n"); + pr_err("gid requires an argument\n"); return 0; } hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp); if (!gid_valid(hsb->s_gid)) { - printk(KERN_ERR "hfs: invalid gid %d\n", tmp); + pr_err("invalid gid %d\n", tmp); return 0; } break; case opt_umask: if (match_octal(&args[0], &tmp)) { - printk(KERN_ERR "hfs: umask requires a value\n"); + pr_err("umask requires a value\n"); return 0; } hsb->s_file_umask = (umode_t)tmp; @@ -283,39 +282,39 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) break; case opt_file_umask: if (match_octal(&args[0], &tmp)) { - printk(KERN_ERR "hfs: file_umask requires a value\n"); + pr_err("file_umask requires a value\n"); return 0; } hsb->s_file_umask = (umode_t)tmp; break; case opt_dir_umask: if (match_octal(&args[0], &tmp)) { - printk(KERN_ERR "hfs: dir_umask requires a value\n"); + pr_err("dir_umask requires a value\n"); return 0; } hsb->s_dir_umask = (umode_t)tmp; break; case opt_part: if (match_int(&args[0], &hsb->part)) { - printk(KERN_ERR "hfs: part requires an argument\n"); + pr_err("part requires an argument\n"); return 0; } break; case opt_session: if (match_int(&args[0], &hsb->session)) { - printk(KERN_ERR "hfs: session requires an argument\n"); + pr_err("session requires an argument\n"); return 0; } break; case opt_type: if (match_fourchar(&args[0], &hsb->s_type)) { - printk(KERN_ERR "hfs: type requires a 4 character value\n"); + pr_err("type requires a 4 character value\n"); return 0; } break; case opt_creator: if (match_fourchar(&args[0], &hsb->s_creator)) { - printk(KERN_ERR "hfs: creator requires a 4 character value\n"); + pr_err("creator requires a 4 character value\n"); return 0; } break; @@ -324,14 +323,14 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) break; case opt_codepage: if (hsb->nls_disk) { - printk(KERN_ERR "hfs: unable to change codepage\n"); + pr_err("unable to change codepage\n"); return 0; } p = match_strdup(&args[0]); if (p) hsb->nls_disk = load_nls(p); if (!hsb->nls_disk) { - printk(KERN_ERR "hfs: unable to load codepage \"%s\"\n", p); + pr_err("unable to load codepage \"%s\"\n", p); kfree(p); return 0; } @@ -339,14 +338,14 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) break; case opt_iocharset: if (hsb->nls_io) { - printk(KERN_ERR "hfs: unable to change iocharset\n"); + pr_err("unable to change iocharset\n"); return 0; } p = match_strdup(&args[0]); if (p) hsb->nls_io = load_nls(p); if (!hsb->nls_io) { - printk(KERN_ERR "hfs: unable to load iocharset \"%s\"\n", p); + pr_err("unable to load iocharset \"%s\"\n", p); kfree(p); return 0; } @@ -360,7 +359,7 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) if (hsb->nls_disk && !hsb->nls_io) { hsb->nls_io = load_nls_default(); if (!hsb->nls_io) { - printk(KERN_ERR "hfs: unable to load default iocharset\n"); + pr_err("unable to load default iocharset\n"); return 0; } } @@ -400,7 +399,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) res = -EINVAL; if (!parse_options((char *)data, sbi)) { - printk(KERN_ERR "hfs: unable to parse mount options.\n"); + pr_err("unable to parse mount options\n"); goto bail; } @@ -411,14 +410,16 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) res = hfs_mdb_get(sb); if (res) { if (!silent) - printk(KERN_WARNING "hfs: can't find a HFS filesystem on dev %s.\n", + pr_warn("can't find a HFS filesystem on dev %s\n", hfs_mdb_name(sb)); res = -EINVAL; goto bail; } /* try to get the root inode */ - hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (res) + goto bail_no_root; res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd); if (!res) { if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) { @@ -447,7 +448,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) return 0; bail_no_root: - printk(KERN_ERR "hfs: get root inode failed.\n"); + pr_err("get root inode failed\n"); bail: hfs_mdb_put(sb); return res; diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index 8d691f124714..0f47890299c4 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -56,7 +56,7 @@ int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key, if (name) { len = strlen(name); if (len > HFSPLUS_ATTR_MAX_STRLEN) { - printk(KERN_ERR "hfs: invalid xattr name's length\n"); + pr_err("invalid xattr name's length\n"); return -EINVAL; } hfsplus_asc2uni(sb, @@ -166,10 +166,10 @@ int hfsplus_find_attr(struct super_block *sb, u32 cnid, { int err = 0; - dprint(DBG_ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); + hfs_dbg(ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); if (!HFSPLUS_SB(sb)->attr_tree) { - printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + pr_err("attributes file doesn't exist\n"); return -EINVAL; } @@ -228,11 +228,11 @@ int hfsplus_create_attr(struct inode *inode, int entry_size; int err; - dprint(DBG_ATTR_MOD, "create_attr: %s,%ld\n", + hfs_dbg(ATTR_MOD, "create_attr: %s,%ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { - printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + pr_err("attributes file doesn't exist\n"); return -EINVAL; } @@ -307,10 +307,10 @@ static int __hfsplus_delete_attr(struct inode *inode, u32 cnid, break; case HFSPLUS_ATTR_FORK_DATA: case HFSPLUS_ATTR_EXTENTS: - printk(KERN_ERR "hfs: only inline data xattr are supported\n"); + pr_err("only inline data xattr are supported\n"); return -EOPNOTSUPP; default: - printk(KERN_ERR "hfs: invalid extended attribute record\n"); + pr_err("invalid extended attribute record\n"); return -ENOENT; } @@ -328,11 +328,11 @@ int hfsplus_delete_attr(struct inode *inode, const char *name) struct super_block *sb = inode->i_sb; struct hfs_find_data fd; - dprint(DBG_ATTR_MOD, "delete_attr: %s,%ld\n", + hfs_dbg(ATTR_MOD, "delete_attr: %s,%ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { - printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + pr_err("attributes file doesn't exist\n"); return -EINVAL; } @@ -346,7 +346,7 @@ int hfsplus_delete_attr(struct inode *inode, const char *name) if (err) goto out; } else { - printk(KERN_ERR "hfs: invalid extended attribute name\n"); + pr_err("invalid extended attribute name\n"); err = -EINVAL; goto out; } @@ -369,10 +369,10 @@ int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) int err = 0; struct hfs_find_data fd; - dprint(DBG_ATTR_MOD, "delete_all_attrs: %d\n", cnid); + hfs_dbg(ATTR_MOD, "delete_all_attrs: %d\n", cnid); if (!HFSPLUS_SB(dir->i_sb)->attr_tree) { - printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + pr_err("attributes file doesn't exist\n"); return -EINVAL; } @@ -384,7 +384,7 @@ int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) err = hfsplus_find_attr(dir->i_sb, cnid, NULL, &fd); if (err) { if (err != -ENOENT) - printk(KERN_ERR "hfs: xattr search failed.\n"); + pr_err("xattr search failed\n"); goto end_delete_all; } diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index d73c98d1ee99..c1422d91cd36 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -22,7 +22,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", + hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); switch (tree->cnid) { case HFSPLUS_CAT_CNID: @@ -44,7 +44,7 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", + hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; @@ -56,7 +56,8 @@ int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, int *end, int *cur_rec) { - __be32 cur_cnid, search_cnid; + __be32 cur_cnid; + __be32 search_cnid; if (bnode->tree->cnid == HFSPLUS_EXT_CNID) { cur_cnid = fd->key->ext.cnid; @@ -67,8 +68,11 @@ int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, } else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) { cur_cnid = fd->key->attr.cnid; search_cnid = fd->search_key->attr.cnid; - } else + } else { + cur_cnid = 0; /* used-uninitialized warning */ + search_cnid = 0; BUG(); + } if (cur_cnid == search_cnid) { (*end) = (*cur_rec); @@ -204,7 +208,7 @@ int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare) return res; invalid: - printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", + pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", height, bnode->height, bnode->type, nidx, parent); res = -EIO; release: diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index 6feefc0cb48a..d2954451519e 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -30,7 +30,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, if (!len) return size; - dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); + hfs_dbg(BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); mutex_lock(&sbi->alloc_mutex); mapping = sbi->alloc_file->i_mapping; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); @@ -89,14 +89,14 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, else end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32; } - dprint(DBG_BITMAP, "bitmap full\n"); + hfs_dbg(BITMAP, "bitmap full\n"); start = size; goto out; found: start = offset + (curr - pptr) * 32 + i; if (start >= size) { - dprint(DBG_BITMAP, "bitmap full\n"); + hfs_dbg(BITMAP, "bitmap full\n"); goto out; } /* do any partial u32 at the start */ @@ -154,7 +154,7 @@ done: *max = offset + (curr - pptr) * 32 + i - start; sbi->free_blocks -= *max; hfsplus_mark_mdb_dirty(sb); - dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); + hfs_dbg(BITMAP, "-> %u,%u\n", start, *max); out: mutex_unlock(&sbi->alloc_mutex); return start; @@ -173,7 +173,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) if (!count) return 0; - dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); + hfs_dbg(BITMAP, "block_free: %u,%u\n", offset, count); /* are all of the bits in range? */ if ((offset + count) > sbi->total_blocks) return -ENOENT; @@ -238,8 +238,7 @@ out: return 0; kaboom: - printk(KERN_CRIT "hfsplus: unable to mark blocks free: error %ld\n", - PTR_ERR(page)); + pr_crit("unable to mark blocks free: error %ld\n", PTR_ERR(page)); mutex_unlock(&sbi->alloc_mutex); return -EIO; diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index f31ac6f404f1..11c860204520 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -130,7 +130,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct page **src_page, **dst_page; int l; - dprint(DBG_BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; tree = src_node->tree; @@ -188,7 +188,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) struct page **src_page, **dst_page; int l; - dprint(DBG_BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); if (!len) return; src += node->page_offset; @@ -302,16 +302,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - dprint(DBG_BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - dprint(DBG_BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - dprint(DBG_BNODE_MOD, " %d", key_off); + hfs_dbg(BNODE_MOD, " %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -320,17 +320,17 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = hfs_bnode_read_u16(node, key_off) + 2; else tmp = node->tree->max_key_len + 2; - dprint(DBG_BNODE_MOD, " (%d", tmp); + hfs_dbg_cont(BNODE_MOD, " (%d", tmp); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - dprint(DBG_BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u16(node, key_off); - dprint(DBG_BNODE_MOD, " (%d)", tmp); + hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); } } - dprint(DBG_BNODE_MOD, "\n"); + hfs_dbg_cont(BNODE_MOD, "\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -366,7 +366,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node) /* move down? */ if (!node->prev && !node->next) - dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n"); + hfs_dbg(BNODE_MOD, "hfs_btree_del_level\n"); if (!node->parent) { tree->root = 0; tree->depth = 0; @@ -386,7 +386,7 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) struct hfs_bnode *node; if (cnid >= tree->node_count) { - printk(KERN_ERR "hfs: request for non-existent node " + pr_err("request for non-existent node " "%d in B*Tree\n", cnid); return NULL; @@ -409,7 +409,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) loff_t off; if (cnid >= tree->node_count) { - printk(KERN_ERR "hfs: request for non-existent node " + pr_err("request for non-existent node " "%d in B*Tree\n", cnid); return NULL; @@ -425,8 +425,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - dprint(DBG_BNODE_REFS, "new_node(%d:%d): 1\n", - node->tree->cnid, node->this); + hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); node2 = hfs_bnode_findhash(tree, cnid); @@ -470,7 +470,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - dprint(DBG_BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -588,7 +588,7 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) node = hfs_bnode_findhash(tree, num); spin_unlock(&tree->hash_lock); if (node) { - printk(KERN_CRIT "new node %u already hashed?\n", num); + pr_crit("new node %u already hashed?\n", num); WARN_ON(1); return node; } @@ -620,7 +620,7 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n", + hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); } @@ -633,7 +633,7 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n", + hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 298d4e45604b..6e560d56094b 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -45,13 +45,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) if (!recoff) return 0; if (recoff > node->tree->node_size - 2) { - printk(KERN_ERR "hfs: recoff %d too large\n", recoff); + pr_err("recoff %d too large\n", recoff); return 0; } retval = hfs_bnode_read_u16(node, recoff) + 2; if (retval > node->tree->max_key_len + 2) { - printk(KERN_ERR "hfs: keylen %d too large\n", + pr_err("keylen %d too large\n", retval); retval = 0; } @@ -90,7 +90,7 @@ again: end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) @@ -191,7 +191,7 @@ again: mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", + hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); @@ -244,7 +244,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - dprint(DBG_BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -379,7 +379,7 @@ again: newkeylen = hfs_bnode_read_u16(node, 14) + 2; else fd->keylength = newkeylen = tree->max_key_len + 2; - dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", + hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; @@ -391,7 +391,7 @@ again: end_off = hfs_bnode_read_u16(parent, end_rec_off); if (end_rec_off - end_off < diff) { - dprint(DBG_BNODE_MOD, "hfs: splitting index node.\n"); + hfs_dbg(BNODE_MOD, "splitting index node\n"); fd->bnode = parent; new_node = hfs_bnode_split(fd); if (IS_ERR(new_node)) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index efb689c21a95..0c6540c91167 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -40,8 +40,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) tree->inode = inode; if (!HFSPLUS_I(tree->inode)->first_blocks) { - printk(KERN_ERR - "hfs: invalid btree extent records (0 size).\n"); + pr_err("invalid btree extent records (0 size)\n"); goto free_inode; } @@ -68,12 +67,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) switch (id) { case HFSPLUS_EXT_CNID: if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) { - printk(KERN_ERR "hfs: invalid extent max_key_len %d\n", + pr_err("invalid extent max_key_len %d\n", tree->max_key_len); goto fail_page; } if (tree->attributes & HFS_TREE_VARIDXKEYS) { - printk(KERN_ERR "hfs: invalid extent btree flag\n"); + pr_err("invalid extent btree flag\n"); goto fail_page; } @@ -81,12 +80,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) break; case HFSPLUS_CAT_CNID: if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) { - printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n", + pr_err("invalid catalog max_key_len %d\n", tree->max_key_len); goto fail_page; } if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) { - printk(KERN_ERR "hfs: invalid catalog btree flag\n"); + pr_err("invalid catalog btree flag\n"); goto fail_page; } @@ -100,19 +99,19 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) break; case HFSPLUS_ATTR_CNID: if (tree->max_key_len != HFSPLUS_ATTR_KEYLEN - sizeof(u16)) { - printk(KERN_ERR "hfs: invalid attributes max_key_len %d\n", + pr_err("invalid attributes max_key_len %d\n", tree->max_key_len); goto fail_page; } tree->keycmp = hfsplus_attr_bin_cmp_key; break; default: - printk(KERN_ERR "hfs: unknown B*Tree requested\n"); + pr_err("unknown B*Tree requested\n"); goto fail_page; } if (!(tree->attributes & HFS_TREE_BIGKEYS)) { - printk(KERN_ERR "hfs: invalid btree flag\n"); + pr_err("invalid btree flag\n"); goto fail_page; } @@ -155,7 +154,7 @@ void hfs_btree_close(struct hfs_btree *tree) while ((node = tree->node_hash[i])) { tree->node_hash[i] = node->next_hash; if (atomic_read(&node->refcnt)) - printk(KERN_CRIT "hfs: node %d:%d " + pr_crit("node %d:%d " "still has %d user(s)!\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); @@ -303,7 +302,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) kunmap(*pagep); nidx = node->next; if (!nidx) { - dprint(DBG_BNODE_MOD, "hfs: create new bmap node.\n"); + hfs_dbg(BNODE_MOD, "create new bmap node\n"); next_node = hfs_bmap_new_bmap(node, idx); } else next_node = hfs_bnode_find(tree, nidx); @@ -329,7 +328,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); BUG_ON(!node->this); tree = node->tree; nidx = node->this; @@ -345,7 +344,7 @@ void hfs_bmap_free(struct hfs_bnode *node) hfs_bnode_put(node); if (!i) { /* panic */; - printk(KERN_CRIT "hfs: unable to free bnode %u. " + pr_crit("unable to free bnode %u. " "bmap not found!\n", node->this); return; @@ -355,7 +354,7 @@ void hfs_bmap_free(struct hfs_bnode *node) return; if (node->type != HFS_NODE_MAP) { /* panic */; - printk(KERN_CRIT "hfs: invalid bmap found! " + pr_crit("invalid bmap found! " "(%u,%d)\n", node->this, node->type); hfs_bnode_put(node); @@ -370,7 +369,7 @@ void hfs_bmap_free(struct hfs_bnode *node) m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { - printk(KERN_CRIT "hfs: trying to free free bnode " + pr_crit("trying to free free bnode " "%u(%d)\n", node->this, node->type); kunmap(page); diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 840d71edd193..968ce411db53 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -188,12 +188,12 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, type = be16_to_cpu(tmp.type); if (type != HFSPLUS_FOLDER_THREAD && type != HFSPLUS_FILE_THREAD) { - printk(KERN_ERR "hfs: found bad thread record in catalog\n"); + pr_err("found bad thread record in catalog\n"); return -EIO; } if (be16_to_cpu(tmp.thread.nodeName.length) > 255) { - printk(KERN_ERR "hfs: catalog name length corrupted\n"); + pr_err("catalog name length corrupted\n"); return -EIO; } @@ -212,7 +212,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, int entry_size; int err; - dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", + hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) @@ -271,8 +271,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) int err, off; u16 type; - dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", - str ? str->name : NULL, cnid); + hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) return err; @@ -361,7 +360,7 @@ int hfsplus_rename_cat(u32 cnid, int entry_size, type; int err; - dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 031c24e50521..a37ac934732f 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -103,7 +103,7 @@ again: } else if (!dentry->d_fsdata) dentry->d_fsdata = (void *)(unsigned long)cnid; } else { - printk(KERN_ERR "hfs: invalid catalog entry type in lookup\n"); + pr_err("invalid catalog entry type in lookup\n"); err = -EIO; goto fail; } @@ -159,12 +159,12 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) { - printk(KERN_ERR "hfs: bad catalog folder thread\n"); + pr_err("bad catalog folder thread\n"); err = -EIO; goto out; } if (fd.entrylength < HFSPLUS_MIN_THREAD_SZ) { - printk(KERN_ERR "hfs: truncated catalog thread\n"); + pr_err("truncated catalog thread\n"); err = -EIO; goto out; } @@ -183,7 +183,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) for (;;) { if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) { - printk(KERN_ERR "hfs: walked past end of dir\n"); + pr_err("walked past end of dir\n"); err = -EIO; goto out; } @@ -203,7 +203,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) if (type == HFSPLUS_FOLDER) { if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) { - printk(KERN_ERR "hfs: small dir entry\n"); + pr_err("small dir entry\n"); err = -EIO; goto out; } @@ -216,7 +216,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) break; } else if (type == HFSPLUS_FILE) { if (fd.entrylength < sizeof(struct hfsplus_cat_file)) { - printk(KERN_ERR "hfs: small file entry\n"); + pr_err("small file entry\n"); err = -EIO; goto out; } @@ -224,7 +224,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) be32_to_cpu(entry.file.id), DT_REG)) break; } else { - printk(KERN_ERR "hfs: bad catalog entry type\n"); + pr_err("bad catalog entry type\n"); err = -EIO; goto out; } diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index a94f0f779d5e..fbb212fbb1ef 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -83,7 +83,7 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext) return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count); } -static void __hfsplus_ext_write_extent(struct inode *inode, +static int __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) { struct hfsplus_inode_info *hip = HFSPLUS_I(inode); @@ -98,13 +98,13 @@ static void __hfsplus_ext_write_extent(struct inode *inode, res = hfs_brec_find(fd, hfs_find_rec_by_key); if (hip->extent_state & HFSPLUS_EXT_NEW) { if (res != -ENOENT) - return; + return res; hfs_brec_insert(fd, hip->cached_extents, sizeof(hfsplus_extent_rec)); hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); } else { if (res) - return; + return res; hfs_bnode_write(fd->bnode, hip->cached_extents, fd->entryoffset, fd->entrylength); hip->extent_state &= ~HFSPLUS_EXT_DIRTY; @@ -117,11 +117,13 @@ static void __hfsplus_ext_write_extent(struct inode *inode, * to explicily mark the inode dirty, too. */ set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags); + + return 0; } static int hfsplus_ext_write_extent_locked(struct inode *inode) { - int res; + int res = 0; if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) { struct hfs_find_data fd; @@ -129,10 +131,10 @@ static int hfsplus_ext_write_extent_locked(struct inode *inode) res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); if (res) return res; - __hfsplus_ext_write_extent(inode, &fd); + res = __hfsplus_ext_write_extent(inode, &fd); hfs_find_exit(&fd); } - return 0; + return res; } int hfsplus_ext_write_extent(struct inode *inode) @@ -175,8 +177,11 @@ static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, WARN_ON(!mutex_is_locked(&hip->extents_lock)); - if (hip->extent_state & HFSPLUS_EXT_DIRTY) - __hfsplus_ext_write_extent(inode, fd); + if (hip->extent_state & HFSPLUS_EXT_DIRTY) { + res = __hfsplus_ext_write_extent(inode, fd); + if (res) + return res; + } res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino, block, HFSPLUS_IS_RSRC(inode) ? @@ -265,7 +270,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, mutex_unlock(&hip->extents_lock); done: - dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", + hfs_dbg(EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); mask = (1 << sbi->fs_shift) - 1; @@ -288,11 +293,12 @@ static void hfsplus_dump_extent(struct hfsplus_extent *extent) { int i; - dprint(DBG_EXTENT, " "); + hfs_dbg(EXTENT, " "); for (i = 0; i < 8; i++) - dprint(DBG_EXTENT, " %u:%u", be32_to_cpu(extent[i].start_block), - be32_to_cpu(extent[i].block_count)); - dprint(DBG_EXTENT, "\n"); + hfs_dbg_cont(EXTENT, " %u:%u", + be32_to_cpu(extent[i].start_block), + be32_to_cpu(extent[i].block_count)); + hfs_dbg_cont(EXTENT, "\n"); } static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset, @@ -348,8 +354,8 @@ found: if (count <= block_nr) { err = hfsplus_block_free(sb, start, count); if (err) { - printk(KERN_ERR "hfs: can't free extent\n"); - dprint(DBG_EXTENT, " start: %u count: %u\n", + pr_err("can't free extent\n"); + hfs_dbg(EXTENT, " start: %u count: %u\n", start, count); } extent->block_count = 0; @@ -359,8 +365,8 @@ found: count -= block_nr; err = hfsplus_block_free(sb, start + count, block_nr); if (err) { - printk(KERN_ERR "hfs: can't free extent\n"); - dprint(DBG_EXTENT, " start: %u count: %u\n", + pr_err("can't free extent\n"); + hfs_dbg(EXTENT, " start: %u count: %u\n", start, count); } extent->block_count = cpu_to_be32(count); @@ -432,7 +438,7 @@ int hfsplus_file_extend(struct inode *inode) if (sbi->alloc_file->i_size * 8 < sbi->total_blocks - sbi->free_blocks + 8) { /* extend alloc file */ - printk(KERN_ERR "hfs: extend alloc file! " + pr_err("extend alloc file! " "(%llu,%u,%u)\n", sbi->alloc_file->i_size * 8, sbi->total_blocks, sbi->free_blocks); @@ -459,11 +465,11 @@ int hfsplus_file_extend(struct inode *inode) } } - dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); if (hip->alloc_blocks <= hip->first_blocks) { if (!hip->first_blocks) { - dprint(DBG_EXTENT, "first extents\n"); + hfs_dbg(EXTENT, "first extents\n"); /* no extents yet */ hip->first_extents[0].start_block = cpu_to_be32(start); hip->first_extents[0].block_count = cpu_to_be32(len); @@ -500,7 +506,7 @@ out: return res; insert_extent: - dprint(DBG_EXTENT, "insert new extent\n"); + hfs_dbg(EXTENT, "insert new extent\n"); res = hfsplus_ext_write_extent_locked(inode); if (res) goto out; @@ -525,15 +531,14 @@ void hfsplus_file_truncate(struct inode *inode) u32 alloc_cnt, blk_cnt, start; int res; - dprint(DBG_INODE, "truncate: %lu, %llu -> %llu\n", - inode->i_ino, (long long)hip->phys_size, - inode->i_size); + hfs_dbg(INODE, "truncate: %lu, %llu -> %llu\n", + inode->i_ino, (long long)hip->phys_size, inode->i_size); if (inode->i_size > hip->phys_size) { struct address_space *mapping = inode->i_mapping; struct page *page; void *fsdata; - u32 size = inode->i_size; + loff_t size = inode->i_size; res = pagecache_write_begin(NULL, mapping, size, 0, AOP_FLAG_UNINTERRUPTIBLE, diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 05b11f36024c..60b0a3388b26 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -10,6 +10,12 @@ #ifndef _LINUX_HFSPLUS_FS_H #define _LINUX_HFSPLUS_FS_H +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/fs.h> #include <linux/mutex.h> #include <linux/buffer_head.h> @@ -32,9 +38,17 @@ #endif #define DBG_MASK (0) -#define dprint(flg, fmt, args...) \ - if (flg & DBG_MASK) \ - printk(fmt , ## args) +#define hfs_dbg(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ +} while (0) + +#define hfs_dbg_cont(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + pr_cont(fmt, ##__VA_ARGS__); \ +} while (0) /* Runtime config options */ #define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */ diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 160ccc9cdb4b..7faaa964968e 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -357,7 +357,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, if (!error) error = error2; } else { - printk(KERN_ERR "hfs: sync non-existent attributes tree\n"); + pr_err("sync non-existent attributes tree\n"); } } @@ -573,7 +573,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); HFSPLUS_I(inode)->create_date = file->create_date; } else { - printk(KERN_ERR "hfs: bad catalog entry used to create inode\n"); + pr_err("bad catalog entry used to create inode\n"); res = -EIO; } return res; diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index ed257c671615..968eab5bc1f5 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -113,67 +113,67 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) switch (token) { case opt_creator: if (match_fourchar(&args[0], &sbi->creator)) { - printk(KERN_ERR "hfs: creator requires a 4 character value\n"); + pr_err("creator requires a 4 character value\n"); return 0; } break; case opt_type: if (match_fourchar(&args[0], &sbi->type)) { - printk(KERN_ERR "hfs: type requires a 4 character value\n"); + pr_err("type requires a 4 character value\n"); return 0; } break; case opt_umask: if (match_octal(&args[0], &tmp)) { - printk(KERN_ERR "hfs: umask requires a value\n"); + pr_err("umask requires a value\n"); return 0; } sbi->umask = (umode_t)tmp; break; case opt_uid: if (match_int(&args[0], &tmp)) { - printk(KERN_ERR "hfs: uid requires an argument\n"); + pr_err("uid requires an argument\n"); return 0; } sbi->uid = make_kuid(current_user_ns(), (uid_t)tmp); if (!uid_valid(sbi->uid)) { - printk(KERN_ERR "hfs: invalid uid specified\n"); + pr_err("invalid uid specified\n"); return 0; } break; case opt_gid: if (match_int(&args[0], &tmp)) { - printk(KERN_ERR "hfs: gid requires an argument\n"); + pr_err("gid requires an argument\n"); return 0; } sbi->gid = make_kgid(current_user_ns(), (gid_t)tmp); if (!gid_valid(sbi->gid)) { - printk(KERN_ERR "hfs: invalid gid specified\n"); + pr_err("invalid gid specified\n"); return 0; } break; case opt_part: if (match_int(&args[0], &sbi->part)) { - printk(KERN_ERR "hfs: part requires an argument\n"); + pr_err("part requires an argument\n"); return 0; } break; case opt_session: if (match_int(&args[0], &sbi->session)) { - printk(KERN_ERR "hfs: session requires an argument\n"); + pr_err("session requires an argument\n"); return 0; } break; case opt_nls: if (sbi->nls) { - printk(KERN_ERR "hfs: unable to change nls mapping\n"); + pr_err("unable to change nls mapping\n"); return 0; } p = match_strdup(&args[0]); if (p) sbi->nls = load_nls(p); if (!sbi->nls) { - printk(KERN_ERR "hfs: unable to load " + pr_err("unable to load " "nls mapping \"%s\"\n", p); kfree(p); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 7b87284e46dc..4c4d142cf890 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -132,7 +132,7 @@ static int hfsplus_system_write_inode(struct inode *inode) if (tree) { int err = hfs_btree_write(tree); if (err) { - printk(KERN_ERR "hfs: b-tree write err: %d, ino %lu\n", + pr_err("b-tree write err: %d, ino %lu\n", err, inode->i_ino); return err; } @@ -145,7 +145,7 @@ static int hfsplus_write_inode(struct inode *inode, { int err; - dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); + hfs_dbg(INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); err = hfsplus_ext_write_extent(inode); if (err) @@ -160,7 +160,7 @@ static int hfsplus_write_inode(struct inode *inode, static void hfsplus_evict_inode(struct inode *inode) { - dprint(DBG_INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); + hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { @@ -179,7 +179,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) if (!wait) return 0; - dprint(DBG_SUPER, "hfsplus_sync_fs\n"); + hfs_dbg(SUPER, "hfsplus_sync_fs\n"); /* * Explicitly write out the special metadata inodes. @@ -251,7 +251,7 @@ static void delayed_sync_fs(struct work_struct *work) err = hfsplus_sync_fs(sbi->alloc_file->i_sb, 1); if (err) - printk(KERN_ERR "hfs: delayed sync fs err %d\n", err); + pr_err("delayed sync fs err %d\n", err); } void hfsplus_mark_mdb_dirty(struct super_block *sb) @@ -275,7 +275,7 @@ static void hfsplus_put_super(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); - dprint(DBG_SUPER, "hfsplus_put_super\n"); + hfs_dbg(SUPER, "hfsplus_put_super\n"); cancel_delayed_work_sync(&sbi->sync_work); @@ -333,25 +333,19 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data) return -EINVAL; if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { - printk(KERN_WARNING "hfs: filesystem was " - "not cleanly unmounted, " - "running fsck.hfsplus is recommended. " - "leaving read-only.\n"); + pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } else if (force) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { - printk(KERN_WARNING "hfs: filesystem is marked locked, " - "leaving read-only.\n"); + pr_warn("filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { - printk(KERN_WARNING "hfs: filesystem is " - "marked journaled, " - "leaving read-only.\n"); + pr_warn("filesystem is marked journaled, leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } @@ -397,7 +391,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; if (!hfsplus_parse_options(data, sbi)) { - printk(KERN_ERR "hfs: unable to parse mount options\n"); + pr_err("unable to parse mount options\n"); goto out_unload_nls; } @@ -405,14 +399,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) nls = sbi->nls; sbi->nls = load_nls("utf8"); if (!sbi->nls) { - printk(KERN_ERR "hfs: unable to load nls for utf8\n"); + pr_err("unable to load nls for utf8\n"); goto out_unload_nls; } /* Grab the volume header */ if (hfsplus_read_wrapper(sb)) { if (!silent) - printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n"); + pr_warn("unable to find HFS+ superblock\n"); goto out_unload_nls; } vhdr = sbi->s_vhdr; @@ -421,7 +415,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = HFSPLUS_VOLHEAD_SIG; if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION || be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) { - printk(KERN_ERR "hfs: wrong filesystem version\n"); + pr_err("wrong filesystem version\n"); goto out_free_vhdr; } sbi->total_blocks = be32_to_cpu(vhdr->total_blocks); @@ -445,7 +439,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) || (last_fs_page > (pgoff_t)(~0ULL))) { - printk(KERN_ERR "hfs: filesystem size too large.\n"); + pr_err("filesystem size too large\n"); goto out_free_vhdr; } @@ -454,22 +448,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = MAX_LFS_FILESIZE; if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { - printk(KERN_WARNING "hfs: Filesystem was " - "not cleanly unmounted, " - "running fsck.hfsplus is recommended. " - "mounting read-only.\n"); + pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { - printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); + pr_warn("Filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) { - printk(KERN_WARNING "hfs: write access to " - "a journaled filesystem is not supported, " - "use the force option at your own risk, " - "mounting read-only.\n"); + pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } @@ -478,18 +466,18 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) /* Load metadata objects (B*Trees) */ sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); if (!sbi->ext_tree) { - printk(KERN_ERR "hfs: failed to load extents file\n"); + pr_err("failed to load extents file\n"); goto out_free_vhdr; } sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); if (!sbi->cat_tree) { - printk(KERN_ERR "hfs: failed to load catalog file\n"); + pr_err("failed to load catalog file\n"); goto out_close_ext_tree; } if (vhdr->attr_file.total_blocks != 0) { sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID); if (!sbi->attr_tree) { - printk(KERN_ERR "hfs: failed to load attributes file\n"); + pr_err("failed to load attributes file\n"); goto out_close_cat_tree; } } @@ -497,7 +485,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID); if (IS_ERR(inode)) { - printk(KERN_ERR "hfs: failed to load allocation file\n"); + pr_err("failed to load allocation file\n"); err = PTR_ERR(inode); goto out_close_attr_tree; } @@ -506,7 +494,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) /* Load the root directory */ root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); if (IS_ERR(root)) { - printk(KERN_ERR "hfs: failed to load root directory\n"); + pr_err("failed to load root directory\n"); err = PTR_ERR(root); goto out_put_alloc_file; } diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 90effcccca9a..b51a6079108d 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -156,7 +156,7 @@ static int hfsplus_get_last_session(struct super_block *sb, *start = (sector_t)te.cdte_addr.lba << 2; return 0; } - printk(KERN_ERR "hfs: invalid session number or type of track\n"); + pr_err("invalid session number or type of track\n"); return -EINVAL; } ms_info.addr_format = CDROM_LBA; @@ -234,8 +234,7 @@ reread: error = -EINVAL; if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) { - printk(KERN_WARNING - "hfs: invalid secondary volume header\n"); + pr_warn("invalid secondary volume header\n"); goto out_free_backup_vhdr; } @@ -259,8 +258,7 @@ reread: blocksize >>= 1; if (sb_set_blocksize(sb, blocksize) != blocksize) { - printk(KERN_ERR "hfs: unable to set blocksize to %u!\n", - blocksize); + pr_err("unable to set blocksize to %u!\n", blocksize); goto out_free_backup_vhdr; } diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index e8a4b0815c61..f66346155df5 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -107,19 +107,19 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); if (err) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); return err; } err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); if (err) { - printk(KERN_ERR "hfs: catalog searching failed\n"); + pr_err("catalog searching failed\n"); goto end_setxattr; } if (!strcmp_xattr_finder_info(name)) { if (flags & XATTR_CREATE) { - printk(KERN_ERR "hfs: xattr exists yet\n"); + pr_err("xattr exists yet\n"); err = -EOPNOTSUPP; goto end_setxattr; } @@ -165,7 +165,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, if (hfsplus_attr_exists(inode, name)) { if (flags & XATTR_CREATE) { - printk(KERN_ERR "hfs: xattr exists yet\n"); + pr_err("xattr exists yet\n"); err = -EOPNOTSUPP; goto end_setxattr; } @@ -177,7 +177,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, goto end_setxattr; } else { if (flags & XATTR_REPLACE) { - printk(KERN_ERR "hfs: cannot replace xattr\n"); + pr_err("cannot replace xattr\n"); err = -EOPNOTSUPP; goto end_setxattr; } @@ -210,7 +210,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, cat_entry_flags); hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); } else { - printk(KERN_ERR "hfs: invalid catalog entry type\n"); + pr_err("invalid catalog entry type\n"); err = -EIO; goto end_setxattr; } @@ -269,7 +269,7 @@ static ssize_t hfsplus_getxattr_finder_info(struct dentry *dentry, if (size >= record_len) { res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); if (res) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); return res; } res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); @@ -340,13 +340,13 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, entry = hfsplus_alloc_attr_entry(); if (!entry) { - printk(KERN_ERR "hfs: can't allocate xattr entry\n"); + pr_err("can't allocate xattr entry\n"); return -ENOMEM; } res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); if (res) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); goto failed_getxattr_init; } @@ -355,7 +355,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, if (res == -ENOENT) res = -ENODATA; else - printk(KERN_ERR "hfs: xattr searching failed\n"); + pr_err("xattr searching failed\n"); goto out; } @@ -368,17 +368,17 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, offsetof(struct hfsplus_attr_inline_data, length)); if (record_length > HFSPLUS_MAX_INLINE_DATA_SIZE) { - printk(KERN_ERR "hfs: invalid xattr record size\n"); + pr_err("invalid xattr record size\n"); res = -EIO; goto out; } } else if (record_type == HFSPLUS_ATTR_FORK_DATA || record_type == HFSPLUS_ATTR_EXTENTS) { - printk(KERN_ERR "hfs: only inline data xattr are supported\n"); + pr_err("only inline data xattr are supported\n"); res = -EOPNOTSUPP; goto out; } else { - printk(KERN_ERR "hfs: invalid xattr record\n"); + pr_err("invalid xattr record\n"); res = -EIO; goto out; } @@ -427,7 +427,7 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); if (res) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); return res; } @@ -506,7 +506,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); if (err) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); return err; } @@ -525,8 +525,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) for (;;) { key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset); if (key_len == 0 || key_len > fd.tree->max_key_len) { - printk(KERN_ERR "hfs: invalid xattr key length: %d\n", - key_len); + pr_err("invalid xattr key length: %d\n", key_len); res = -EIO; goto end_listxattr; } @@ -541,7 +540,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) if (hfsplus_uni2asc(inode->i_sb, (const struct hfsplus_unistr *)&fd.key->attr.key_name, strbuf, &xattr_name_len)) { - printk(KERN_ERR "hfs: unicode conversion failed\n"); + pr_err("unicode conversion failed\n"); res = -EIO; goto end_listxattr; } @@ -598,13 +597,13 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name) err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); if (err) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); return err; } err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); if (err) { - printk(KERN_ERR "hfs: catalog searching failed\n"); + pr_err("catalog searching failed\n"); goto end_removexattr; } @@ -643,7 +642,7 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name) flags); hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); } else { - printk(KERN_ERR "hfs: invalid catalog entry type\n"); + pr_err("invalid catalog entry type\n"); err = -EIO; goto end_removexattr; } diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 9f9dbeceeee7..3027f4dbbab5 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -131,6 +131,24 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping, return ret; } +static int hpfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *pagep, void *fsdata) +{ + struct inode *inode = mapping->host; + int err; + err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + if (err < len) + hpfs_write_failed(mapping, pos + len); + if (!(err < 0)) { + /* make sure we write it on close, if not earlier */ + hpfs_lock(inode->i_sb); + hpfs_i(inode)->i_dirty = 1; + hpfs_unlock(inode->i_sb); + } + return err; +} + static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,hpfs_get_block); @@ -140,30 +158,16 @@ const struct address_space_operations hpfs_aops = { .readpage = hpfs_readpage, .writepage = hpfs_writepage, .write_begin = hpfs_write_begin, - .write_end = generic_write_end, + .write_end = hpfs_write_end, .bmap = _hpfs_bmap }; -static ssize_t hpfs_file_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - ssize_t retval; - - retval = do_sync_write(file, buf, count, ppos); - if (retval > 0) { - hpfs_lock(file->f_path.dentry->d_sb); - hpfs_i(file_inode(file))->i_dirty = 1; - hpfs_unlock(file->f_path.dentry->d_sb); - } - return retval; -} - const struct file_operations hpfs_file_ops = { .llseek = generic_file_llseek, .read = do_sync_read, .aio_read = generic_file_aio_read, - .write = hpfs_file_write, + .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, .release = hpfs_file_release, diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 126d3c2e2dee..cd3e38972c86 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -436,7 +436,6 @@ static int hppfs_open(struct inode *inode, struct file *file) path.mnt = inode->i_sb->s_fs_info; path.dentry = HPPFS_I(inode)->proc_dentry; - /* XXX This isn't closed anywhere */ data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred); err = PTR_ERR(data->proc_file); if (IS_ERR(data->proc_file)) @@ -523,12 +522,23 @@ static loff_t hppfs_llseek(struct file *file, loff_t off, int where) return default_llseek(file, off, where); } +static int hppfs_release(struct inode *inode, struct file *file) +{ + struct hppfs_private *data = file->private_data; + struct file *proc_file = data->proc_file; + if (proc_file) + fput(proc_file); + kfree(data); + return 0; +} + static const struct file_operations hppfs_file_fops = { .owner = NULL, .llseek = hppfs_llseek, .read = hppfs_read, .write = hppfs_write, .open = hppfs_open, + .release = hppfs_release, }; struct hppfs_dirent { @@ -570,18 +580,12 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) return err; } -static int hppfs_fsync(struct file *file, loff_t start, loff_t end, - int datasync) -{ - return filemap_write_and_wait_range(file->f_mapping, start, end); -} - static const struct file_operations hppfs_dir_fops = { .owner = NULL, .readdir = hppfs_readdir, .open = hppfs_dir_open, - .fsync = hppfs_fsync, .llseek = default_llseek, + .release = hppfs_release, }; static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 84e3d856e91d..523464e62849 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap_pgoff unwinds (may be important on powerpc * and ia64). */ - vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; vma->vm_ops = &hugetlb_vm_ops; if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) diff --git a/fs/inode.c b/fs/inode.c index f5f7c06c36fb..00d5fc3b86e1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -725,7 +725,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan) * inode to the back of the list so we don't spin on it. */ if (!spin_trylock(&inode->i_lock)) { - list_move_tail(&inode->i_lru, &sb->s_inode_lru); + list_move(&inode->i_lru, &sb->s_inode_lru); continue; } @@ -1803,7 +1803,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; } else if (S_ISFIFO(mode)) - inode->i_fop = &def_fifo_fops; + inode->i_fop = &pipefifo_fops; else if (S_ISSOCK(mode)) inode->i_fop = &bad_sock_fops; else diff --git a/fs/internal.h b/fs/internal.h index 4be78237d896..eaa75f75b625 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -130,3 +130,8 @@ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); * read_write.c */ extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); + +/* + * pipe.c + */ +extern const struct file_operations pipefifo_fops; diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 86b39b167c23..11bb11f48b3a 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -162,8 +162,17 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs, for (i = 0; i < bufs; i++) { wbuf[i]->b_end_io = end_buffer_write_sync; - /* We use-up our safety reference in submit_bh() */ - submit_bh(write_op, wbuf[i]); + /* + * Here we write back pagecache data that may be mmaped. Since + * we cannot afford to clean the page and set PageWriteback + * here due to lock ordering (page lock ranks above transaction + * start), the data can change while IO is in flight. Tell the + * block layer it should bounce the bio pages if stable data + * during write is required. + * + * We use up our safety reference in submit_bh(). + */ + _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE); } } @@ -667,7 +676,17 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(write_op, bh); + /* + * In data=journal mode, here we can end up + * writing pagecache data that might be + * mmapped. Since we can't afford to clean the + * page and set PageWriteback (see the comment + * near the other use of _submit_bh()), the + * data can change while the write is in + * flight. Tell the block layer to bounce the + * bio pages if stable pages are required. + */ + _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE); } cond_resched(); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 726a4432cbb1..6510d6355729 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -310,8 +310,6 @@ int journal_write_metadata_buffer(transaction_t *transaction, new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); /* keep subsequent assertions sane */ - new_bh->b_state = 0; - init_buffer(new_bh, NULL, NULL); atomic_set(&new_bh->b_count, 1); new_jh = journal_add_journal_head(new_bh); /* This sleeps */ diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 750c70148eff..0f53946f13c1 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -382,7 +382,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) int space_left = 0; int first_tag = 0; int tag_flag; - int i, to_free = 0; + int i; int tag_bytes = journal_tag_bytes(journal); struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; @@ -1134,7 +1134,7 @@ restart_loop: journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; spin_unlock(&journal->j_history_lock); - commit_transaction->t_state = T_FINISHED; + commit_transaction->t_state = T_COMMIT_CALLBACK; J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; journal->j_committing_transaction = NULL; @@ -1149,38 +1149,44 @@ restart_loop: journal->j_average_commit_time*3) / 4; else journal->j_average_commit_time = commit_time; + write_unlock(&journal->j_state_lock); - if (commit_transaction->t_checkpoint_list == NULL && - commit_transaction->t_checkpoint_io_list == NULL) { - __jbd2_journal_drop_transaction(journal, commit_transaction); - to_free = 1; + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; } else { - if (journal->j_checkpoint_transactions == NULL) { - journal->j_checkpoint_transactions = commit_transaction; - commit_transaction->t_cpnext = commit_transaction; - commit_transaction->t_cpprev = commit_transaction; - } else { - commit_transaction->t_cpnext = - journal->j_checkpoint_transactions; - commit_transaction->t_cpprev = - commit_transaction->t_cpnext->t_cpprev; - commit_transaction->t_cpnext->t_cpprev = - commit_transaction; - commit_transaction->t_cpprev->t_cpnext = + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = commit_transaction; - } } spin_unlock(&journal->j_list_lock); - + /* Drop all spin_locks because commit_callback may be block. + * __journal_remove_checkpoint() can not destroy transaction + * under us because it is not marked as T_FINISHED yet */ if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); trace_jbd2_end_commit(journal, commit_transaction); jbd_debug(1, "JBD2: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); - if (to_free) - jbd2_journal_free_transaction(commit_transaction); + write_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + commit_transaction->t_state = T_FINISHED; + /* Recheck checkpoint lists after j_list_lock was dropped */ + if (commit_transaction->t_checkpoint_list == NULL && + commit_transaction->t_checkpoint_io_list == NULL) { + __jbd2_journal_drop_transaction(journal, commit_transaction); + jbd2_journal_free_transaction(commit_transaction); + } + spin_unlock(&journal->j_list_lock); + write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_done_commit); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ed10991ab006..95457576e434 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -367,8 +367,6 @@ retry_alloc: } /* keep subsequent assertions sane */ - new_bh->b_state = 0; - init_buffer(new_bh, NULL, NULL); atomic_set(&new_bh->b_count, 1); new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */ @@ -710,6 +708,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) } /* + * When this function returns the transaction corresponding to tid + * will be completed. If the transaction has currently running, start + * committing that transaction before waiting for it to complete. If + * the transaction id is stale, it is by definition already completed, + * so just return SUCCESS. + */ +int jbd2_complete_transaction(journal_t *journal, tid_t tid) +{ + int need_to_wait = 1; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) { + if (journal->j_commit_request != tid) { + /* transaction not yet started, so request it */ + read_unlock(&journal->j_state_lock); + jbd2_log_start_commit(journal, tid); + goto wait_commit; + } + } else if (!(journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid)) + need_to_wait = 0; + read_unlock(&journal->j_state_lock); + if (!need_to_wait) + return 0; +wait_commit: + return jbd2_log_wait_commit(journal, tid); +} +EXPORT_SYMBOL(jbd2_complete_transaction); + +/* * Log buffer allocation routines: */ @@ -950,7 +979,7 @@ static const struct seq_operations jbd2_seq_info_ops = { static int jbd2_seq_info_open(struct inode *inode, struct file *file) { - journal_t *journal = PDE(inode)->data; + journal_t *journal = PDE_DATA(inode); struct jbd2_stats_proc_session *s; int rc, size; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 325bc019ed88..10f524c59ea8 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -332,7 +332,6 @@ static handle_t *new_handle(int nblocks) handle_t *handle = jbd2_alloc_handle(GFP_NOFS); if (!handle) return NULL; - memset(handle, 0, sizeof(*handle)); handle->h_buffer_credits = nblocks; handle->h_ref = 1; @@ -640,6 +639,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, int error; char *frozen_buffer = NULL; int need_copy = 0; + unsigned long start_lock, time_lock; if (is_handle_aborted(handle)) return -EROFS; @@ -655,9 +655,16 @@ repeat: /* @@@ Need to check for errors here at some point. */ + start_lock = jiffies; lock_buffer(bh); jbd_lock_bh_state(bh); + /* If it takes too long to lock the buffer, trace it */ + time_lock = jbd2_time_diff(start_lock, jiffies); + if (time_lock > HZ/10) + trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev, + jiffies_to_msecs(time_lock)); + /* We now hold the buffer lock so it is safe to query the buffer * state. Is the buffer dirty? * diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 0796c45d0d4d..01bfe7662751 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -144,6 +144,9 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) timeout); if (ret < 0) return -ERESTARTSYS; + /* Reset the lock status after a server reboot so we resend */ + if (block->b_status == nlm_lck_denied_grace_period) + block->b_status = nlm_lck_blocked; req->a_res.status = block->b_status; return 0; } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 7e529c3c45c0..9760ecb9b60f 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -550,9 +550,6 @@ again: status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT); if (status < 0) break; - /* Resend the blocking lock request after a server reboot */ - if (resp->status == nlm_lck_denied_grace_period) - continue; if (resp->status != nlm_lck_blocked) break; } diff --git a/fs/mount.h b/fs/mount.h index cd5007980400..64a858143ff9 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -18,6 +18,12 @@ struct mnt_pcp { int mnt_writers; }; +struct mountpoint { + struct list_head m_hash; + struct dentry *m_dentry; + int m_count; +}; + struct mount { struct list_head mnt_hash; struct mount *mnt_parent; @@ -40,6 +46,7 @@ struct mount { struct list_head mnt_slave; /* slave list entry */ struct mount *mnt_master; /* slave is on master->mnt_slave_list */ struct mnt_namespace *mnt_ns; /* containing namespace */ + struct mountpoint *mnt_mp; /* where is it mounted */ #ifdef CONFIG_FSNOTIFY struct hlist_head mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; diff --git a/fs/namespace.c b/fs/namespace.c index 50ca17d3cb45..b4f96a5230a3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -21,7 +21,8 @@ #include <linux/fs_struct.h> /* get_fs_root et.al. */ #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ #include <linux/uaccess.h> -#include <linux/proc_fs.h> +#include <linux/proc_ns.h> +#include <linux/magic.h> #include "pnode.h" #include "internal.h" @@ -36,6 +37,7 @@ static int mnt_id_start = 0; static int mnt_group_start = 1; static struct list_head *mount_hashtable __read_mostly; +static struct list_head *mountpoint_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; static struct rw_semaphore namespace_sem; @@ -605,6 +607,51 @@ struct vfsmount *lookup_mnt(struct path *path) } } +static struct mountpoint *new_mountpoint(struct dentry *dentry) +{ + struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry); + struct mountpoint *mp; + + list_for_each_entry(mp, chain, m_hash) { + if (mp->m_dentry == dentry) { + /* might be worth a WARN_ON() */ + if (d_unlinked(dentry)) + return ERR_PTR(-ENOENT); + mp->m_count++; + return mp; + } + } + + mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!mp) + return ERR_PTR(-ENOMEM); + + spin_lock(&dentry->d_lock); + if (d_unlinked(dentry)) { + spin_unlock(&dentry->d_lock); + kfree(mp); + return ERR_PTR(-ENOENT); + } + dentry->d_flags |= DCACHE_MOUNTED; + spin_unlock(&dentry->d_lock); + mp->m_dentry = dentry; + mp->m_count = 1; + list_add(&mp->m_hash, chain); + return mp; +} + +static void put_mountpoint(struct mountpoint *mp) +{ + if (!--mp->m_count) { + struct dentry *dentry = mp->m_dentry; + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_MOUNTED; + spin_unlock(&dentry->d_lock); + list_del(&mp->m_hash); + kfree(mp); + } +} + static inline int check_mnt(struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; @@ -633,27 +680,6 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) } /* - * Clear dentry's mounted state if it has no remaining mounts. - * vfsmount_lock must be held for write. - */ -static void dentry_reset_mounted(struct dentry *dentry) -{ - unsigned u; - - for (u = 0; u < HASH_SIZE; u++) { - struct mount *p; - - list_for_each_entry(p, &mount_hashtable[u], mnt_hash) { - if (p->mnt_mountpoint == dentry) - return; - } - } - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_MOUNTED; - spin_unlock(&dentry->d_lock); -} - -/* * vfsmount lock must be held for write */ static void detach_mnt(struct mount *mnt, struct path *old_path) @@ -664,32 +690,35 @@ static void detach_mnt(struct mount *mnt, struct path *old_path) mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); list_del_init(&mnt->mnt_hash); - dentry_reset_mounted(old_path->dentry); + put_mountpoint(mnt->mnt_mp); + mnt->mnt_mp = NULL; } /* * vfsmount lock must be held for write */ -void mnt_set_mountpoint(struct mount *mnt, struct dentry *dentry, +void mnt_set_mountpoint(struct mount *mnt, + struct mountpoint *mp, struct mount *child_mnt) { + mp->m_count++; mnt_add_count(mnt, 1); /* essentially, that's mntget */ - child_mnt->mnt_mountpoint = dget(dentry); + child_mnt->mnt_mountpoint = dget(mp->m_dentry); child_mnt->mnt_parent = mnt; - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_MOUNTED; - spin_unlock(&dentry->d_lock); + child_mnt->mnt_mp = mp; } /* * vfsmount lock must be held for write */ -static void attach_mnt(struct mount *mnt, struct path *path) +static void attach_mnt(struct mount *mnt, + struct mount *parent, + struct mountpoint *mp) { - mnt_set_mountpoint(real_mount(path->mnt), path->dentry, mnt); + mnt_set_mountpoint(parent, mp, mnt); list_add_tail(&mnt->mnt_hash, mount_hashtable + - hash(path->mnt, path->dentry)); - list_add_tail(&mnt->mnt_child, &real_mount(path->mnt)->mnt_mounts); + hash(&parent->mnt, mp->m_dentry)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } /* @@ -798,6 +827,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; + /* Don't allow unprivileged users to change mount flags */ + if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); @@ -1091,11 +1124,23 @@ int may_umount(struct vfsmount *mnt) EXPORT_SYMBOL(may_umount); -void release_mounts(struct list_head *head) +static LIST_HEAD(unmounted); /* protected by namespace_sem */ + +static void namespace_unlock(void) { struct mount *mnt; - while (!list_empty(head)) { - mnt = list_first_entry(head, struct mount, mnt_hash); + LIST_HEAD(head); + + if (likely(list_empty(&unmounted))) { + up_write(&namespace_sem); + return; + } + + list_splice_init(&unmounted, &head); + up_write(&namespace_sem); + + while (!list_empty(&head)) { + mnt = list_first_entry(&head, struct mount, mnt_hash); list_del_init(&mnt->mnt_hash); if (mnt_has_parent(mnt)) { struct dentry *dentry; @@ -1115,11 +1160,16 @@ void release_mounts(struct list_head *head) } } +static inline void namespace_lock(void) +{ + down_write(&namespace_sem); +} + /* * vfsmount lock must be held for write * namespace_sem must be held for write */ -void umount_tree(struct mount *mnt, int propagate, struct list_head *kill) +void umount_tree(struct mount *mnt, int propagate) { LIST_HEAD(tmp_list); struct mount *p; @@ -1138,20 +1188,20 @@ void umount_tree(struct mount *mnt, int propagate, struct list_head *kill) list_del_init(&p->mnt_child); if (mnt_has_parent(p)) { p->mnt_parent->mnt_ghosts++; - dentry_reset_mounted(p->mnt_mountpoint); + put_mountpoint(p->mnt_mp); + p->mnt_mp = NULL; } change_mnt_propagation(p, MS_PRIVATE); } - list_splice(&tmp_list, kill); + list_splice(&tmp_list, &unmounted); } -static void shrink_submounts(struct mount *mnt, struct list_head *umounts); +static void shrink_submounts(struct mount *mnt); static int do_umount(struct mount *mnt, int flags) { struct super_block *sb = mnt->mnt.mnt_sb; int retval; - LIST_HEAD(umount_list); retval = security_sb_umount(&mnt->mnt, flags); if (retval) @@ -1218,22 +1268,21 @@ static int do_umount(struct mount *mnt, int flags) return retval; } - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); event++; if (!(flags & MNT_DETACH)) - shrink_submounts(mnt, &umount_list); + shrink_submounts(mnt); retval = -EBUSY; if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { if (!list_empty(&mnt->mnt_list)) - umount_tree(mnt, 1, &umount_list); + umount_tree(mnt, 1); retval = 0; } br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); - release_mounts(&umount_list); + namespace_unlock(); return retval; } @@ -1306,13 +1355,13 @@ static bool mnt_ns_loop(struct path *path) * mount namespace loop? */ struct inode *inode = path->dentry->d_inode; - struct proc_inode *ei; + struct proc_ns *ei; struct mnt_namespace *mnt_ns; if (!proc_ns_inode(inode)) return false; - ei = PROC_I(inode); + ei = get_proc_ns(inode); if (ei->ns_ops != &mntns_operations) return false; @@ -1323,8 +1372,7 @@ static bool mnt_ns_loop(struct path *path) struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, int flag) { - struct mount *res, *p, *q, *r; - struct path path; + struct mount *res, *p, *q, *r, *parent; if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) return ERR_PTR(-EINVAL); @@ -1351,25 +1399,22 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, q = q->mnt_parent; } p = s; - path.mnt = &q->mnt; - path.dentry = p->mnt_mountpoint; + parent = q; q = clone_mnt(p, p->mnt.mnt_root, flag); if (IS_ERR(q)) goto out; br_write_lock(&vfsmount_lock); list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, &path); + attach_mnt(q, parent, p->mnt_mp); br_write_unlock(&vfsmount_lock); } } return res; out: if (res) { - LIST_HEAD(umount_list); br_write_lock(&vfsmount_lock); - umount_tree(res, 0, &umount_list); + umount_tree(res, 0); br_write_unlock(&vfsmount_lock); - release_mounts(&umount_list); } return q; } @@ -1379,10 +1424,10 @@ out: struct vfsmount *collect_mounts(struct path *path) { struct mount *tree; - down_write(&namespace_sem); + namespace_lock(); tree = copy_tree(real_mount(path->mnt), path->dentry, CL_COPY_ALL | CL_PRIVATE); - up_write(&namespace_sem); + namespace_unlock(); if (IS_ERR(tree)) return NULL; return &tree->mnt; @@ -1390,13 +1435,11 @@ struct vfsmount *collect_mounts(struct path *path) void drop_collected_mounts(struct vfsmount *mnt) { - LIST_HEAD(umount_list); - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); - umount_tree(real_mount(mnt), 0, &umount_list); + umount_tree(real_mount(mnt), 0); br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); - release_mounts(&umount_list); + namespace_unlock(); } int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, @@ -1505,11 +1548,11 @@ static int invent_group_ids(struct mount *mnt, bool recurse) * in allocations. */ static int attach_recursive_mnt(struct mount *source_mnt, - struct path *path, struct path *parent_path) + struct mount *dest_mnt, + struct mountpoint *dest_mp, + struct path *parent_path) { LIST_HEAD(tree_list); - struct mount *dest_mnt = real_mount(path->mnt); - struct dentry *dest_dentry = path->dentry; struct mount *child, *p; int err; @@ -1518,7 +1561,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, if (err) goto out; } - err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list); + err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); if (err) goto out_cleanup_ids; @@ -1530,10 +1573,10 @@ static int attach_recursive_mnt(struct mount *source_mnt, } if (parent_path) { detach_mnt(source_mnt, parent_path); - attach_mnt(source_mnt, path); + attach_mnt(source_mnt, dest_mnt, dest_mp); touch_mnt_namespace(source_mnt->mnt_ns); } else { - mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); commit_tree(source_mnt); } @@ -1552,46 +1595,53 @@ static int attach_recursive_mnt(struct mount *source_mnt, return err; } -static int lock_mount(struct path *path) +static struct mountpoint *lock_mount(struct path *path) { struct vfsmount *mnt; + struct dentry *dentry = path->dentry; retry: - mutex_lock(&path->dentry->d_inode->i_mutex); - if (unlikely(cant_mount(path->dentry))) { - mutex_unlock(&path->dentry->d_inode->i_mutex); - return -ENOENT; + mutex_lock(&dentry->d_inode->i_mutex); + if (unlikely(cant_mount(dentry))) { + mutex_unlock(&dentry->d_inode->i_mutex); + return ERR_PTR(-ENOENT); } - down_write(&namespace_sem); + namespace_lock(); mnt = lookup_mnt(path); - if (likely(!mnt)) - return 0; - up_write(&namespace_sem); + if (likely(!mnt)) { + struct mountpoint *mp = new_mountpoint(dentry); + if (IS_ERR(mp)) { + namespace_unlock(); + mutex_unlock(&dentry->d_inode->i_mutex); + return mp; + } + return mp; + } + namespace_unlock(); mutex_unlock(&path->dentry->d_inode->i_mutex); path_put(path); path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); + dentry = path->dentry = dget(mnt->mnt_root); goto retry; } -static void unlock_mount(struct path *path) +static void unlock_mount(struct mountpoint *where) { - up_write(&namespace_sem); - mutex_unlock(&path->dentry->d_inode->i_mutex); + struct dentry *dentry = where->m_dentry; + put_mountpoint(where); + namespace_unlock(); + mutex_unlock(&dentry->d_inode->i_mutex); } -static int graft_tree(struct mount *mnt, struct path *path) +static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) { if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) return -EINVAL; - if (S_ISDIR(path->dentry->d_inode->i_mode) != + if (S_ISDIR(mp->m_dentry->d_inode->i_mode) != S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) return -ENOTDIR; - if (d_unlinked(path->dentry)) - return -ENOENT; - - return attach_recursive_mnt(mnt, path, NULL); + return attach_recursive_mnt(mnt, p, mp, NULL); } /* @@ -1629,7 +1679,7 @@ static int do_change_type(struct path *path, int flag) if (!type) return -EINVAL; - down_write(&namespace_sem); + namespace_lock(); if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) @@ -1642,7 +1692,7 @@ static int do_change_type(struct path *path, int flag) br_write_unlock(&vfsmount_lock); out_unlock: - up_write(&namespace_sem); + namespace_unlock(); return err; } @@ -1652,9 +1702,9 @@ static int do_change_type(struct path *path, int flag) static int do_loopback(struct path *path, const char *old_name, int recurse) { - LIST_HEAD(umount_list); struct path old_path; - struct mount *mnt = NULL, *old; + struct mount *mnt = NULL, *old, *parent; + struct mountpoint *mp; int err; if (!old_name || !*old_name) return -EINVAL; @@ -1666,17 +1716,19 @@ static int do_loopback(struct path *path, const char *old_name, if (mnt_ns_loop(&old_path)) goto out; - err = lock_mount(path); - if (err) + mp = lock_mount(path); + err = PTR_ERR(mp); + if (IS_ERR(mp)) goto out; old = real_mount(old_path.mnt); + parent = real_mount(path->mnt); err = -EINVAL; if (IS_MNT_UNBINDABLE(old)) goto out2; - if (!check_mnt(real_mount(path->mnt)) || !check_mnt(old)) + if (!check_mnt(parent) || !check_mnt(old)) goto out2; if (recurse) @@ -1686,18 +1738,17 @@ static int do_loopback(struct path *path, const char *old_name, if (IS_ERR(mnt)) { err = PTR_ERR(mnt); - goto out; + goto out2; } - err = graft_tree(mnt, path); + err = graft_tree(mnt, parent, mp); if (err) { br_write_lock(&vfsmount_lock); - umount_tree(mnt, 0, &umount_list); + umount_tree(mnt, 0); br_write_unlock(&vfsmount_lock); } out2: - unlock_mount(path); - release_mounts(&umount_list); + unlock_mount(mp); out: path_put(&old_path); return err; @@ -1713,6 +1764,9 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) if (readonly_request == __mnt_is_readonly(mnt)) return 0; + if (mnt->mnt_flags & MNT_LOCK_READONLY) + return -EPERM; + if (readonly_request) error = mnt_make_readonly(real_mount(mnt)); else @@ -1779,6 +1833,7 @@ static int do_move_mount(struct path *path, const char *old_name) struct path old_path, parent_path; struct mount *p; struct mount *old; + struct mountpoint *mp; int err; if (!old_name || !*old_name) return -EINVAL; @@ -1786,8 +1841,9 @@ static int do_move_mount(struct path *path, const char *old_name) if (err) return err; - err = lock_mount(path); - if (err < 0) + mp = lock_mount(path); + err = PTR_ERR(mp); + if (IS_ERR(mp)) goto out; old = real_mount(old_path.mnt); @@ -1797,9 +1853,6 @@ static int do_move_mount(struct path *path, const char *old_name) if (!check_mnt(p) || !check_mnt(old)) goto out1; - if (d_unlinked(path->dentry)) - goto out1; - err = -EINVAL; if (old_path.dentry != old_path.mnt->mnt_root) goto out1; @@ -1826,7 +1879,7 @@ static int do_move_mount(struct path *path, const char *old_name) if (p == old) goto out1; - err = attach_recursive_mnt(old, path, &parent_path); + err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path); if (err) goto out1; @@ -1834,7 +1887,7 @@ static int do_move_mount(struct path *path, const char *old_name) * automatically */ list_del_init(&old->mnt_expire); out1: - unlock_mount(path); + unlock_mount(mp); out: if (!err) path_put(&parent_path); @@ -1870,21 +1923,24 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) */ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) { + struct mountpoint *mp; + struct mount *parent; int err; mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); - err = lock_mount(path); - if (err) - return err; + mp = lock_mount(path); + if (IS_ERR(mp)) + return PTR_ERR(mp); + parent = real_mount(path->mnt); err = -EINVAL; - if (unlikely(!check_mnt(real_mount(path->mnt)))) { + if (unlikely(!check_mnt(parent))) { /* that's acceptable only for automounts done in private ns */ if (!(mnt_flags & MNT_SHRINKABLE)) goto unlock; /* ... and for those we'd better have mountpoint still alive */ - if (!real_mount(path->mnt)->mnt_ns) + if (!parent->mnt_ns) goto unlock; } @@ -1899,10 +1955,10 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) goto unlock; newmnt->mnt.mnt_flags = mnt_flags; - err = graft_tree(newmnt, path); + err = graft_tree(newmnt, parent, mp); unlock: - unlock_mount(path); + unlock_mount(mp); return err; } @@ -1975,11 +2031,11 @@ int finish_automount(struct vfsmount *m, struct path *path) fail: /* remove m from any expiration list it may be on */ if (!list_empty(&mnt->mnt_expire)) { - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); list_del_init(&mnt->mnt_expire); br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); + namespace_unlock(); } mntput(m); mntput(m); @@ -1993,13 +2049,13 @@ fail: */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); + namespace_unlock(); } EXPORT_SYMBOL(mnt_set_expiry); @@ -2012,12 +2068,11 @@ void mark_mounts_for_expiry(struct list_head *mounts) { struct mount *mnt, *next; LIST_HEAD(graveyard); - LIST_HEAD(umounts); if (list_empty(mounts)) return; - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); /* extract from the expiration list every vfsmount that matches the @@ -2035,12 +2090,10 @@ void mark_mounts_for_expiry(struct list_head *mounts) while (!list_empty(&graveyard)) { mnt = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(mnt->mnt_ns); - umount_tree(mnt, 1, &umounts); + umount_tree(mnt, 1); } br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); - - release_mounts(&umounts); + namespace_unlock(); } EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); @@ -2097,7 +2150,7 @@ resume: * * vfsmount_lock must be held for write */ -static void shrink_submounts(struct mount *mnt, struct list_head *umounts) +static void shrink_submounts(struct mount *mnt) { LIST_HEAD(graveyard); struct mount *m; @@ -2108,7 +2161,7 @@ static void shrink_submounts(struct mount *mnt, struct list_head *umounts) m = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(m->mnt_ns); - umount_tree(m, 1, umounts); + umount_tree(m, 1); } } } @@ -2335,14 +2388,14 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, if (IS_ERR(new_ns)) return new_ns; - down_write(&namespace_sem); + namespace_lock(); /* First pass: copy the tree topology */ copy_flags = CL_COPY_ALL | CL_EXPIRE; if (user_ns != mnt_ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE; + copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { - up_write(&namespace_sem); + namespace_unlock(); free_mnt_ns(new_ns); return ERR_CAST(new); } @@ -2373,7 +2426,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, p = next_mnt(p, old); q = next_mnt(q, new); } - up_write(&namespace_sem); + namespace_unlock(); if (rootmnt) mntput(rootmnt); @@ -2543,7 +2596,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { struct path new, old, parent_path, root_parent, root; - struct mount *new_mnt, *root_mnt; + struct mount *new_mnt, *root_mnt, *old_mnt; + struct mountpoint *old_mp, *root_mp; int error; if (!may_mount()) @@ -2562,14 +2616,16 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out2; get_fs_root(current->fs, &root); - error = lock_mount(&old); - if (error) + old_mp = lock_mount(&old); + error = PTR_ERR(old_mp); + if (IS_ERR(old_mp)) goto out3; error = -EINVAL; new_mnt = real_mount(new.mnt); root_mnt = real_mount(root.mnt); - if (IS_MNT_SHARED(real_mount(old.mnt)) || + old_mnt = real_mount(old.mnt); + if (IS_MNT_SHARED(old_mnt) || IS_MNT_SHARED(new_mnt->mnt_parent) || IS_MNT_SHARED(root_mnt->mnt_parent)) goto out4; @@ -2578,37 +2634,37 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, error = -ENOENT; if (d_unlinked(new.dentry)) goto out4; - if (d_unlinked(old.dentry)) - goto out4; error = -EBUSY; - if (new.mnt == root.mnt || - old.mnt == root.mnt) + if (new_mnt == root_mnt || old_mnt == root_mnt) goto out4; /* loop, on the same file system */ error = -EINVAL; if (root.mnt->mnt_root != root.dentry) goto out4; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) goto out4; /* not attached */ + root_mp = root_mnt->mnt_mp; if (new.mnt->mnt_root != new.dentry) goto out4; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) goto out4; /* not attached */ /* make sure we can reach put_old from new_root */ - if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new)) + if (!is_path_reachable(old_mnt, old.dentry, &new)) goto out4; + root_mp->m_count++; /* pin it so it won't go away */ br_write_lock(&vfsmount_lock); detach_mnt(new_mnt, &parent_path); detach_mnt(root_mnt, &root_parent); /* mount old root on put_old */ - attach_mnt(root_mnt, &old); + attach_mnt(root_mnt, old_mnt, old_mp); /* mount new_root on / */ - attach_mnt(new_mnt, &root_parent); + attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); touch_mnt_namespace(current->nsproxy->mnt_ns); br_write_unlock(&vfsmount_lock); chroot_fs_refs(&root, &new); + put_mountpoint(root_mp); error = 0; out4: - unlock_mount(&old); + unlock_mount(old_mp); if (!error) { path_put(&root_parent); path_put(&parent_path); @@ -2663,14 +2719,17 @@ void __init mnt_init(void) 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); + mountpoint_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); - if (!mount_hashtable) + if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE); for (u = 0; u < HASH_SIZE; u++) INIT_LIST_HEAD(&mount_hashtable[u]); + for (u = 0; u < HASH_SIZE; u++) + INIT_LIST_HEAD(&mountpoint_hashtable[u]); br_lock_init(&vfsmount_lock); @@ -2687,16 +2746,13 @@ void __init mnt_init(void) void put_mnt_ns(struct mnt_namespace *ns) { - LIST_HEAD(umount_list); - if (!atomic_dec_and_test(&ns->count)) return; - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); - umount_tree(ns->root, 0, &umount_list); + umount_tree(ns->root, 0); br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); - release_mounts(&umount_list); + namespace_unlock(); free_mnt_ns(ns); } @@ -2732,6 +2788,51 @@ bool our_mnt(struct vfsmount *mnt) return check_mnt(real_mount(mnt)); } +bool current_chrooted(void) +{ + /* Does the current process have a non-standard root */ + struct path ns_root; + struct path fs_root; + bool chrooted; + + /* Find the namespace root */ + ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; + ns_root.dentry = ns_root.mnt->mnt_root; + path_get(&ns_root); + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) + ; + + get_fs_root(current->fs, &fs_root); + + chrooted = !path_equal(&fs_root, &ns_root); + + path_put(&fs_root); + path_put(&ns_root); + + return chrooted; +} + +void update_mnt_policy(struct user_namespace *userns) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + switch (mnt->mnt.mnt_sb->s_magic) { + case SYSFS_MAGIC: + userns->may_mount_sysfs = true; + break; + case PROC_SUPER_MAGIC: + userns->may_mount_proc = true; + break; + } + if (userns->may_mount_sysfs && userns->may_mount_proc) + break; + } + up_read(&namespace_sem); +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 5088b57b078a..cff089a412c7 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -125,6 +125,9 @@ nfs41_callback_svc(void *vrqstp) set_freezable(); while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 2960512792c2..a13d26ede254 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -500,7 +500,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy, &args->craa_type_mask)) pnfs_recall_all_layouts(cps->clp); if (flags) - nfs_expire_all_delegation_types(cps->clp, flags); + nfs_expire_unused_delegation_types(cps->clp, flags); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 84d8eae203a7..c513b0cc835f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -593,6 +593,8 @@ int nfs_create_rpc_client(struct nfs_client *clp, args.flags |= RPC_CLNT_CREATE_DISCRTRY; if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS; if (!IS_ERR(clp->cl_rpcclient)) return 0; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 6390a4b5fee7..57db3244f4d9 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -64,17 +64,15 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags) return ret; } -static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state) +static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) { struct inode *inode = state->inode; struct file_lock *fl; int status = 0; if (inode->i_flock == NULL) - return 0; - - if (inode->i_flock == NULL) goto out; + /* Protect inode->i_flock using the file locks lock */ lock_flocks(); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { @@ -83,7 +81,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_ if (nfs_file_open_context(fl->fl_file) != ctx) continue; unlock_flocks(); - status = nfs4_lock_delegation_recall(state, fl); + status = nfs4_lock_delegation_recall(fl, state, stateid); if (status < 0) goto out; lock_flocks(); @@ -120,7 +118,7 @@ again: seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); err = nfs4_open_delegation_recall(ctx, state, stateid); if (!err) - err = nfs_delegation_claim_locks(ctx, state); + err = nfs_delegation_claim_locks(ctx, state, stateid); if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) err = -EAGAIN; mutex_unlock(&sp->so_delegreturn_mutex); @@ -389,6 +387,24 @@ out: return err; } +static bool nfs_delegation_need_return(struct nfs_delegation *delegation) +{ + bool ret = false; + + if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) + ret = true; + if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) { + struct inode *inode; + + spin_lock(&delegation->lock); + inode = delegation->inode; + if (inode && list_empty(&NFS_I(inode)->open_files)) + ret = true; + spin_unlock(&delegation->lock); + } + return ret; +} + /** * nfs_client_return_marked_delegations - return previously marked delegations * @clp: nfs_client to process @@ -411,8 +427,7 @@ restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry_rcu(delegation, &server->delegations, super_list) { - if (!test_and_clear_bit(NFS_DELEGATION_RETURN, - &delegation->flags)) + if (!nfs_delegation_need_return(delegation)) continue; inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) @@ -471,6 +486,13 @@ int nfs4_inode_return_delegation(struct inode *inode) return err; } +static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); +} + static void nfs_mark_return_delegation(struct nfs_server *server, struct nfs_delegation *delegation) { @@ -478,6 +500,45 @@ static void nfs_mark_return_delegation(struct nfs_server *server, set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); } +static bool nfs_server_mark_return_all_delegations(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + bool ret = false; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + nfs_mark_return_delegation(server, delegation); + ret = true; + } + return ret; +} + +static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_server_mark_return_all_delegations(server); + rcu_read_unlock(); +} + +static void nfs_delegation_run_state_manager(struct nfs_client *clp) +{ + if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) + nfs4_schedule_state_manager(clp); +} + +/** + * nfs_expire_all_delegations + * @clp: client to process + * + */ +void nfs_expire_all_delegations(struct nfs_client *clp) +{ + nfs_client_mark_return_all_delegations(clp); + nfs_delegation_run_state_manager(clp); +} + /** * nfs_super_return_all_delegations - return delegations for one superblock * @sb: sb to process @@ -486,24 +547,22 @@ static void nfs_mark_return_delegation(struct nfs_server *server, void nfs_server_return_all_delegations(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; - struct nfs_delegation *delegation; + bool need_wait; if (clp == NULL) return; rcu_read_lock(); - list_for_each_entry_rcu(delegation, &server->delegations, super_list) { - spin_lock(&delegation->lock); - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); - spin_unlock(&delegation->lock); - } + need_wait = nfs_server_mark_return_all_delegations(server); rcu_read_unlock(); - if (nfs_client_return_marked_delegations(clp) != 0) + if (need_wait) { nfs4_schedule_state_manager(clp); + nfs4_wait_clnt_recover(clp); + } } -static void nfs_mark_return_all_delegation_types(struct nfs_server *server, +static void nfs_mark_return_unused_delegation_types(struct nfs_server *server, fmode_t flags) { struct nfs_delegation *delegation; @@ -512,27 +571,21 @@ static void nfs_mark_return_all_delegation_types(struct nfs_server *server, if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) continue; if (delegation->type & flags) - nfs_mark_return_delegation(server, delegation); + nfs_mark_return_if_closed_delegation(server, delegation); } } -static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, +static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *clp, fmode_t flags) { struct nfs_server *server; rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) - nfs_mark_return_all_delegation_types(server, flags); + nfs_mark_return_unused_delegation_types(server, flags); rcu_read_unlock(); } -static void nfs_delegation_run_state_manager(struct nfs_client *clp) -{ - if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) - nfs4_schedule_state_manager(clp); -} - void nfs_remove_bad_delegation(struct inode *inode) { struct nfs_delegation *delegation; @@ -546,27 +599,17 @@ void nfs_remove_bad_delegation(struct inode *inode) EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); /** - * nfs_expire_all_delegation_types + * nfs_expire_unused_delegation_types * @clp: client to process * @flags: delegation types to expire * */ -void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags) +void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags) { - nfs_client_mark_return_all_delegation_types(clp, flags); + nfs_client_mark_return_unused_delegation_types(clp, flags); nfs_delegation_run_state_manager(clp); } -/** - * nfs_expire_all_delegations - * @clp: client to process - * - */ -void nfs_expire_all_delegations(struct nfs_client *clp) -{ - nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); -} - static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) { struct nfs_delegation *delegation; @@ -574,7 +617,7 @@ static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) continue; - nfs_mark_return_delegation(server, delegation); + nfs_mark_return_if_closed_delegation(server, delegation); } } diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index d54d4fca6793..9a79c7a99d6d 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -28,6 +28,7 @@ struct nfs_delegation { enum { NFS_DELEGATION_NEED_RECLAIM = 0, NFS_DELEGATION_RETURN, + NFS_DELEGATION_RETURN_IF_CLOSED, NFS_DELEGATION_REFERENCED, NFS_DELEGATION_RETURNING, }; @@ -41,7 +42,7 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); void nfs_server_return_all_delegations(struct nfs_server *); void nfs_expire_all_delegations(struct nfs_client *clp); -void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags); +void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags); void nfs_expire_unreferenced_delegations(struct nfs_client *clp); int nfs_client_return_marked_delegations(struct nfs_client *clp); int nfs_delegations_present(struct nfs_client *clp); @@ -53,7 +54,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp); /* NFSv4 delegation-related procedures */ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); -int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); +int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f23f455be42b..e093e73178b7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1486,6 +1486,8 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) goto no_open; if (d_mountpoint(dentry)) goto no_open; + if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1) + goto no_open; inode = dentry->d_inode; parent = dget_parent(dentry); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 29f4a48a0ee6..a87a44f84113 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -744,6 +744,7 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; + struct nfs_lock_context *l_ctx; int status; /* @@ -752,6 +753,14 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) */ nfs_sync_mapping(filp->f_mapping); + l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); + if (!IS_ERR(l_ctx)) { + status = nfs_iocounter_wait(&l_ctx->io_count); + nfs_put_lock_context(l_ctx); + if (status < 0) + return status; + } + /* NOTE: special case * If we're signalled while cleaning up locks on process exit, we * still need to complete the unlock. diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 1f941674b089..c1c7a9d78722 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -561,20 +561,22 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) l_ctx->lockowner.l_owner = current->files; l_ctx->lockowner.l_pid = current->tgid; INIT_LIST_HEAD(&l_ctx->list); + nfs_iocounter_init(&l_ctx->io_count); } static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) { - struct nfs_lock_context *pos; + struct nfs_lock_context *head = &ctx->lock_context; + struct nfs_lock_context *pos = head; - list_for_each_entry(pos, &ctx->lock_context.list, list) { + do { if (pos->lockowner.l_owner != current->files) continue; if (pos->lockowner.l_pid != current->tgid) continue; atomic_inc(&pos->count); return pos; - } + } while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head); return NULL; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 541c9ebdbc5a..91e59a39fc08 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -229,6 +229,13 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); +int nfs_iocounter_wait(struct nfs_io_counter *c); + +static inline void nfs_iocounter_init(struct nfs_io_counter *c) +{ + c->flags = 0; + atomic_set(&c->io_count, 0); +} /* nfs2xdr.c */ extern struct rpc_procinfo nfs_procedures[]; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 944c9a5c1039..553a83cc4106 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -36,6 +36,7 @@ enum nfs4_client_state { struct nfs4_minor_version_ops { u32 minor_version; + unsigned init_caps; int (*call_sync)(struct rpc_clnt *clnt, struct nfs_server *server, @@ -143,12 +144,14 @@ struct nfs4_lock_state { enum { LK_STATE_IN_USE, NFS_DELEGATED_STATE, /* Current stateid is delegation */ + NFS_OPEN_STATE, /* OPEN stateid is set */ NFS_O_RDONLY_STATE, /* OPEN stateid has read-only state */ NFS_O_WRONLY_STATE, /* OPEN stateid has write-only state */ NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */ NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */ NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ + NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */ }; struct nfs4_state { @@ -233,6 +236,10 @@ extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); extern int nfs4_release_lockowner(struct nfs4_lock_state *); extern const struct xattr_handler *nfs4_xattr_handlers[]; +extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode); #if defined(CONFIG_NFS_V4_1) static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) @@ -347,13 +354,13 @@ extern int nfs4_wait_clnt_recover(struct nfs_client *clp); extern int nfs4_client_recover_expired_lease(struct nfs_client *clp); extern void nfs4_schedule_state_manager(struct nfs_client *); extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); -extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); +extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); -extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, +extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, fmode_t, const struct nfs_lockowner *); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); @@ -412,6 +419,11 @@ static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_statei return memcmp(dst, src, sizeof(*dst)) == 0; } +static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) +{ + return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; +} + #else #define nfs4_close_state(a, b) do { } while (0) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index ac4fc9a8fdbc..947b0c908aa9 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -198,8 +198,12 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, /* Check NFS protocol revision and initialize RPC op vector */ clp->rpc_ops = &nfs_v4_clientops; + if (clp->cl_minorversion != 0) + __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); - error = nfs_create_rpc_client(clp, timeparms, authflavour); + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); + if (error == -EINVAL) + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_NULL); if (error < 0) goto error; @@ -300,7 +304,7 @@ int nfs40_walk_client_list(struct nfs_client *new, struct rpc_cred *cred) { struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); - struct nfs_client *pos, *n, *prev = NULL; + struct nfs_client *pos, *prev = NULL; struct nfs4_setclientid_res clid = { .clientid = new->cl_clientid, .confirm = new->cl_confirm, @@ -308,10 +312,23 @@ int nfs40_walk_client_list(struct nfs_client *new, int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); - list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { + list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { /* If "pos" isn't marked ready, we can't trust the * remaining fields in "pos" */ - if (pos->cl_cons_state < NFS_CS_READY) + if (pos->cl_cons_state > NFS_CS_READY) { + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + if (prev) + nfs_put_client(prev); + prev = pos; + + status = nfs_wait_client_init_complete(pos); + spin_lock(&nn->nfs_client_lock); + if (status < 0) + continue; + } + if (pos->cl_cons_state != NFS_CS_READY) continue; if (pos->rpc_ops != new->rpc_ops) @@ -423,16 +440,16 @@ int nfs41_walk_client_list(struct nfs_client *new, struct rpc_cred *cred) { struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); - struct nfs_client *pos, *n, *prev = NULL; + struct nfs_client *pos, *prev = NULL; int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); - list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { + list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { /* If "pos" isn't marked ready, we can't trust the * remaining fields in "pos", especially the client * ID and serverowner fields. Wait for CREATE_SESSION * to finish. */ - if (pos->cl_cons_state < NFS_CS_READY) { + if (pos->cl_cons_state > NFS_CS_READY) { atomic_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); @@ -440,18 +457,17 @@ int nfs41_walk_client_list(struct nfs_client *new, nfs_put_client(prev); prev = pos; - nfs4_schedule_lease_recovery(pos); status = nfs_wait_client_init_complete(pos); - if (status < 0) { - nfs_put_client(pos); - spin_lock(&nn->nfs_client_lock); - continue; + if (status == 0) { + nfs4_schedule_lease_recovery(pos); + status = nfs4_wait_clnt_recover(pos); } - status = pos->cl_cons_state; spin_lock(&nn->nfs_client_lock); if (status < 0) continue; } + if (pos->cl_cons_state != NFS_CS_READY) + continue; if (pos->rpc_ops != new->rpc_ops) continue; @@ -469,17 +485,18 @@ int nfs41_walk_client_list(struct nfs_client *new, continue; atomic_inc(&pos->cl_count); - spin_unlock(&nn->nfs_client_lock); + *result = pos; + status = 0; dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); - - *result = pos; - return 0; + break; } /* No matching nfs_client found. */ spin_unlock(&nn->nfs_client_lock); dprintk("NFS: <-- %s status = %d\n", __func__, status); + if (prev) + nfs_put_client(prev); return status; } #endif /* CONFIG_NFS_V4_1 */ @@ -717,6 +734,19 @@ static int nfs4_server_common_setup(struct nfs_server *server, if (error < 0) goto out; + /* Set the basic capabilities */ + server->caps |= server->nfs_client->cl_mvops->init_caps; + if (server->flags & NFS_MOUNT_NORDIRPLUS) + server->caps &= ~NFS_CAP_READDIRPLUS; + /* + * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower + * authentication. + */ + if (nfs4_disable_idmapping && + server->client->cl_auth->au_flavor == RPC_AUTH_UNIX) + server->caps |= NFS_CAP_UIDGID_NOMAP; + + /* Probe the root fh to retrieve its FSID and filehandle */ error = nfs4_get_rootfh(server, mntfh); if (error < 0) @@ -760,9 +790,6 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; - server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK; - if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) - server->caps |= NFS_CAP_READDIRPLUS; server->options = data->options; /* Get a client record */ @@ -779,13 +806,6 @@ static int nfs4_init_server(struct nfs_server *server, if (error < 0) goto error; - /* - * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower - * authentication. - */ - if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX) - server->caps |= NFS_CAP_UIDGID_NOMAP; - if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); if (data->wsize) @@ -863,7 +883,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, /* Initialise the client representation from the parent server */ nfs_server_copy_userdata(server, parent_server); - server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; /* Get a client representation. * Note: NFSv4 always uses TCP, */ diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 4fb234d3aefb..22d10623f5ee 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -158,11 +158,14 @@ static int filelayout_async_handle_error(struct rpc_task *task, case -NFS4ERR_OPENMODE: if (state == NULL) break; - nfs4_schedule_stateid_recovery(mds_server, state); + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) + goto out_bad_stateid; goto wait_on_recovery; case -NFS4ERR_EXPIRED: - if (state != NULL) - nfs4_schedule_stateid_recovery(mds_server, state); + if (state != NULL) { + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) + goto out_bad_stateid; + } nfs4_schedule_lease_recovery(mds_client); goto wait_on_recovery; /* DS session errors */ @@ -226,6 +229,9 @@ reset: out: task->tk_status = 0; return -EAGAIN; +out_bad_stateid: + task->tk_status = -EIO; + return 0; wait_on_recovery: rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0) @@ -299,6 +305,10 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) { struct nfs_read_data *rdata = data; + if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) { + rpc_exit(task, -EIO); + return; + } if (filelayout_reset_to_mds(rdata->header->lseg)) { dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); filelayout_reset_read(rdata); @@ -307,10 +317,13 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) } rdata->read_done_cb = filelayout_read_done_cb; - nfs41_setup_sequence(rdata->ds_clp->cl_session, + if (nfs41_setup_sequence(rdata->ds_clp->cl_session, &rdata->args.seq_args, &rdata->res.seq_res, - task); + task)) + return; + nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, + rdata->args.lock_context, FMODE_READ); } static void filelayout_read_call_done(struct rpc_task *task, void *data) @@ -401,16 +414,23 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data) { struct nfs_write_data *wdata = data; + if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) { + rpc_exit(task, -EIO); + return; + } if (filelayout_reset_to_mds(wdata->header->lseg)) { dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); filelayout_reset_write(wdata); rpc_exit(task, 0); return; } - nfs41_setup_sequence(wdata->ds_clp->cl_session, + if (nfs41_setup_sequence(wdata->ds_clp->cl_session, &wdata->args.seq_args, &wdata->res.seq_res, - task); + task)) + return; + nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, + wdata->args.lock_context, FMODE_WRITE); } static void filelayout_write_call_done(struct rpc_task *task, void *data) diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 0dd766079e1c..cdb0b41a4810 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -134,33 +134,38 @@ static size_t nfs_parse_server_name(char *string, size_t len, return ret; } +/** + * nfs_find_best_sec - Find a security mechanism supported locally + * @flavors: List of security tuples returned by SECINFO procedure + * + * Return the pseudoflavor of the first security mechanism in + * "flavors" that is locally supported. Return RPC_AUTH_UNIX if + * no matching flavor is found in the array. The "flavors" array + * is searched in the order returned from the server, per RFC 3530 + * recommendation. + */ rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) { - struct gss_api_mech *mech; - struct xdr_netobj oid; - int i; - rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX; + rpc_authflavor_t pseudoflavor; + struct nfs4_secinfo4 *secinfo; + unsigned int i; for (i = 0; i < flavors->num_flavors; i++) { - struct nfs4_secinfo_flavor *flavor; - flavor = &flavors->flavors[i]; - - if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) { - pseudoflavor = flavor->flavor; - break; - } else if (flavor->flavor == RPC_AUTH_GSS) { - oid.len = flavor->gss.sec_oid4.len; - oid.data = flavor->gss.sec_oid4.data; - mech = gss_mech_get_by_OID(&oid); - if (!mech) - continue; - pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service); - gss_mech_put(mech); + secinfo = &flavors->flavors[i]; + + switch (secinfo->flavor) { + case RPC_AUTH_NULL: + case RPC_AUTH_UNIX: + case RPC_AUTH_GSS: + pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor, + &secinfo->flavor_info); + if (pseudoflavor != RPC_AUTH_MAXFLAVOR) + return pseudoflavor; break; } } - return pseudoflavor; + return RPC_AUTH_UNIX; } static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 26431cf62ddb..9da4bd55eb30 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -107,6 +107,8 @@ static int nfs4_map_errors(int err) return -EPROTONOSUPPORT; case -NFS4ERR_ACCESS: return -EACCES; + case -NFS4ERR_FILE_OPEN: + return -EBUSY; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); @@ -295,19 +297,30 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc } if (state == NULL) break; - nfs4_schedule_stateid_recovery(server, state); + ret = nfs4_schedule_stateid_recovery(server, state); + if (ret < 0) + break; goto wait_on_recovery; case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: + if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) { + nfs_remove_bad_delegation(inode); + exception->retry = 1; + break; + } if (state == NULL) break; - nfs_remove_bad_delegation(state->inode); - nfs4_schedule_stateid_recovery(server, state); + ret = nfs4_schedule_stateid_recovery(server, state); + if (ret < 0) + break; goto wait_on_recovery; case -NFS4ERR_EXPIRED: - if (state != NULL) - nfs4_schedule_stateid_recovery(server, state); + if (state != NULL) { + ret = nfs4_schedule_stateid_recovery(server, state); + if (ret < 0) + break; + } case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); @@ -756,10 +769,40 @@ struct nfs4_opendata { struct iattr attrs; unsigned long timestamp; unsigned int rpc_done : 1; + unsigned int is_recover : 1; int rpc_status; int cancelled; }; +static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server, + int err, struct nfs4_exception *exception) +{ + if (err != -EINVAL) + return false; + if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1)) + return false; + server->caps &= ~NFS_CAP_ATOMIC_OPEN_V1; + exception->retry = 1; + return true; +} + +static enum open_claim_type4 +nfs4_map_atomic_open_claim(struct nfs_server *server, + enum open_claim_type4 claim) +{ + if (server->caps & NFS_CAP_ATOMIC_OPEN_V1) + return claim; + switch (claim) { + default: + return claim; + case NFS4_OPEN_CLAIM_FH: + return NFS4_OPEN_CLAIM_NULL; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + return NFS4_OPEN_CLAIM_DELEGATE_CUR; + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + return NFS4_OPEN_CLAIM_DELEGATE_PREV; + } +} static void nfs4_init_opendata_res(struct nfs4_opendata *p) { @@ -775,6 +818,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, struct nfs4_state_owner *sp, fmode_t fmode, int flags, const struct iattr *attrs, + enum open_claim_type4 claim, gfp_t gfp_mask) { struct dentry *parent = dget_parent(dentry); @@ -793,7 +837,6 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->dir = parent; p->owner = sp; atomic_inc(&sp->so_count); - p->o_arg.fh = NFS_FH(dir); p->o_arg.open_flags = flags; p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS @@ -811,7 +854,19 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; - p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; + p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); + switch (p->o_arg.claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + p->o_arg.fh = NFS_FH(dir); + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + p->o_arg.fh = NFS_FH(dentry->d_inode); + } if (attrs != NULL && attrs->ia_valid != 0) { __be32 verf[2]; @@ -924,6 +979,7 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid * if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); + set_bit(NFS_OPEN_STATE, &state->flags); switch (fmode) { case FMODE_READ: set_bit(NFS_O_RDONLY_STATE, &state->flags); @@ -1046,9 +1102,12 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) /* Save the delegation */ nfs4_stateid_copy(&stateid, &delegation->stateid); rcu_read_unlock(); - ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); - if (ret != 0) - goto out; + nfs_release_seqid(opendata->o_arg.seqid); + if (!opendata->is_recover) { + ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); + if (ret != 0) + goto out; + } ret = -EAGAIN; /* Try to update the stateid using the delegation */ @@ -1193,11 +1252,13 @@ static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state * return ERR_PTR(-ENOENT); } -static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, struct nfs4_state *state) +static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, + struct nfs4_state *state, enum open_claim_type4 claim) { struct nfs4_opendata *opendata; - opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, NULL, GFP_NOFS); + opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, + NULL, claim, GFP_NOFS); if (opendata == NULL) return ERR_PTR(-ENOMEM); opendata->state = state; @@ -1233,6 +1294,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * /* memory barrier prior to reading state->n_* */ clear_bit(NFS_DELEGATED_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); smp_rmb(); if (state->n_rdwr != 0) { clear_bit(NFS_O_RDWR_STATE, &state->flags); @@ -1283,11 +1345,10 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state fmode_t delegation_type = 0; int status; - opendata = nfs4_open_recoverdata_alloc(ctx, state); + opendata = nfs4_open_recoverdata_alloc(ctx, state, + NFS4_OPEN_CLAIM_PREVIOUS); if (IS_ERR(opendata)) return PTR_ERR(opendata); - opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS; - opendata->o_arg.fh = NFS_FH(state->inode); rcu_read_lock(); delegation = rcu_dereference(NFS_I(state->inode)->delegation); if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0) @@ -1306,6 +1367,8 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state int err; do { err = _nfs4_do_open_reclaim(ctx, state); + if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) + continue; if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -1320,71 +1383,72 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta ctx = nfs4_state_find_open_context(state); if (IS_ERR(ctx)) - return PTR_ERR(ctx); + return -EAGAIN; ret = nfs4_do_open_reclaim(ctx, state); put_nfs_open_context(ctx); return ret; } -static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err) { - struct nfs4_opendata *opendata; - int ret; - - opendata = nfs4_open_recoverdata_alloc(ctx, state); - if (IS_ERR(opendata)) - return PTR_ERR(opendata); - opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; - nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid); - ret = nfs4_open_recover(opendata, state); - nfs4_opendata_put(opendata); - return ret; + switch (err) { + default: + printk(KERN_ERR "NFS: %s: unhandled error " + "%d.\n", __func__, err); + case 0: + case -ENOENT: + case -ESTALE: + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + set_bit(NFS_DELEGATED_STATE, &state->flags); + nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); + return -EAGAIN; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_STALE_STATEID: + set_bit(NFS_DELEGATED_STATE, &state->flags); + case -NFS4ERR_EXPIRED: + /* Don't recall a delegation if it was lost */ + nfs4_schedule_lease_recovery(server->nfs_client); + return -EAGAIN; + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OPENMODE: + nfs_inode_find_state_and_recover(state->inode, + stateid); + nfs4_schedule_stateid_recovery(server, state); + return 0; + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + set_bit(NFS_DELEGATED_STATE, &state->flags); + ssleep(1); + return -EAGAIN; + case -ENOMEM: + case -NFS4ERR_DENIED: + /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + return 0; + } + return err; } int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) { - struct nfs4_exception exception = { }; struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_opendata *opendata; int err; - do { - err = _nfs4_open_delegation_recall(ctx, state, stateid); - switch (err) { - case 0: - case -ENOENT: - case -ESTALE: - goto out; - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - case -NFS4ERR_DEADSESSION: - set_bit(NFS_DELEGATED_STATE, &state->flags); - nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); - err = -EAGAIN; - goto out; - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_STALE_STATEID: - set_bit(NFS_DELEGATED_STATE, &state->flags); - case -NFS4ERR_EXPIRED: - /* Don't recall a delegation if it was lost */ - nfs4_schedule_lease_recovery(server->nfs_client); - err = -EAGAIN; - goto out; - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_BAD_STATEID: - nfs_inode_find_state_and_recover(state->inode, - stateid); - nfs4_schedule_stateid_recovery(server, state); - case -ENOMEM: - err = 0; - goto out; - } - set_bit(NFS_DELEGATED_STATE, &state->flags); - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); -out: - return err; + + opendata = nfs4_open_recoverdata_alloc(ctx, state, + NFS4_OPEN_CLAIM_DELEG_CUR_FH); + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid); + err = nfs4_open_recover(opendata, state); + nfs4_opendata_put(opendata); + return nfs4_handle_delegation_recall_error(server, state, stateid, err); } static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) @@ -1467,6 +1531,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; struct nfs4_state_owner *sp = data->owner; + struct nfs_client *clp = sp->so_server->nfs_client; if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) goto out_wait; @@ -1482,15 +1547,20 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) rcu_read_lock(); delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR && + data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH && can_open_delegated(delegation, data->o_arg.fmode)) goto unlock_no_action; rcu_read_unlock(); } /* Update client id. */ - data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; - if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { - task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; + data->o_arg.clientid = clp->cl_clientid; + switch (data->o_arg.claim) { + case NFS4_OPEN_CLAIM_PREVIOUS: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0]; + case NFS4_OPEN_CLAIM_FH: + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); } data->timestamp = jiffies; @@ -1499,6 +1569,16 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) &data->o_res.seq_res, task) != 0) nfs_release_seqid(data->o_arg.seqid); + + /* Set the create mode (note dependency on the session type) */ + data->o_arg.createmode = NFS4_CREATE_UNCHECKED; + if (data->o_arg.open_flags & O_EXCL) { + data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE; + if (nfs4_has_persistent_session(clp)) + data->o_arg.createmode = NFS4_CREATE_GUARDED; + else if (clp->cl_mvops->minor_version > 0) + data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1; + } return; unlock_no_action: rcu_read_unlock(); @@ -1594,8 +1674,11 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) data->rpc_done = 0; data->rpc_status = 0; data->cancelled = 0; - if (isrecover) + data->is_recover = 0; + if (isrecover) { nfs4_set_sequence_privileged(&o_arg->seq_args); + data->is_recover = 1; + } task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -1720,7 +1803,8 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s struct nfs4_opendata *opendata; int ret; - opendata = nfs4_open_recoverdata_alloc(ctx, state); + opendata = nfs4_open_recoverdata_alloc(ctx, state, + NFS4_OPEN_CLAIM_FH); if (IS_ERR(opendata)) return PTR_ERR(opendata); ret = nfs4_open_recover(opendata, state); @@ -1738,6 +1822,8 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state do { err = _nfs4_open_expired(ctx, state); + if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) + continue; switch (err) { default: goto out; @@ -1758,7 +1844,7 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta ctx = nfs4_state_find_open_context(state); if (IS_ERR(ctx)) - return PTR_ERR(ctx); + return -EAGAIN; ret = nfs4_do_open_expired(ctx, state); put_nfs_open_context(ctx); return ret; @@ -1820,6 +1906,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) clear_bit(NFS_O_RDONLY_STATE, &state->flags); clear_bit(NFS_O_WRONLY_STATE, &state->flags); clear_bit(NFS_O_RDWR_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); } return status; } @@ -1880,10 +1967,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (ret != 0) goto out; - if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) { + if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) nfs4_schedule_stateid_recovery(server, state); - nfs4_wait_clnt_recover(server->nfs_client); - } *res = state; out: return ret; @@ -1905,6 +1990,7 @@ static int _nfs4_do_open(struct inode *dir, struct nfs4_state *state = NULL; struct nfs_server *server = NFS_SERVER(dir); struct nfs4_opendata *opendata; + enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL; int status; /* Protect against reboot recovery conflicts */ @@ -1920,7 +2006,10 @@ static int _nfs4_do_open(struct inode *dir, if (dentry->d_inode != NULL) nfs4_return_incompatible_delegation(dentry->d_inode, fmode); status = -ENOMEM; - opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, GFP_KERNEL); + if (dentry->d_inode) + claim = NFS4_OPEN_CLAIM_FH; + opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, + claim, GFP_KERNEL); if (opendata == NULL) goto err_put_state_owner; @@ -1937,7 +2026,8 @@ static int _nfs4_do_open(struct inode *dir, if (status != 0) goto err_opendata_put; - if (opendata->o_arg.open_flags & O_EXCL) { + if ((opendata->o_arg.open_flags & O_EXCL) && + (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { nfs4_exclusive_attrset(opendata, sattr); nfs_fattr_init(opendata->o_res.f_attr); @@ -1978,6 +2068,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct rpc_cred *cred, struct nfs4_threshold **ctx_th) { + struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { }; struct nfs4_state *res; int status; @@ -2021,7 +2112,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, exception.retry = 1; continue; } - res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), + if (nfs4_clear_cap_atomic_open_v1(server, status, &exception)) + continue; + res = ERR_PTR(nfs4_handle_exception(server, status, &exception)); } while (exception.retry); return res; @@ -2049,20 +2142,25 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, .rpc_cred = cred, }; unsigned long timestamp = jiffies; + fmode_t fmode; + bool truncate; int status; nfs_fattr_init(fattr); - if (state != NULL) { + /* Servers should only apply open mode checks for file size changes */ + truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; + fmode = truncate ? FMODE_WRITE : FMODE_READ; + + if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) { + /* Use that stateid */ + } else if (truncate && state != NULL && nfs4_valid_open_stateid(state)) { struct nfs_lockowner lockowner = { .l_owner = current->files, .l_pid = current->tgid, }; nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, &lockowner); - } else if (nfs4_copy_delegation_stateid(&arg.stateid, inode, - FMODE_WRITE)) { - /* Use that stateid */ } else nfs4_stateid_copy(&arg.stateid, &zero_stateid); @@ -2086,6 +2184,13 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, err = _nfs4_do_setattr(inode, cred, fattr, sattr, state); switch (err) { case -NFS4ERR_OPENMODE: + if (!(sattr->ia_valid & ATTR_SIZE)) { + pr_warn_once("NFSv4: server %s is incorrectly " + "applying open mode checks to " + "a SETATTR that is not " + "changing file size.\n", + server->nfs_client->cl_hostname); + } if (state && !(state->state & FMODE_WRITE)) { err = -EBADF; if (sattr->ia_valid & ATTR_OPEN) @@ -2129,11 +2234,19 @@ static void nfs4_close_clear_stateid_flags(struct nfs4_state *state, fmode_t fmode) { spin_lock(&state->owner->so_lock); - if (!(fmode & FMODE_READ)) + clear_bit(NFS_O_RDWR_STATE, &state->flags); + switch (fmode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_WRITE: clear_bit(NFS_O_RDONLY_STATE, &state->flags); - if (!(fmode & FMODE_WRITE)) + break; + case FMODE_READ: clear_bit(NFS_O_WRONLY_STATE, &state->flags); - clear_bit(NFS_O_RDWR_STATE, &state->flags); + break; + case 0: + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); + } spin_unlock(&state->owner->so_lock); } @@ -2201,6 +2314,8 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) calldata->arg.fmode &= ~FMODE_WRITE; } } + if (!nfs4_valid_open_stateid(state)) + call_close = 0; spin_unlock(&state->owner->so_lock); if (!call_close) { @@ -2211,8 +2326,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (calldata->arg.fmode == 0) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; if (calldata->roc && - pnfs_roc_drain(inode, &calldata->roc_barrier, task)) + pnfs_roc_drain(inode, &calldata->roc_barrier, task)) { + nfs_release_seqid(calldata->arg.seqid); goto out_wait; + } } nfs_fattr_init(calldata->res.fattr); @@ -2443,7 +2560,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl auth = rpcauth_create(flavor, server->client); if (IS_ERR(auth)) { - ret = -EIO; + ret = -EACCES; goto out; } ret = nfs4_lookup_root(server, fhandle, info); @@ -2451,27 +2568,36 @@ out: return ret; } +/* + * Retry pseudoroot lookup with various security flavors. We do this when: + * + * NFSv4.0: the PUTROOTFH operation returns NFS4ERR_WRONGSEC + * NFSv4.1: the server does not support the SECINFO_NO_NAME operation + * + * Returns zero on success, or a negative NFS4ERR value, or a + * negative errno value. + */ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { - int i, len, status = 0; - rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS]; - - len = rpcauth_list_flavors(flav_array, ARRAY_SIZE(flav_array)); - if (len < 0) - return len; - - for (i = 0; i < len; i++) { - /* AUTH_UNIX is the default flavor if none was specified, - * thus has already been tried. */ - if (flav_array[i] == RPC_AUTH_UNIX) - continue; + /* Per 3530bis 15.33.5 */ + static const rpc_authflavor_t flav_array[] = { + RPC_AUTH_GSS_KRB5P, + RPC_AUTH_GSS_KRB5I, + RPC_AUTH_GSS_KRB5, + RPC_AUTH_UNIX, /* courtesy */ + RPC_AUTH_NULL, + }; + int status = -EPERM; + size_t i; + for (i = 0; i < ARRAY_SIZE(flav_array); i++) { status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]); if (status == -NFS4ERR_WRONGSEC || status == -EACCES) continue; break; } + /* * -EACCESS could mean that the user doesn't have correct permissions * to access the mount. It could also mean that we tried to mount @@ -2484,24 +2610,36 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, return status; } -/* - * get the file handle for the "/" directory on the server +static int nfs4_do_find_root_sec(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs_fsinfo *info) +{ + int mv = server->nfs_client->cl_minorversion; + return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info); +} + +/** + * nfs4_proc_get_rootfh - get file handle for server's pseudoroot + * @server: initialized nfs_server handle + * @fhandle: we fill in the pseudo-fs root file handle + * @info: we fill in an FSINFO struct + * + * Returns zero on success, or a negative errno. */ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { - int minor_version = server->nfs_client->cl_minorversion; - int status = nfs4_lookup_root(server, fhandle, info); - if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR)) - /* - * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM - * by nfs4_map_errors() as this function exits. - */ - status = nfs_v4_minor_ops[minor_version]->find_root_sec(server, fhandle, info); + int status; + + status = nfs4_lookup_root(server, fhandle, info); + if ((status == -NFS4ERR_WRONGSEC) && + !(server->flags & NFS_MOUNT_SECFLAVOUR)) + status = nfs4_do_find_root_sec(server, fhandle, info); + if (status == 0) status = nfs4_server_capabilities(server, fhandle); if (status == 0) status = nfs4_do_fsinfo(server, fhandle, info); + return nfs4_map_errors(status); } @@ -3380,12 +3518,21 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) { struct nfs4_exception exception = { }; + unsigned long now = jiffies; int err; do { - err = nfs4_handle_exception(server, - _nfs4_do_fsinfo(server, fhandle, fsinfo), - &exception); + err = _nfs4_do_fsinfo(server, fhandle, fsinfo); + if (err == 0) { + struct nfs_client *clp = server->nfs_client; + + spin_lock(&clp->cl_lock); + clp->cl_lease_time = fsinfo->lease_time * HZ; + clp->cl_last_renewal = now; + spin_unlock(&clp->cl_lock); + break; + } + err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); return err; } @@ -3445,6 +3592,46 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return err; } +int nfs4_set_rw_stateid(nfs4_stateid *stateid, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode) +{ + const struct nfs_lockowner *lockowner = NULL; + + if (l_ctx != NULL) + lockowner = &l_ctx->lockowner; + return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner); +} +EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid); + +static bool nfs4_stateid_is_current(nfs4_stateid *stateid, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode) +{ + nfs4_stateid current_stateid; + + if (nfs4_set_rw_stateid(¤t_stateid, ctx, l_ctx, fmode)) + return false; + return nfs4_stateid_match(stateid, ¤t_stateid); +} + +static bool nfs4_error_stateid_expired(int err) +{ + switch (err) { + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_OPENMODE: + case -NFS4ERR_EXPIRED: + return true; + } + return false; +} + void __nfs4_read_done_cb(struct nfs_read_data *data) { nfs_invalidate_atime(data->header->inode); @@ -3465,6 +3652,20 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) return 0; } +static bool nfs4_read_stateid_changed(struct rpc_task *task, + struct nfs_readargs *args) +{ + + if (!nfs4_error_stateid_expired(task->tk_status) || + nfs4_stateid_is_current(&args->stateid, + args->context, + args->lock_context, + FMODE_READ)) + return false; + rpc_restart_call_prepare(task); + return true; +} + static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) { @@ -3472,7 +3673,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) if (!nfs4_sequence_done(task, &data->res.seq_res)) return -EAGAIN; - + if (nfs4_read_stateid_changed(task, &data->args)) + return -EAGAIN; return data->read_done_cb ? data->read_done_cb(task, data) : nfs4_read_done_cb(task, data); } @@ -3487,10 +3689,13 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) { - nfs4_setup_sequence(NFS_SERVER(data->header->inode), + if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), &data->args.seq_args, &data->res.seq_res, - task); + task)) + return; + nfs4_set_rw_stateid(&data->args.stateid, data->args.context, + data->args.lock_context, FMODE_READ); } static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) @@ -3508,10 +3713,26 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data return 0; } +static bool nfs4_write_stateid_changed(struct rpc_task *task, + struct nfs_writeargs *args) +{ + + if (!nfs4_error_stateid_expired(task->tk_status) || + nfs4_stateid_is_current(&args->stateid, + args->context, + args->lock_context, + FMODE_WRITE)) + return false; + rpc_restart_call_prepare(task); + return true; +} + static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) { if (!nfs4_sequence_done(task, &data->res.seq_res)) return -EAGAIN; + if (nfs4_write_stateid_changed(task, &data->args)) + return -EAGAIN; return data->write_done_cb ? data->write_done_cb(task, data) : nfs4_write_done_cb(task, data); } @@ -3551,10 +3772,13 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) { - nfs4_setup_sequence(NFS_SERVER(data->header->inode), + if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), &data->args.seq_args, &data->res.seq_res, - task); + task)) + return; + nfs4_set_rw_stateid(&data->args.stateid, data->args.context, + data->args.lock_context, FMODE_WRITE); } static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) @@ -3656,7 +3880,7 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, return -ENOMEM; data->client = clp; data->timestamp = jiffies; - return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, + return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT, &nfs4_renew_ops, data); } @@ -3670,7 +3894,7 @@ static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) unsigned long now = jiffies; int status; - status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (status < 0) return status; do_renew_lease(clp, now); @@ -3980,11 +4204,14 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_OPENMODE: if (state == NULL) break; - nfs4_schedule_stateid_recovery(server, state); + if (nfs4_schedule_stateid_recovery(server, state) < 0) + goto stateid_invalid; goto wait_on_recovery; case -NFS4ERR_EXPIRED: - if (state != NULL) - nfs4_schedule_stateid_recovery(server, state); + if (state != NULL) { + if (nfs4_schedule_stateid_recovery(server, state) < 0) + goto stateid_invalid; + } case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); @@ -4016,6 +4243,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, } task->tk_status = nfs4_map_errors(task->tk_status); return 0; +stateid_invalid: + task->tk_status = -EIO; + return 0; wait_on_recovery: rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) @@ -4143,27 +4373,17 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct nfs4_setclientid_res *arg, struct rpc_cred *cred) { - struct nfs_fsinfo fsinfo; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], .rpc_argp = arg, - .rpc_resp = &fsinfo, .rpc_cred = cred, }; - unsigned long now; int status; dprintk("NFS call setclientid_confirm auth=%s, (client ID %llx)\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_clientid); - now = jiffies; status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); - if (status == 0) { - spin_lock(&clp->cl_lock); - clp->cl_lease_time = fsinfo.lease_time * HZ; - clp->cl_last_renewal = now; - spin_unlock(&clp->cl_lock); - } dprintk("NFS reply setclientid_confirm: %d\n", status); return status; } @@ -4627,17 +4847,23 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { goto out_release_lock_seqid; } - data->arg.open_stateid = &state->stateid; + data->arg.open_stateid = &state->open_stateid; data->arg.new_lock_owner = 1; data->res.open_seqid = data->arg.open_seqid; } else data->arg.new_lock_owner = 0; + if (!nfs4_valid_open_stateid(state)) { + data->rpc_status = -EBADF; + task->tk_action = NULL; + goto out_release_open_seqid; + } data->timestamp = jiffies; if (nfs4_setup_sequence(data->server, &data->arg.seq_args, &data->res.seq_res, task) == 0) return; +out_release_open_seqid: nfs_release_seqid(data->arg.open_seqid); out_release_lock_seqid: nfs_release_seqid(data->arg.lock_seqid); @@ -4983,58 +5209,16 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) return status; } -int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) +int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid) { struct nfs_server *server = NFS_SERVER(state->inode); - struct nfs4_exception exception = { }; int err; err = nfs4_set_lock_state(state, fl); if (err != 0) - goto out; - do { - err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); - switch (err) { - default: - printk(KERN_ERR "NFS: %s: unhandled error " - "%d.\n", __func__, err); - case 0: - case -ESTALE: - goto out; - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_STALE_STATEID: - set_bit(NFS_DELEGATED_STATE, &state->flags); - case -NFS4ERR_EXPIRED: - nfs4_schedule_lease_recovery(server->nfs_client); - err = -EAGAIN; - goto out; - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - case -NFS4ERR_DEADSESSION: - set_bit(NFS_DELEGATED_STATE, &state->flags); - nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); - err = -EAGAIN; - goto out; - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OPENMODE: - nfs4_schedule_stateid_recovery(server, state); - err = 0; - goto out; - case -ENOMEM: - case -NFS4ERR_DENIED: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ - err = 0; - goto out; - } - set_bit(NFS_DELEGATED_STATE, &state->flags); - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); -out: - return err; + return err; + err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); + return nfs4_handle_delegation_recall_error(server, state, stateid, err); } struct nfs_release_lockowner_data { @@ -5848,7 +6032,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs41_sequence_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, }; if (!atomic_inc_not_zero(&clp->cl_count)) @@ -6725,6 +6909,10 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, + .init_caps = NFS_CAP_READDIRPLUS + | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_CHANGE_ATTR + | NFS_CAP_POSIX_LOCK, .call_sync = _nfs4_call_sync, .match_stateid = nfs4_match_stateid, .find_root_sec = nfs4_find_root_sec, @@ -6736,6 +6924,12 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { #if defined(CONFIG_NFS_V4_1) static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, + .init_caps = NFS_CAP_READDIRPLUS + | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_CHANGE_ATTR + | NFS_CAP_POSIX_LOCK + | NFS_CAP_STATEID_NFSV41 + | NFS_CAP_ATOMIC_OPEN_V1, .call_sync = nfs4_call_sync_sequence, .match_stateid = nfs41_match_stateid, .find_root_sec = nfs41_find_root_sec, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 6ace365c6334..0b32f9483b7a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -154,18 +154,6 @@ struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp) return cred; } -static void nfs4_clear_machine_cred(struct nfs_client *clp) -{ - struct rpc_cred *cred; - - spin_lock(&clp->cl_lock); - cred = clp->cl_machine_cred; - clp->cl_machine_cred = NULL; - spin_unlock(&clp->cl_lock); - if (cred != NULL) - put_rpccred(cred); -} - static struct rpc_cred * nfs4_get_renew_cred_server_locked(struct nfs_server *server) { @@ -699,6 +687,8 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner) list_for_each_entry(state, &nfsi->open_states, inode_states) { if (state->owner != owner) continue; + if (!nfs4_valid_open_stateid(state)) + continue; if (atomic_inc_not_zero(&state->count)) return state; } @@ -987,13 +977,14 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) return 0; } -static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state, +static int nfs4_copy_lock_stateid(nfs4_stateid *dst, + struct nfs4_state *state, const struct nfs_lockowner *lockowner) { struct nfs4_lock_state *lsp; fl_owner_t fl_owner; pid_t fl_pid; - bool ret = false; + int ret = -ENOENT; if (lockowner == NULL) @@ -1008,7 +999,10 @@ static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state, lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { nfs4_stateid_copy(dst, &lsp->ls_stateid); - ret = true; + ret = 0; + smp_rmb(); + if (!list_empty(&lsp->ls_seqid.list)) + ret = -EWOULDBLOCK; } spin_unlock(&state->state_lock); nfs4_put_lock_state(lsp); @@ -1016,28 +1010,44 @@ out: return ret; } -static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) { + const nfs4_stateid *src; + int ret; int seq; do { + src = &zero_stateid; seq = read_seqbegin(&state->seqlock); - nfs4_stateid_copy(dst, &state->stateid); + if (test_bit(NFS_OPEN_STATE, &state->flags)) + src = &state->open_stateid; + nfs4_stateid_copy(dst, src); + ret = 0; + smp_rmb(); + if (!list_empty(&state->owner->so_seqid.list)) + ret = -EWOULDBLOCK; } while (read_seqretry(&state->seqlock, seq)); + return ret; } /* * Byte-range lock aware utility to initialize the stateid of read/write * requests. */ -void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, +int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, fmode_t fmode, const struct nfs_lockowner *lockowner) { + int ret = 0; if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) - return; - if (nfs4_copy_lock_stateid(dst, state, lockowner)) - return; - nfs4_copy_open_stateid(dst, state); + goto out; + ret = nfs4_copy_lock_stateid(dst, state, lockowner); + if (ret != -ENOENT) + goto out; + ret = nfs4_copy_open_stateid(dst, state); +out: + if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41)) + dst->seqid = 0; + return ret; } struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask) @@ -1286,14 +1296,17 @@ static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_s return 1; } -void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state) +int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state) { struct nfs_client *clp = server->nfs_client; + if (!nfs4_valid_open_stateid(state)) + return -EBADF; nfs4_state_mark_reclaim_nograce(clp, state); dprintk("%s: scheduling stateid recovery for server %s\n", __func__, clp->cl_hostname); nfs4_schedule_state_manager(clp); + return 0; } EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); @@ -1323,6 +1336,27 @@ void nfs_inode_find_state_and_recover(struct inode *inode, nfs4_schedule_state_manager(clp); } +static void nfs4_state_mark_open_context_bad(struct nfs4_state *state) +{ + struct inode *inode = state->inode; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *ctx; + + spin_lock(&inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + if (ctx->state != state) + continue; + set_bit(NFS_CONTEXT_BAD, &ctx->flags); + } + spin_unlock(&inode->i_lock); +} + +static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error) +{ + set_bit(NFS_STATE_RECOVERY_FAILED, &state->flags); + nfs4_state_mark_open_context_bad(state); +} + static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) { @@ -1398,6 +1432,8 @@ restart: list_for_each_entry(state, &sp->so_states, open_states) { if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) continue; + if (!nfs4_valid_open_stateid(state)) + continue; if (state->state == 0) continue; atomic_inc(&state->count); @@ -1430,11 +1466,10 @@ restart: * Open state on this file cannot be recovered * All we can do is revert to using the zero stateid. */ - memset(&state->stateid, 0, - sizeof(state->stateid)); - /* Mark the file as being 'closed' */ - state->state = 0; + nfs4_state_mark_recovery_failed(state, status); break; + case -EAGAIN: + ssleep(1); case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_BAD_STATEID: @@ -1696,6 +1731,10 @@ static int nfs4_check_lease(struct nfs_client *clp) } status = ops->renew_lease(clp, cred); put_rpccred(cred); + if (status == -ETIMEDOUT) { + set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + return 0; + } out: return nfs4_recovery_handle_error(clp, status); } @@ -1725,10 +1764,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); return -EPERM; case -EACCES: - if (clp->cl_machine_cred == NULL) - return -EACCES; - /* Handle case where the user hasn't set up machine creds */ - nfs4_clear_machine_cred(clp); case -NFS4ERR_DELAY: case -ETIMEDOUT: case -EAGAIN: @@ -1823,31 +1858,18 @@ int nfs4_discover_server_trunking(struct nfs_client *clp, { const struct nfs4_state_recovery_ops *ops = clp->cl_mvops->reboot_recovery_ops; - rpc_authflavor_t *flavors, flav, save; struct rpc_clnt *clnt; struct rpc_cred *cred; - int i, len, status; + int i, status; dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname); - len = NFS_MAX_SECFLAVORS; - flavors = kcalloc(len, sizeof(*flavors), GFP_KERNEL); - if (flavors == NULL) { - status = -ENOMEM; - goto out; - } - len = rpcauth_list_flavors(flavors, len); - if (len < 0) { - status = len; - goto out_free; - } clnt = clp->cl_rpcclient; - save = clnt->cl_auth->au_flavor; i = 0; mutex_lock(&nfs_clid_init_mutex); - status = -ENOENT; again: + status = -ENOENT; cred = ops->get_clid_cred(clp); if (cred == NULL) goto out_unlock; @@ -1857,12 +1879,6 @@ again: switch (status) { case 0: break; - - case -EACCES: - if (clp->cl_machine_cred == NULL) - break; - /* Handle case where the user hasn't set up machine creds */ - nfs4_clear_machine_cred(clp); case -NFS4ERR_DELAY: case -ETIMEDOUT: case -EAGAIN: @@ -1871,22 +1887,23 @@ again: dprintk("NFS: %s after status %d, retrying\n", __func__, status); goto again; - + case -EACCES: + if (i++) + break; case -NFS4ERR_CLID_INUSE: case -NFS4ERR_WRONGSEC: - status = -EPERM; - if (i >= len) - break; - - flav = flavors[i++]; - if (flav == save) - flav = flavors[i++]; - clnt = rpc_clone_client_set_auth(clnt, flav); + clnt = rpc_clone_client_set_auth(clnt, RPC_AUTH_UNIX); if (IS_ERR(clnt)) { status = PTR_ERR(clnt); break; } - clp->cl_rpcclient = clnt; + /* Note: this is safe because we haven't yet marked the + * client as ready, so we are the only user of + * clp->cl_rpcclient + */ + clnt = xchg(&clp->cl_rpcclient, clnt); + rpc_shutdown_client(clnt); + clnt = clp->cl_rpcclient; goto again; case -NFS4ERR_MINOR_VERS_MISMATCH: @@ -1897,13 +1914,15 @@ again: case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery * in nfs4_exchange_id */ status = -EKEYEXPIRED; + break; + default: + pr_warn("NFS: %s unhandled error %d. Exiting with error EIO\n", + __func__, status); + status = -EIO; } out_unlock: mutex_unlock(&nfs_clid_init_mutex); -out_free: - kfree(flavors); -out: dprintk("NFS: %s: status = %d\n", __func__, status); return status; } diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 569b166cc050..a5e1a3026d48 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -252,6 +252,8 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name, dfprintk(MOUNT, "--> nfs4_try_mount()\n"); + if (data->auth_flavors[0] == RPC_AUTH_MAXFLAVOR) + data->auth_flavors[0] = RPC_AUTH_UNIX; export_path = data->nfs_server.export_path; data->nfs_server.export_path = "/"; root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info, diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e3edda554ac7..3c79c5878c6d 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -530,14 +530,10 @@ static int nfs4_stat_to_errno(int); decode_setclientid_maxsz) #define NFS4_enc_setclientid_confirm_sz \ (compound_encode_hdr_maxsz + \ - encode_setclientid_confirm_maxsz + \ - encode_putrootfh_maxsz + \ - encode_fsinfo_maxsz) + encode_setclientid_confirm_maxsz) #define NFS4_dec_setclientid_confirm_sz \ (compound_decode_hdr_maxsz + \ - decode_setclientid_confirm_maxsz + \ - decode_putrootfh_maxsz + \ - decode_fsinfo_maxsz) + decode_setclientid_confirm_maxsz) #define NFS4_enc_lock_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -1058,8 +1054,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const if (iap->ia_valid & ATTR_ATIME_SET) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(iap->ia_atime.tv_sec); + p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec); *p++ = cpu_to_be32(iap->ia_atime.tv_nsec); } else if (iap->ia_valid & ATTR_ATIME) { @@ -1069,8 +1064,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const if (iap->ia_valid & ATTR_MTIME_SET) { bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); + p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec); *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); } else if (iap->ia_valid & ATTR_MTIME) { @@ -1366,33 +1360,28 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) { + struct iattr dummy; __be32 *p; - struct nfs_client *clp; p = reserve_space(xdr, 4); - switch(arg->open_flags & O_EXCL) { - case 0: + switch(arg->createmode) { + case NFS4_CREATE_UNCHECKED: *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); encode_attrs(xdr, arg->u.attrs, arg->server); break; - default: - clp = arg->server->nfs_client; - if (clp->cl_mvops->minor_version > 0) { - if (nfs4_has_persistent_session(clp)) { - *p = cpu_to_be32(NFS4_CREATE_GUARDED); - encode_attrs(xdr, arg->u.attrs, arg->server); - } else { - struct iattr dummy; - - *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); - encode_nfs4_verifier(xdr, &arg->u.verifier); - dummy.ia_valid = 0; - encode_attrs(xdr, &dummy, arg->server); - } - } else { - *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); - encode_nfs4_verifier(xdr, &arg->u.verifier); - } + case NFS4_CREATE_GUARDED: + *p = cpu_to_be32(NFS4_CREATE_GUARDED); + encode_attrs(xdr, arg->u.attrs, arg->server); + break; + case NFS4_CREATE_EXCLUSIVE: + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); + encode_nfs4_verifier(xdr, &arg->u.verifier); + break; + case NFS4_CREATE_EXCLUSIVE4_1: + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); + encode_nfs4_verifier(xdr, &arg->u.verifier); + dummy.ia_valid = 0; + encode_attrs(xdr, &dummy, arg->server); } } @@ -1459,6 +1448,23 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc encode_string(xdr, name->len, name->name); } +static inline void encode_claim_fh(struct xdr_stream *xdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_FH); +} + +static inline void encode_claim_delegate_cur_fh(struct xdr_stream *xdr, const nfs4_stateid *stateid) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEG_CUR_FH); + encode_nfs4_stateid(xdr, stateid); +} + static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr) { encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr); @@ -1474,6 +1480,12 @@ static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, case NFS4_OPEN_CLAIM_DELEGATE_CUR: encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation); break; + case NFS4_OPEN_CLAIM_FH: + encode_claim_fh(xdr); + break; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + encode_claim_delegate_cur_fh(xdr, &arg->u.delegation); + break; default: BUG(); } @@ -1506,35 +1518,12 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr); } -static void encode_open_stateid(struct xdr_stream *xdr, - const struct nfs_open_context *ctx, - const struct nfs_lock_context *l_ctx, - fmode_t fmode, - int zero_seqid) -{ - nfs4_stateid stateid; - - if (ctx->state != NULL) { - const struct nfs_lockowner *lockowner = NULL; - - if (l_ctx != NULL) - lockowner = &l_ctx->lockowner; - nfs4_select_rw_stateid(&stateid, ctx->state, - fmode, lockowner); - if (zero_seqid) - stateid.seqid = 0; - encode_nfs4_stateid(xdr, &stateid); - } else - encode_nfs4_stateid(xdr, &zero_stateid); -} - static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) { __be32 *p; encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr); - encode_open_stateid(xdr, args->context, args->lock_context, - FMODE_READ, hdr->minorversion); + encode_nfs4_stateid(xdr, &args->stateid); p = reserve_space(xdr, 12); p = xdr_encode_hyper(p, args->offset); @@ -1670,8 +1659,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg __be32 *p; encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr); - encode_open_stateid(xdr, args->context, args->lock_context, - FMODE_WRITE, hdr->minorversion); + encode_nfs4_stateid(xdr, &args->stateid); p = reserve_space(xdr, 16); p = xdr_encode_hyper(p, args->offset); @@ -2609,12 +2597,9 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, struct compound_hdr hdr = { .nops = 0, }; - const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME }; encode_compound_hdr(xdr, req, &hdr); encode_setclientid_confirm(xdr, arg, &hdr); - encode_putrootfh(xdr, &hdr); - encode_fsinfo(xdr, lease_bitmap, &hdr); encode_nops(&hdr); } @@ -3497,8 +3482,11 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) if (n == 0) goto root_path; dprintk("pathname4: "); - path->ncomponents = 0; - while (path->ncomponents < n) { + if (n > NFS4_PATHNAME_MAXCOMPONENTS) { + dprintk("cannot parse %d components in path\n", n); + goto out_eio; + } + for (path->ncomponents = 0; path->ncomponents < n; path->ncomponents++) { struct nfs4_string *component = &path->components[path->ncomponents]; status = decode_opaque_inline(xdr, &component->len, &component->data); if (unlikely(status != 0)) @@ -3507,12 +3495,6 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) pr_cont("%s%.*s ", (path->ncomponents != n ? "/ " : ""), component->len, component->data); - if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS) - path->ncomponents++; - else { - dprintk("cannot parse %d components in path\n", n); - goto out_eio; - } } out: return status; @@ -3557,27 +3539,23 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st n = be32_to_cpup(p); if (n <= 0) goto out_eio; - res->nlocations = 0; - while (res->nlocations < n) { + for (res->nlocations = 0; res->nlocations < n; res->nlocations++) { u32 m; - struct nfs4_fs_location *loc = &res->locations[res->nlocations]; + struct nfs4_fs_location *loc; + if (res->nlocations == NFS4_FS_LOCATIONS_MAXENTRIES) + break; + loc = &res->locations[res->nlocations]; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; m = be32_to_cpup(p); - loc->nservers = 0; dprintk("%s: servers:\n", __func__); - while (loc->nservers < m) { - struct nfs4_string *server = &loc->servers[loc->nservers]; - status = decode_opaque_inline(xdr, &server->len, &server->data); - if (unlikely(status != 0)) - goto out_eio; - dprintk("%s ", server->data); - if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS) - loc->nservers++; - else { + for (loc->nservers = 0; loc->nservers < m; loc->nservers++) { + struct nfs4_string *server; + + if (loc->nservers == NFS4_FS_LOCATION_MAXSERVERS) { unsigned int i; dprintk("%s: using first %u of %u servers " "returned for location %u\n", @@ -3591,13 +3569,17 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st if (unlikely(status != 0)) goto out_eio; } + break; } + server = &loc->servers[loc->nservers]; + status = decode_opaque_inline(xdr, &server->len, &server->data); + if (unlikely(status != 0)) + goto out_eio; + dprintk("%s ", server->data); } status = decode_pathname(xdr, &loc->rootpath); if (unlikely(status != 0)) goto out_eio; - if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES) - res->nlocations++; } if (res->nlocations != 0) status = NFS_ATTR_FATTR_V4_LOCATIONS; @@ -5209,27 +5191,30 @@ static int decode_delegreturn(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_DELEGRETURN); } -static int decode_secinfo_gss(struct xdr_stream *xdr, struct nfs4_secinfo_flavor *flavor) +static int decode_secinfo_gss(struct xdr_stream *xdr, + struct nfs4_secinfo4 *flavor) { + u32 oid_len; __be32 *p; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - flavor->gss.sec_oid4.len = be32_to_cpup(p); - if (flavor->gss.sec_oid4.len > GSS_OID_MAX_LEN) + oid_len = be32_to_cpup(p); + if (oid_len > GSS_OID_MAX_LEN) goto out_err; - p = xdr_inline_decode(xdr, flavor->gss.sec_oid4.len); + p = xdr_inline_decode(xdr, oid_len); if (unlikely(!p)) goto out_overflow; - memcpy(flavor->gss.sec_oid4.data, p, flavor->gss.sec_oid4.len); + memcpy(flavor->flavor_info.oid.data, p, oid_len); + flavor->flavor_info.oid.len = oid_len; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - flavor->gss.qop4 = be32_to_cpup(p++); - flavor->gss.service = be32_to_cpup(p); + flavor->flavor_info.qop = be32_to_cpup(p++); + flavor->flavor_info.service = be32_to_cpup(p); return 0; @@ -5242,10 +5227,10 @@ out_err: static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) { - struct nfs4_secinfo_flavor *sec_flavor; + struct nfs4_secinfo4 *sec_flavor; + unsigned int i, num_flavors; int status; __be32 *p; - int i, num_flavors; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) @@ -6648,8 +6633,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, * Decode SETCLIENTID_CONFIRM response */ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, - struct xdr_stream *xdr, - struct nfs_fsinfo *fsinfo) + struct xdr_stream *xdr) { struct compound_hdr hdr; int status; @@ -6657,10 +6641,6 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, status = decode_compound_hdr(xdr, &hdr); if (!status) status = decode_setclientid_confirm(xdr); - if (!status) - status = decode_putrootfh(xdr); - if (!status) - status = decode_fsinfo(xdr, fsinfo); return status; } diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 88f9611a945c..5457745dd4f1 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -234,7 +234,7 @@ static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, lseg = kzalloc(lseg_size, gfp_flags); if (unlikely(!lseg)) { - dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__, + dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__, numdevs, lseg_size); return -ENOMEM; } diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 880ba086be94..87aa1dec6120 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -114,7 +114,7 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, gfp_t gfp_flags); extern void objio_free_lseg(struct pnfs_layout_segment *lseg); -/* objio_free_result will free these @oir structs recieved from +/* objio_free_result will free these @oir structs received from * objlayout_{read,write}_done */ extern void objio_free_result(struct objlayout_io_res *oir); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index e56e846e9d2d..29cfb7ade121 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -84,6 +84,55 @@ nfs_page_free(struct nfs_page *p) kmem_cache_free(nfs_page_cachep, p); } +static void +nfs_iocounter_inc(struct nfs_io_counter *c) +{ + atomic_inc(&c->io_count); +} + +static void +nfs_iocounter_dec(struct nfs_io_counter *c) +{ + if (atomic_dec_and_test(&c->io_count)) { + clear_bit(NFS_IO_INPROGRESS, &c->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&c->flags, NFS_IO_INPROGRESS); + } +} + +static int +__nfs_iocounter_wait(struct nfs_io_counter *c) +{ + wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS); + DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS); + int ret = 0; + + do { + prepare_to_wait(wq, &q.wait, TASK_KILLABLE); + set_bit(NFS_IO_INPROGRESS, &c->flags); + if (atomic_read(&c->io_count) == 0) + break; + ret = nfs_wait_bit_killable(&c->flags); + } while (atomic_read(&c->io_count) != 0); + finish_wait(wq, &q.wait); + return ret; +} + +/** + * nfs_iocounter_wait - wait for i/o to complete + * @c: nfs_io_counter to use + * + * returns -ERESTARTSYS if interrupted by a fatal signal. + * Otherwise returns 0 once the io_count hits 0. + */ +int +nfs_iocounter_wait(struct nfs_io_counter *c) +{ + if (atomic_read(&c->io_count) == 0) + return 0; + return __nfs_iocounter_wait(c); +} + /** * nfs_create_request - Create an NFS read/write request. * @ctx: open context to use @@ -104,6 +153,8 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, struct nfs_page *req; struct nfs_lock_context *l_ctx; + if (test_bit(NFS_CONTEXT_BAD, &ctx->flags)) + return ERR_PTR(-EBADF); /* try to allocate the request struct */ req = nfs_page_alloc(); if (req == NULL) @@ -116,6 +167,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, return ERR_CAST(l_ctx); } req->wb_lock_context = l_ctx; + nfs_iocounter_inc(&l_ctx->io_count); /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in @@ -175,6 +227,7 @@ static void nfs_clear_request(struct nfs_page *req) req->wb_page = NULL; } if (l_ctx != NULL) { + nfs_iocounter_dec(&l_ctx->io_count); nfs_put_lock_context(l_ctx); req->wb_lock_context = NULL; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4bdffe0ba025..c5bd758e5637 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -718,6 +718,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, spin_lock(&lo->plh_inode->i_lock); if (pnfs_layoutgets_blocked(lo, 1)) { status = -EAGAIN; + } else if (!nfs4_valid_open_stateid(open_state)) { + status = -EBADF; } else if (list_empty(&lo->plh_segs)) { int seq; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index a5e5d9899d56..70a26c651f09 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -514,6 +514,8 @@ void nfs_read_prepare(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data); + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) + rpc_exit(task, -EIO); } static const struct rpc_call_ops nfs_read_common_ops = { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2f8a29db0f1b..1bb071dca9ab 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -920,7 +920,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) data->mount_server.port = NFS_UNSPEC_PORT; data->nfs_server.port = NFS_UNSPEC_PORT; data->nfs_server.protocol = XPRT_TRANSPORT_TCP; - data->auth_flavors[0] = RPC_AUTH_UNIX; + data->auth_flavors[0] = RPC_AUTH_MAXFLAVOR; data->auth_flavor_len = 1; data->minorversion = 0; data->need_mount = true; @@ -1608,49 +1608,57 @@ out_security_failure: } /* - * Match the requested auth flavors with the list returned by - * the server. Returns zero and sets the mount's authentication - * flavor on success; returns -EACCES if server does not support - * the requested flavor. + * Select a security flavor for this mount. The selected flavor + * is planted in args->auth_flavors[0]. */ -static int nfs_walk_authlist(struct nfs_parsed_mount_data *args, - struct nfs_mount_request *request) +static void nfs_select_flavor(struct nfs_parsed_mount_data *args, + struct nfs_mount_request *request) { - unsigned int i, j, server_authlist_len = *(request->auth_flav_len); + unsigned int i, count = *(request->auth_flav_len); + rpc_authflavor_t flavor; + + if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) + goto out; + + /* + * The NFSv2 MNT operation does not return a flavor list. + */ + if (args->mount_server.version != NFS_MNT3_VERSION) + goto out_default; /* * Certain releases of Linux's mountd return an empty - * flavor list. To prevent behavioral regression with - * these servers (ie. rejecting mounts that used to - * succeed), revert to pre-2.6.32 behavior (no checking) - * if the returned flavor list is empty. + * flavor list in some cases. */ - if (server_authlist_len == 0) - return 0; + if (count == 0) + goto out_default; /* - * We avoid sophisticated negotiating here, as there are - * plenty of cases where we can get it wrong, providing - * either too little or too much security. - * * RFC 2623, section 2.7 suggests we SHOULD prefer the * flavor listed first. However, some servers list - * AUTH_NULL first. Our caller plants AUTH_SYS, the - * preferred default, in args->auth_flavors[0] if user - * didn't specify sec= mount option. + * AUTH_NULL first. Avoid ever choosing AUTH_NULL. */ - for (i = 0; i < args->auth_flavor_len; i++) - for (j = 0; j < server_authlist_len; j++) - if (args->auth_flavors[i] == request->auth_flavs[j]) { - dfprintk(MOUNT, "NFS: using auth flavor %d\n", - request->auth_flavs[j]); - args->auth_flavors[0] = request->auth_flavs[j]; - return 0; - } + for (i = 0; i < count; i++) { + struct rpcsec_gss_info info; + + flavor = request->auth_flavs[i]; + switch (flavor) { + case RPC_AUTH_UNIX: + goto out_set; + case RPC_AUTH_NULL: + continue; + default: + if (rpcauth_get_gssinfo(flavor, &info) == 0) + goto out_set; + } + } - dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n"); - nfs_umount(request); - return -EACCES; +out_default: + flavor = RPC_AUTH_UNIX; +out_set: + args->auth_flavors[0] = flavor; +out: + dfprintk(MOUNT, "NFS: using auth flavor %d\n", args->auth_flavors[0]); } /* @@ -1713,12 +1721,8 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args, return status; } - /* - * MNTv1 (NFSv2) does not support auth flavor negotiation. - */ - if (args->mount_server.version != NFS_MNT3_VERSION) - return 0; - return nfs_walk_authlist(args, &request); + nfs_select_flavor(args, &request); + return 0; } struct dentry *nfs_try_mount(int flags, const char *dev_name, @@ -2381,10 +2385,9 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, struct nfs_mount_info *mount_info) { /* clone any lsm security options from the parent to the new sb */ - security_sb_clone_mnt_opts(mount_info->cloned->sb, s); if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) return -ESTALE; - return 0; + return security_sb_clone_mnt_opts(mount_info->cloned->sb, s); } EXPORT_SYMBOL_GPL(nfs_clone_sb_security); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c483cc50b82e..a2c7c28049d5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1251,6 +1251,8 @@ void nfs_write_prepare(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data); + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) + rpc_exit(task, -EIO); } void nfs_commit_prepare(struct rpc_task *task, void *calldata) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2e27430b9070..417c84877742 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -234,7 +234,6 @@ static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) { struct idr *stateids = &cl->cl_stateids; - static int min_stateid = 0; struct nfs4_stid *stid; int new_id; @@ -242,7 +241,7 @@ kmem_cache *slab) if (!stid) return NULL; - new_id = idr_alloc(stateids, stid, min_stateid, 0, GFP_KERNEL); + new_id = idr_alloc_cyclic(stateids, stid, 0, 0, GFP_KERNEL); if (new_id < 0) goto out_free; stid->sc_client = cl; @@ -261,10 +260,6 @@ kmem_cache *slab) * amount of time until an id is reused, by ensuring they always * "increase" (mod INT_MAX): */ - - min_stateid = new_id+1; - if (min_stateid == INT_MAX) - min_stateid = 0; return stid; out_free: kfree(stid); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 01168865dd37..2502951714b1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -264,7 +264,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, iattr->ia_valid |= ATTR_SIZE; } if (bmval[0] & FATTR4_WORD0_ACL) { - int nace; + u32 nace; struct nfs4_ace *ace; READ_BUF(4); len += 4; @@ -3138,10 +3138,9 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ static __be32 nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, - __be32 nfserr,struct svc_export *exp) + __be32 nfserr, struct svc_export *exp) { - int i = 0; - u32 nflavs; + u32 i, nflavs; struct exp_flavor_info *flavs; struct exp_flavor_info def_flavs[2]; __be32 *p; @@ -3172,30 +3171,29 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, WRITE32(nflavs); ADJUST_ARGS(); for (i = 0; i < nflavs; i++) { - u32 flav = flavs[i].pseudoflavor; - struct gss_api_mech *gm = gss_mech_get_by_pseudoflavor(flav); + struct rpcsec_gss_info info; - if (gm) { + if (rpcauth_get_gssinfo(flavs[i].pseudoflavor, &info) == 0) { RESERVE_SPACE(4); WRITE32(RPC_AUTH_GSS); ADJUST_ARGS(); - RESERVE_SPACE(4 + gm->gm_oid.len); - WRITE32(gm->gm_oid.len); - WRITEMEM(gm->gm_oid.data, gm->gm_oid.len); + RESERVE_SPACE(4 + info.oid.len); + WRITE32(info.oid.len); + WRITEMEM(info.oid.data, info.oid.len); ADJUST_ARGS(); RESERVE_SPACE(4); - WRITE32(0); /* qop */ + WRITE32(info.qop); ADJUST_ARGS(); RESERVE_SPACE(4); - WRITE32(gss_pseudoflavor_to_service(gm, flav)); + WRITE32(info.service); ADJUST_ARGS(); - gss_mech_put(gm); } else { RESERVE_SPACE(4); - WRITE32(flav); + WRITE32(flavs[i].pseudoflavor); ADJUST_ARGS(); } } + out: if (exp) exp_put(exp); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f33455b4d957..5bee0313dffd 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -177,7 +177,7 @@ static int export_features_open(struct inode *inode, struct file *file) return single_open(file, export_features_show, NULL); } -static struct file_operations export_features_operations = { +static const struct file_operations export_features_operations = { .open = export_features_open, .read = seq_read, .llseek = seq_lseek, @@ -196,7 +196,7 @@ static int supported_enctypes_open(struct inode *inode, struct file *file) return single_open(file, supported_enctypes_show, NULL); } -static struct file_operations supported_enctypes_ops = { +static const struct file_operations supported_enctypes_ops = { .open = supported_enctypes_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 6b49f14eac8c..cf02f5530713 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -175,6 +175,11 @@ static int nilfs_writepages(struct address_space *mapping, struct inode *inode = mapping->host; int err = 0; + if (inode->i_sb->s_flags & MS_RDONLY) { + nilfs_clear_dirty_pages(mapping, false); + return -EROFS; + } + if (wbc->sync_mode == WB_SYNC_ALL) err = nilfs_construct_dsync_segment(inode->i_sb, inode, wbc->range_start, @@ -187,6 +192,18 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc) struct inode *inode = page->mapping->host; int err; + if (inode->i_sb->s_flags & MS_RDONLY) { + /* + * It means that filesystem was remounted in read-only + * mode because of error or metadata corruption. But we + * have dirty pages that try to be flushed in background. + * So, here we simply discard this dirty page. + */ + nilfs_clear_dirty_page(page, false); + unlock_page(page); + return -EROFS; + } + redirty_page_for_writepage(wbc, page); unlock_page(page); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index f9897d09c693..c4dcd1db57ee 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -375,14 +375,25 @@ int nilfs_mdt_fetch_dirty(struct inode *inode) static int nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) { - struct inode *inode; + struct inode *inode = page->mapping->host; struct super_block *sb; int err = 0; + if (inode && (inode->i_sb->s_flags & MS_RDONLY)) { + /* + * It means that filesystem was remounted in read-only + * mode because of error or metadata corruption. But we + * have dirty pages that try to be flushed in background. + * So, here we simply discard this dirty page. + */ + nilfs_clear_dirty_page(page, false); + unlock_page(page); + return -EROFS; + } + redirty_page_for_writepage(wbc, page); unlock_page(page); - inode = page->mapping->host; if (!inode) return 0; @@ -561,10 +572,10 @@ void nilfs_mdt_restore_from_shadow_map(struct inode *inode) if (mi->mi_palloc_cache) nilfs_palloc_clear_cache(inode); - nilfs_clear_dirty_pages(inode->i_mapping); + nilfs_clear_dirty_pages(inode->i_mapping, true); nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data); - nilfs_clear_dirty_pages(&ii->i_btnode_cache); + nilfs_clear_dirty_pages(&ii->i_btnode_cache, true); nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes); nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 07f76db04ec7..0ba679866e50 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -370,7 +370,12 @@ repeat: goto repeat; } -void nilfs_clear_dirty_pages(struct address_space *mapping) +/** + * nilfs_clear_dirty_pages - discard dirty pages in address space + * @mapping: address space with dirty pages for discarding + * @silent: suppress [true] or print [false] warning messages + */ +void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) { struct pagevec pvec; unsigned int i; @@ -382,25 +387,9 @@ void nilfs_clear_dirty_pages(struct address_space *mapping) PAGEVEC_SIZE)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; lock_page(page); - ClearPageUptodate(page); - ClearPageMappedToDisk(page); - bh = head = page_buffers(page); - do { - lock_buffer(bh); - clear_buffer_dirty(bh); - clear_buffer_nilfs_volatile(bh); - clear_buffer_nilfs_checked(bh); - clear_buffer_nilfs_redirected(bh); - clear_buffer_uptodate(bh); - clear_buffer_mapped(bh); - unlock_buffer(bh); - bh = bh->b_this_page; - } while (bh != head); - - __nilfs_clear_page_dirty(page); + nilfs_clear_dirty_page(page, silent); unlock_page(page); } pagevec_release(&pvec); @@ -408,6 +397,51 @@ void nilfs_clear_dirty_pages(struct address_space *mapping) } } +/** + * nilfs_clear_dirty_page - discard dirty page + * @page: dirty page that will be discarded + * @silent: suppress [true] or print [false] warning messages + */ +void nilfs_clear_dirty_page(struct page *page, bool silent) +{ + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + + BUG_ON(!PageLocked(page)); + + if (!silent) { + nilfs_warning(sb, __func__, + "discard page: offset %lld, ino %lu", + page_offset(page), inode->i_ino); + } + + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + lock_buffer(bh); + if (!silent) { + nilfs_warning(sb, __func__, + "discard block %llu, size %zu", + (u64)bh->b_blocknr, bh->b_size); + } + clear_buffer_dirty(bh); + clear_buffer_nilfs_volatile(bh); + clear_buffer_nilfs_checked(bh); + clear_buffer_nilfs_redirected(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + unlock_buffer(bh); + } while (bh = bh->b_this_page, bh != head); + } + + __nilfs_clear_page_dirty(page); +} + unsigned nilfs_page_count_clean_buffers(struct page *page, unsigned from, unsigned to) { diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index fb7de71605a0..ef30c5c2426f 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -55,7 +55,8 @@ void nilfs_page_bug(struct page *); int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); -void nilfs_clear_dirty_pages(struct address_space *); +void nilfs_clear_dirty_page(struct page *, bool); +void nilfs_clear_dirty_pages(struct address_space *, bool); void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, struct backing_dev_info *bdi); unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 5d8444268a16..d0be29fa94cf 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -755,9 +755,9 @@ out_destroy_group: return fd; } -SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, - __u64 mask, int dfd, - const char __user * pathname) +SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, + __u64, mask, int, dfd, + const char __user *, pathname) { struct inode *inode = NULL; struct vfsmount *mnt = NULL; @@ -857,17 +857,6 @@ fput_and_out: return ret; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_fanotify_mark(long fanotify_fd, long flags, __u64 mask, - long dfd, long pathname) -{ - return SYSC_fanotify_mark((int) fanotify_fd, (unsigned int) flags, - mask, (int) dfd, - (const char __user *) pathname); -} -SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark); -#endif - /* * fanotify_user_setup - Our initialization function. Note that we cannot return * error because we have compiled-in VFS hooks. So an (unlikely) failure here diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index e0f7c1241a6a..959815c1e017 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -287,9 +287,6 @@ static int inotify_release(struct inode *ignored, struct file *file) pr_debug("%s: group=%p\n", __func__, group); - if (file->f_flags & FASYNC) - fsnotify_fasync(-1, file, 0); - /* free this group, matching get was inotify_init->fsnotify_obtain_group */ fsnotify_destroy_group(group); @@ -359,7 +356,6 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns } static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock, - int *last_wd, struct inotify_inode_mark *i_mark) { int ret; @@ -367,11 +363,10 @@ static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock, idr_preload(GFP_KERNEL); spin_lock(idr_lock); - ret = idr_alloc(idr, i_mark, *last_wd + 1, 0, GFP_NOWAIT); + ret = idr_alloc_cyclic(idr, i_mark, 1, 0, GFP_NOWAIT); if (ret >= 0) { /* we added the mark to the idr, take a reference */ i_mark->wd = ret; - *last_wd = i_mark->wd; fsnotify_get_mark(&i_mark->fsn_mark); } @@ -572,7 +567,6 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, int add = (arg & IN_MASK_ADD); int ret; - /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); fsn_mark = fsnotify_find_inode_mark(group, inode); @@ -623,7 +617,6 @@ static int inotify_new_watch(struct fsnotify_group *group, struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; - /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); @@ -638,8 +631,7 @@ static int inotify_new_watch(struct fsnotify_group *group, if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) goto out_err; - ret = inotify_add_to_idr(idr, idr_lock, &group->inotify_data.last_wd, - tmp_i_mark); + ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark); if (ret) goto out_err; @@ -697,7 +689,6 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); - group->inotify_data.last_wd = 0; group->inotify_data.user = get_current_user(); if (atomic_inc_return(&group->inotify_data.user->inotify_devs) > @@ -751,6 +742,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, int ret; unsigned flags = 0; + /* don't allow invalid bits: we don't want flags set */ + if (unlikely(!(mask & ALL_INOTIFY_BITS))) + return -EINVAL; + f = fdget(fd); if (unlikely(!f.file)) return -EBADF; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 5b2d4f0853ac..1da4b81e6f76 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2129,7 +2129,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); - sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); @@ -2138,7 +2137,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0) ret = err; } - sb_end_write(inode->i_sb); return ret; } diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index eeac97bb3bfa..b3fdd1a323d6 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1498,10 +1498,8 @@ leave: dlm_put(dlm); if (ret < 0) { - if (buf) - kfree(buf); - if (item) - kfree(item); + kfree(buf); + kfree(item); mlog_errno(ret); } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6474cb44004d..8a7509f9e6f5 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2248,8 +2248,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, if (iocb->ki_left == 0) return 0; - sb_start_write(inode->i_sb); - appending = file->f_flags & O_APPEND ? 1 : 0; direct_io = file->f_flags & O_DIRECT ? 1 : 0; @@ -2423,7 +2421,6 @@ out_sems: ocfs2_iocb_clear_sem_locked(iocb); mutex_unlock(&inode->i_mutex); - sb_end_write(inode->i_sb); if (written) ret = written; @@ -2468,8 +2465,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, out->f_path.dentry->d_name.len, out->f_path.dentry->d_name.name, len); - if (pipe->inode) - mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); + pipe_lock(pipe); splice_from_pipe_begin(&sd); do { @@ -2489,8 +2485,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, } while (ret > 0); splice_from_pipe_end(pipe, &sd); - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); if (sd.num_spliced) ret = sd.num_spliced; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 752f0b26221d..0c60ef2d8056 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -101,13 +101,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, if (!S_ISDIR(inode->i_mode)) flags &= ~OCFS2_DIRSYNC_FL; - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto bail_unlock; - } - oldflags = ocfs2_inode->ip_attr; flags = flags & mask; flags |= oldflags & ~mask; @@ -120,7 +113,14 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) & (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) { if (!capable(CAP_LINUX_IMMUTABLE)) - goto bail_commit; + goto bail_unlock; + } + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; } ocfs2_inode->ip_attr = flags; @@ -130,8 +130,8 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, if (status < 0) mlog_errno(status); -bail_commit: ocfs2_commit_trans(osb, handle); + bail_unlock: ocfs2_inode_unlock(inode, 1); bail: @@ -706,8 +706,10 @@ int ocfs2_info_handle_freefrag(struct inode *inode, o2info_set_request_filled(&oiff->iff_req); - if (o2info_to_user(*oiff, req)) + if (o2info_to_user(*oiff, req)) { + status = -EFAULT; goto bail; + } status = 0; bail: diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 9f8dcadd9a50..f1fc172175b6 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -471,7 +471,7 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, int ret, goal_bit = 0; struct buffer_head *gd_bh = NULL; - struct ocfs2_group_desc *bg = NULL; + struct ocfs2_group_desc *bg; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int c_to_b = 1 << (osb->s_clustersize_bits - inode->i_sb->s_blocksize_bits); @@ -482,13 +482,6 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb, range->me_goal); /* - * moving goal is not allowd to start with a group desc blok(#0 blk) - * let's compromise to the latter cluster. - */ - if (range->me_goal == le64_to_cpu(bg->bg_blkno)) - range->me_goal += c_to_b; - - /* * validate goal sits within global_bitmap, and return the victim * group desc */ @@ -502,6 +495,13 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, bg = (struct ocfs2_group_desc *)gd_bh->b_data; /* + * moving goal is not allowd to start with a group desc blok(#0 blk) + * let's compromise to the latter cluster. + */ + if (range->me_goal == le64_to_cpu(bg->bg_blkno)) + range->me_goal += c_to_b; + + /* * movement is not gonna cross two groups. */ if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < @@ -1057,42 +1057,40 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) struct inode *inode = file_inode(filp); struct ocfs2_move_extents range; - struct ocfs2_move_extents_context *context = NULL; + struct ocfs2_move_extents_context *context; + + if (!argp) + return -EINVAL; status = mnt_want_write_file(filp); if (status) return status; if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) - goto out; + goto out_drop; if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { status = -EPERM; - goto out; + goto out_drop; } context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); if (!context) { status = -ENOMEM; mlog_errno(status); - goto out; + goto out_drop; } context->inode = inode; context->file = filp; - if (argp) { - if (copy_from_user(&range, argp, sizeof(range))) { - status = -EFAULT; - goto out; - } - } else { - status = -EINVAL; - goto out; + if (copy_from_user(&range, argp, sizeof(range))) { + status = -EFAULT; + goto out_free; } if (range.me_start > i_size_read(inode)) - goto out; + goto out_free; if (range.me_start + range.me_len > i_size_read(inode)) range.me_len = i_size_read(inode) - range.me_start; @@ -1124,25 +1122,24 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) status = ocfs2_validate_and_adjust_move_goal(inode, &range); if (status) - goto out; + goto out_copy; } status = ocfs2_move_extents(context); if (status) mlog_errno(status); -out: +out_copy: /* * movement/defragmentation may end up being partially completed, * that's the reason why we need to return userspace the finished * length and new_offset even if failure happens somewhere. */ - if (argp) { - if (copy_to_user(argp, &range, sizeof(range))) - status = -EFAULT; - } + if (copy_to_user(argp, &range, sizeof(range))) + status = -EFAULT; +out_free: kfree(context); - +out_drop: mnt_drop_write_file(filp); return status; diff --git a/fs/open.c b/fs/open.c index 68354466879f..8c741002f947 100644 --- a/fs/open.c +++ b/fs/open.c @@ -197,10 +197,7 @@ out: SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) { - long ret = do_sys_ftruncate(fd, length, 1); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, fd, length); - return ret; + return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT @@ -212,32 +209,15 @@ COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) /* LFS versions of truncate are only needed on 32 bit machines */ #if BITS_PER_LONG == 32 -SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length) +SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length) { return do_sys_truncate(path, length); } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_truncate64(long path, loff_t length) -{ - return SYSC_truncate64((const char __user *) path, length); -} -SYSCALL_ALIAS(sys_truncate64, SyS_truncate64); -#endif -SYSCALL_DEFINE(ftruncate64)(unsigned int fd, loff_t length) +SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) { - long ret = do_sys_ftruncate(fd, length, 0); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, fd, length); - return ret; -} -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_ftruncate64(long fd, loff_t length) -{ - return SYSC_ftruncate64((unsigned int) fd, length); + return do_sys_ftruncate(fd, length, 0); } -SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64); -#endif #endif /* BITS_PER_LONG == 32 */ @@ -299,7 +279,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return ret; } -SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) +SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) { struct fd f = fdget(fd); int error = -EBADF; @@ -311,14 +291,6 @@ SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) return error; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len) -{ - return SYSC_fallocate((int)fd, (int)mode, offset, len); -} -SYSCALL_ALIAS(sys_fallocate, SyS_fallocate); -#endif - /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and @@ -983,29 +955,19 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { - long ret; - if (force_o_largefile()) flags |= O_LARGEFILE; - ret = do_sys_open(AT_FDCWD, filename, flags, mode); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, filename, flags, mode); - return ret; + return do_sys_open(AT_FDCWD, filename, flags, mode); } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { - long ret; - if (force_o_largefile()) flags |= O_LARGEFILE; - ret = do_sys_open(dfd, filename, flags, mode); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(4, ret, dfd, filename, flags, mode); - return ret; + return do_sys_open(dfd, filename, flags, mode); } #ifndef __alpha__ diff --git a/fs/pipe.c b/fs/pipe.c index 2234f3f61f8d..a029a14bacf1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -25,6 +25,8 @@ #include <asm/uaccess.h> #include <asm/ioctls.h> +#include "internal.h" + /* * The max size that a non-root user is allowed to grow the pipe. Can * be set by root in /proc/sys/fs/pipe-max-size @@ -53,8 +55,8 @@ unsigned int pipe_min_size = PAGE_SIZE; static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) { - if (pipe->inode) - mutex_lock_nested(&pipe->inode->i_mutex, subclass); + if (pipe->files) + mutex_lock_nested(&pipe->mutex, subclass); } void pipe_lock(struct pipe_inode_info *pipe) @@ -68,11 +70,21 @@ EXPORT_SYMBOL(pipe_lock); void pipe_unlock(struct pipe_inode_info *pipe) { - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + if (pipe->files) + mutex_unlock(&pipe->mutex); } EXPORT_SYMBOL(pipe_unlock); +static inline void __pipe_lock(struct pipe_inode_info *pipe) +{ + mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); +} + +static inline void __pipe_unlock(struct pipe_inode_info *pipe) +{ + mutex_unlock(&pipe->mutex); +} + void pipe_double_lock(struct pipe_inode_info *pipe1, struct pipe_inode_info *pipe2) { @@ -361,8 +373,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; - struct inode *inode = file_inode(filp); - struct pipe_inode_info *pipe; + struct pipe_inode_info *pipe = filp->private_data; int do_wakeup; ssize_t ret; struct iovec *iov = (struct iovec *)_iov; @@ -375,8 +386,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(&inode->i_mutex); - pipe = inode->i_pipe; + __pipe_lock(pipe); for (;;) { int bufs = pipe->nrbufs; if (bufs) { @@ -464,7 +474,7 @@ redo: } pipe_wait(pipe); } - mutex_unlock(&inode->i_mutex); + __pipe_unlock(pipe); /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { @@ -486,8 +496,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, unsigned long nr_segs, loff_t ppos) { struct file *filp = iocb->ki_filp; - struct inode *inode = file_inode(filp); - struct pipe_inode_info *pipe; + struct pipe_inode_info *pipe = filp->private_data; ssize_t ret; int do_wakeup; struct iovec *iov = (struct iovec *)_iov; @@ -501,8 +510,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(&inode->i_mutex); - pipe = inode->i_pipe; + __pipe_lock(pipe); if (!pipe->readers) { send_sig(SIGPIPE, current, 0); @@ -649,7 +657,7 @@ redo2: pipe->waiting_writers--; } out: - mutex_unlock(&inode->i_mutex); + __pipe_unlock(pipe); if (do_wakeup) { wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); @@ -662,29 +670,14 @@ out: return ret; } -static ssize_t -bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) -{ - return -EBADF; -} - -static ssize_t -bad_pipe_w(struct file *filp, const char __user *buf, size_t count, - loff_t *ppos) -{ - return -EBADF; -} - static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = file_inode(filp); - struct pipe_inode_info *pipe; + struct pipe_inode_info *pipe = filp->private_data; int count, buf, nrbufs; switch (cmd) { case FIONREAD: - mutex_lock(&inode->i_mutex); - pipe = inode->i_pipe; + __pipe_lock(pipe); count = 0; buf = pipe->curbuf; nrbufs = pipe->nrbufs; @@ -692,7 +685,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) count += pipe->bufs[buf].len; buf = (buf+1) & (pipe->buffers - 1); } - mutex_unlock(&inode->i_mutex); + __pipe_unlock(pipe); return put_user(count, (int __user *)arg); default: @@ -705,8 +698,7 @@ static unsigned int pipe_poll(struct file *filp, poll_table *wait) { unsigned int mask; - struct inode *inode = file_inode(filp); - struct pipe_inode_info *pipe = inode->i_pipe; + struct pipe_inode_info *pipe = filp->private_data; int nrbufs; poll_wait(filp, &pipe->wait, wait); @@ -734,197 +726,56 @@ pipe_poll(struct file *filp, poll_table *wait) } static int -pipe_release(struct inode *inode, int decr, int decw) +pipe_release(struct inode *inode, struct file *file) { - struct pipe_inode_info *pipe; + struct pipe_inode_info *pipe = inode->i_pipe; + int kill = 0; - mutex_lock(&inode->i_mutex); - pipe = inode->i_pipe; - pipe->readers -= decr; - pipe->writers -= decw; + __pipe_lock(pipe); + if (file->f_mode & FMODE_READ) + pipe->readers--; + if (file->f_mode & FMODE_WRITE) + pipe->writers--; - if (!pipe->readers && !pipe->writers) { - free_pipe_info(inode); - } else { + if (pipe->readers || pipe->writers) { wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - mutex_unlock(&inode->i_mutex); - - return 0; -} - -static int -pipe_read_fasync(int fd, struct file *filp, int on) -{ - struct inode *inode = file_inode(filp); - int retval; - - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); - mutex_unlock(&inode->i_mutex); - - return retval; -} - - -static int -pipe_write_fasync(int fd, struct file *filp, int on) -{ - struct inode *inode = file_inode(filp); - int retval; + spin_lock(&inode->i_lock); + if (!--pipe->files) { + inode->i_pipe = NULL; + kill = 1; + } + spin_unlock(&inode->i_lock); + __pipe_unlock(pipe); - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); - mutex_unlock(&inode->i_mutex); + if (kill) + free_pipe_info(pipe); - return retval; + return 0; } - static int -pipe_rdwr_fasync(int fd, struct file *filp, int on) +pipe_fasync(int fd, struct file *filp, int on) { - struct inode *inode = file_inode(filp); - struct pipe_inode_info *pipe = inode->i_pipe; - int retval; + struct pipe_inode_info *pipe = filp->private_data; + int retval = 0; - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); - if (retval >= 0) { + __pipe_lock(pipe); + if (filp->f_mode & FMODE_READ) + retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); + if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); - if (retval < 0) /* this can happen only if on == T */ + if (retval < 0 && (filp->f_mode & FMODE_READ)) + /* this can happen only if on == T */ fasync_helper(-1, filp, 0, &pipe->fasync_readers); } - mutex_unlock(&inode->i_mutex); + __pipe_unlock(pipe); return retval; } - -static int -pipe_read_release(struct inode *inode, struct file *filp) -{ - return pipe_release(inode, 1, 0); -} - -static int -pipe_write_release(struct inode *inode, struct file *filp) -{ - return pipe_release(inode, 0, 1); -} - -static int -pipe_rdwr_release(struct inode *inode, struct file *filp) -{ - int decr, decw; - - decr = (filp->f_mode & FMODE_READ) != 0; - decw = (filp->f_mode & FMODE_WRITE) != 0; - return pipe_release(inode, decr, decw); -} - -static int -pipe_read_open(struct inode *inode, struct file *filp) -{ - int ret = -ENOENT; - - mutex_lock(&inode->i_mutex); - - if (inode->i_pipe) { - ret = 0; - inode->i_pipe->readers++; - } - - mutex_unlock(&inode->i_mutex); - - return ret; -} - -static int -pipe_write_open(struct inode *inode, struct file *filp) -{ - int ret = -ENOENT; - - mutex_lock(&inode->i_mutex); - - if (inode->i_pipe) { - ret = 0; - inode->i_pipe->writers++; - } - - mutex_unlock(&inode->i_mutex); - - return ret; -} - -static int -pipe_rdwr_open(struct inode *inode, struct file *filp) -{ - int ret = -ENOENT; - - if (!(filp->f_mode & (FMODE_READ|FMODE_WRITE))) - return -EINVAL; - - mutex_lock(&inode->i_mutex); - - if (inode->i_pipe) { - ret = 0; - if (filp->f_mode & FMODE_READ) - inode->i_pipe->readers++; - if (filp->f_mode & FMODE_WRITE) - inode->i_pipe->writers++; - } - - mutex_unlock(&inode->i_mutex); - - return ret; -} - -/* - * The file_operations structs are not static because they - * are also used in linux/fs/fifo.c to do operations on FIFOs. - * - * Pipes reuse fifos' file_operations structs. - */ -const struct file_operations read_pipefifo_fops = { - .llseek = no_llseek, - .read = do_sync_read, - .aio_read = pipe_read, - .write = bad_pipe_w, - .poll = pipe_poll, - .unlocked_ioctl = pipe_ioctl, - .open = pipe_read_open, - .release = pipe_read_release, - .fasync = pipe_read_fasync, -}; - -const struct file_operations write_pipefifo_fops = { - .llseek = no_llseek, - .read = bad_pipe_r, - .write = do_sync_write, - .aio_write = pipe_write, - .poll = pipe_poll, - .unlocked_ioctl = pipe_ioctl, - .open = pipe_write_open, - .release = pipe_write_release, - .fasync = pipe_write_fasync, -}; - -const struct file_operations rdwr_pipefifo_fops = { - .llseek = no_llseek, - .read = do_sync_read, - .aio_read = pipe_read, - .write = do_sync_write, - .aio_write = pipe_write, - .poll = pipe_poll, - .unlocked_ioctl = pipe_ioctl, - .open = pipe_rdwr_open, - .release = pipe_rdwr_release, - .fasync = pipe_rdwr_fasync, -}; - -struct pipe_inode_info * alloc_pipe_info(struct inode *inode) +struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; @@ -934,8 +785,8 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode) if (pipe->bufs) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; - pipe->inode = inode; pipe->buffers = PIPE_DEF_BUFFERS; + mutex_init(&pipe->mutex); return pipe; } kfree(pipe); @@ -944,7 +795,7 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode) return NULL; } -void __free_pipe_info(struct pipe_inode_info *pipe) +void free_pipe_info(struct pipe_inode_info *pipe) { int i; @@ -959,12 +810,6 @@ void __free_pipe_info(struct pipe_inode_info *pipe) kfree(pipe); } -void free_pipe_info(struct inode *inode) -{ - __free_pipe_info(inode->i_pipe); - inode->i_pipe = NULL; -} - static struct vfsmount *pipe_mnt __read_mostly; /* @@ -990,13 +835,14 @@ static struct inode * get_pipe_inode(void) inode->i_ino = get_next_ino(); - pipe = alloc_pipe_info(inode); + pipe = alloc_pipe_info(); if (!pipe) goto fail_iput; - inode->i_pipe = pipe; + inode->i_pipe = pipe; + pipe->files = 2; pipe->readers = pipe->writers = 1; - inode->i_fop = &rdwr_pipefifo_fops; + inode->i_fop = &pipefifo_fops; /* * Mark the inode dirty from the very beginning, @@ -1039,17 +885,19 @@ int create_pipe_files(struct file **res, int flags) d_instantiate(path.dentry, inode); err = -ENFILE; - f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); + f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops); if (IS_ERR(f)) goto err_dentry; f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); + f->private_data = inode->i_pipe; - res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops); + res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops); if (IS_ERR(res[0])) goto err_file; path_get(&path); + res[0]->private_data = inode->i_pipe; res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); res[1] = f; return 0; @@ -1057,12 +905,12 @@ int create_pipe_files(struct file **res, int flags) err_file: put_filp(f); err_dentry: - free_pipe_info(inode); + free_pipe_info(inode->i_pipe); path_put(&path); return err; err_inode: - free_pipe_info(inode); + free_pipe_info(inode->i_pipe); iput(inode); return err; } @@ -1144,6 +992,168 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) return sys_pipe2(fildes, 0); } +static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) +{ + int cur = *cnt; + + while (cur == *cnt) { + pipe_wait(pipe); + if (signal_pending(current)) + break; + } + return cur == *cnt ? -ERESTARTSYS : 0; +} + +static void wake_up_partner(struct pipe_inode_info *pipe) +{ + wake_up_interruptible(&pipe->wait); +} + +static int fifo_open(struct inode *inode, struct file *filp) +{ + struct pipe_inode_info *pipe; + bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; + int kill = 0; + int ret; + + filp->f_version = 0; + + spin_lock(&inode->i_lock); + if (inode->i_pipe) { + pipe = inode->i_pipe; + pipe->files++; + spin_unlock(&inode->i_lock); + } else { + spin_unlock(&inode->i_lock); + pipe = alloc_pipe_info(); + if (!pipe) + return -ENOMEM; + pipe->files = 1; + spin_lock(&inode->i_lock); + if (unlikely(inode->i_pipe)) { + inode->i_pipe->files++; + spin_unlock(&inode->i_lock); + free_pipe_info(pipe); + pipe = inode->i_pipe; + } else { + inode->i_pipe = pipe; + spin_unlock(&inode->i_lock); + } + } + filp->private_data = pipe; + /* OK, we have a pipe and it's pinned down */ + + __pipe_lock(pipe); + + /* We can only do regular read/write on fifos */ + filp->f_mode &= (FMODE_READ | FMODE_WRITE); + + switch (filp->f_mode) { + case FMODE_READ: + /* + * O_RDONLY + * POSIX.1 says that O_NONBLOCK means return with the FIFO + * opened, even when there is no process writing the FIFO. + */ + pipe->r_counter++; + if (pipe->readers++ == 0) + wake_up_partner(pipe); + + if (!is_pipe && !pipe->writers) { + if ((filp->f_flags & O_NONBLOCK)) { + /* suppress POLLHUP until we have + * seen a writer */ + filp->f_version = pipe->w_counter; + } else { + if (wait_for_partner(pipe, &pipe->w_counter)) + goto err_rd; + } + } + break; + + case FMODE_WRITE: + /* + * O_WRONLY + * POSIX.1 says that O_NONBLOCK means return -1 with + * errno=ENXIO when there is no process reading the FIFO. + */ + ret = -ENXIO; + if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) + goto err; + + pipe->w_counter++; + if (!pipe->writers++) + wake_up_partner(pipe); + + if (!is_pipe && !pipe->readers) { + if (wait_for_partner(pipe, &pipe->r_counter)) + goto err_wr; + } + break; + + case FMODE_READ | FMODE_WRITE: + /* + * O_RDWR + * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. + * This implementation will NEVER block on a O_RDWR open, since + * the process can at least talk to itself. + */ + + pipe->readers++; + pipe->writers++; + pipe->r_counter++; + pipe->w_counter++; + if (pipe->readers == 1 || pipe->writers == 1) + wake_up_partner(pipe); + break; + + default: + ret = -EINVAL; + goto err; + } + + /* Ok! */ + __pipe_unlock(pipe); + return 0; + +err_rd: + if (!--pipe->readers) + wake_up_interruptible(&pipe->wait); + ret = -ERESTARTSYS; + goto err; + +err_wr: + if (!--pipe->writers) + wake_up_interruptible(&pipe->wait); + ret = -ERESTARTSYS; + goto err; + +err: + spin_lock(&inode->i_lock); + if (!--pipe->files) { + inode->i_pipe = NULL; + kill = 1; + } + spin_unlock(&inode->i_lock); + __pipe_unlock(pipe); + if (kill) + free_pipe_info(pipe); + return ret; +} + +const struct file_operations pipefifo_fops = { + .open = fifo_open, + .llseek = no_llseek, + .read = do_sync_read, + .aio_read = pipe_read, + .write = do_sync_write, + .aio_write = pipe_write, + .poll = pipe_poll, + .unlocked_ioctl = pipe_ioctl, + .release = pipe_release, + .fasync = pipe_fasync, +}; + /* * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. @@ -1229,9 +1239,7 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, */ struct pipe_inode_info *get_pipe_info(struct file *file) { - struct inode *i = file_inode(file); - - return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; + return file->f_op == &pipefifo_fops ? file->private_data : NULL; } long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) @@ -1243,7 +1251,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (!pipe) return -EBADF; - mutex_lock(&pipe->inode->i_mutex); + __pipe_lock(pipe); switch (cmd) { case F_SETPIPE_SZ: { @@ -1272,7 +1280,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) } out: - mutex_unlock(&pipe->inode->i_mutex); + __pipe_unlock(pipe); return ret; } diff --git a/fs/pnode.c b/fs/pnode.c index 3e000a51ac0d..3d2a7141b87a 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -9,6 +9,7 @@ #include <linux/mnt_namespace.h> #include <linux/mount.h> #include <linux/fs.h> +#include <linux/nsproxy.h> #include "internal.h" #include "pnode.h" @@ -217,15 +218,15 @@ static struct mount *get_source(struct mount *dest, * @source_mnt: source mount. * @tree_list : list of heads of trees to be attached. */ -int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, +int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, struct mount *source_mnt, struct list_head *tree_list) { + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct mount *m, *child; int ret = 0; struct mount *prev_dest_mnt = dest_mnt; struct mount *prev_src_mnt = source_mnt; LIST_HEAD(tmp_list); - LIST_HEAD(umount_list); for (m = propagation_next(dest_mnt, dest_mnt); m; m = propagation_next(m, dest_mnt)) { @@ -237,6 +238,10 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); + /* Notice when we are propagating across user namespaces */ + if (m->mnt_ns->user_ns != user_ns) + type |= CL_UNPRIVILEGED; + child = copy_tree(source, source->mnt.mnt_root, type); if (IS_ERR(child)) { ret = PTR_ERR(child); @@ -244,8 +249,8 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, goto out; } - if (is_subdir(dest_dentry, m->mnt.mnt_root)) { - mnt_set_mountpoint(m, dest_dentry, child); + if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) { + mnt_set_mountpoint(m, dest_mp, child); list_add_tail(&child->mnt_hash, tree_list); } else { /* @@ -261,10 +266,9 @@ out: br_write_lock(&vfsmount_lock); while (!list_empty(&tmp_list)) { child = list_first_entry(&tmp_list, struct mount, mnt_hash); - umount_tree(child, 0, &umount_list); + umount_tree(child, 0); } br_write_unlock(&vfsmount_lock); - release_mounts(&umount_list); return ret; } diff --git a/fs/pnode.h b/fs/pnode.h index 19b853a3445c..b091445c1c4a 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -23,6 +23,7 @@ #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_SHARED_TO_SLAVE 0x20 +#define CL_UNPRIVILEGED 0x40 static inline void set_mnt_shared(struct mount *mnt) { @@ -31,17 +32,16 @@ static inline void set_mnt_shared(struct mount *mnt) } void change_mnt_propagation(struct mount *, int); -int propagate_mnt(struct mount *, struct dentry *, struct mount *, +int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct list_head *); int propagate_umount(struct list_head *); int propagate_mount_busy(struct mount *, int); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); unsigned int mnt_get_count(struct mount *mnt); -void mnt_set_mountpoint(struct mount *, struct dentry *, +void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); -void release_mounts(struct list_head *); -void umount_tree(struct mount *, int, struct list_head *); +void umount_tree(struct mount *, int); struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 712f24db9600..ab30716584f5 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -5,7 +5,7 @@ obj-y += proc.o proc-y := nommu.o task_nommu.o -proc-$(CONFIG_MMU) := mmu.o task_mmu.o +proc-$(CONFIG_MMU) := task_mmu.o proc-y += inode.o root.o base.o generic.o array.o \ fd.o diff --git a/fs/proc/array.c b/fs/proc/array.c index f7ed9ee46eb9..cbd0f1b324b9 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -143,6 +143,7 @@ static const char * const task_state_array[] = { "x (dead)", /* 64 */ "K (wakekill)", /* 128 */ "W (waking)", /* 256 */ + "P (parked)", /* 512 */ }; static inline const char *get_task_state(struct task_struct *tsk) diff --git a/fs/proc/base.c b/fs/proc/base.c index 69078c7cef1f..dd51e50001fe 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -86,6 +86,7 @@ #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/flex_array.h> +#include <linux/posix-timers.h> #ifdef CONFIG_HARDWALL #include <asm/hardwall.h> #endif @@ -404,6 +405,37 @@ static const struct file_operations proc_lstats_operations = { #endif +#ifdef CONFIG_CGROUPS +static int cgroup_open(struct inode *inode, struct file *file) +{ + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cgroup_show, pid); +} + +static const struct file_operations proc_cgroup_operations = { + .open = cgroup_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +#ifdef CONFIG_PROC_PID_CPUSET + +static int cpuset_open(struct inode *inode, struct file *file) +{ + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cpuset_show, pid); +} + +static const struct file_operations proc_cpuset_operations = { + .open = cpuset_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static int proc_oom_score(struct task_struct *task, char *buffer) { unsigned long totalpages = totalram_pages + total_swap_pages; @@ -1347,11 +1379,10 @@ static ssize_t comm_write(struct file *file, const char __user *buf, struct inode *inode = file_inode(file); struct task_struct *p; char buffer[TASK_COMM_LEN]; + const size_t maxlen = sizeof(buffer) - 1; memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) + if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) return -EFAULT; p = get_proc_task(inode); @@ -1621,6 +1652,15 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) return 0; } +int pid_delete_dentry(const struct dentry *dentry) +{ + /* Is the task we represent dead? + * If so, then don't put the dentry on the lru list, + * kill it immediately. + */ + return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; +} + const struct dentry_operations pid_dentry_operations = { .d_revalidate = pid_revalidate, @@ -2013,6 +2053,102 @@ static const struct file_operations proc_map_files_operations = { .llseek = default_llseek, }; +struct timers_private { + struct pid *pid; + struct task_struct *task; + struct sighand_struct *sighand; + struct pid_namespace *ns; + unsigned long flags; +}; + +static void *timers_start(struct seq_file *m, loff_t *pos) +{ + struct timers_private *tp = m->private; + + tp->task = get_pid_task(tp->pid, PIDTYPE_PID); + if (!tp->task) + return ERR_PTR(-ESRCH); + + tp->sighand = lock_task_sighand(tp->task, &tp->flags); + if (!tp->sighand) + return ERR_PTR(-ESRCH); + + return seq_list_start(&tp->task->signal->posix_timers, *pos); +} + +static void *timers_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct timers_private *tp = m->private; + return seq_list_next(v, &tp->task->signal->posix_timers, pos); +} + +static void timers_stop(struct seq_file *m, void *v) +{ + struct timers_private *tp = m->private; + + if (tp->sighand) { + unlock_task_sighand(tp->task, &tp->flags); + tp->sighand = NULL; + } + + if (tp->task) { + put_task_struct(tp->task); + tp->task = NULL; + } +} + +static int show_timer(struct seq_file *m, void *v) +{ + struct k_itimer *timer; + struct timers_private *tp = m->private; + int notify; + static char *nstr[] = { + [SIGEV_SIGNAL] = "signal", + [SIGEV_NONE] = "none", + [SIGEV_THREAD] = "thread", + }; + + timer = list_entry((struct list_head *)v, struct k_itimer, list); + notify = timer->it_sigev_notify; + + seq_printf(m, "ID: %d\n", timer->it_id); + seq_printf(m, "signal: %d/%p\n", timer->sigq->info.si_signo, + timer->sigq->info.si_value.sival_ptr); + seq_printf(m, "notify: %s/%s.%d\n", + nstr[notify & ~SIGEV_THREAD_ID], + (notify & SIGEV_THREAD_ID) ? "tid" : "pid", + pid_nr_ns(timer->it_pid, tp->ns)); + + return 0; +} + +static const struct seq_operations proc_timers_seq_ops = { + .start = timers_start, + .next = timers_next, + .stop = timers_stop, + .show = show_timer, +}; + +static int proc_timers_open(struct inode *inode, struct file *file) +{ + struct timers_private *tp; + + tp = __seq_open_private(file, &proc_timers_seq_ops, + sizeof(struct timers_private)); + if (!tp) + return -ENOMEM; + + tp->pid = proc_pid(inode); + tp->ns = inode->i_sb->s_fs_info; + return 0; +} + +static const struct file_operations proc_timers_operations = { + .open = proc_timers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; #endif /* CONFIG_CHECKPOINT_RESTORE */ static struct dentry *proc_pident_instantiate(struct inode *dir, @@ -2583,6 +2719,9 @@ static const struct pid_entry tgid_base_stuff[] = { REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), #endif +#ifdef CONFIG_CHECKPOINT_RESTORE + REG("timers", S_IRUGO, proc_timers_operations), +#endif }; static int proc_tgid_base_readdir(struct file * filp, @@ -2794,7 +2933,7 @@ retry: return iter; } -#define TGID_OFFSET (FIRST_PROCESS_ENTRY) +#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1) static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct tgid_iter iter) @@ -2817,13 +2956,21 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) struct tgid_iter iter; struct pid_namespace *ns; filldir_t __filldir; + loff_t pos = filp->f_pos; - if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) + if (pos >= PID_MAX_LIMIT + TGID_OFFSET) goto out; - ns = filp->f_dentry->d_sb->s_fs_info; + if (pos == TGID_OFFSET - 1) { + if (proc_fill_cache(filp, dirent, filldir, "self", 4, + NULL, NULL, NULL) < 0) + goto out; + iter.tgid = 0; + } else { + iter.tgid = pos - TGID_OFFSET; + } iter.task = NULL; - iter.tgid = filp->f_pos - TGID_OFFSET; + ns = filp->f_dentry->d_sb->s_fs_info; for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { diff --git a/fs/proc/fd.h b/fs/proc/fd.h index cbb1d47deda8..7c047f256ae2 100644 --- a/fs/proc/fd.h +++ b/fs/proc/fd.h @@ -11,4 +11,9 @@ extern const struct inode_operations proc_fdinfo_inode_operations; extern int proc_fd_permission(struct inode *inode, int mask); +static inline int proc_fd(struct inode *inode) +{ + return PROC_I(inode)->fd; +} + #endif /* __PROCFS_FD_H__ */ diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 4b3b3ffb52f1..a2596afffae6 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -36,212 +36,6 @@ static int proc_match(unsigned int len, const char *name, struct proc_dir_entry return !memcmp(name, de->name, len); } -/* buffer size is one page but our output routines use some slack for overruns */ -#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024) - -static ssize_t -__proc_file_read(struct file *file, char __user *buf, size_t nbytes, - loff_t *ppos) -{ - struct inode * inode = file_inode(file); - char *page; - ssize_t retval=0; - int eof=0; - ssize_t n, count; - char *start; - struct proc_dir_entry * dp; - unsigned long long pos; - - /* - * Gaah, please just use "seq_file" instead. The legacy /proc - * interfaces cut loff_t down to off_t for reads, and ignore - * the offset entirely for writes.. - */ - pos = *ppos; - if (pos > MAX_NON_LFS) - return 0; - if (nbytes > MAX_NON_LFS - pos) - nbytes = MAX_NON_LFS - pos; - - dp = PDE(inode); - if (!(page = (char*) __get_free_page(GFP_TEMPORARY))) - return -ENOMEM; - - while ((nbytes > 0) && !eof) { - count = min_t(size_t, PROC_BLOCK_SIZE, nbytes); - - start = NULL; - if (dp->read_proc) { - /* - * How to be a proc read function - * ------------------------------ - * Prototype: - * int f(char *buffer, char **start, off_t offset, - * int count, int *peof, void *dat) - * - * Assume that the buffer is "count" bytes in size. - * - * If you know you have supplied all the data you - * have, set *peof. - * - * You have three ways to return data: - * 0) Leave *start = NULL. (This is the default.) - * Put the data of the requested offset at that - * offset within the buffer. Return the number (n) - * of bytes there are from the beginning of the - * buffer up to the last byte of data. If the - * number of supplied bytes (= n - offset) is - * greater than zero and you didn't signal eof - * and the reader is prepared to take more data - * you will be called again with the requested - * offset advanced by the number of bytes - * absorbed. This interface is useful for files - * no larger than the buffer. - * 1) Set *start = an unsigned long value less than - * the buffer address but greater than zero. - * Put the data of the requested offset at the - * beginning of the buffer. Return the number of - * bytes of data placed there. If this number is - * greater than zero and you didn't signal eof - * and the reader is prepared to take more data - * you will be called again with the requested - * offset advanced by *start. This interface is - * useful when you have a large file consisting - * of a series of blocks which you want to count - * and return as wholes. - * (Hack by Paul.Russell@rustcorp.com.au) - * 2) Set *start = an address within the buffer. - * Put the data of the requested offset at *start. - * Return the number of bytes of data placed there. - * If this number is greater than zero and you - * didn't signal eof and the reader is prepared to - * take more data you will be called again with the - * requested offset advanced by the number of bytes - * absorbed. - */ - n = dp->read_proc(page, &start, *ppos, - count, &eof, dp->data); - } else - break; - - if (n == 0) /* end of file */ - break; - if (n < 0) { /* error */ - if (retval == 0) - retval = n; - break; - } - - if (start == NULL) { - if (n > PAGE_SIZE) /* Apparent buffer overflow */ - n = PAGE_SIZE; - n -= *ppos; - if (n <= 0) - break; - if (n > count) - n = count; - start = page + *ppos; - } else if (start < page) { - if (n > PAGE_SIZE) /* Apparent buffer overflow */ - n = PAGE_SIZE; - if (n > count) { - /* - * Don't reduce n because doing so might - * cut off part of a data block. - */ - pr_warn("proc_file_read: count exceeded\n"); - } - } else /* start >= page */ { - unsigned long startoff = (unsigned long)(start - page); - if (n > (PAGE_SIZE - startoff)) /* buffer overflow? */ - n = PAGE_SIZE - startoff; - if (n > count) - n = count; - } - - n -= copy_to_user(buf, start < page ? page : start, n); - if (n == 0) { - if (retval == 0) - retval = -EFAULT; - break; - } - - *ppos += start < page ? (unsigned long)start : n; - nbytes -= n; - buf += n; - retval += n; - } - free_page((unsigned long) page); - return retval; -} - -static ssize_t -proc_file_read(struct file *file, char __user *buf, size_t nbytes, - loff_t *ppos) -{ - struct proc_dir_entry *pde = PDE(file_inode(file)); - ssize_t rv = -EIO; - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; - } - pde->pde_users++; - spin_unlock(&pde->pde_unload_lock); - - rv = __proc_file_read(file, buf, nbytes, ppos); - - pde_users_dec(pde); - return rv; -} - -static ssize_t -proc_file_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - struct proc_dir_entry *pde = PDE(file_inode(file)); - ssize_t rv = -EIO; - - if (pde->write_proc) { - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; - } - pde->pde_users++; - spin_unlock(&pde->pde_unload_lock); - - /* FIXME: does this routine need ppos? probably... */ - rv = pde->write_proc(file, buffer, count, pde->data); - pde_users_dec(pde); - } - return rv; -} - - -static loff_t -proc_file_lseek(struct file *file, loff_t offset, int orig) -{ - loff_t retval = -EINVAL; - switch (orig) { - case 1: - offset += file->f_pos; - /* fallthrough */ - case 0: - if (offset < 0 || offset > MAX_NON_LFS) - break; - file->f_pos = retval = offset; - } - return retval; -} - -static const struct file_operations proc_file_operations = { - .llseek = proc_file_lseek, - .read = proc_file_read, - .write = proc_file_write, -}; - static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = dentry->d_inode; @@ -371,7 +165,7 @@ void proc_free_inum(unsigned int inum) static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) { - nd_set_link(nd, PDE(dentry->d_inode)->data); + nd_set_link(nd, __PDE_DATA(dentry->d_inode)); return NULL; } @@ -541,19 +335,17 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp return ret; if (S_ISDIR(dp->mode)) { - if (dp->proc_iops == NULL) { - dp->proc_fops = &proc_dir_operations; - dp->proc_iops = &proc_dir_inode_operations; - } + dp->proc_fops = &proc_dir_operations; + dp->proc_iops = &proc_dir_inode_operations; dir->nlink++; } else if (S_ISLNK(dp->mode)) { - if (dp->proc_iops == NULL) - dp->proc_iops = &proc_link_inode_operations; + dp->proc_iops = &proc_link_inode_operations; } else if (S_ISREG(dp->mode)) { - if (dp->proc_fops == NULL) - dp->proc_fops = &proc_file_operations; - if (dp->proc_iops == NULL) - dp->proc_iops = &proc_file_inode_operations; + BUG_ON(dp->proc_fops == NULL); + dp->proc_iops = &proc_file_inode_operations; + } else { + WARN_ON(1); + return -EINVAL; } spin_lock(&proc_subdir_lock); @@ -636,13 +428,17 @@ struct proc_dir_entry *proc_symlink(const char *name, } EXPORT_SYMBOL(proc_symlink); -struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, - struct proc_dir_entry *parent) +struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, void *data) { struct proc_dir_entry *ent; + if (mode == 0) + mode = S_IRUGO | S_IXUGO; + ent = __proc_create(&parent, name, S_IFDIR | mode, 2); if (ent) { + ent->data = data; if (proc_register(parent, ent) < 0) { kfree(ent); ent = NULL; @@ -650,82 +446,39 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, } return ent; } -EXPORT_SYMBOL(proc_mkdir_mode); +EXPORT_SYMBOL_GPL(proc_mkdir_data); -struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, - struct proc_dir_entry *parent) +struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, + struct proc_dir_entry *parent) { - struct proc_dir_entry *ent; - - ent = __proc_create(&parent, name, S_IFDIR | S_IRUGO | S_IXUGO, 2); - if (ent) { - ent->data = net; - if (proc_register(parent, ent) < 0) { - kfree(ent); - ent = NULL; - } - } - return ent; + return proc_mkdir_data(name, mode, parent, NULL); } -EXPORT_SYMBOL_GPL(proc_net_mkdir); +EXPORT_SYMBOL(proc_mkdir_mode); struct proc_dir_entry *proc_mkdir(const char *name, struct proc_dir_entry *parent) { - return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent); + return proc_mkdir_data(name, 0, parent, NULL); } EXPORT_SYMBOL(proc_mkdir); -struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode, - struct proc_dir_entry *parent) -{ - struct proc_dir_entry *ent; - nlink_t nlink; - - if (S_ISDIR(mode)) { - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO | S_IXUGO; - nlink = 2; - } else { - if ((mode & S_IFMT) == 0) - mode |= S_IFREG; - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO; - nlink = 1; - } - - ent = __proc_create(&parent, name, mode, nlink); - if (ent) { - if (proc_register(parent, ent) < 0) { - kfree(ent); - ent = NULL; - } - } - return ent; -} -EXPORT_SYMBOL(create_proc_entry); - struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops, void *data) { struct proc_dir_entry *pde; - nlink_t nlink; + if ((mode & S_IFMT) == 0) + mode |= S_IFREG; - if (S_ISDIR(mode)) { - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO | S_IXUGO; - nlink = 2; - } else { - if ((mode & S_IFMT) == 0) - mode |= S_IFREG; - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO; - nlink = 1; + if (!S_ISREG(mode)) { + WARN_ON(1); /* use proc_mkdir() */ + return NULL; } - pde = __proc_create(&parent, name, mode, nlink); + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO; + pde = __proc_create(&parent, name, mode, 1); if (!pde) goto out; pde->proc_fops = proc_fops; @@ -739,6 +492,19 @@ out: return NULL; } EXPORT_SYMBOL(proc_create_data); + +void proc_set_size(struct proc_dir_entry *de, loff_t size) +{ + de->size = size; +} +EXPORT_SYMBOL(proc_set_size); + +void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) +{ + de->uid = uid; + de->gid = gid; +} +EXPORT_SYMBOL(proc_set_user); static void free_proc_entry(struct proc_dir_entry *de) { @@ -786,37 +552,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) return; } - spin_lock(&de->pde_unload_lock); - /* - * Stop accepting new callers into module. If you're - * dynamically allocating ->proc_fops, save a pointer somewhere. - */ - de->proc_fops = NULL; - /* Wait until all existing callers into module are done. */ - if (de->pde_users > 0) { - DECLARE_COMPLETION_ONSTACK(c); - - if (!de->pde_unload_completion) - de->pde_unload_completion = &c; - - spin_unlock(&de->pde_unload_lock); - - wait_for_completion(de->pde_unload_completion); - - spin_lock(&de->pde_unload_lock); - } - - while (!list_empty(&de->pde_openers)) { - struct pde_opener *pdeo; - - pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); - list_del(&pdeo->lh); - spin_unlock(&de->pde_unload_lock); - pdeo->release(pdeo->inode, pdeo->file); - kfree(pdeo); - spin_lock(&de->pde_unload_lock); - } - spin_unlock(&de->pde_unload_lock); + proc_entry_rundown(de); if (S_ISDIR(de->mode)) parent->nlink--; @@ -827,3 +563,77 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) pde_put(de); } EXPORT_SYMBOL(remove_proc_entry); + +int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) +{ + struct proc_dir_entry **p; + struct proc_dir_entry *root = NULL, *de, *next; + const char *fn = name; + unsigned int len; + + spin_lock(&proc_subdir_lock); + if (__xlate_proc_name(name, &parent, &fn) != 0) { + spin_unlock(&proc_subdir_lock); + return -ENOENT; + } + len = strlen(fn); + + for (p = &parent->subdir; *p; p=&(*p)->next ) { + if (proc_match(len, fn, *p)) { + root = *p; + *p = root->next; + root->next = NULL; + break; + } + } + if (!root) { + spin_unlock(&proc_subdir_lock); + return -ENOENT; + } + de = root; + while (1) { + next = de->subdir; + if (next) { + de->subdir = next->next; + next->next = NULL; + de = next; + continue; + } + spin_unlock(&proc_subdir_lock); + + proc_entry_rundown(de); + next = de->parent; + if (S_ISDIR(de->mode)) + next->nlink--; + de->nlink = 0; + if (de == root) + break; + pde_put(de); + + spin_lock(&proc_subdir_lock); + de = next; + } + pde_put(root); + return 0; +} +EXPORT_SYMBOL(remove_proc_subtree); + +void *proc_get_parent_data(const struct inode *inode) +{ + struct proc_dir_entry *de = PDE(inode); + return de->parent->data; +} +EXPORT_SYMBOL_GPL(proc_get_parent_data); + +void proc_remove(struct proc_dir_entry *de) +{ + if (de) + remove_proc_subtree(de->name, de->parent); +} +EXPORT_SYMBOL(proc_remove); + +void *PDE_DATA(const struct inode *inode) +{ + return __PDE_DATA(inode); +} +EXPORT_SYMBOL(PDE_DATA); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 869116c2afbe..073aea60cf8f 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -22,6 +22,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/mount.h> +#include <linux/magic.h> #include <asm/uaccess.h> @@ -50,8 +51,8 @@ static void proc_evict_inode(struct inode *inode) sysctl_head_put(head); } /* Release any associated namespace */ - ns_ops = PROC_I(inode)->ns_ops; - ns = PROC_I(inode)->ns; + ns_ops = PROC_I(inode)->ns.ns_ops; + ns = PROC_I(inode)->ns.ns; if (ns_ops && ns) ns_ops->put(ns); } @@ -72,8 +73,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; - ei->ns = NULL; - ei->ns_ops = NULL; + ei->ns.ns = NULL; + ei->ns.ns_ops = NULL; inode = &ei->vfs_inode; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; return inode; @@ -129,96 +130,100 @@ static const struct super_operations proc_sops = { .show_options = proc_show_options, }; -static void __pde_users_dec(struct proc_dir_entry *pde) +enum {BIAS = -1U<<31}; + +static inline int use_pde(struct proc_dir_entry *pde) +{ + return atomic_inc_unless_negative(&pde->in_use); +} + +static void unuse_pde(struct proc_dir_entry *pde) { - pde->pde_users--; - if (pde->pde_unload_completion && pde->pde_users == 0) + if (atomic_dec_return(&pde->in_use) == BIAS) complete(pde->pde_unload_completion); } -void pde_users_dec(struct proc_dir_entry *pde) +/* pde is locked */ +static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) { - spin_lock(&pde->pde_unload_lock); - __pde_users_dec(pde); - spin_unlock(&pde->pde_unload_lock); + if (pdeo->closing) { + /* somebody else is doing that, just wait */ + DECLARE_COMPLETION_ONSTACK(c); + pdeo->c = &c; + spin_unlock(&pde->pde_unload_lock); + wait_for_completion(&c); + spin_lock(&pde->pde_unload_lock); + } else { + struct file *file; + pdeo->closing = 1; + spin_unlock(&pde->pde_unload_lock); + file = pdeo->file; + pde->proc_fops->release(file_inode(file), file); + spin_lock(&pde->pde_unload_lock); + list_del_init(&pdeo->lh); + if (pdeo->c) + complete(pdeo->c); + kfree(pdeo); + } +} + +void proc_entry_rundown(struct proc_dir_entry *de) +{ + DECLARE_COMPLETION_ONSTACK(c); + /* Wait until all existing callers into module are done. */ + de->pde_unload_completion = &c; + if (atomic_add_return(BIAS, &de->in_use) != BIAS) + wait_for_completion(&c); + + spin_lock(&de->pde_unload_lock); + while (!list_empty(&de->pde_openers)) { + struct pde_opener *pdeo; + pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); + close_pdeo(de, pdeo); + } + spin_unlock(&de->pde_unload_lock); } static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; - loff_t (*llseek)(struct file *, loff_t, int); - - spin_lock(&pde->pde_unload_lock); - /* - * remove_proc_entry() is going to delete PDE (as part of module - * cleanup sequence). No new callers into module allowed. - */ - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + loff_t (*llseek)(struct file *, loff_t, int); + llseek = pde->proc_fops->llseek; + if (!llseek) + llseek = default_llseek; + rv = llseek(file, offset, whence); + unuse_pde(pde); } - /* - * Bump refcount so that remove_proc_entry will wail for ->llseek to - * complete. - */ - pde->pde_users++; - /* - * Save function pointer under lock, to protect against ->proc_fops - * NULL'ifying right after ->pde_unload_lock is dropped. - */ - llseek = pde->proc_fops->llseek; - spin_unlock(&pde->pde_unload_lock); - - if (!llseek) - llseek = default_llseek; - rv = llseek(file, offset, whence); - - pde_users_dec(pde); return rv; } static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + read = pde->proc_fops->read; + if (read) + rv = read(file, buf, count, ppos); + unuse_pde(pde); } - pde->pde_users++; - read = pde->proc_fops->read; - spin_unlock(&pde->pde_unload_lock); - - if (read) - rv = read(file, buf, count, ppos); - - pde_users_dec(pde); return rv; } static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { + ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + write = pde->proc_fops->write; + if (write) + rv = write(file, buf, count, ppos); + unuse_pde(pde); } - pde->pde_users++; - write = pde->proc_fops->write; - spin_unlock(&pde->pde_unload_lock); - - if (write) - rv = write(file, buf, count, ppos); - - pde_users_dec(pde); return rv; } @@ -227,20 +232,12 @@ static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *p struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned int rv = DEFAULT_POLLMASK; unsigned int (*poll)(struct file *, struct poll_table_struct *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + poll = pde->proc_fops->poll; + if (poll) + rv = poll(file, pts); + unuse_pde(pde); } - pde->pde_users++; - poll = pde->proc_fops->poll; - spin_unlock(&pde->pde_unload_lock); - - if (poll) - rv = poll(file, pts); - - pde_users_dec(pde); return rv; } @@ -249,20 +246,12 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; long (*ioctl)(struct file *, unsigned int, unsigned long); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + ioctl = pde->proc_fops->unlocked_ioctl; + if (ioctl) + rv = ioctl(file, cmd, arg); + unuse_pde(pde); } - pde->pde_users++; - ioctl = pde->proc_fops->unlocked_ioctl; - spin_unlock(&pde->pde_unload_lock); - - if (ioctl) - rv = ioctl(file, cmd, arg); - - pde_users_dec(pde); return rv; } @@ -272,20 +261,12 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; long (*compat_ioctl)(struct file *, unsigned int, unsigned long); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + compat_ioctl = pde->proc_fops->compat_ioctl; + if (compat_ioctl) + rv = compat_ioctl(file, cmd, arg); + unuse_pde(pde); } - pde->pde_users++; - compat_ioctl = pde->proc_fops->compat_ioctl; - spin_unlock(&pde->pde_unload_lock); - - if (compat_ioctl) - rv = compat_ioctl(file, cmd, arg); - - pde_users_dec(pde); return rv; } #endif @@ -295,20 +276,12 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; int (*mmap)(struct file *, struct vm_area_struct *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + mmap = pde->proc_fops->mmap; + if (mmap) + rv = mmap(file, vma); + unuse_pde(pde); } - pde->pde_users++; - mmap = pde->proc_fops->mmap; - spin_unlock(&pde->pde_unload_lock); - - if (mmap) - rv = mmap(file, vma); - - pde_users_dec(pde); return rv; } @@ -330,91 +303,47 @@ static int proc_reg_open(struct inode *inode, struct file *file) * by hand in remove_proc_entry(). For this, save opener's credentials * for later. */ - pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); + pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL); if (!pdeo) return -ENOMEM; - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + if (!use_pde(pde)) { kfree(pdeo); return -ENOENT; } - pde->pde_users++; open = pde->proc_fops->open; release = pde->proc_fops->release; - spin_unlock(&pde->pde_unload_lock); if (open) rv = open(inode, file); - spin_lock(&pde->pde_unload_lock); if (rv == 0 && release) { /* To know what to release. */ - pdeo->inode = inode; pdeo->file = file; /* Strictly for "too late" ->release in proc_reg_release(). */ - pdeo->release = release; + spin_lock(&pde->pde_unload_lock); list_add(&pdeo->lh, &pde->pde_openers); + spin_unlock(&pde->pde_unload_lock); } else kfree(pdeo); - __pde_users_dec(pde); - spin_unlock(&pde->pde_unload_lock); - return rv; -} - -static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde, - struct inode *inode, struct file *file) -{ - struct pde_opener *pdeo; - list_for_each_entry(pdeo, &pde->pde_openers, lh) { - if (pdeo->inode == inode && pdeo->file == file) - return pdeo; - } - return NULL; + unuse_pde(pde); + return rv; } static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); - int rv = 0; - int (*release)(struct inode *, struct file *); struct pde_opener *pdeo; - spin_lock(&pde->pde_unload_lock); - pdeo = find_pde_opener(pde, inode, file); - if (!pde->proc_fops) { - /* - * Can't simply exit, __fput() will think that everything is OK, - * and move on to freeing struct file. remove_proc_entry() will - * find slacker in opener's list and will try to do non-trivial - * things with struct file. Therefore, remove opener from list. - * - * But if opener is removed from list, who will ->release it? - */ - if (pdeo) { - list_del(&pdeo->lh); - spin_unlock(&pde->pde_unload_lock); - rv = pdeo->release(inode, file); - kfree(pdeo); - } else - spin_unlock(&pde->pde_unload_lock); - return rv; - } - pde->pde_users++; - release = pde->proc_fops->release; - if (pdeo) { - list_del(&pdeo->lh); - kfree(pdeo); + list_for_each_entry(pdeo, &pde->pde_openers, lh) { + if (pdeo->file == file) { + close_pdeo(pde, pdeo); + break; + } } spin_unlock(&pde->pde_unload_lock); - - if (release) - rv = release(inode, file); - - pde_users_dec(pde); - return rv; + return 0; } static const struct file_operations proc_reg_file_ops = { @@ -462,8 +391,8 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_size = de->size; if (de->nlink) set_nlink(inode, de->nlink); - if (de->proc_iops) - inode->i_op = de->proc_iops; + WARN_ON(!de->proc_iops); + inode->i_op = de->proc_iops; if (de->proc_fops) { if (S_ISREG(inode->i_mode)) { #ifdef CONFIG_COMPAT @@ -506,5 +435,5 @@ int proc_fill_super(struct super_block *s) return -ENOMEM; } - return 0; + return proc_setup_self(s); } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 85ff3a4598b3..d600fb098b6a 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -1,4 +1,4 @@ -/* internal.h: internal procfs definitions +/* Internal procfs definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) @@ -9,80 +9,83 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/sched.h> #include <linux/proc_fs.h> +#include <linux/proc_ns.h> +#include <linux/spinlock.h> +#include <linux/atomic.h> #include <linux/binfmts.h> -struct ctl_table_header; -struct mempolicy; -extern struct proc_dir_entry proc_root; -extern void proc_self_init(void); -#ifdef CONFIG_PROC_SYSCTL -extern int proc_sys_init(void); -extern void sysctl_head_put(struct ctl_table_header *head); -#else -static inline void proc_sys_init(void) { } -static inline void sysctl_head_put(struct ctl_table_header *head) { } -#endif -#ifdef CONFIG_NET -extern int proc_net_init(void); -#else -static inline int proc_net_init(void) { return 0; } -#endif +struct ctl_table_header; +struct mempolicy; -struct vmalloc_info { - unsigned long used; - unsigned long largest_chunk; +/* + * This is not completely implemented yet. The idea is to + * create an in-memory tree (like the actual /proc filesystem + * tree) of these proc_dir_entries, so that we can dynamically + * add new files to /proc. + * + * The "next" pointer creates a linked list of one /proc directory, + * while parent/subdir create the directory structure (every + * /proc file has a parent, but "subdir" is NULL for all + * non-directory entries). + */ +struct proc_dir_entry { + unsigned int low_ino; + umode_t mode; + nlink_t nlink; + kuid_t uid; + kgid_t gid; + loff_t size; + const struct inode_operations *proc_iops; + const struct file_operations *proc_fops; + struct proc_dir_entry *next, *parent, *subdir; + void *data; + atomic_t count; /* use count */ + atomic_t in_use; /* number of callers into module in progress; */ + /* negative -> it's going away RSN */ + struct completion *pde_unload_completion; + struct list_head pde_openers; /* who did ->open, but not ->release */ + spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ + u8 namelen; + char name[]; }; -#ifdef CONFIG_MMU -#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) -extern void get_vmalloc_info(struct vmalloc_info *vmi); -#else - -#define VMALLOC_TOTAL 0UL -#define get_vmalloc_info(vmi) \ -do { \ - (vmi)->used = 0; \ - (vmi)->largest_chunk = 0; \ -} while(0) -#endif - -extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); - -extern const struct file_operations proc_tid_children_operations; -extern const struct file_operations proc_pid_maps_operations; -extern const struct file_operations proc_tid_maps_operations; -extern const struct file_operations proc_pid_numa_maps_operations; -extern const struct file_operations proc_tid_numa_maps_operations; -extern const struct file_operations proc_pid_smaps_operations; -extern const struct file_operations proc_tid_smaps_operations; -extern const struct file_operations proc_clear_refs_operations; -extern const struct file_operations proc_pagemap_operations; -extern const struct file_operations proc_net_operations; -extern const struct inode_operations proc_net_inode_operations; -extern const struct inode_operations proc_pid_link_inode_operations; +union proc_op { + int (*proc_get_link)(struct dentry *, struct path *); + int (*proc_read)(struct task_struct *task, char *page); + int (*proc_show)(struct seq_file *m, + struct pid_namespace *ns, struct pid *pid, + struct task_struct *task); +}; -struct proc_maps_private { +struct proc_inode { struct pid *pid; - struct task_struct *task; -#ifdef CONFIG_MMU - struct vm_area_struct *tail_vma; -#endif -#ifdef CONFIG_NUMA - struct mempolicy *task_mempolicy; -#endif + int fd; + union proc_op op; + struct proc_dir_entry *pde; + struct ctl_table_header *sysctl; + struct ctl_table *sysctl_entry; + struct proc_ns ns; + struct inode vfs_inode; }; -void proc_init_inodecache(void); +/* + * General functions + */ +static inline struct proc_inode *PROC_I(const struct inode *inode) +{ + return container_of(inode, struct proc_inode, vfs_inode); +} + +static inline struct proc_dir_entry *PDE(const struct inode *inode) +{ + return PROC_I(inode)->pde; +} + +static inline void *__PDE_DATA(const struct inode *inode) +{ + return PDE(inode)->data; +} static inline struct pid *proc_pid(struct inode *inode) { @@ -94,11 +97,6 @@ static inline struct task_struct *get_proc_task(struct inode *inode) return get_pid_task(proc_pid(inode), PIDTYPE_PID); } -static inline int proc_fd(struct inode *inode) -{ - return PROC_I(inode)->fd; -} - static inline int task_dumpable(struct task_struct *task) { int dumpable = 0; @@ -114,15 +112,6 @@ static inline int task_dumpable(struct task_struct *task) return 0; } -static inline int pid_delete_dentry(const struct dentry * dentry) -{ - /* Is the task we represent dead? - * If so, then don't put the dentry on the lru list, - * kill it immediately. - */ - return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; -} - static inline unsigned name_to_int(struct dentry *dentry) { const char *name = dentry->d_name.name; @@ -145,63 +134,165 @@ out: return ~0U; } -struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, - struct dentry *dentry); -int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, - filldir_t filldir); +/* + * Offset of the first process in the /proc root directory.. + */ +#define FIRST_PROCESS_ENTRY 256 -struct pde_opener { - struct inode *inode; - struct file *file; - int (*release)(struct inode *, struct file *); - struct list_head lh; -}; -void pde_users_dec(struct proc_dir_entry *pde); +/* Worst case buffer size needed for holding an integer. */ +#define PROC_NUMBUF 13 + +/* + * array.c + */ +extern const struct file_operations proc_tid_children_operations; + +extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_status(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); + +/* + * base.c + */ +extern const struct dentry_operations pid_dentry_operations; +extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int proc_setattr(struct dentry *, struct iattr *); +extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *); +extern int pid_revalidate(struct dentry *, unsigned int); +extern int pid_delete_dentry(const struct dentry *); +extern int proc_pid_readdir(struct file *, void *, filldir_t); +extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); +extern loff_t mem_lseek(struct file *, loff_t, int); + +/* Lookups */ +typedef struct dentry *instantiate_t(struct inode *, struct dentry *, + struct task_struct *, const void *); +extern int proc_fill_cache(struct file *, void *, filldir_t, const char *, int, + instantiate_t, struct task_struct *, const void *); +/* + * generic.c + */ extern spinlock_t proc_subdir_lock; -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int); -int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); -unsigned long task_vsize(struct mm_struct *); -unsigned long task_statm(struct mm_struct *, - unsigned long *, unsigned long *, unsigned long *, unsigned long *); -void task_mem(struct seq_file *, struct mm_struct *); +extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); +extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, + struct dentry *); +extern int proc_readdir(struct file *, void *, filldir_t); +extern int proc_readdir_de(struct proc_dir_entry *, struct file *, void *, filldir_t); static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { atomic_inc(&pde->count); return pde; } -void pde_put(struct proc_dir_entry *pde); - -int proc_fill_super(struct super_block *); -struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); -int proc_remount(struct super_block *sb, int *flags, char *data); +extern void pde_put(struct proc_dir_entry *); /* - * These are generic /proc routines that use the internal - * "struct proc_dir_entry" tree to traverse the filesystem. - * - * The /proc root directory has extended versions to take care - * of the /proc/<pid> subdirectories. + * inode.c */ -int proc_readdir(struct file *, void *, filldir_t); -struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); +struct pde_opener { + struct file *file; + struct list_head lh; + int closing; + struct completion *c; +}; +extern const struct inode_operations proc_pid_link_inode_operations; +extern void proc_init_inodecache(void); +extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); +extern int proc_fill_super(struct super_block *); +extern void proc_entry_rundown(struct proc_dir_entry *); -/* Lookups */ -typedef struct dentry *instantiate_t(struct inode *, struct dentry *, - struct task_struct *, const void *); -int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - const char *name, int len, - instantiate_t instantiate, struct task_struct *task, const void *ptr); -int pid_revalidate(struct dentry *dentry, unsigned int flags); -struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task); -extern const struct dentry_operations pid_dentry_operations; -int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -int proc_setattr(struct dentry *dentry, struct iattr *attr); +/* + * proc_devtree.c + */ +#ifdef CONFIG_PROC_DEVICETREE +extern void proc_device_tree_init(void); +#endif +/* + * proc_namespaces.c + */ extern const struct inode_operations proc_ns_dir_inode_operations; extern const struct file_operations proc_ns_dir_operations; +/* + * proc_net.c + */ +extern const struct file_operations proc_net_operations; +extern const struct inode_operations proc_net_inode_operations; + +#ifdef CONFIG_NET +extern int proc_net_init(void); +#else +static inline int proc_net_init(void) { return 0; } +#endif + +/* + * proc_self.c + */ +extern int proc_setup_self(struct super_block *); + +/* + * proc_sysctl.c + */ +#ifdef CONFIG_PROC_SYSCTL +extern int proc_sys_init(void); +extern void sysctl_head_put(struct ctl_table_header *); +#else +static inline void proc_sys_init(void) { } +static inline void sysctl_head_put(struct ctl_table_header *head) { } +#endif + +/* + * proc_tty.c + */ +#ifdef CONFIG_TTY +extern void proc_tty_init(void); +#else +static inline void proc_tty_init(void) {} +#endif + +/* + * root.c + */ +extern struct proc_dir_entry proc_root; + +extern void proc_self_init(void); +extern int proc_remount(struct super_block *, int *, char *); + +/* + * task_[no]mmu.c + */ +struct proc_maps_private { + struct pid *pid; + struct task_struct *task; +#ifdef CONFIG_MMU + struct vm_area_struct *tail_vma; +#endif +#ifdef CONFIG_NUMA + struct mempolicy *task_mempolicy; +#endif +}; + +extern const struct file_operations proc_pid_maps_operations; +extern const struct file_operations proc_tid_maps_operations; +extern const struct file_operations proc_pid_numa_maps_operations; +extern const struct file_operations proc_tid_numa_maps_operations; +extern const struct file_operations proc_pid_smaps_operations; +extern const struct file_operations proc_tid_smaps_operations; +extern const struct file_operations proc_clear_refs_operations; +extern const struct file_operations proc_pagemap_operations; + +extern unsigned long task_vsize(struct mm_struct *); +extern unsigned long task_statm(struct mm_struct *, + unsigned long *, unsigned long *, + unsigned long *, unsigned long *); +extern void task_mem(struct seq_file *, struct mm_struct *); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index eda6f017f272..0a22194e5d58 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -11,10 +11,12 @@ #include <linux/mm.h> #include <linux/proc_fs.h> +#include <linux/kcore.h> #include <linux/user.h> #include <linux/capability.h> #include <linux/elf.h> #include <linux/elfcore.h> +#include <linux/notifier.h> #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/printk.h> @@ -27,6 +29,7 @@ #include <linux/ioport.h> #include <linux/memory.h> #include <asm/sections.h> +#include "internal.h" #define CORE_STR "CORE" @@ -564,7 +567,6 @@ static const struct file_operations proc_kcore_operations = { .llseek = default_llseek, }; -#ifdef CONFIG_MEMORY_HOTPLUG /* just remember that we have to update kcore */ static int __meminit kcore_callback(struct notifier_block *self, unsigned long action, void *arg) @@ -578,8 +580,11 @@ static int __meminit kcore_callback(struct notifier_block *self, } return NOTIFY_OK; } -#endif +static struct notifier_block kcore_callback_nb __meminitdata = { + .notifier_call = kcore_callback, + .priority = 0, +}; static struct kcore_list kcore_vmalloc; @@ -631,7 +636,7 @@ static int __init proc_kcore_init(void) add_modules_range(); /* Store direct-map area from physical memory map */ kcore_update_ram(); - hotplug_memory_notifier(kcore_callback, 0); + register_hotmemory_notifier(&kcore_callback_nb); return 0; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 1efaaa19c4f3..5aa847a603c0 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -11,6 +11,7 @@ #include <linux/swap.h> #include <linux/vmstat.h> #include <linux/atomic.h> +#include <linux/vmalloc.h> #include <asm/page.h> #include <asm/pgtable.h> #include "internal.h" diff --git a/fs/proc/mmu.c b/fs/proc/mmu.c deleted file mode 100644 index 8ae221dfd010..000000000000 --- a/fs/proc/mmu.c +++ /dev/null @@ -1,60 +0,0 @@ -/* mmu.c: mmu memory info files - * - * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include <linux/spinlock.h> -#include <linux/vmalloc.h> -#include <linux/highmem.h> -#include <asm/pgtable.h> -#include "internal.h" - -void get_vmalloc_info(struct vmalloc_info *vmi) -{ - struct vm_struct *vma; - unsigned long free_area_size; - unsigned long prev_end; - - vmi->used = 0; - - if (!vmlist) { - vmi->largest_chunk = VMALLOC_TOTAL; - } - else { - vmi->largest_chunk = 0; - - prev_end = VMALLOC_START; - - read_lock(&vmlist_lock); - - for (vma = vmlist; vma; vma = vma->next) { - unsigned long addr = (unsigned long) vma->addr; - - /* - * Some archs keep another range for modules in vmlist - */ - if (addr < VMALLOC_START) - continue; - if (addr >= VMALLOC_END) - break; - - vmi->used += vma->size; - - free_area_size = addr - prev_end; - if (vmi->largest_chunk < free_area_size) - vmi->largest_chunk = free_area_size; - - prev_end = vma->size + addr; - } - - if (VMALLOC_END - prev_end > vmi->largest_chunk) - vmi->largest_chunk = VMALLOC_END - prev_end; - - read_unlock(&vmlist_lock); - } -} diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 66b51c0383da..54bdc6701e9f 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -51,7 +51,7 @@ static int ns_delete_dentry(const struct dentry *dentry) static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = dentry->d_inode; - const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops; return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", ns_ops->name, inode->i_ino); @@ -95,8 +95,8 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb, inode->i_op = &ns_inode_operations; inode->i_mode = S_IFREG | S_IRUGO; inode->i_fop = &ns_file_operations; - ei->ns_ops = ns_ops; - ei->ns = ns; + ei->ns.ns_ops = ns_ops; + ei->ns.ns = ns; unlock_new_inode(inode); } else { ns_ops->put(ns); @@ -128,7 +128,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; - ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); + ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops); if (IS_ERR(ns_path.dentry)) { error = ERR_CAST(ns_path.dentry); goto out_put_task; @@ -148,7 +148,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl { struct inode *inode = dentry->d_inode; struct proc_inode *ei = PROC_I(inode); - const struct proc_ns_operations *ns_ops = ei->ns_ops; + const struct proc_ns_operations *ns_ops = ei->ns.ns_ops; struct task_struct *task; void *ns; char name[50]; @@ -202,7 +202,7 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, ei = PROC_I(inode); inode->i_mode = S_IFLNK|S_IRWXUGO; inode->i_op = &proc_ns_link_inode_operations; - ei->ns_ops = ns_ops; + ei->ns.ns_ops = ns_ops; d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); @@ -337,6 +337,11 @@ out_invalid: return ERR_PTR(-EINVAL); } +struct proc_ns *get_proc_ns(struct inode *inode) +{ + return &PROC_I(inode)->ns; +} + bool proc_ns_inode(struct inode *inode) { return inode->i_fop == &ns_file_operations; diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 30b590f5bd35..505afc950e0a 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -41,7 +41,7 @@ static int property_proc_show(struct seq_file *m, void *v) static int property_proc_open(struct inode *inode, struct file *file) { - return single_open(file, property_proc_show, PDE(inode)->data); + return single_open(file, property_proc_show, __PDE_DATA(inode)); } static const struct file_operations property_proc_fops = { diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index b4ac6572474f..986e83220d56 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -26,6 +26,10 @@ #include "internal.h" +static inline struct net *PDE_NET(struct proc_dir_entry *pde) +{ + return pde->parent->data; +} static struct net *get_proc_net(const struct inode *inode) { diff --git a/fs/proc/root.c b/fs/proc/root.c index c6e9fac26bac..41a6ea93f486 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,6 +16,7 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/bitops.h> +#include <linux/user_namespace.h> #include <linux/mount.h> #include <linux/pid_namespace.h> #include <linux/parser.h> @@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } else { ns = task_active_pid_ns(current); options = data; + + if (!current_user_ns()->may_mount_proc) + return ERR_PTR(-EPERM); } sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); @@ -137,6 +141,8 @@ static void proc_kill_sb(struct super_block *sb) struct pid_namespace *ns; ns = (struct pid_namespace *)sb->s_fs_info; + if (ns->proc_self) + dput(ns->proc_self); kill_anon_super(sb); put_pid_ns(ns); } diff --git a/fs/proc/self.c b/fs/proc/self.c index aa5cc3bff140..6b6a993b5c25 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -1,6 +1,8 @@ -#include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/namei.h> +#include <linux/slab.h> +#include <linux/pid_namespace.h> +#include "internal.h" /* * /proc/self: @@ -48,12 +50,43 @@ static const struct inode_operations proc_self_inode_operations = { .put_link = proc_self_put_link, }; -void __init proc_self_init(void) +static unsigned self_inum; + +int proc_setup_self(struct super_block *s) { - struct proc_dir_entry *proc_self_symlink; - mode_t mode; + struct inode *root_inode = s->s_root->d_inode; + struct pid_namespace *ns = s->s_fs_info; + struct dentry *self; + + mutex_lock(&root_inode->i_mutex); + self = d_alloc_name(s->s_root, "self"); + if (self) { + struct inode *inode = new_inode_pseudo(s); + if (inode) { + inode->i_ino = self_inum; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_op = &proc_self_inode_operations; + d_add(self, inode); + } else { + dput(self); + self = ERR_PTR(-ENOMEM); + } + } else { + self = ERR_PTR(-ENOMEM); + } + mutex_unlock(&root_inode->i_mutex); + if (IS_ERR(self)) { + pr_err("proc_fill_super: can't allocate /proc/self\n"); + return PTR_ERR(self); + } + ns->proc_self = self; + return 0; +} - mode = S_IFLNK | S_IRWXUGO; - proc_self_symlink = proc_create("self", mode, NULL, NULL ); - proc_self_symlink->proc_iops = &proc_self_inode_operations; +void __init proc_self_init(void) +{ + proc_alloc_inum(&self_inum); } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index b870f740ab5a..17f7e080d7ff 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -8,7 +8,7 @@ */ #include <linux/mm.h> -#include <linux/proc_fs.h> +#include <linux/kcore.h> #include <linux/user.h> #include <linux/elf.h> #include <linux/elfcore.h> @@ -22,6 +22,7 @@ #include <linux/list.h> #include <asm/uaccess.h> #include <asm/io.h> +#include "internal.h" /* List representing chunks of contiguous memory areas and their offsets in * vmcore file. @@ -698,7 +699,7 @@ void vmcore_cleanup(void) struct list_head *pos, *next; if (proc_vmcore) { - remove_proc_entry(proc_vmcore->name, proc_vmcore->parent); + proc_remove(proc_vmcore); proc_vmcore = NULL; } diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 288f068740f6..32cbd7c8a90c 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -83,7 +83,7 @@ struct ramoops_context { size_t console_size; size_t ftrace_size; int dump_oops; - int ecc_size; + struct persistent_ram_ecc_info ecc_info; unsigned int max_dump_cnt; unsigned int dump_write_cnt; unsigned int dump_read_cnt; @@ -136,6 +136,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, char **buf, struct pstore_info *psi) { ssize_t size; + ssize_t ecc_notice_size; struct ramoops_context *cxt = psi->data; struct persistent_ram_zone *prz; @@ -156,12 +157,18 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, time->tv_nsec = 0; size = persistent_ram_old_size(prz); - *buf = kmalloc(size, GFP_KERNEL); + + /* ECC correction notice */ + ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); + + *buf = kmalloc(size + ecc_notice_size + 1, GFP_KERNEL); if (*buf == NULL) return -ENOMEM; + memcpy(*buf, persistent_ram_old(prz), size); + persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1); - return size; + return size + ecc_notice_size; } static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) @@ -323,7 +330,8 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, for (i = 0; i < cxt->max_dump_cnt; i++) { size_t sz = cxt->record_size; - cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, cxt->ecc_size); + cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, + &cxt->ecc_info); if (IS_ERR(cxt->przs[i])) { err = PTR_ERR(cxt->przs[i]); dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", @@ -353,7 +361,7 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, return -ENOMEM; } - *prz = persistent_ram_new(*paddr, sz, sig, cxt->ecc_size); + *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info); if (IS_ERR(*prz)) { int err = PTR_ERR(*prz); @@ -407,7 +415,7 @@ static int ramoops_probe(struct platform_device *pdev) cxt->console_size = pdata->console_size; cxt->ftrace_size = pdata->ftrace_size; cxt->dump_oops = pdata->dump_oops; - cxt->ecc_size = pdata->ecc_size; + cxt->ecc_info = pdata->ecc_info; paddr = cxt->phys_addr; @@ -465,9 +473,9 @@ static int ramoops_probe(struct platform_device *pdev) record_size = pdata->record_size; dump_oops = pdata->dump_oops; - pr_info("attached 0x%lx@0x%llx, ecc: %d\n", + pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n", cxt->size, (unsigned long long)cxt->phys_addr, - cxt->ecc_size); + cxt->ecc_info.ecc_size, cxt->ecc_info.block_size); return 0; @@ -539,7 +547,7 @@ static void ramoops_register_dummy(void) * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC * (using 1 byte for ECC isn't much of use anyway). */ - dummy_data->ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc; + dummy_data->ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc; dummy = platform_device_register_data(NULL, "ramoops", -1, dummy_data, sizeof(struct ramoops_platform_data)); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 0306303be372..59337326e288 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -82,12 +82,12 @@ static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz, uint8_t *data, size_t len, uint8_t *ecc) { int i; - uint16_t par[prz->ecc_size]; + uint16_t par[prz->ecc_info.ecc_size]; /* Initialize the parity buffer */ memset(par, 0, sizeof(par)); encode_rs8(prz->rs_decoder, data, len, par, 0); - for (i = 0; i < prz->ecc_size; i++) + for (i = 0; i < prz->ecc_info.ecc_size; i++) ecc[i] = par[i]; } @@ -95,9 +95,9 @@ static int persistent_ram_decode_rs8(struct persistent_ram_zone *prz, void *data, size_t len, uint8_t *ecc) { int i; - uint16_t par[prz->ecc_size]; + uint16_t par[prz->ecc_info.ecc_size]; - for (i = 0; i < prz->ecc_size; i++) + for (i = 0; i < prz->ecc_info.ecc_size; i++) par[i] = ecc[i]; return decode_rs8(prz->rs_decoder, data, par, len, NULL, 0, NULL, 0, NULL); @@ -110,15 +110,15 @@ static void notrace persistent_ram_update_ecc(struct persistent_ram_zone *prz, uint8_t *buffer_end = buffer->data + prz->buffer_size; uint8_t *block; uint8_t *par; - int ecc_block_size = prz->ecc_block_size; - int ecc_size = prz->ecc_size; - int size = prz->ecc_block_size; + int ecc_block_size = prz->ecc_info.block_size; + int ecc_size = prz->ecc_info.ecc_size; + int size = ecc_block_size; - if (!prz->ecc_size) + if (!ecc_size) return; block = buffer->data + (start & ~(ecc_block_size - 1)); - par = prz->par_buffer + (start / ecc_block_size) * prz->ecc_size; + par = prz->par_buffer + (start / ecc_block_size) * ecc_size; do { if (block + ecc_block_size > buffer_end) @@ -133,7 +133,7 @@ static void persistent_ram_update_header_ecc(struct persistent_ram_zone *prz) { struct persistent_ram_buffer *buffer = prz->buffer; - if (!prz->ecc_size) + if (!prz->ecc_info.ecc_size) return; persistent_ram_encode_rs8(prz, (uint8_t *)buffer, sizeof(*buffer), @@ -146,14 +146,14 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz) uint8_t *block; uint8_t *par; - if (!prz->ecc_size) + if (!prz->ecc_info.ecc_size) return; block = buffer->data; par = prz->par_buffer; while (block < buffer->data + buffer_size(prz)) { int numerr; - int size = prz->ecc_block_size; + int size = prz->ecc_info.block_size; if (block + size > buffer->data + prz->buffer_size) size = buffer->data + prz->buffer_size - block; numerr = persistent_ram_decode_rs8(prz, block, size, par); @@ -166,44 +166,49 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz) block); prz->bad_blocks++; } - block += prz->ecc_block_size; - par += prz->ecc_size; + block += prz->ecc_info.block_size; + par += prz->ecc_info.ecc_size; } } static int persistent_ram_init_ecc(struct persistent_ram_zone *prz, - int ecc_size) + struct persistent_ram_ecc_info *ecc_info) { int numerr; struct persistent_ram_buffer *buffer = prz->buffer; int ecc_blocks; size_t ecc_total; - int ecc_symsize = 8; - int ecc_poly = 0x11d; - if (!ecc_size) + if (!ecc_info || !ecc_info->ecc_size) return 0; - prz->ecc_block_size = 128; - prz->ecc_size = ecc_size; + prz->ecc_info.block_size = ecc_info->block_size ?: 128; + prz->ecc_info.ecc_size = ecc_info->ecc_size ?: 16; + prz->ecc_info.symsize = ecc_info->symsize ?: 8; + prz->ecc_info.poly = ecc_info->poly ?: 0x11d; - ecc_blocks = DIV_ROUND_UP(prz->buffer_size, prz->ecc_block_size); - ecc_total = (ecc_blocks + 1) * prz->ecc_size; + ecc_blocks = DIV_ROUND_UP(prz->buffer_size - prz->ecc_info.ecc_size, + prz->ecc_info.block_size + + prz->ecc_info.ecc_size); + ecc_total = (ecc_blocks + 1) * prz->ecc_info.ecc_size; if (ecc_total >= prz->buffer_size) { pr_err("%s: invalid ecc_size %u (total %zu, buffer size %zu)\n", - __func__, prz->ecc_size, ecc_total, prz->buffer_size); + __func__, prz->ecc_info.ecc_size, + ecc_total, prz->buffer_size); return -EINVAL; } prz->buffer_size -= ecc_total; prz->par_buffer = buffer->data + prz->buffer_size; - prz->par_header = prz->par_buffer + ecc_blocks * prz->ecc_size; + prz->par_header = prz->par_buffer + + ecc_blocks * prz->ecc_info.ecc_size; /* * first consecutive root is 0 * primitive element to generate roots = 1 */ - prz->rs_decoder = init_rs(ecc_symsize, ecc_poly, 0, 1, prz->ecc_size); + prz->rs_decoder = init_rs(prz->ecc_info.symsize, prz->ecc_info.poly, + 0, 1, prz->ecc_info.ecc_size); if (prz->rs_decoder == NULL) { pr_info("persistent_ram: init_rs failed\n"); return -EINVAL; @@ -230,6 +235,9 @@ ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz, { ssize_t ret; + if (!prz->ecc_info.ecc_size) + return 0; + if (prz->corrected_bytes || prz->bad_blocks) ret = snprintf(str, len, "" "\n%d Corrected bytes, %d unrecoverable blocks\n", @@ -391,11 +399,11 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, } static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, - int ecc_size) + struct persistent_ram_ecc_info *ecc_info) { int ret; - ret = persistent_ram_init_ecc(prz, ecc_size); + ret = persistent_ram_init_ecc(prz, ecc_info); if (ret) return ret; @@ -444,7 +452,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz) } struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, - u32 sig, int ecc_size) + u32 sig, struct persistent_ram_ecc_info *ecc_info) { struct persistent_ram_zone *prz; int ret = -ENOMEM; @@ -459,7 +467,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, if (ret) goto err; - ret = persistent_ram_post_init(prz, sig, ecc_size); + ret = persistent_ram_post_init(prz, sig, ecc_info); if (ret) goto err; diff --git a/fs/read_write.c b/fs/read_write.c index e6ddc8dceb96..605dbbcb1973 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -128,7 +128,7 @@ EXPORT_SYMBOL(generic_file_llseek_size); * * This is a generic implemenation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by - * @offset and @whence under i_mutex. + * @offset and @whence. */ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { @@ -459,6 +459,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ ret = rw_verify_area(WRITE, file, pos, count); if (ret >= 0) { count = ret; + file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else @@ -468,6 +469,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ add_wchar(current, ret); } inc_syscw(current); + file_end_write(file); } return ret; @@ -515,8 +517,8 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, return ret; } -SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf, - size_t count, loff_t pos) +SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, + size_t, count, loff_t, pos) { struct fd f; ssize_t ret = -EBADF; @@ -534,17 +536,9 @@ SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf, return ret; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos) -{ - return SYSC_pread64((unsigned int) fd, (char __user *) buf, - (size_t) count, pos); -} -SYSCALL_ALIAS(sys_pread64, SyS_pread64); -#endif -SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf, - size_t count, loff_t pos) +SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, + size_t, count, loff_t, pos) { struct fd f; ssize_t ret = -EBADF; @@ -562,14 +556,6 @@ SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf, return ret; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos) -{ - return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf, - (size_t) count, pos); -} -SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64); -#endif /* * Reduce an iovec's length in-place. Return the resulting number of segments @@ -592,7 +578,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) } EXPORT_SYMBOL(iov_shorten); -ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, +static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) { struct kiocb kiocb; @@ -617,7 +603,7 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, } /* Do it by hand, with file-ops */ -ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, +static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, unsigned long nr_segs, loff_t *ppos, io_fn_t fn) { struct iovec *vector = iov; @@ -759,6 +745,7 @@ static ssize_t do_readv_writev(int type, struct file *file, } else { fn = (io_fn_t)file->f_op->write; fnv = file->f_op->aio_write; + file_start_write(file); } if (fnv) @@ -767,6 +754,9 @@ static ssize_t do_readv_writev(int type, struct file *file, else ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); + if (type != READ) + file_end_write(file); + out: if (iov != iovstack) kfree(iov); @@ -897,8 +887,203 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, return ret; } -ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, - loff_t max) +#ifdef CONFIG_COMPAT + +static ssize_t compat_do_readv_writev(int type, struct file *file, + const struct compat_iovec __user *uvector, + unsigned long nr_segs, loff_t *pos) +{ + compat_ssize_t tot_len; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + ssize_t ret; + io_fn_t fn; + iov_fn_t fnv; + + ret = -EINVAL; + if (!file->f_op) + goto out; + + ret = -EFAULT; + if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) + goto out; + + ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, + UIO_FASTIOV, iovstack, &iov); + if (ret <= 0) + goto out; + + tot_len = ret; + ret = rw_verify_area(type, file, pos, tot_len); + if (ret < 0) + goto out; + + fnv = NULL; + if (type == READ) { + fn = file->f_op->read; + fnv = file->f_op->aio_read; + } else { + fn = (io_fn_t)file->f_op->write; + fnv = file->f_op->aio_write; + file_start_write(file); + } + + if (fnv) + ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, + pos, fnv); + else + ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); + + if (type != READ) + file_end_write(file); + +out: + if (iov != iovstack) + kfree(iov); + if ((ret + (type == READ)) > 0) { + if (type == READ) + fsnotify_access(file); + else + fsnotify_modify(file); + } + return ret; +} + +static size_t compat_readv(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + ssize_t ret = -EBADF; + + if (!(file->f_mode & FMODE_READ)) + goto out; + + ret = -EINVAL; + if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) + goto out; + + ret = compat_do_readv_writev(READ, file, vec, vlen, pos); + +out: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen) +{ + struct fd f = fdget(fd); + ssize_t ret; + loff_t pos; + + if (!f.file) + return -EBADF; + pos = f.file->f_pos; + ret = compat_readv(f.file, vec, vlen, &pos); + f.file->f_pos = pos; + fdput(f); + return ret; +} + +COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos) +{ + struct fd f; + ssize_t ret; + + if (pos < 0) + return -EINVAL; + f = fdget(fd); + if (!f.file) + return -EBADF; + ret = -ESPIPE; + if (f.file->f_mode & FMODE_PREAD) + ret = compat_readv(f.file, vec, vlen, &pos); + fdput(f); + return ret; +} + +COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, u32, pos_low, u32, pos_high) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + return compat_sys_preadv64(fd, vec, vlen, pos); +} + +static size_t compat_writev(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + ssize_t ret = -EBADF; + + if (!(file->f_mode & FMODE_WRITE)) + goto out; + + ret = -EINVAL; + if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) + goto out; + + ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); + +out: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + +COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd, + const struct compat_iovec __user *, vec, + unsigned long, vlen) +{ + struct fd f = fdget(fd); + ssize_t ret; + loff_t pos; + + if (!f.file) + return -EBADF; + pos = f.file->f_pos; + ret = compat_writev(f.file, vec, vlen, &pos); + f.file->f_pos = pos; + fdput(f); + return ret; +} + +COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos) +{ + struct fd f; + ssize_t ret; + + if (pos < 0) + return -EINVAL; + f = fdget(fd); + if (!f.file) + return -EBADF; + ret = -ESPIPE; + if (f.file->f_mode & FMODE_PWRITE) + ret = compat_writev(f.file, vec, vlen, &pos); + fdput(f); + return ret; +} + +COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, u32, pos_low, u32, pos_high) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + return compat_sys_pwritev64(fd, vec, vlen, pos); +} +#endif + +static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, + size_t count, loff_t max) { struct fd in, out; struct inode *in_inode, *out_inode; @@ -1022,3 +1207,43 @@ SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, si return do_sendfile(out_fd, in_fd, NULL, count, 0); } + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, + compat_off_t __user *, offset, compat_size_t, count) +{ + loff_t pos; + off_t off; + ssize_t ret; + + if (offset) { + if (unlikely(get_user(off, offset))) + return -EFAULT; + pos = off; + ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); + if (unlikely(put_user(pos, offset))) + return -EFAULT; + return ret; + } + + return do_sendfile(out_fd, in_fd, NULL, count, 0); +} + +COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, + compat_loff_t __user *, offset, compat_size_t, count) +{ + loff_t pos; + ssize_t ret; + + if (offset) { + if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) + return -EFAULT; + ret = do_sendfile(out_fd, in_fd, &pos, count, 0); + if (unlikely(put_user(pos, offset))) + return -EFAULT; + return ret; + } + + return do_sendfile(out_fd, in_fd, NULL, count, 0); +} +#endif diff --git a/fs/read_write.h b/fs/read_write.h index d3e00ef67420..0ec530d9305b 100644 --- a/fs/read_write.h +++ b/fs/read_write.h @@ -7,10 +7,3 @@ typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *, unsigned long, loff_t); - -ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, - unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn); -ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, - unsigned long nr_segs, loff_t *ppos, io_fn_t fn); -ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, - loff_t max); diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 6165bd4784f6..dcaafcfc23b0 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -234,68 +234,9 @@ int reiserfs_commit_page(struct inode *inode, struct page *page, return ret; } -/* Write @count bytes at position @ppos in a file indicated by @file - from the buffer @buf. - - generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want - something simple that works. It is not for serious use by general purpose filesystems, excepting the one that it was - written for (ext2/3). This is for several reasons: - - * It has no understanding of any filesystem specific optimizations. - - * It enters the filesystem repeatedly for each page that is written. - - * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key - * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time - * to reiserfs which allows for fewer tree traversals. - - * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks. - - * Asking the block allocation code for blocks one at a time is slightly less efficient. - - All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to - use it, but we were in a hurry to make code freeze, and so it couldn't be revised then. This new code should make - things right finally. - - Future Features: providing search_by_key with hints. - -*/ -static ssize_t reiserfs_file_write(struct file *file, /* the file we are going to write into */ - const char __user * buf, /* pointer to user supplied data - (in userspace) */ - size_t count, /* amount of bytes to write */ - loff_t * ppos /* pointer to position in file that we start writing at. Should be updated to - * new current position before returning. */ - ) -{ - struct inode *inode = file_inode(file); // Inode of the file that we are writing to. - /* To simplify coding at this time, we store - locked pages in array for now */ - struct reiserfs_transaction_handle th; - th.t_trans_id = 0; - - /* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items - * lying around (most of the disk, in fact). Despite the filesystem - * now being a v3.6 format, the old items still can't support large - * file sizes. Catch this case here, as the rest of the VFS layer is - * oblivious to the different limitations between old and new items. - * reiserfs_setattr catches this for truncates. This chunk is lifted - * from generic_write_checks. */ - if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 && - *ppos + count > MAX_NON_LFS) { - if (*ppos >= MAX_NON_LFS) { - return -EFBIG; - } - if (count > MAX_NON_LFS - (unsigned long)*ppos) - count = MAX_NON_LFS - (unsigned long)*ppos; - } - - return do_sync_write(file, buf, count, ppos); -} - const struct file_operations reiserfs_file_operations = { .read = do_sync_read, - .write = reiserfs_file_write, + .write = do_sync_write, .unlocked_ioctl = reiserfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = reiserfs_compat_ioctl, diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 9cc0740adffa..33532f79b4f7 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -394,20 +394,24 @@ static int set_sb(struct super_block *sb, void *data) return -ENOENT; } +struct reiserfs_seq_private { + struct super_block *sb; + int (*show) (struct seq_file *, struct super_block *); +}; + static void *r_start(struct seq_file *m, loff_t * pos) { - struct proc_dir_entry *de = m->private; - struct super_block *s = de->parent->data; + struct reiserfs_seq_private *priv = m->private; loff_t l = *pos; if (l) return NULL; - if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, s))) + if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, priv->sb))) return NULL; - up_write(&s->s_umount); - return s; + up_write(&priv->sb->s_umount); + return priv->sb; } static void *r_next(struct seq_file *m, void *v, loff_t * pos) @@ -426,9 +430,8 @@ static void r_stop(struct seq_file *m, void *v) static int r_show(struct seq_file *m, void *v) { - struct proc_dir_entry *de = m->private; - int (*show) (struct seq_file *, struct super_block *) = de->data; - return show(m, v); + struct reiserfs_seq_private *priv = m->private; + return priv->show(m, v); } static const struct seq_operations r_ops = { @@ -440,11 +443,15 @@ static const struct seq_operations r_ops = { static int r_open(struct inode *inode, struct file *file) { - int ret = seq_open(file, &r_ops); + struct reiserfs_seq_private *priv; + int ret = seq_open_private(file, &r_ops, + sizeof(struct reiserfs_seq_private)); if (!ret) { struct seq_file *m = file->private_data; - m->private = PDE(inode); + priv = m->private; + priv->sb = proc_get_parent_data(inode); + priv->show = PDE_DATA(inode); } return ret; } @@ -453,7 +460,7 @@ static const struct file_operations r_file_operations = { .open = r_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, .owner = THIS_MODULE, }; @@ -479,9 +486,8 @@ int reiserfs_proc_info_init(struct super_block *sb) *s = '!'; spin_lock_init(&__PINFO(sb).lock); - REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root); + REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb); if (REISERFS_SB(sb)->procdir) { - REISERFS_SB(sb)->procdir->data = sb; add_file(sb, "version", show_version); add_file(sb, "super", show_super); add_file(sb, "per-level", show_per_level); @@ -499,29 +505,17 @@ int reiserfs_proc_info_init(struct super_block *sb) int reiserfs_proc_info_done(struct super_block *sb) { struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; - char b[BDEVNAME_SIZE]; - char *s; + if (de) { + char b[BDEVNAME_SIZE]; + char *s; - /* Some block devices use /'s */ - strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); - s = strchr(b, '/'); - if (s) - *s = '!'; + /* Some block devices use /'s */ + strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + s = strchr(b, '/'); + if (s) + *s = '!'; - if (de) { - remove_proc_entry("journal", de); - remove_proc_entry("oidmap", de); - remove_proc_entry("on-disk-super", de); - remove_proc_entry("bitmap", de); - remove_proc_entry("per-level", de); - remove_proc_entry("super", de); - remove_proc_entry("version", de); - } - spin_lock(&__PINFO(sb).lock); - __PINFO(sb).exiting = 1; - spin_unlock(&__PINFO(sb).lock); - if (proc_info_root) { - remove_proc_entry(b, proc_info_root); + remove_proc_subtree(b, proc_info_root); REISERFS_SB(sb)->procdir = NULL; } return 0; diff --git a/fs/seq_file.c b/fs/seq_file.c index 38bb59f3f2ad..774c1eb7f1c9 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -599,6 +599,24 @@ int single_open(struct file *file, int (*show)(struct seq_file *, void *), } EXPORT_SYMBOL(single_open); +int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), + void *data, size_t size) +{ + char *buf = kmalloc(size, GFP_KERNEL); + int ret; + if (!buf) + return -ENOMEM; + ret = single_open(file, show, data); + if (ret) { + kfree(buf); + return ret; + } + ((struct seq_file *)file->private_data)->buf = buf; + ((struct seq_file *)file->private_data)->size = size; + return 0; +} +EXPORT_SYMBOL(single_open_size); + int single_release(struct inode *inode, struct file *file) { const struct seq_operations *op = ((struct seq_file *)file->private_data)->op; diff --git a/fs/signalfd.c b/fs/signalfd.c index b53486961735..424b7b65321f 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -30,6 +30,7 @@ #include <linux/signalfd.h> #include <linux/syscalls.h> #include <linux/proc_fs.h> +#include <linux/compat.h> void signalfd_cleanup(struct sighand_struct *sighand) { @@ -311,3 +312,33 @@ SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask, { return sys_signalfd4(ufd, user_mask, sizemask, 0); } + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd, + const compat_sigset_t __user *,sigmask, + compat_size_t, sigsetsize, + int, flags) +{ + compat_sigset_t ss32; + sigset_t tmp; + sigset_t __user *ksigmask; + + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&tmp, &ss32); + ksigmask = compat_alloc_user_space(sizeof(sigset_t)); + if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t))) + return -EFAULT; + + return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags); +} + +COMPAT_SYSCALL_DEFINE3(signalfd, int, ufd, + const compat_sigset_t __user *,sigmask, + compat_size_t, sigsetsize) +{ + return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0); +} +#endif diff --git a/fs/splice.c b/fs/splice.c index 29e394e49ddd..e6b25598c8c4 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -31,6 +31,7 @@ #include <linux/security.h> #include <linux/gfp.h> #include <linux/socket.h> +#include <linux/compat.h> #include "internal.h" /* @@ -218,7 +219,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, page_nr++; ret += buf->len; - if (pipe->inode) + if (pipe->files) do_wakeup = 1; if (!--spd->nr_pages) @@ -828,7 +829,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, ops->release(pipe, buf); pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; - if (pipe->inode) + if (pipe->files) sd->need_wakeup = true; } @@ -1000,8 +1001,6 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, }; ssize_t ret; - sb_start_write(inode->i_sb); - pipe_lock(pipe); splice_from_pipe_begin(&sd); @@ -1037,7 +1036,6 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, *ppos += ret; balance_dirty_pages_ratelimited(mapping); } - sb_end_write(inode->i_sb); return ret; } @@ -1117,7 +1115,10 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, else splice_write = default_file_splice_write; - return splice_write(pipe, out, ppos, len, flags); + file_start_write(out); + ret = splice_write(pipe, out, ppos, len, flags); + file_end_write(out); + return ret; } /* @@ -1183,7 +1184,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, */ pipe = current->splice_pipe; if (unlikely(!pipe)) { - pipe = alloc_pipe_info(NULL); + pipe = alloc_pipe_info(); if (!pipe) return -ENOMEM; @@ -1690,6 +1691,27 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, return error; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32, + unsigned int, nr_segs, unsigned int, flags) +{ + unsigned i; + struct iovec __user *iov; + if (nr_segs > UIO_MAXIOV) + return -EINVAL; + iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); + for (i = 0; i < nr_segs; i++) { + struct compat_iovec v; + if (get_user(v.iov_base, &iov32[i].iov_base) || + get_user(v.iov_len, &iov32[i].iov_len) || + put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || + put_user(v.iov_len, &iov[i].iov_len)) + return -EFAULT; + } + return sys_vmsplice(fd, iov, nr_segs, flags); +} +#endif + SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) diff --git a/fs/sync.c b/fs/sync.c index 2c5d6639a66a..905f3f6b3d85 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -283,8 +283,8 @@ EXPORT_SYMBOL(generic_write_sync); * already-instantiated disk blocks, there are no guarantees here that the data * will be available after a crash. */ -SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, - unsigned int flags) +SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, + unsigned int, flags) { int ret; struct fd f; @@ -365,29 +365,11 @@ out_put: out: return ret; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_sync_file_range(long fd, loff_t offset, loff_t nbytes, - long flags) -{ - return SYSC_sync_file_range((int) fd, offset, nbytes, - (unsigned int) flags); -} -SYSCALL_ALIAS(sys_sync_file_range, SyS_sync_file_range); -#endif /* It would be nice if people remember that not all the world's an i386 when they introduce new system calls */ -SYSCALL_DEFINE(sync_file_range2)(int fd, unsigned int flags, - loff_t offset, loff_t nbytes) +SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags, + loff_t, offset, loff_t, nbytes) { return sys_sync_file_range(fd, offset, nbytes, flags); } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_sync_file_range2(long fd, long flags, - loff_t offset, loff_t nbytes) -{ - return SYSC_sync_file_range2((int) fd, (unsigned int) flags, - offset, nbytes); -} -SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2); -#endif diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 2fbdff6be25c..e8e0e71b29d5 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -165,21 +165,8 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) if (unlikely(!sd)) return NULL; - while (1) { - int v, t; - - v = atomic_read(&sd->s_active); - if (unlikely(v < 0)) - return NULL; - - t = atomic_cmpxchg(&sd->s_active, v, v + 1); - if (likely(t == v)) - break; - if (t < 0) - return NULL; - - cpu_relax(); - } + if (!atomic_inc_unless_negative(&sd->s_active)) + return NULL; if (likely(!ignore_lockdep(sd))) rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_); @@ -281,6 +268,10 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) */ parent_sd = sd->s_parent; + WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED), + "sysfs: free using entry: %s/%s\n", + parent_sd ? parent_sd->s_name : "", sd->s_name); + if (sysfs_type(sd) == SYSFS_KOBJ_LINK) sysfs_put(sd->s_symlink.target_sd); if (sysfs_type(sd) & SYSFS_COPY_NAME) @@ -399,7 +390,7 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) sd->s_name = name; sd->s_mode = mode; - sd->s_flags = type; + sd->s_flags = type | SYSFS_FLAG_REMOVED; return sd; @@ -479,6 +470,9 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; } + /* Mark the entry added into directory tree */ + sd->s_flags &= ~SYSFS_FLAG_REMOVED; + return 0; } @@ -1012,6 +1006,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) enum kobj_ns_type type; const void *ns; ino_t ino; + loff_t off; type = sysfs_ns_type(parent_sd); ns = sysfs_info(dentry->d_sb)->ns[type]; @@ -1020,6 +1015,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } if (filp->f_pos == 1) { if (parent_sd->s_parent) @@ -1028,8 +1025,11 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } mutex_lock(&sysfs_mutex); + off = filp->f_pos; for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); pos; pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) { @@ -1041,27 +1041,43 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) len = strlen(name); ino = pos->s_ino; type = dt_type(pos); - filp->f_pos = pos->s_hash; + off = filp->f_pos = pos->s_hash; filp->private_data = sysfs_get(pos); mutex_unlock(&sysfs_mutex); - ret = filldir(dirent, name, len, filp->f_pos, ino, type); + ret = filldir(dirent, name, len, off, ino, type); mutex_lock(&sysfs_mutex); if (ret < 0) break; } mutex_unlock(&sysfs_mutex); - if ((filp->f_pos > 1) && !pos) { /* EOF */ - filp->f_pos = INT_MAX; + + /* don't reference last entry if its refcount is dropped */ + if (!pos) { filp->private_data = NULL; + + /* EOF and not changed as 0 or 1 in read/write path */ + if (off == filp->f_pos && off > 1) + filp->f_pos = INT_MAX; } return 0; } +static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + loff_t ret; + + mutex_lock(&inode->i_mutex); + ret = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return ret; +} const struct file_operations sysfs_dir_operations = { .read = generic_read_dir, .readdir = sysfs_readdir, .release = sysfs_dir_release, - .llseek = generic_file_llseek, + .llseek = sysfs_dir_llseek, }; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8d924b5ec733..afd83273e6ce 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -19,6 +19,7 @@ #include <linux/module.h> #include <linux/magic.h> #include <linux/slab.h> +#include <linux/user_namespace.h> #include "sysfs.h" @@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; + if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) + return ERR_PTR(-EPERM); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ac838b844936..f21acf0ef01f 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1568,6 +1568,12 @@ static int ubifs_remount_rw(struct ubifs_info *c) c->remounting_rw = 1; c->ro_mount = 0; + if (c->space_fixup) { + err = ubifs_fixup_free_space(c); + if (err) + return err; + } + err = check_free_space(c); if (err) goto out; @@ -1684,12 +1690,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) err = dbg_check_space_info(c); } - if (c->space_fixup) { - err = ubifs_fixup_free_space(c); - if (err) - goto out; - } - mutex_unlock(&c->umount_mutex); return err; diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 95425b59ce0a..b6c2f94e041e 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -26,8 +26,7 @@ struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi, count = size >> uspi->s_fshift; if (count > UFS_MAXFRAG) return NULL; - ubh = (struct ufs_buffer_head *) - kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS); + ubh = kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS); if (!ubh) return NULL; ubh->fragment = fragment; diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index d02201df855b..6313b69b6644 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -45,11 +45,11 @@ xfs-y += xfs_aops.o \ xfs_itable.o \ xfs_message.o \ xfs_mru_cache.o \ - xfs_super.o \ - xfs_xattr.o \ xfs_rename.o \ + xfs_super.o \ xfs_utils.o \ xfs_vnodeops.o \ + xfs_xattr.o \ kmem.o \ uuid.o @@ -58,6 +58,7 @@ xfs-y += xfs_alloc.o \ xfs_alloc_btree.o \ xfs_attr.o \ xfs_attr_leaf.o \ + xfs_attr_remote.o \ xfs_bmap.o \ xfs_bmap_btree.o \ xfs_btree.o \ @@ -73,6 +74,7 @@ xfs-y += xfs_alloc.o \ xfs_inode.o \ xfs_log_recover.o \ xfs_mount.o \ + xfs_symlink.o \ xfs_trans.o # low-level transaction/log code diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index f2aeedb6a579..317aa86d96ea 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -30,6 +30,7 @@ struct xfs_trans; #define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */ #define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */ +#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */ #define XFS_AGF_VERSION 1 #define XFS_AGI_VERSION 1 @@ -63,12 +64,29 @@ typedef struct xfs_agf { __be32 agf_spare0; /* spare field */ __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ __be32 agf_spare1; /* spare field */ + __be32 agf_flfirst; /* first freelist block's index */ __be32 agf_fllast; /* last freelist block's index */ __be32 agf_flcount; /* count of blocks in freelist */ __be32 agf_freeblks; /* total free blocks */ + __be32 agf_longest; /* longest free space */ __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ + uuid_t agf_uuid; /* uuid of filesystem */ + + /* + * reserve some contiguous space for future logged fields before we add + * the unlogged fields. This makes the range logging via flags and + * structure offsets much simpler. + */ + __be64 agf_spare64[16]; + + /* unlogged fields, written during buffer writeback. */ + __be64 agf_lsn; /* last write sequence */ + __be32 agf_crc; /* crc of agf sector */ + __be32 agf_spare2; + + /* structure must be padded to 64 bit alignment */ } xfs_agf_t; #define XFS_AGF_MAGICNUM 0x00000001 @@ -83,7 +101,8 @@ typedef struct xfs_agf { #define XFS_AGF_FREEBLKS 0x00000200 #define XFS_AGF_LONGEST 0x00000400 #define XFS_AGF_BTREEBLKS 0x00000800 -#define XFS_AGF_NUM_BITS 12 +#define XFS_AGF_UUID 0x00001000 +#define XFS_AGF_NUM_BITS 13 #define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ @@ -98,7 +117,8 @@ typedef struct xfs_agf { { XFS_AGF_FLCOUNT, "FLCOUNT" }, \ { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ { XFS_AGF_LONGEST, "LONGEST" }, \ - { XFS_AGF_BTREEBLKS, "BTREEBLKS" } + { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ + { XFS_AGF_UUID, "UUID" } /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) @@ -132,6 +152,7 @@ typedef struct xfs_agi { __be32 agi_root; /* root of inode btree */ __be32 agi_level; /* levels in inode btree */ __be32 agi_freecount; /* number of free inodes */ + __be32 agi_newino; /* new inode just allocated */ __be32 agi_dirino; /* last directory inode chunk */ /* @@ -139,6 +160,13 @@ typedef struct xfs_agi { * still being referenced. */ __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; + + uuid_t agi_uuid; /* uuid of filesystem */ + __be32 agi_crc; /* crc of agi sector */ + __be32 agi_pad32; + __be64 agi_lsn; /* last write sequence */ + + /* structure must be padded to 64 bit alignment */ } xfs_agi_t; #define XFS_AGI_MAGICNUM 0x00000001 @@ -171,11 +199,31 @@ extern const struct xfs_buf_ops xfs_agi_buf_ops; */ #define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) #define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) -#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t)) #define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) +#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ + (__be32 *)(bp)->b_addr) + +/* + * Size of the AGFL. For CRC-enabled filesystes we steal a couple of + * slots in the beginning of the block for a proper header with the + * location information and CRC. + */ +#define XFS_AGFL_SIZE(mp) \ + (((mp)->m_sb.sb_sectsize - \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + sizeof(struct xfs_agfl) : 0)) / \ + sizeof(xfs_agblock_t)) + typedef struct xfs_agfl { - __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ + __be32 agfl_magicnum; + __be32 agfl_seqno; + uuid_t agfl_uuid; + __be64 agfl_lsn; + __be32 agfl_crc; + __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ } xfs_agfl_t; /* diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 0ad23253e8b1..5673bcfda2f0 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -33,7 +33,9 @@ #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_error.h" +#include "xfs_cksum.h" #include "xfs_trace.h" +#include "xfs_buf_item.h" struct workqueue_struct *xfs_alloc_wq; @@ -430,53 +432,84 @@ xfs_alloc_fixup_trees( return 0; } -static void +static bool xfs_agfl_verify( struct xfs_buf *bp) { -#ifdef WHEN_CRCS_COME_ALONG - /* - * we cannot actually do any verification of the AGFL because mkfs does - * not initialise the AGFL to zero or NULL. Hence the only valid part of - * the AGFL is what the AGF says is active. We can't get to the AGF, so - * we can't verify just those entries are valid. - * - * This problem goes away when the CRC format change comes along as that - * requires the AGFL to be initialised by mkfs. At that point, we can - * verify the blocks in the agfl -active or not- lie within the bounds - * of the AG. Until then, just leave this check ifdef'd out. - */ struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); - int agfl_ok = 1; - int i; + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) + return false; + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) + return false; + for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { - if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK || + if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) - agfl_ok = 0; + return false; } + return true; +} + +static void +xfs_agfl_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int agfl_ok = 1; + + /* + * There is no verification of non-crc AGFLs because mkfs does not + * initialise the AGFL to zero or NULL. Hence the only valid part of the + * AGFL is what the AGF says is active. We can't get to the AGF, so we + * can't verify just those entries are valid. + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + agfl_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agfl, agfl_crc)); + + agfl_ok = agfl_ok && xfs_agfl_verify(bp); if (!agfl_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); } -#endif } static void xfs_agfl_write_verify( struct xfs_buf *bp) { - xfs_agfl_verify(bp); -} + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; -static void -xfs_agfl_read_verify( - struct xfs_buf *bp) -{ - xfs_agfl_verify(bp); + /* no verification of non-crc AGFLs */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_agfl_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (bip) + XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agfl, agfl_crc)); } const struct xfs_buf_ops xfs_agfl_buf_ops = { @@ -842,7 +875,7 @@ xfs_alloc_ag_vextent_near( */ int dofirst; /* set to do first algorithm */ - dofirst = random32() & 1; + dofirst = prandom_u32() & 1; #endif restart: @@ -1982,18 +2015,18 @@ xfs_alloc_get_freelist( int btreeblk) /* destination is a AGF btree */ { xfs_agf_t *agf; /* a.g. freespace structure */ - xfs_agfl_t *agfl; /* a.g. freelist structure */ xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ xfs_agblock_t bno; /* block number returned */ + __be32 *agfl_bno; int error; int logflags; - xfs_mount_t *mp; /* mount structure */ + xfs_mount_t *mp = tp->t_mountp; xfs_perag_t *pag; /* per allocation group data */ - agf = XFS_BUF_TO_AGF(agbp); /* * Freelist is empty, give up. */ + agf = XFS_BUF_TO_AGF(agbp); if (!agf->agf_flcount) { *bnop = NULLAGBLOCK; return 0; @@ -2001,15 +2034,17 @@ xfs_alloc_get_freelist( /* * Read the array of free blocks. */ - mp = tp->t_mountp; - if ((error = xfs_alloc_read_agfl(mp, tp, - be32_to_cpu(agf->agf_seqno), &agflbp))) + error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), + &agflbp); + if (error) return error; - agfl = XFS_BUF_TO_AGFL(agflbp); + + /* * Get the block number and update the data structures. */ - bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]); + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) @@ -2058,11 +2093,14 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_freeblks), offsetof(xfs_agf_t, agf_longest), offsetof(xfs_agf_t, agf_btreeblks), + offsetof(xfs_agf_t, agf_uuid), sizeof(xfs_agf_t) }; trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); + xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); } @@ -2099,12 +2137,13 @@ xfs_alloc_put_freelist( int btreeblk) /* block came from a AGF btree */ { xfs_agf_t *agf; /* a.g. freespace structure */ - xfs_agfl_t *agfl; /* a.g. free block array */ __be32 *blockp;/* pointer to array entry */ int error; int logflags; xfs_mount_t *mp; /* mount structure */ xfs_perag_t *pag; /* per allocation group data */ + __be32 *agfl_bno; + int startoff; agf = XFS_BUF_TO_AGF(agbp); mp = tp->t_mountp; @@ -2112,7 +2151,6 @@ xfs_alloc_put_freelist( if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), &agflbp))) return error; - agfl = XFS_BUF_TO_AGFL(agflbp); be32_add_cpu(&agf->agf_fllast, 1); if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) agf->agf_fllast = 0; @@ -2133,32 +2171,38 @@ xfs_alloc_put_freelist( xfs_alloc_log_agf(tp, agbp, logflags); ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); - blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)]; + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; *blockp = cpu_to_be32(bno); + startoff = (char *)blockp - (char *)agflbp->b_addr; + xfs_alloc_log_agf(tp, agbp, logflags); - xfs_trans_log_buf(tp, agflbp, - (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl), - (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl + - sizeof(xfs_agblock_t) - 1)); + + xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF); + xfs_trans_log_buf(tp, agflbp, startoff, + startoff + sizeof(xfs_agblock_t) - 1); return 0; } -static void +static bool xfs_agf_verify( + struct xfs_mount *mp, struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_agf *agf; - int agf_ok; + struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); - agf = XFS_BUF_TO_AGF(bp); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid)) + return false; - agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && - be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp); + if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && + XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && + be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) + return false; /* * during growfs operations, the perag is not fully initialised, @@ -2166,33 +2210,58 @@ xfs_agf_verify( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag) - agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) == - bp->b_pag->pag_agno; + if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) + return false; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) - agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= - be32_to_cpu(agf->agf_length); + if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) + return false; + + return true;; - if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, - XFS_RANDOM_ALLOC_READ_AGF))) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } } static void xfs_agf_read_verify( struct xfs_buf *bp) { - xfs_agf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + int agf_ok = 1; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + agf_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agf, agf_crc)); + + agf_ok = agf_ok && xfs_agf_verify(mp, bp); + + if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void xfs_agf_write_verify( struct xfs_buf *bp) { - xfs_agf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_agf_verify(mp, bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agf, agf_crc)); } const struct xfs_buf_ops xfs_agf_buf_ops = { diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index b1ddef6b2689..30c4c1434faf 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -33,6 +33,7 @@ #include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" STATIC struct xfs_btree_cur * @@ -272,7 +273,7 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } -static void +static bool xfs_allocbt_verify( struct xfs_buf *bp) { @@ -280,66 +281,103 @@ xfs_allocbt_verify( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; unsigned int level; - int sblock_ok; /* block passes checks */ /* * magic number and level verification * - * During growfs operations, we can't verify the exact level as the - * perag is not fully initialised and hence not attached to the buffer. - * In this case, check against the maximum tree depth. + * During growfs operations, we can't verify the exact level or owner as + * the perag is not fully initialised and hence not attached to the + * buffer. In this case, check against the maximum tree depth. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agf information will not yet have been initialised + * from the on disk AGF. Again, we can only check against maximum limits + * in this case. */ level = be16_to_cpu(block->bb_level); switch (block->bb_magic) { + case cpu_to_be32(XFS_ABTB_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ case cpu_to_be32(XFS_ABTB_MAGIC): - if (pag) - sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi]; - else - sblock_ok = level < mp->m_ag_maxlevels; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; break; + case cpu_to_be32(XFS_ABTC_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ case cpu_to_be32(XFS_ABTC_MAGIC): - if (pag) - sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi]; - else - sblock_ok = level < mp->m_ag_maxlevels; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; break; default: - sblock_ok = 0; - break; + return false; } /* numrecs verification */ - sblock_ok = sblock_ok && - be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0]; + if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0]) + return false; /* sibling pointer verification */ - sblock_ok = sblock_ok && - (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_leftsib && - (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_rightsib; - - if (!sblock_ok) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; } static void xfs_allocbt_read_verify( struct xfs_buf *bp) { - xfs_allocbt_verify(bp); + if (!(xfs_btree_sblock_verify_crc(bp) && + xfs_allocbt_verify(bp))) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void xfs_allocbt_write_verify( struct xfs_buf *bp) { - xfs_allocbt_verify(bp); + if (!xfs_allocbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + xfs_btree_sblock_calc_crc(bp); + } const struct xfs_buf_ops xfs_allocbt_buf_ops = { @@ -444,6 +482,9 @@ xfs_allocbt_init_cursor( cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + return cur; } diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index 7e89a2b429dd..e3a3f7424192 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -31,8 +31,10 @@ struct xfs_mount; * by blockcount and blockno. All blocks look the same to make the code * simpler; if we have time later, we'll make the optimizations. */ -#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ -#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ +#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ +#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */ +#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ +#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */ /* * Data record/key structure @@ -59,10 +61,10 @@ typedef __be32 xfs_alloc_ptr_t; /* * Btree block header size depends on a superblock flag. - * - * (not quite yet, but soon) */ -#define XFS_ALLOC_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN +#define XFS_ALLOC_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* * Record, key, and pointer address macros for btree blocks. diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5f707e537171..3244c988d379 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -953,13 +953,13 @@ xfs_vm_writepage( unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); /* - * Just skip the page if it is fully outside i_size, e.g. due - * to a truncate operation that is in progress. + * Skip the page if it is fully outside i_size, e.g. due to a + * truncate operation that is in progress. We must redirty the + * page so that reclaim stops reclaiming it. Otherwise + * xfs_vm_releasepage() is called on it and gets confused. */ - if (page->index >= end_index + 1 || offset_into_page == 0) { - unlock_page(page); - return 0; - } + if (page->index >= end_index + 1 || offset_into_page == 0) + goto redirty; /* * The page straddles i_size. It must be zeroed out on each diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 888683844d98..20fe3fe9d341 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -15,7 +15,6 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ - #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" @@ -35,6 +34,7 @@ #include "xfs_bmap.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" @@ -74,13 +74,6 @@ STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); -/* - * Routines to manipulate out-of-line attribute values. - */ -STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args); -STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args); - -#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ STATIC int xfs_attr_name_to_xname( @@ -820,7 +813,7 @@ xfs_attr_inactive(xfs_inode_t *dp) error = 0; goto out; } - error = xfs_attr_root_inactive(&trans, dp); + error = xfs_attr3_root_inactive(&trans, dp); if (error) goto out; @@ -906,7 +899,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; @@ -914,14 +907,14 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Look up the given attribute in the leaf block. Figure out if * the given flags produce an error or call for an atomic rename. */ - retval = xfs_attr_leaf_lookup_int(bp, args); + retval = xfs_attr3_leaf_lookup_int(bp, args); if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { xfs_trans_brelse(args->trans, bp); - return(retval); + return retval; } else if (retval == EEXIST) { if (args->flags & ATTR_CREATE) { /* pure create op */ xfs_trans_brelse(args->trans, bp); - return(retval); + return retval; } trace_xfs_attr_leaf_replace(args); @@ -937,7 +930,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Add the attribute to the leaf block, transitioning to a Btree * if required. */ - retval = xfs_attr_leaf_add(bp, args); + retval = xfs_attr3_leaf_add(bp, args); if (retval == ENOSPC) { /* * Promote the attribute list to the Btree format, then @@ -945,7 +938,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * can manage its own transactions. */ xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_node(args); + error = xfs_attr3_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -1010,7 +1003,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. */ - error = xfs_attr_leaf_flipflags(args); + error = xfs_attr3_leaf_flipflags(args); if (error) return(error); @@ -1032,19 +1025,19 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Read in the block containing the "old" attr, then * remove the "old" attr from that block (neat, huh!) */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; - xfs_attr_leaf_remove(bp, args); + xfs_attr3_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, @@ -1076,9 +1069,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) /* * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr_leaf_clearflag(args); + error = xfs_attr3_leaf_clearflag(args); } - return(error); + return error; } /* @@ -1101,24 +1094,24 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; - error = xfs_attr_leaf_lookup_int(bp, args); + error = xfs_attr3_leaf_lookup_int(bp, args); if (error == ENOATTR) { xfs_trans_brelse(args->trans, bp); - return(error); + return error; } - xfs_attr_leaf_remove(bp, args); + xfs_attr3_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1128,7 +1121,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); - return(error); + return error; } /* @@ -1138,7 +1131,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) if (committed) xfs_trans_ijoin(args->trans, dp, 0); } - return(0); + return 0; } /* @@ -1156,21 +1149,21 @@ xfs_attr_leaf_get(xfs_da_args_t *args) trace_xfs_attr_leaf_get(args); args->blkno = 0; - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return error; - error = xfs_attr_leaf_lookup_int(bp, args); + error = xfs_attr3_leaf_lookup_int(bp, args); if (error != EEXIST) { xfs_trans_brelse(args->trans, bp); - return(error); + return error; } - error = xfs_attr_leaf_getvalue(bp, args); + error = xfs_attr3_leaf_getvalue(bp, args); xfs_trans_brelse(args->trans, bp); if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { error = xfs_attr_rmtval_get(args); } - return(error); + return error; } /* @@ -1185,11 +1178,11 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) trace_xfs_attr_leaf_list(context); context->cursor->blkno = 0; - error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp); + error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp); if (error) return XFS_ERROR(error); - error = xfs_attr_leaf_list_int(bp, context); + error = xfs_attr3_leaf_list_int(bp, context); xfs_trans_brelse(NULL, bp); return XFS_ERROR(error); } @@ -1236,7 +1229,7 @@ restart: * Search to see if name already exists, and get back a pointer * to where it should go. */ - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error) goto out; blk = &state->path.blk[ state->path.active-1 ]; @@ -1258,7 +1251,7 @@ restart: args->rmtblkcnt = 0; } - retval = xfs_attr_leaf_add(blk->bp, state->args); + retval = xfs_attr3_leaf_add(blk->bp, state->args); if (retval == ENOSPC) { if (state->path.active == 1) { /* @@ -1268,7 +1261,7 @@ restart: */ xfs_da_state_free(state); xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_node(args); + error = xfs_attr3_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1307,7 +1300,7 @@ restart: * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. */ xfs_bmap_init(args->flist, args->firstblock); - error = xfs_da_split(state); + error = xfs_da3_split(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -1329,7 +1322,7 @@ restart: /* * Addition succeeded, update Btree hashvals. */ - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); } /* @@ -1370,7 +1363,7 @@ restart: * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. */ - error = xfs_attr_leaf_flipflags(args); + error = xfs_attr3_leaf_flipflags(args); if (error) goto out; @@ -1400,7 +1393,7 @@ restart: state->blocksize = state->mp->m_sb.sb_blocksize; state->node_ents = state->mp->m_attr_node_ents; state->inleaf = 0; - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error) goto out; @@ -1409,15 +1402,15 @@ restart: */ blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr_leaf_remove(blk->bp, args); - xfs_da_fixhashpath(state, &state->path); + error = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); /* * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_da_join(state); + error = xfs_da3_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1450,7 +1443,7 @@ restart: /* * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr_leaf_clearflag(args); + error = xfs_attr3_leaf_clearflag(args); if (error) goto out; } @@ -1495,7 +1488,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error || (retval != EEXIST)) { if (error == 0) error = retval; @@ -1524,7 +1517,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * Mark the attribute as INCOMPLETE, then bunmapi() the * remote value. */ - error = xfs_attr_leaf_setflag(args); + error = xfs_attr3_leaf_setflag(args); if (error) goto out; error = xfs_attr_rmtval_remove(args); @@ -1545,15 +1538,15 @@ xfs_attr_node_removename(xfs_da_args_t *args) */ blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - retval = xfs_attr_leaf_remove(blk->bp, args); - xfs_da_fixhashpath(state, &state->path); + retval = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); /* * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_da_join(state); + error = xfs_da3_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -1591,13 +1584,13 @@ xfs_attr_node_removename(xfs_da_args_t *args) ASSERT(state->path.blk[0].bp); state->path.blk[0].bp = NULL; - error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp); if (error) goto out; if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); - error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, @@ -1699,7 +1692,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_node_read(state->args->trans, + error = xfs_da3_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); @@ -1718,7 +1711,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_node_read(state->args->trans, + error = xfs_da3_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); @@ -1758,7 +1751,7 @@ xfs_attr_node_get(xfs_da_args_t *args) /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error) { retval = error; } else if (retval == EEXIST) { @@ -1769,7 +1762,7 @@ xfs_attr_node_get(xfs_da_args_t *args) /* * Get the value, local or "remote" */ - retval = xfs_attr_leaf_getvalue(blk->bp, args); + retval = xfs_attr3_leaf_getvalue(blk->bp, args); if (!retval && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { retval = xfs_attr_rmtval_get(args); @@ -1794,7 +1787,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) attrlist_cursor_kern_t *cursor; xfs_attr_leafblock_t *leaf; xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; + struct xfs_attr3_icleaf_hdr leafhdr; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; int error, i; struct xfs_buf *bp; @@ -1810,27 +1805,33 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ bp = NULL; if (cursor->blkno > 0) { - error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1, + error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if ((error != 0) && (error != EFSCORRUPTED)) return(error); if (bp) { + struct xfs_attr_leaf_entry *entries; + node = bp->b_addr; switch (be16_to_cpu(node->hdr.info.magic)) { case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: trace_xfs_attr_list_wrong_blk(context); xfs_trans_brelse(NULL, bp); bp = NULL; break; case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: leaf = bp->b_addr; - if (cursor->hashval > be32_to_cpu(leaf->entries[ - be16_to_cpu(leaf->hdr.count)-1].hashval)) { + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + if (cursor->hashval > be32_to_cpu( + entries[leafhdr.count - 1].hashval)) { trace_xfs_attr_list_wrong_blk(context); xfs_trans_brelse(NULL, bp); bp = NULL; - } else if (cursor->hashval <= - be32_to_cpu(leaf->entries[0].hashval)) { + } else if (cursor->hashval <= be32_to_cpu( + entries[0].hashval)) { trace_xfs_attr_list_wrong_blk(context); xfs_trans_brelse(NULL, bp); bp = NULL; @@ -1852,27 +1853,31 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (bp == NULL) { cursor->blkno = 0; for (;;) { - error = xfs_da_node_read(NULL, context->dp, + __uint16_t magic; + + error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1, &bp, XFS_ATTR_FORK); if (error) return(error); node = bp->b_addr; - if (node->hdr.info.magic == - cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) + magic = be16_to_cpu(node->hdr.info.magic); + if (magic == XFS_ATTR_LEAF_MAGIC || + magic == XFS_ATTR3_LEAF_MAGIC) break; - if (unlikely(node->hdr.info.magic != - cpu_to_be16(XFS_DA_NODE_MAGIC))) { + if (magic != XFS_DA_NODE_MAGIC && + magic != XFS_DA3_NODE_MAGIC) { XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", XFS_ERRLEVEL_LOW, context->dp->i_mount, node); xfs_trans_brelse(NULL, bp); - return(XFS_ERROR(EFSCORRUPTED)); + return XFS_ERROR(EFSCORRUPTED); } - btree = node->btree; - for (i = 0; i < be16_to_cpu(node->hdr.count); - btree++, i++) { + + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + for (i = 0; i < nodehdr.count; btree++, i++) { if (cursor->hashval <= be32_to_cpu(btree->hashval)) { cursor->blkno = be32_to_cpu(btree->before); @@ -1881,9 +1886,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) break; } } - if (i == be16_to_cpu(node->hdr.count)) { + if (i == nodehdr.count) { xfs_trans_brelse(NULL, bp); - return(0); + return 0; } xfs_trans_brelse(NULL, bp); } @@ -1897,310 +1902,21 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ for (;;) { leaf = bp->b_addr; - error = xfs_attr_leaf_list_int(bp, context); + error = xfs_attr3_leaf_list_int(bp, context); if (error) { xfs_trans_brelse(NULL, bp); return error; } - if (context->seen_enough || leaf->hdr.info.forw == 0) + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + if (context->seen_enough || leafhdr.forw == 0) break; - cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); + cursor->blkno = leafhdr.forw; xfs_trans_brelse(NULL, bp); - error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1, + error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1, &bp); if (error) return error; } xfs_trans_brelse(NULL, bp); - return(0); -} - - -/*======================================================================== - * External routines for manipulating out-of-line attribute values. - *========================================================================*/ - -/* - * Read the value associated with an attribute from the out-of-line buffer - * that we stored it in. - */ -int -xfs_attr_rmtval_get(xfs_da_args_t *args) -{ - xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; - xfs_mount_t *mp; - xfs_daddr_t dblkno; - void *dst; - xfs_buf_t *bp; - int nmap, error, tmp, valuelen, blkcnt, i; - xfs_dablk_t lblkno; - - trace_xfs_attr_rmtval_get(args); - - ASSERT(!(args->flags & ATTR_KERNOVAL)); - - mp = args->dp->i_mount; - dst = args->value; - valuelen = args->valuelen; - lblkno = args->rmtblkno; - while (valuelen > 0) { - nmap = ATTR_RMTVALUE_MAPSIZE; - error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, map, &nmap, - XFS_BMAPI_ATTRFORK); - if (error) - return(error); - ASSERT(nmap >= 1); - - for (i = 0; (i < nmap) && (valuelen > 0); i++) { - ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && - (map[i].br_startblock != HOLESTARTBLOCK)); - dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); - blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - dblkno, blkcnt, 0, &bp, NULL); - if (error) - return(error); - - tmp = min_t(int, valuelen, BBTOB(bp->b_length)); - xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ); - xfs_buf_relse(bp); - dst += tmp; - valuelen -= tmp; - - lblkno += map[i].br_blockcount; - } - } - ASSERT(valuelen == 0); - return(0); -} - -/* - * Write the value associated with an attribute into the out-of-line buffer - * that we have defined for it. - */ -STATIC int -xfs_attr_rmtval_set(xfs_da_args_t *args) -{ - xfs_mount_t *mp; - xfs_fileoff_t lfileoff; - xfs_inode_t *dp; - xfs_bmbt_irec_t map; - xfs_daddr_t dblkno; - void *src; - xfs_buf_t *bp; - xfs_dablk_t lblkno; - int blkcnt, valuelen, nmap, error, tmp, committed; - - trace_xfs_attr_rmtval_set(args); - - dp = args->dp; - mp = dp->i_mount; - src = args->value; - - /* - * Find a "hole" in the attribute address space large enough for - * us to drop the new attribute's value into. - */ - blkcnt = XFS_B_TO_FSB(mp, args->valuelen); - lfileoff = 0; - error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, - XFS_ATTR_FORK); - if (error) { - return(error); - } - args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; - args->rmtblkcnt = blkcnt; - - /* - * Roll through the "value", allocating blocks on disk as required. - */ - while (blkcnt > 0) { - /* - * Allocate a single extent, up to the size of the value. - */ - xfs_bmap_init(args->flist, args->firstblock); - nmap = 1; - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, - blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, args->total, &map, &nmap, - args->flist); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } - if (error) { - ASSERT(committed); - args->trans = NULL; - xfs_bmap_cancel(args->flist); - return(error); - } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - lblkno += map.br_blockcount; - blkcnt -= map.br_blockcount; - - /* - * Start the next trans in the chain. - */ - error = xfs_trans_roll(&args->trans, dp); - if (error) - return (error); - } - - /* - * Roll through the "value", copying the attribute value to the - * already-allocated blocks. Blocks are written synchronously - * so that we can know they are all on disk before we turn off - * the INCOMPLETE flag. - */ - lblkno = args->rmtblkno; - valuelen = args->valuelen; - while (valuelen > 0) { - int buflen; - - /* - * Try to remember where we decided to put the value. - */ - xfs_bmap_init(args->flist, args->firstblock); - nmap = 1; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, - XFS_BMAPI_ATTRFORK); - if (error) - return(error); - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); - if (!bp) - return ENOMEM; - - buflen = BBTOB(bp->b_length); - tmp = min_t(int, valuelen, buflen); - xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); - if (tmp < buflen) - xfs_buf_zero(bp, tmp, buflen - tmp); - - error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ - xfs_buf_relse(bp); - if (error) - return error; - src += tmp; - valuelen -= tmp; - - lblkno += map.br_blockcount; - } - ASSERT(valuelen == 0); - return(0); -} - -/* - * Remove the value associated with an attribute by deleting the - * out-of-line buffer that it is stored on. - */ -STATIC int -xfs_attr_rmtval_remove(xfs_da_args_t *args) -{ - xfs_mount_t *mp; - xfs_bmbt_irec_t map; - xfs_buf_t *bp; - xfs_daddr_t dblkno; - xfs_dablk_t lblkno; - int valuelen, blkcnt, nmap, error, done, committed; - - trace_xfs_attr_rmtval_remove(args); - - mp = args->dp->i_mount; - - /* - * Roll through the "value", invalidating the attribute value's - * blocks. - */ - lblkno = args->rmtblkno; - valuelen = args->rmtblkcnt; - while (valuelen > 0) { - /* - * Try to remember where we decided to put the value. - */ - nmap = 1; - error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, - XFS_BMAPI_ATTRFORK); - if (error) - return(error); - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - /* - * If the "remote" value is in the cache, remove it. - */ - bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); - if (bp) { - xfs_buf_stale(bp); - xfs_buf_relse(bp); - bp = NULL; - } - - valuelen -= map.br_blockcount; - - lblkno += map.br_blockcount; - } - - /* - * Keep de-allocating extents until the remote-value region is gone. - */ - lblkno = args->rmtblkno; - blkcnt = args->rmtblkcnt; - done = 0; - while (!done) { - xfs_bmap_init(args->flist, args->firstblock); - error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - 1, args->firstblock, args->flist, - &done); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } - if (error) { - ASSERT(committed); - args->trans = NULL; - xfs_bmap_cancel(args->flist); - return(error); - } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, args->dp, 0); - - /* - * Close out trans and start the next one in the chain. - */ - error = xfs_trans_roll(&args->trans, args->dp); - if (error) - return (error); - } - return(0); + return 0; } diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index e920d68ef509..de8dd58da46c 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -140,7 +140,6 @@ typedef struct xfs_attr_list_context { * Overall external interface routines. */ int xfs_attr_inactive(struct xfs_inode *dp); -int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_list_int(struct xfs_attr_list_context *); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index ee24993c7d12..08d5457c948e 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -31,6 +32,7 @@ #include "xfs_alloc.h" #include "xfs_btree.h" #include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -39,6 +41,9 @@ #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" + /* * xfs_attr_leaf.c @@ -53,85 +58,226 @@ /* * Routines used for growing the Btree. */ -STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block, - struct xfs_buf **bpp); -STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer, - xfs_da_args_t *args, int freemap_index); -STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args, - struct xfs_buf *leaf_buffer); -STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, +STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args, + xfs_dablk_t which_block, struct xfs_buf **bpp); +STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_da_args *args, int freemap_index); +STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_buf *leaf_buffer); +STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); -STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state, - xfs_da_state_blk_t *leaf_blk_1, - xfs_da_state_blk_t *leaf_blk_2, - int *number_entries_in_blk1, - int *number_usedbytes_in_blk1); +STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state, + xfs_da_state_blk_t *leaf_blk_1, + struct xfs_attr3_icleaf_hdr *ichdr1, + xfs_da_state_blk_t *leaf_blk_2, + struct xfs_attr3_icleaf_hdr *ichdr2, + int *number_entries_in_blk1, + int *number_usedbytes_in_blk1); /* * Routines used for shrinking the Btree. */ -STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, +STATIC int xfs_attr3_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, struct xfs_buf *bp, int level); -STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, +STATIC int xfs_attr3_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, struct xfs_buf *bp); -STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, +STATIC int xfs_attr3_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dablk_t blkno, int blkcnt); /* * Utility routines. */ -STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, - int src_start, - xfs_attr_leafblock_t *dst_leaf, - int dst_start, int move_count, - xfs_mount_t *mp); +STATIC void xfs_attr3_leaf_moveents(struct xfs_attr_leafblock *src_leaf, + struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start, + struct xfs_attr_leafblock *dst_leaf, + struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start, + int move_count, struct xfs_mount *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); -static void -xfs_attr_leaf_verify( +void +xfs_attr3_leaf_hdr_from_disk( + struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from) +{ + int i; + + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + + if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { + struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->count); + to->usedbytes = be16_to_cpu(hdr3->usedbytes); + to->firstused = be16_to_cpu(hdr3->firstused); + to->holes = hdr3->holes; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base); + to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size); + } + return; + } + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->usedbytes = be16_to_cpu(from->hdr.usedbytes); + to->firstused = be16_to_cpu(from->hdr.firstused); + to->holes = from->hdr.holes; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base); + to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size); + } +} + +void +xfs_attr3_leaf_hdr_to_disk( + struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from) +{ + int i; + + ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC || + from->magic == XFS_ATTR3_LEAF_MAGIC); + + if (from->magic == XFS_ATTR3_LEAF_MAGIC) { + struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to; + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->count = cpu_to_be16(from->count); + hdr3->usedbytes = cpu_to_be16(from->usedbytes); + hdr3->firstused = cpu_to_be16(from->firstused); + hdr3->holes = from->holes; + hdr3->pad1 = 0; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base); + hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size); + } + return; + } + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.usedbytes = cpu_to_be16(from->usedbytes); + to->hdr.firstused = cpu_to_be16(from->firstused); + to->hdr.holes = from->holes; + to->hdr.pad1 = 0; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base); + to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size); + } +} + +static bool +xfs_attr3_leaf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_attr_leaf_hdr *hdr = bp->b_addr; - int block_ok = 0; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr3_icleaf_hdr ichdr; - block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC); - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) + return false; + + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) + return false; + } else { + if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) + return false; } + if (ichdr.count == 0) + return false; + + /* XXX: need to range check rest of attr header values */ + /* XXX: hash order check? */ + + return true; } static void -xfs_attr_leaf_read_verify( +xfs_attr3_leaf_write_verify( struct xfs_buf *bp) { - xfs_attr_leaf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; + + if (!xfs_attr3_leaf_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_ATTR3_LEAF_CRC_OFF); } +/* + * leaf/node format detection on trees is sketchy, so a node read can be done on + * leaf level blocks when detection identifies the tree as a node format tree + * incorrectly. In this case, we need to swap the verifier to match the correct + * format of the block being read. + */ static void -xfs_attr_leaf_write_verify( - struct xfs_buf *bp) +xfs_attr3_leaf_read_verify( + struct xfs_buf *bp) { - xfs_attr_leaf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_ATTR3_LEAF_CRC_OFF)) || + !xfs_attr3_leaf_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } -const struct xfs_buf_ops xfs_attr_leaf_buf_ops = { - .verify_read = xfs_attr_leaf_read_verify, - .verify_write = xfs_attr_leaf_write_verify, +const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { + .verify_read = xfs_attr3_leaf_read_verify, + .verify_write = xfs_attr3_leaf_write_verify, }; int -xfs_attr_leaf_read( +xfs_attr3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); + return err; } /*======================================================================== @@ -172,7 +318,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) int dsize; xfs_mount_t *mp = dp->i_mount; - offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ + /* rounded down */ + offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; switch (dp->i_d.di_format) { case XFS_DINODE_FMT_DEV: @@ -231,7 +378,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) return 0; return dp->i_d.di_forkoff; } - dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); + dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot); break; } @@ -243,7 +390,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ - maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ if (offset >= maxforkoff) @@ -557,7 +705,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) } ASSERT(blkno == 0); - error = xfs_attr_leaf_create(args, blkno, &bp); + error = xfs_attr3_leaf_create(args, blkno, &bp); if (error) { error = xfs_da_shrink_inode(args, 0, bp); bp = NULL; @@ -586,9 +734,9 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) nargs.hashval = xfs_da_hashname(sfe->nameval, sfe->namelen); nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); - error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ + error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == ENOATTR); - error = xfs_attr_leaf_add(bp, &nargs); + error = xfs_attr3_leaf_add(bp, &nargs); ASSERT(error != ENOSPC); if (error) goto out; @@ -801,7 +949,7 @@ xfs_attr_shortform_allfit( continue; /* don't copy partial entries */ if (!(entry->flags & XFS_ATTR_LOCAL)) return(0); - name_loc = xfs_attr_leaf_name_local(leaf, i); + name_loc = xfs_attr3_leaf_name_local(leaf, i); if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX) return(0); if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) @@ -821,29 +969,34 @@ xfs_attr_shortform_allfit( * Convert a leaf attribute list to shortform attribute list */ int -xfs_attr_leaf_to_shortform( - struct xfs_buf *bp, - xfs_da_args_t *args, - int forkoff) +xfs_attr3_leaf_to_shortform( + struct xfs_buf *bp, + struct xfs_da_args *args, + int forkoff) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_da_args_t nargs; - xfs_inode_t *dp; - char *tmpbuffer; - int error, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_da_args nargs; + struct xfs_inode *dp = args->dp; + char *tmpbuffer; + int error; + int i; trace_xfs_attr_leaf_to_sf(args); - dp = args->dp; tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP); - ASSERT(tmpbuffer != NULL); + if (!tmpbuffer) + return ENOMEM; - ASSERT(bp != NULL); memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(dp->i_mount)); + leaf = (xfs_attr_leafblock_t *)tmpbuffer; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); + + /* XXX (dgc): buffer is about to be marked stale - why zero it? */ memset(bp->b_addr, 0, XFS_LBSIZE(dp->i_mount)); /* @@ -873,14 +1026,14 @@ xfs_attr_leaf_to_shortform( nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + + for (i = 0; i < ichdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!entry->nameidx) continue; ASSERT(entry->flags & XFS_ATTR_LOCAL); - name_loc = xfs_attr_leaf_name_local(leaf, i); + name_loc = xfs_attr3_leaf_name_local(leaf, i); nargs.name = name_loc->nameval; nargs.namelen = name_loc->namelen; nargs.value = &name_loc->nameval[nargs.namelen]; @@ -893,61 +1046,75 @@ xfs_attr_leaf_to_shortform( out: kmem_free(tmpbuffer); - return(error); + return error; } /* * Convert from using a single leaf to a root node and a leaf. */ int -xfs_attr_leaf_to_node(xfs_da_args_t *args) +xfs_attr3_leaf_to_node( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_da_intnode_t *node; - xfs_inode_t *dp; - struct xfs_buf *bp1, *bp2; - xfs_dablk_t blkno; - int error; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr icleafhdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr icnodehdr; + struct xfs_da_intnode *node; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp1 = NULL; + struct xfs_buf *bp2 = NULL; + xfs_dablk_t blkno; + int error; trace_xfs_attr_leaf_to_node(args); - dp = args->dp; - bp1 = bp2 = NULL; error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1); + error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1); if (error) goto out; - bp2 = NULL; - error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, - XFS_ATTR_FORK); + error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK); if (error) goto out; + + /* copy leaf to new buffer, update identifiers */ + xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); bp2->b_ops = bp1->b_ops; - memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); - bp1 = NULL; - xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); + memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(mp)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; + hdr3->blkno = cpu_to_be64(bp2->b_bn); + } + xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(mp) - 1); /* * Set up the new root node. */ - error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); + error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); if (error) goto out; node = bp1->b_addr; + xfs_da3_node_hdr_from_disk(&icnodehdr, node); + btree = xfs_da3_node_tree_p(node); + leaf = bp2->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + /* both on-disk, don't endian-flip twice */ - node->btree[0].hashval = - leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval; - node->btree[0].before = cpu_to_be32(blkno); - node->hdr.count = cpu_to_be16(1); - xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1); + btree[0].hashval = entries[icleafhdr.count - 1].hashval; + btree[0].before = cpu_to_be32(blkno); + icnodehdr.count = 1; + xfs_da3_node_hdr_to_disk(node, &icnodehdr); + xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(mp) - 1); error = 0; out: - return(error); + return error; } @@ -960,52 +1127,63 @@ out: * or a leaf in a node attribute list. */ STATIC int -xfs_attr_leaf_create( - xfs_da_args_t *args, - xfs_dablk_t blkno, - struct xfs_buf **bpp) +xfs_attr3_leaf_create( + struct xfs_da_args *args, + xfs_dablk_t blkno, + struct xfs_buf **bpp) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_inode_t *dp; - struct xfs_buf *bp; - int error; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; trace_xfs_attr_leaf_create(args); - dp = args->dp; - ASSERT(dp != NULL); error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp, XFS_ATTR_FORK); if (error) - return(error); - bp->b_ops = &xfs_attr_leaf_buf_ops; + return error; + bp->b_ops = &xfs_attr3_leaf_buf_ops; + xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF); leaf = bp->b_addr; - memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); - hdr = &leaf->hdr; - hdr->info.magic = cpu_to_be16(XFS_ATTR_LEAF_MAGIC); - hdr->firstused = cpu_to_be16(XFS_LBSIZE(dp->i_mount)); - if (!hdr->firstused) { - hdr->firstused = cpu_to_be16( - XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN); - } + memset(leaf, 0, XFS_LBSIZE(mp)); - hdr->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) - - sizeof(xfs_attr_leaf_hdr_t)); + memset(&ichdr, 0, sizeof(ichdr)); + ichdr.firstused = XFS_LBSIZE(mp); - xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_blkinfo *hdr3 = bp->b_addr; + + ichdr.magic = XFS_ATTR3_LEAF_MAGIC; + + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + + ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); + } else { + ichdr.magic = XFS_ATTR_LEAF_MAGIC; + ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr); + } + ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; + + xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(mp) - 1); *bpp = bp; - return(0); + return 0; } /* * Split the leaf node, rebalance, then add the new entry. */ int -xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, - xfs_da_state_blk_t *newblk) +xfs_attr3_leaf_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk) { xfs_dablk_t blkno; int error; @@ -1019,7 +1197,7 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, error = xfs_da_grow_inode(state->args, &blkno); if (error) return(error); - error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp); + error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp); if (error) return(error); newblk->blkno = blkno; @@ -1029,8 +1207,8 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Rebalance the entries across the two leaves. * NOTE: rebalance() currently depends on the 2nd block being empty. */ - xfs_attr_leaf_rebalance(state, oldblk, newblk); - error = xfs_da_blk_link(state, oldblk, newblk); + xfs_attr3_leaf_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); if (error) return(error); @@ -1043,10 +1221,10 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, */ if (state->inleaf) { trace_xfs_attr_leaf_add_old(state->args); - error = xfs_attr_leaf_add(oldblk->bp, state->args); + error = xfs_attr3_leaf_add(oldblk->bp, state->args); } else { trace_xfs_attr_leaf_add_new(state->args); - error = xfs_attr_leaf_add(newblk->bp, state->args); + error = xfs_attr3_leaf_add(newblk->bp, state->args); } /* @@ -1061,22 +1239,23 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Add a name to the leaf attribute list structure. */ int -xfs_attr_leaf_add( +xfs_attr3_leaf_add( struct xfs_buf *bp, struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_attr_leaf_map_t *map; - int tablesize, entsize, sum, tmp, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + int tablesize; + int entsize; + int sum; + int tmp; + int i; trace_xfs_attr_leaf_add(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT((args->index >= 0) - && (args->index <= be16_to_cpu(leaf->hdr.count))); - hdr = &leaf->hdr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(args->index >= 0 && args->index <= ichdr.count); entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen, args->trans->t_mountp->m_sb.sb_blocksize, NULL); @@ -1084,25 +1263,23 @@ xfs_attr_leaf_add( * Search through freemap for first-fit on new name length. * (may need to figure in size of entry struct too) */ - tablesize = (be16_to_cpu(hdr->count) + 1) - * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1]; - for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) { - if (tablesize > be16_to_cpu(hdr->firstused)) { - sum += be16_to_cpu(map->size); + tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) { + if (tablesize > ichdr.firstused) { + sum += ichdr.freemap[i].size; continue; } - if (!map->size) + if (!ichdr.freemap[i].size) continue; /* no space in this map */ tmp = entsize; - if (be16_to_cpu(map->base) < be16_to_cpu(hdr->firstused)) + if (ichdr.freemap[i].base < ichdr.firstused) tmp += sizeof(xfs_attr_leaf_entry_t); - if (be16_to_cpu(map->size) >= tmp) { - tmp = xfs_attr_leaf_add_work(bp, args, i); - return(tmp); + if (ichdr.freemap[i].size >= tmp) { + tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i); + goto out_log_hdr; } - sum += be16_to_cpu(map->size); + sum += ichdr.freemap[i].size; } /* @@ -1110,82 +1287,89 @@ xfs_attr_leaf_add( * and we don't have enough freespace, then compaction will do us * no good and we should just give up. */ - if (!hdr->holes && (sum < entsize)) - return(XFS_ERROR(ENOSPC)); + if (!ichdr.holes && sum < entsize) + return XFS_ERROR(ENOSPC); /* * Compact the entries to coalesce free space. * This may change the hdr->count via dropping INCOMPLETE entries. */ - xfs_attr_leaf_compact(args, bp); + xfs_attr3_leaf_compact(args, &ichdr, bp); /* * After compaction, the block is guaranteed to have only one * free region, in freemap[0]. If it is not big enough, give up. */ - if (be16_to_cpu(hdr->freemap[0].size) - < (entsize + sizeof(xfs_attr_leaf_entry_t))) - return(XFS_ERROR(ENOSPC)); + if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) { + tmp = ENOSPC; + goto out_log_hdr; + } + + tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); - return(xfs_attr_leaf_add_work(bp, args, 0)); +out_log_hdr: + xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, &leaf->hdr, + xfs_attr3_leaf_hdr_size(leaf))); + return tmp; } /* * Add a name to a leaf attribute list structure. */ STATIC int -xfs_attr_leaf_add_work( - struct xfs_buf *bp, - xfs_da_args_t *args, - int mapindex) +xfs_attr3_leaf_add_work( + struct xfs_buf *bp, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_da_args *args, + int mapindex) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; - xfs_attr_leaf_map_t *map; - xfs_mount_t *mp; - int tmp, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_mount *mp; + int tmp; + int i; trace_xfs_attr_leaf_add_work(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - hdr = &leaf->hdr; - ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE)); - ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(hdr->count))); + ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE); + ASSERT(args->index >= 0 && args->index <= ichdr->count); /* * Force open some space in the entry array and fill it in. */ - entry = &leaf->entries[args->index]; - if (args->index < be16_to_cpu(hdr->count)) { - tmp = be16_to_cpu(hdr->count) - args->index; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + if (args->index < ichdr->count) { + tmp = ichdr->count - args->index; tmp *= sizeof(xfs_attr_leaf_entry_t); - memmove((char *)(entry+1), (char *)entry, tmp); + memmove(entry + 1, entry, tmp); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); } - be16_add_cpu(&hdr->count, 1); + ichdr->count++; /* * Allocate space for the new string (at the end of the run). */ - map = &hdr->freemap[mapindex]; mp = args->trans->t_mountp; - ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp)); - ASSERT((be16_to_cpu(map->base) & 0x3) == 0); - ASSERT(be16_to_cpu(map->size) >= + ASSERT(ichdr->freemap[mapindex].base < XFS_LBSIZE(mp)); + ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0); + ASSERT(ichdr->freemap[mapindex].size >= xfs_attr_leaf_newentsize(args->namelen, args->valuelen, mp->m_sb.sb_blocksize, NULL)); - ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp)); - ASSERT((be16_to_cpu(map->size) & 0x3) == 0); - be16_add_cpu(&map->size, - -xfs_attr_leaf_newentsize(args->namelen, args->valuelen, - mp->m_sb.sb_blocksize, &tmp)); - entry->nameidx = cpu_to_be16(be16_to_cpu(map->base) + - be16_to_cpu(map->size)); + ASSERT(ichdr->freemap[mapindex].size < XFS_LBSIZE(mp)); + ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0); + + ichdr->freemap[mapindex].size -= + xfs_attr_leaf_newentsize(args->namelen, args->valuelen, + mp->m_sb.sb_blocksize, &tmp); + + entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + + ichdr->freemap[mapindex].size); entry->hashval = cpu_to_be32(args->hashval); entry->flags = tmp ? XFS_ATTR_LOCAL : 0; entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); @@ -1200,7 +1384,7 @@ xfs_attr_leaf_add_work( XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); ASSERT((args->index == 0) || (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval))); - ASSERT((args->index == be16_to_cpu(hdr->count)-1) || + ASSERT((args->index == ichdr->count - 1) || (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval))); /* @@ -1211,14 +1395,14 @@ xfs_attr_leaf_add_work( * as part of this transaction (a split operation for example). */ if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); name_loc->namelen = args->namelen; name_loc->valuelen = cpu_to_be16(args->valuelen); memcpy((char *)name_loc->nameval, args->name, args->namelen); memcpy((char *)&name_loc->nameval[args->namelen], args->value, be16_to_cpu(name_loc->valuelen)); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->namelen = args->namelen; memcpy((char *)name_rmt->name, args->name, args->namelen); entry->flags |= XFS_ATTR_INCOMPLETE; @@ -1229,44 +1413,41 @@ xfs_attr_leaf_add_work( args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); } xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), + XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), xfs_attr_leaf_entsize(leaf, args->index))); /* * Update the control info for this leaf node */ - if (be16_to_cpu(entry->nameidx) < be16_to_cpu(hdr->firstused)) { - /* both on-disk, don't endian-flip twice */ - hdr->firstused = entry->nameidx; - } - ASSERT(be16_to_cpu(hdr->firstused) >= - ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr))); - tmp = (be16_to_cpu(hdr->count)-1) * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - map = &hdr->freemap[0]; - for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { - if (be16_to_cpu(map->base) == tmp) { - be16_add_cpu(&map->base, sizeof(xfs_attr_leaf_entry_t)); - be16_add_cpu(&map->size, - -((int)sizeof(xfs_attr_leaf_entry_t))); + if (be16_to_cpu(entry->nameidx) < ichdr->firstused) + ichdr->firstused = be16_to_cpu(entry->nameidx); + + ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf)); + tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + if (ichdr->freemap[i].base == tmp) { + ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t); + ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t); } } - be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index)); - xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); - return(0); + ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index); + return 0; } /* * Garbage collect a leaf attribute list block by copying it to a new buffer. */ STATIC void -xfs_attr_leaf_compact( +xfs_attr3_leaf_compact( struct xfs_da_args *args, + struct xfs_attr3_icleaf_hdr *ichdr_d, struct xfs_buf *bp) { xfs_attr_leafblock_t *leaf_s, *leaf_d; - xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; + struct xfs_attr3_icleaf_hdr ichdr_s; struct xfs_trans *trans = args->trans; struct xfs_mount *mp = trans->t_mountp; char *tmpbuffer; @@ -1283,34 +1464,69 @@ xfs_attr_leaf_compact( */ leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; leaf_d = bp->b_addr; - hdr_s = &leaf_s->hdr; - hdr_d = &leaf_d->hdr; - hdr_d->info = hdr_s->info; /* struct copy */ - hdr_d->firstused = cpu_to_be16(XFS_LBSIZE(mp)); - /* handle truncation gracefully */ - if (!hdr_d->firstused) { - hdr_d->firstused = cpu_to_be16( - XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN); - } - hdr_d->usedbytes = 0; - hdr_d->count = 0; - hdr_d->holes = 0; - hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) - - sizeof(xfs_attr_leaf_hdr_t)); + ichdr_s = *ichdr_d; /* struct copy */ + ichdr_d->firstused = XFS_LBSIZE(mp); + ichdr_d->usedbytes = 0; + ichdr_d->count = 0; + ichdr_d->holes = 0; + ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_s); + ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; /* * Copy all entry's in the same (sorted) order, * but allocate name/value pairs packed and in sequence. */ - xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0, - be16_to_cpu(hdr_s->count), mp); + xfs_attr3_leaf_moveents(leaf_s, &ichdr_s, 0, leaf_d, ichdr_d, 0, + ichdr_s.count, mp); + /* + * this logs the entire buffer, but the caller must write the header + * back to the buffer when it is finished modifying it. + */ xfs_trans_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); kmem_free(tmpbuffer); } /* + * Compare two leaf blocks "order". + * Return 0 unless leaf2 should go before leaf1. + */ +static int +xfs_attr3_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_attr3_icleaf_hdr *leaf1hdr, + struct xfs_buf *leaf2_bp, + struct xfs_attr3_icleaf_hdr *leaf2hdr) +{ + struct xfs_attr_leaf_entry *entries1; + struct xfs_attr_leaf_entry *entries2; + + entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr); + entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr); + if (leaf1hdr->count > 0 && leaf2hdr->count > 0 && + ((be32_to_cpu(entries2[0].hashval) < + be32_to_cpu(entries1[0].hashval)) || + (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) < + be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) { + return 1; + } + return 0; +} + +int +xfs_attr_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp) +{ + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + + xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr); + return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2); +} + +/* * Redistribute the attribute list entries between two leaf nodes, * taking into account the size of the new entry. * @@ -1323,14 +1539,23 @@ xfs_attr_leaf_compact( * the "new" and "old" values can end up in different blocks. */ STATIC void -xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2) +xfs_attr3_leaf_rebalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) { - xfs_da_args_t *args; - xfs_da_state_blk_t *tmp_blk; - xfs_attr_leafblock_t *leaf1, *leaf2; - xfs_attr_leaf_hdr_t *hdr1, *hdr2; - int count, totallen, max, space, swap; + struct xfs_da_args *args; + struct xfs_attr_leafblock *leaf1; + struct xfs_attr_leafblock *leaf2; + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + struct xfs_attr_leaf_entry *entries1; + struct xfs_attr_leaf_entry *entries2; + int count; + int totallen; + int max; + int space; + int swap; /* * Set up environment. @@ -1339,9 +1564,9 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; - ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(leaf2->hdr.count == 0); + xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); + xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + ASSERT(ichdr2.count == 0); args = state->args; trace_xfs_attr_leaf_rebalance(args); @@ -1353,16 +1578,23 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * second block, this code should never set "swap". */ swap = 0; - if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) { + if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) { + struct xfs_da_state_blk *tmp_blk; + struct xfs_attr3_icleaf_hdr tmp_ichdr; + tmp_blk = blk1; blk1 = blk2; blk2 = tmp_blk; + + /* struct copies to swap them rather than reconverting */ + tmp_ichdr = ichdr1; + ichdr1 = ichdr2; + ichdr2 = tmp_ichdr; + leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; swap = 1; } - hdr1 = &leaf1->hdr; - hdr2 = &leaf2->hdr; /* * Examine entries until we reduce the absolute difference in @@ -1372,41 +1604,39 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * "inleaf" is true if the new entry should be inserted into blk1. * If "swap" is also true, then reverse the sense of "inleaf". */ - state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2, - &count, &totallen); + state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1, + blk2, &ichdr2, + &count, &totallen); if (swap) state->inleaf = !state->inleaf; /* * Move any entries required from leaf to leaf: */ - if (count < be16_to_cpu(hdr1->count)) { + if (count < ichdr1.count) { /* * Figure the total bytes to be added to the destination leaf. */ /* number entries being moved */ - count = be16_to_cpu(hdr1->count) - count; - space = be16_to_cpu(hdr1->usedbytes) - totallen; + count = ichdr1.count - count; + space = ichdr1.usedbytes - totallen; space += count * sizeof(xfs_attr_leaf_entry_t); /* * leaf2 is the destination, compact it if it looks tight. */ - max = be16_to_cpu(hdr2->firstused) - - sizeof(xfs_attr_leaf_hdr_t); - max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t); + max = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1); + max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t); if (space > max) - xfs_attr_leaf_compact(args, blk2->bp); + xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp); /* * Move high entries from leaf1 to low end of leaf2. */ - xfs_attr_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count, - leaf2, 0, count, state->mp); + xfs_attr3_leaf_moveents(leaf1, &ichdr1, ichdr1.count - count, + leaf2, &ichdr2, 0, count, state->mp); - xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); - } else if (count > be16_to_cpu(hdr1->count)) { + } else if (count > ichdr1.count) { /* * I assert that since all callers pass in an empty * second buffer, this code should never execute. @@ -1417,36 +1647,37 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Figure the total bytes to be added to the destination leaf. */ /* number entries being moved */ - count -= be16_to_cpu(hdr1->count); - space = totallen - be16_to_cpu(hdr1->usedbytes); + count -= ichdr1.count; + space = totallen - ichdr1.usedbytes; space += count * sizeof(xfs_attr_leaf_entry_t); /* * leaf1 is the destination, compact it if it looks tight. */ - max = be16_to_cpu(hdr1->firstused) - - sizeof(xfs_attr_leaf_hdr_t); - max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t); + max = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1); + max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t); if (space > max) - xfs_attr_leaf_compact(args, blk1->bp); + xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp); /* * Move low entries from leaf2 to high end of leaf1. */ - xfs_attr_leaf_moveents(leaf2, 0, leaf1, - be16_to_cpu(hdr1->count), count, state->mp); - - xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + xfs_attr3_leaf_moveents(leaf2, &ichdr2, 0, leaf1, &ichdr1, + ichdr1.count, count, state->mp); } + xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1); + xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2); + xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + /* * Copy out last hashval in each block for B-tree code. */ - blk1->hashval = be32_to_cpu( - leaf1->entries[be16_to_cpu(leaf1->hdr.count)-1].hashval); - blk2->hashval = be32_to_cpu( - leaf2->entries[be16_to_cpu(leaf2->hdr.count)-1].hashval); + entries1 = xfs_attr3_leaf_entryp(leaf1); + entries2 = xfs_attr3_leaf_entryp(leaf2); + blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval); + blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval); /* * Adjust the expected index for insertion. @@ -1460,12 +1691,12 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * inserting. The index/blkno fields refer to the "old" entry, * while the index2/blkno2 fields refer to the "new" entry. */ - if (blk1->index > be16_to_cpu(leaf1->hdr.count)) { + if (blk1->index > ichdr1.count) { ASSERT(state->inleaf == 0); - blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); + blk2->index = blk1->index - ichdr1.count; args->index = args->index2 = blk2->index; args->blkno = args->blkno2 = blk2->blkno; - } else if (blk1->index == be16_to_cpu(leaf1->hdr.count)) { + } else if (blk1->index == ichdr1.count) { if (state->inleaf) { args->index = blk1->index; args->blkno = blk1->blkno; @@ -1477,8 +1708,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * is already stored in blkno2/index2, so don't * overwrite it overwise we corrupt the tree. */ - blk2->index = blk1->index - - be16_to_cpu(leaf1->hdr.count); + blk2->index = blk1->index - ichdr1.count; args->index = blk2->index; args->blkno = blk2->blkno; if (!state->extravalid) { @@ -1506,42 +1736,40 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * GROT: Do a double-split for this case? */ STATIC int -xfs_attr_leaf_figure_balance(xfs_da_state_t *state, - xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2, - int *countarg, int *usedbytesarg) +xfs_attr3_leaf_figure_balance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_attr3_icleaf_hdr *ichdr1, + struct xfs_da_state_blk *blk2, + struct xfs_attr3_icleaf_hdr *ichdr2, + int *countarg, + int *usedbytesarg) { - xfs_attr_leafblock_t *leaf1, *leaf2; - xfs_attr_leaf_hdr_t *hdr1, *hdr2; - xfs_attr_leaf_entry_t *entry; - int count, max, index, totallen, half; - int lastdelta, foundit, tmp; - - /* - * Set up environment. - */ - leaf1 = blk1->bp->b_addr; - leaf2 = blk2->bp->b_addr; - hdr1 = &leaf1->hdr; - hdr2 = &leaf2->hdr; - foundit = 0; - totallen = 0; + struct xfs_attr_leafblock *leaf1 = blk1->bp->b_addr; + struct xfs_attr_leafblock *leaf2 = blk2->bp->b_addr; + struct xfs_attr_leaf_entry *entry; + int count; + int max; + int index; + int totallen = 0; + int half; + int lastdelta; + int foundit = 0; + int tmp; /* * Examine entries until we reduce the absolute difference in * byte usage between the two blocks to a minimum. */ - max = be16_to_cpu(hdr1->count) + be16_to_cpu(hdr2->count); - half = (max+1) * sizeof(*entry); - half += be16_to_cpu(hdr1->usedbytes) + - be16_to_cpu(hdr2->usedbytes) + - xfs_attr_leaf_newentsize( - state->args->namelen, - state->args->valuelen, - state->blocksize, NULL); + max = ichdr1->count + ichdr2->count; + half = (max + 1) * sizeof(*entry); + half += ichdr1->usedbytes + ichdr2->usedbytes + + xfs_attr_leaf_newentsize(state->args->namelen, + state->args->valuelen, + state->blocksize, NULL); half /= 2; lastdelta = state->blocksize; - entry = &leaf1->entries[0]; + entry = xfs_attr3_leaf_entryp(leaf1); for (count = index = 0; count < max; entry++, index++, count++) { #define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A)) @@ -1564,9 +1792,9 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, /* * Wrap around into the second block if necessary. */ - if (count == be16_to_cpu(hdr1->count)) { + if (count == ichdr1->count) { leaf1 = leaf2; - entry = &leaf1->entries[0]; + entry = xfs_attr3_leaf_entryp(leaf1); index = 0; } @@ -1597,7 +1825,7 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, *countarg = count; *usedbytesarg = totallen; - return(foundit); + return foundit; } /*======================================================================== @@ -1616,14 +1844,20 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, * GROT: allow for INCOMPLETE entries in calculation. */ int -xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) +xfs_attr3_leaf_toosmall( + struct xfs_da_state *state, + int *action) { - xfs_attr_leafblock_t *leaf; - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *info; - int count, bytes, forward, error, retval, i; - xfs_dablk_t blkno; - struct xfs_buf *bp; + struct xfs_attr_leafblock *leaf; + struct xfs_da_state_blk *blk; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_buf *bp; + xfs_dablk_t blkno; + int bytes; + int forward; + int error; + int retval; + int i; trace_xfs_attr_leaf_toosmall(state->args); @@ -1633,13 +1867,11 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * to coalesce with a sibling. */ blk = &state->path.blk[ state->path.active-1 ]; - info = blk->bp->b_addr; - ASSERT(info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - leaf = (xfs_attr_leafblock_t *)info; - count = be16_to_cpu(leaf->hdr.count); - bytes = sizeof(xfs_attr_leaf_hdr_t) + - count * sizeof(xfs_attr_leaf_entry_t) + - be16_to_cpu(leaf->hdr.usedbytes); + leaf = blk->bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + bytes = xfs_attr3_leaf_hdr_size(leaf) + + ichdr.count * sizeof(xfs_attr_leaf_entry_t) + + ichdr.usedbytes; if (bytes > (state->blocksize >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return(0); @@ -1651,14 +1883,14 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * coalesce it with a sibling block. We choose (arbitrarily) * to merge with the forward block unless it is NULL. */ - if (count == 0) { + if (ichdr.count == 0) { /* * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ - forward = (info->forw != 0); + forward = (ichdr.forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); if (error) return(error); @@ -1667,7 +1899,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) } else { *action = 2; } - return(0); + return 0; } /* @@ -1678,28 +1910,28 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * to shrink an attribute list over time. */ /* start with smaller blk num */ - forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back)); + forward = ichdr.forw < ichdr.back; for (i = 0; i < 2; forward = !forward, i++) { + struct xfs_attr3_icleaf_hdr ichdr2; if (forward) - blkno = be32_to_cpu(info->forw); + blkno = ichdr.forw; else - blkno = be32_to_cpu(info->back); + blkno = ichdr.back; if (blkno == 0) continue; - error = xfs_attr_leaf_read(state->args->trans, state->args->dp, + error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, blkno, -1, &bp); if (error) return(error); - leaf = (xfs_attr_leafblock_t *)info; - count = be16_to_cpu(leaf->hdr.count); - bytes = state->blocksize - (state->blocksize>>2); - bytes -= be16_to_cpu(leaf->hdr.usedbytes); - leaf = bp->b_addr; - count += be16_to_cpu(leaf->hdr.count); - bytes -= be16_to_cpu(leaf->hdr.usedbytes); - bytes -= count * sizeof(xfs_attr_leaf_entry_t); - bytes -= sizeof(xfs_attr_leaf_hdr_t); + xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr); + + bytes = state->blocksize - (state->blocksize >> 2) - + ichdr.usedbytes - ichdr2.usedbytes - + ((ichdr.count + ichdr2.count) * + sizeof(xfs_attr_leaf_entry_t)) - + xfs_attr3_leaf_hdr_size(leaf); + xfs_trans_brelse(state->args->trans, bp); if (bytes >= 0) break; /* fits with at least 25% to spare */ @@ -1715,10 +1947,10 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) { - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); } else { - error = xfs_da_path_shift(state, &state->path, forward, + error = xfs_da3_path_shift(state, &state->path, forward, 0, &retval); } if (error) @@ -1738,32 +1970,35 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * If two leaves are 37% full, when combined they will leave 25% free. */ int -xfs_attr_leaf_remove( - struct xfs_buf *bp, - xfs_da_args_t *args) +xfs_attr3_leaf_remove( + struct xfs_buf *bp, + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_attr_leaf_map_t *map; - xfs_attr_leaf_entry_t *entry; - int before, after, smallest, entsize; - int tablesize, tmp, i; - xfs_mount_t *mp; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_mount *mp = args->trans->t_mountp; + int before; + int after; + int smallest; + int entsize; + int tablesize; + int tmp; + int i; trace_xfs_attr_leaf_remove(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - hdr = &leaf->hdr; - mp = args->trans->t_mountp; - ASSERT((be16_to_cpu(hdr->count) > 0) - && (be16_to_cpu(hdr->count) < (XFS_LBSIZE(mp)/8))); - ASSERT((args->index >= 0) - && (args->index < be16_to_cpu(hdr->count))); - ASSERT(be16_to_cpu(hdr->firstused) >= - ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr))); - entry = &leaf->entries[args->index]; - ASSERT(be16_to_cpu(entry->nameidx) >= be16_to_cpu(hdr->firstused)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + + ASSERT(ichdr.count > 0 && ichdr.count < XFS_LBSIZE(mp) / 8); + ASSERT(args->index >= 0 && args->index < ichdr.count); + ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) + + xfs_attr3_leaf_hdr_size(leaf)); + + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + + ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); /* @@ -1772,30 +2007,28 @@ xfs_attr_leaf_remove( * find smallest free region in case we need to replace it, * adjust any map that borders the entry table, */ - tablesize = be16_to_cpu(hdr->count) * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - map = &hdr->freemap[0]; - tmp = be16_to_cpu(map->size); + tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + tmp = ichdr.freemap[0].size; before = after = -1; smallest = XFS_ATTR_LEAF_MAPSIZE - 1; entsize = xfs_attr_leaf_entsize(leaf, args->index); - for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { - ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp)); - ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp)); - if (be16_to_cpu(map->base) == tablesize) { - be16_add_cpu(&map->base, - -((int)sizeof(xfs_attr_leaf_entry_t))); - be16_add_cpu(&map->size, sizeof(xfs_attr_leaf_entry_t)); + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + ASSERT(ichdr.freemap[i].base < XFS_LBSIZE(mp)); + ASSERT(ichdr.freemap[i].size < XFS_LBSIZE(mp)); + if (ichdr.freemap[i].base == tablesize) { + ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t); + ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t); } - if ((be16_to_cpu(map->base) + be16_to_cpu(map->size)) - == be16_to_cpu(entry->nameidx)) { + if (ichdr.freemap[i].base + ichdr.freemap[i].size == + be16_to_cpu(entry->nameidx)) { before = i; - } else if (be16_to_cpu(map->base) - == (be16_to_cpu(entry->nameidx) + entsize)) { + } else if (ichdr.freemap[i].base == + (be16_to_cpu(entry->nameidx) + entsize)) { after = i; - } else if (be16_to_cpu(map->size) < tmp) { - tmp = be16_to_cpu(map->size); + } else if (ichdr.freemap[i].size < tmp) { + tmp = ichdr.freemap[i].size; smallest = i; } } @@ -1806,36 +2039,30 @@ xfs_attr_leaf_remove( */ if ((before >= 0) || (after >= 0)) { if ((before >= 0) && (after >= 0)) { - map = &hdr->freemap[before]; - be16_add_cpu(&map->size, entsize); - be16_add_cpu(&map->size, - be16_to_cpu(hdr->freemap[after].size)); - hdr->freemap[after].base = 0; - hdr->freemap[after].size = 0; + ichdr.freemap[before].size += entsize; + ichdr.freemap[before].size += ichdr.freemap[after].size; + ichdr.freemap[after].base = 0; + ichdr.freemap[after].size = 0; } else if (before >= 0) { - map = &hdr->freemap[before]; - be16_add_cpu(&map->size, entsize); + ichdr.freemap[before].size += entsize; } else { - map = &hdr->freemap[after]; - /* both on-disk, don't endian flip twice */ - map->base = entry->nameidx; - be16_add_cpu(&map->size, entsize); + ichdr.freemap[after].base = be16_to_cpu(entry->nameidx); + ichdr.freemap[after].size += entsize; } } else { /* * Replace smallest region (if it is smaller than free'd entry) */ - map = &hdr->freemap[smallest]; - if (be16_to_cpu(map->size) < entsize) { - map->base = cpu_to_be16(be16_to_cpu(entry->nameidx)); - map->size = cpu_to_be16(entsize); + if (ichdr.freemap[smallest].size < entsize) { + ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx); + ichdr.freemap[smallest].size = entsize; } } /* * Did we remove the first entry? */ - if (be16_to_cpu(entry->nameidx) == be16_to_cpu(hdr->firstused)) + if (be16_to_cpu(entry->nameidx) == ichdr.firstused) smallest = 1; else smallest = 0; @@ -1843,20 +2070,20 @@ xfs_attr_leaf_remove( /* * Compress the remaining entries and zero out the removed stuff. */ - memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize); - be16_add_cpu(&hdr->usedbytes, -entsize); + memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize); + ichdr.usedbytes -= entsize; xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index), + XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), entsize)); - tmp = (be16_to_cpu(hdr->count) - args->index) - * sizeof(xfs_attr_leaf_entry_t); - memmove((char *)entry, (char *)(entry+1), tmp); - be16_add_cpu(&hdr->count, -1); + tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t); + memmove(entry, entry + 1, tmp); + ichdr.count--; xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); - entry = &leaf->entries[be16_to_cpu(hdr->count)]; - memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t)); + XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t))); + + entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count]; + memset(entry, 0, sizeof(xfs_attr_leaf_entry_t)); /* * If we removed the first entry, re-find the first used byte @@ -1866,130 +2093,130 @@ xfs_attr_leaf_remove( */ if (smallest) { tmp = XFS_LBSIZE(mp); - entry = &leaf->entries[0]; - for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) { - ASSERT(be16_to_cpu(entry->nameidx) >= - be16_to_cpu(hdr->firstused)); + entry = xfs_attr3_leaf_entryp(leaf); + for (i = ichdr.count - 1; i >= 0; entry++, i--) { + ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); if (be16_to_cpu(entry->nameidx) < tmp) tmp = be16_to_cpu(entry->nameidx); } - hdr->firstused = cpu_to_be16(tmp); - if (!hdr->firstused) { - hdr->firstused = cpu_to_be16( - tmp - XFS_ATTR_LEAF_NAME_ALIGN); - } + ichdr.firstused = tmp; + if (!ichdr.firstused) + ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN; } else { - hdr->holes = 1; /* mark as needing compaction */ + ichdr.holes = 1; /* mark as needing compaction */ } + xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); + XFS_DA_LOGRANGE(leaf, &leaf->hdr, + xfs_attr3_leaf_hdr_size(leaf))); /* * Check if leaf is less than 50% full, caller may want to * "join" the leaf with a sibling if so. */ - tmp = sizeof(xfs_attr_leaf_hdr_t); - tmp += be16_to_cpu(leaf->hdr.count) * sizeof(xfs_attr_leaf_entry_t); - tmp += be16_to_cpu(leaf->hdr.usedbytes); - return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */ + tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) + + ichdr.count * sizeof(xfs_attr_leaf_entry_t); + + return tmp < mp->m_attr_magicpct; /* leaf is < 37% full */ } /* * Move all the attribute list entries from drop_leaf into save_leaf. */ void -xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk) +xfs_attr3_leaf_unbalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) { - xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf; - xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr; - xfs_mount_t *mp; - char *tmpbuffer; + struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr; + struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr; + struct xfs_attr3_icleaf_hdr drophdr; + struct xfs_attr3_icleaf_hdr savehdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_mount *mp = state->mp; trace_xfs_attr_leaf_unbalance(state->args); - /* - * Set up environment. - */ - mp = state->mp; - ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC); - ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC); drop_leaf = drop_blk->bp->b_addr; save_leaf = save_blk->bp->b_addr; - ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - drop_hdr = &drop_leaf->hdr; - save_hdr = &save_leaf->hdr; + xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf); + xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf); + entry = xfs_attr3_leaf_entryp(drop_leaf); /* * Save last hashval from dying block for later Btree fixup. */ - drop_blk->hashval = be32_to_cpu( - drop_leaf->entries[be16_to_cpu(drop_leaf->hdr.count)-1].hashval); + drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval); /* * Check if we need a temp buffer, or can we do it in place. * Note that we don't check "leaf" for holes because we will * always be dropping it, toosmall() decided that for us already. */ - if (save_hdr->holes == 0) { + if (savehdr.holes == 0) { /* * dest leaf has no holes, so we add there. May need * to make some room in the entry array. */ - if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { - xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0, - be16_to_cpu(drop_hdr->count), mp); + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, + drop_blk->bp, &drophdr)) { + xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + save_leaf, &savehdr, 0, + drophdr.count, mp); } else { - xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, - be16_to_cpu(save_hdr->count), - be16_to_cpu(drop_hdr->count), mp); + xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + save_leaf, &savehdr, + savehdr.count, drophdr.count, mp); } } else { /* * Destination has holes, so we make a temporary copy * of the leaf and add them both to that. */ - tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP); - ASSERT(tmpbuffer != NULL); - memset(tmpbuffer, 0, state->blocksize); - tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer; - tmp_hdr = &tmp_leaf->hdr; - tmp_hdr->info = save_hdr->info; /* struct copy */ - tmp_hdr->count = 0; - tmp_hdr->firstused = cpu_to_be16(state->blocksize); - if (!tmp_hdr->firstused) { - tmp_hdr->firstused = cpu_to_be16( - state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN); - } - tmp_hdr->usedbytes = 0; - if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { - xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0, - be16_to_cpu(drop_hdr->count), mp); - xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, - be16_to_cpu(tmp_leaf->hdr.count), - be16_to_cpu(save_hdr->count), mp); + struct xfs_attr_leafblock *tmp_leaf; + struct xfs_attr3_icleaf_hdr tmphdr; + + tmp_leaf = kmem_alloc(state->blocksize, KM_SLEEP); + memset(tmp_leaf, 0, state->blocksize); + memset(&tmphdr, 0, sizeof(tmphdr)); + + tmphdr.magic = savehdr.magic; + tmphdr.forw = savehdr.forw; + tmphdr.back = savehdr.back; + tmphdr.firstused = state->blocksize; + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, + drop_blk->bp, &drophdr)) { + xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + tmp_leaf, &tmphdr, 0, + drophdr.count, mp); + xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0, + tmp_leaf, &tmphdr, tmphdr.count, + savehdr.count, mp); } else { - xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0, - be16_to_cpu(save_hdr->count), mp); - xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, - be16_to_cpu(tmp_leaf->hdr.count), - be16_to_cpu(drop_hdr->count), mp); + xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0, + tmp_leaf, &tmphdr, 0, + savehdr.count, mp); + xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + tmp_leaf, &tmphdr, tmphdr.count, + drophdr.count, mp); } - memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize); - kmem_free(tmpbuffer); + memcpy(save_leaf, tmp_leaf, state->blocksize); + savehdr = tmphdr; /* struct copy */ + kmem_free(tmp_leaf); } + xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr); xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, state->blocksize - 1); /* * Copy out last hashval in each block for B-tree code. */ - save_blk->hashval = be32_to_cpu( - save_leaf->entries[be16_to_cpu(save_leaf->hdr.count)-1].hashval); + entry = xfs_attr3_leaf_entryp(save_leaf); + save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval); } /*======================================================================== @@ -2010,31 +2237,33 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * Don't change the args->value unless we find the attribute. */ int -xfs_attr_leaf_lookup_int( - struct xfs_buf *bp, - xfs_da_args_t *args) +xfs_attr3_leaf_lookup_int( + struct xfs_buf *bp, + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; - int probe, span; - xfs_dahash_t hashval; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + xfs_dahash_t hashval; + int probe; + int span; trace_xfs_attr_leaf_lookup(args); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(be16_to_cpu(leaf->hdr.count) - < (XFS_LBSIZE(args->dp->i_mount)/8)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8); /* * Binary search. (note: small blocks will skip this loop) */ hashval = args->hashval; - probe = span = be16_to_cpu(leaf->hdr.count) / 2; - for (entry = &leaf->entries[probe]; span > 4; - entry = &leaf->entries[probe]) { + probe = span = ichdr.count / 2; + for (entry = &entries[probe]; span > 4; entry = &entries[probe]) { span /= 2; if (be32_to_cpu(entry->hashval) < hashval) probe += span; @@ -2043,35 +2272,31 @@ xfs_attr_leaf_lookup_int( else break; } - ASSERT((probe >= 0) && - (!leaf->hdr.count - || (probe < be16_to_cpu(leaf->hdr.count)))); - ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval)); + ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count)); + ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval); /* * Since we may have duplicate hashval's, find the first matching * hashval in the leaf. */ - while ((probe > 0) && (be32_to_cpu(entry->hashval) >= hashval)) { + while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) { entry--; probe--; } - while ((probe < be16_to_cpu(leaf->hdr.count)) && - (be32_to_cpu(entry->hashval) < hashval)) { + while (probe < ichdr.count && + be32_to_cpu(entry->hashval) < hashval) { entry++; probe++; } - if ((probe == be16_to_cpu(leaf->hdr.count)) || - (be32_to_cpu(entry->hashval) != hashval)) { + if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) { args->index = probe; - return(XFS_ERROR(ENOATTR)); + return XFS_ERROR(ENOATTR); } /* * Duplicate keys may be present, so search all of them for a match. */ - for ( ; (probe < be16_to_cpu(leaf->hdr.count)) && - (be32_to_cpu(entry->hashval) == hashval); + for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval); entry++, probe++) { /* * GROT: Add code to remove incomplete entries. @@ -2085,21 +2310,22 @@ xfs_attr_leaf_lookup_int( continue; } if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, probe); + name_loc = xfs_attr3_leaf_name_local(leaf, probe); if (name_loc->namelen != args->namelen) continue; - if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0) + if (memcmp(args->name, name_loc->nameval, + args->namelen) != 0) continue; if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; - return(XFS_ERROR(EEXIST)); + return XFS_ERROR(EEXIST); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, probe); + name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); if (name_rmt->namelen != args->namelen) continue; - if (memcmp(args->name, (char *)name_rmt->name, - args->namelen) != 0) + if (memcmp(args->name, name_rmt->name, + args->namelen) != 0) continue; if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; @@ -2107,11 +2333,11 @@ xfs_attr_leaf_lookup_int( args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, be32_to_cpu(name_rmt->valuelen)); - return(XFS_ERROR(EEXIST)); + return XFS_ERROR(EEXIST); } } args->index = probe; - return(XFS_ERROR(ENOATTR)); + return XFS_ERROR(ENOATTR); } /* @@ -2119,40 +2345,40 @@ xfs_attr_leaf_lookup_int( * list structure. */ int -xfs_attr_leaf_getvalue( - struct xfs_buf *bp, - xfs_da_args_t *args) +xfs_attr3_leaf_getvalue( + struct xfs_buf *bp, + struct xfs_da_args *args) { - int valuelen; - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + int valuelen; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(be16_to_cpu(leaf->hdr.count) - < (XFS_LBSIZE(args->dp->i_mount)/8)); - ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8); + ASSERT(args->index < ichdr.count); - entry = &leaf->entries[args->index]; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); ASSERT(name_loc->namelen == args->namelen); ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); valuelen = be16_to_cpu(name_loc->valuelen); if (args->flags & ATTR_KERNOVAL) { args->valuelen = valuelen; - return(0); + return 0; } if (args->valuelen < valuelen) { args->valuelen = valuelen; - return(XFS_ERROR(ERANGE)); + return XFS_ERROR(ERANGE); } args->valuelen = valuelen; memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); ASSERT(name_rmt->namelen == args->namelen); ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); valuelen = be32_to_cpu(name_rmt->valuelen); @@ -2160,15 +2386,15 @@ xfs_attr_leaf_getvalue( args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen); if (args->flags & ATTR_KERNOVAL) { args->valuelen = valuelen; - return(0); + return 0; } if (args->valuelen < valuelen) { args->valuelen = valuelen; - return(XFS_ERROR(ERANGE)); + return XFS_ERROR(ERANGE); } args->valuelen = valuelen; } - return(0); + return 0; } /*======================================================================== @@ -2181,13 +2407,21 @@ xfs_attr_leaf_getvalue( */ /*ARGSUSED*/ STATIC void -xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, - xfs_attr_leafblock_t *leaf_d, int start_d, - int count, xfs_mount_t *mp) +xfs_attr3_leaf_moveents( + struct xfs_attr_leafblock *leaf_s, + struct xfs_attr3_icleaf_hdr *ichdr_s, + int start_s, + struct xfs_attr_leafblock *leaf_d, + struct xfs_attr3_icleaf_hdr *ichdr_d, + int start_d, + int count, + struct xfs_mount *mp) { - xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; - xfs_attr_leaf_entry_t *entry_s, *entry_d; - int desti, tmp, i; + struct xfs_attr_leaf_entry *entry_s; + struct xfs_attr_leaf_entry *entry_d; + int desti; + int tmp; + int i; /* * Check for nothing to do. @@ -2198,45 +2432,41 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, /* * Set up environment. */ - ASSERT(leaf_s->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - ASSERT(leaf_d->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - hdr_s = &leaf_s->hdr; - hdr_d = &leaf_d->hdr; - ASSERT((be16_to_cpu(hdr_s->count) > 0) && - (be16_to_cpu(hdr_s->count) < (XFS_LBSIZE(mp)/8))); - ASSERT(be16_to_cpu(hdr_s->firstused) >= - ((be16_to_cpu(hdr_s->count) - * sizeof(*entry_s))+sizeof(*hdr_s))); - ASSERT(be16_to_cpu(hdr_d->count) < (XFS_LBSIZE(mp)/8)); - ASSERT(be16_to_cpu(hdr_d->firstused) >= - ((be16_to_cpu(hdr_d->count) - * sizeof(*entry_d))+sizeof(*hdr_d))); - - ASSERT(start_s < be16_to_cpu(hdr_s->count)); - ASSERT(start_d <= be16_to_cpu(hdr_d->count)); - ASSERT(count <= be16_to_cpu(hdr_s->count)); + ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC || + ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC); + ASSERT(ichdr_s->magic == ichdr_d->magic); + ASSERT(ichdr_s->count > 0 && ichdr_s->count < XFS_LBSIZE(mp) / 8); + ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s)) + + xfs_attr3_leaf_hdr_size(leaf_s)); + ASSERT(ichdr_d->count < XFS_LBSIZE(mp) / 8); + ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d)) + + xfs_attr3_leaf_hdr_size(leaf_d)); + + ASSERT(start_s < ichdr_s->count); + ASSERT(start_d <= ichdr_d->count); + ASSERT(count <= ichdr_s->count); + /* * Move the entries in the destination leaf up to make a hole? */ - if (start_d < be16_to_cpu(hdr_d->count)) { - tmp = be16_to_cpu(hdr_d->count) - start_d; + if (start_d < ichdr_d->count) { + tmp = ichdr_d->count - start_d; tmp *= sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_d->entries[start_d]; - entry_d = &leaf_d->entries[start_d + count]; - memmove((char *)entry_d, (char *)entry_s, tmp); + entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; + entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count]; + memmove(entry_d, entry_s, tmp); } /* * Copy all entry's in the same (sorted) order, * but allocate attribute info packed and in sequence. */ - entry_s = &leaf_s->entries[start_s]; - entry_d = &leaf_d->entries[start_d]; + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; desti = start_d; for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) { - ASSERT(be16_to_cpu(entry_s->nameidx) - >= be16_to_cpu(hdr_s->firstused)); + ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused); tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i); #ifdef GROT /* @@ -2245,36 +2475,34 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, * off for 6.2, should be revisited later. */ if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */ - memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp); - be16_add_cpu(&hdr_s->usedbytes, -tmp); - be16_add_cpu(&hdr_s->count, -1); + memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); + ichdr_s->usedbytes -= tmp; + ichdr_s->count -= 1; entry_d--; /* to compensate for ++ in loop hdr */ desti--; if ((start_s + i) < offset) result++; /* insertion index adjustment */ } else { #endif /* GROT */ - be16_add_cpu(&hdr_d->firstused, -tmp); + ichdr_d->firstused -= tmp; /* both on-disk, don't endian flip twice */ entry_d->hashval = entry_s->hashval; - /* both on-disk, don't endian flip twice */ - entry_d->nameidx = hdr_d->firstused; + entry_d->nameidx = cpu_to_be16(ichdr_d->firstused); entry_d->flags = entry_s->flags; ASSERT(be16_to_cpu(entry_d->nameidx) + tmp <= XFS_LBSIZE(mp)); - memmove(xfs_attr_leaf_name(leaf_d, desti), - xfs_attr_leaf_name(leaf_s, start_s + i), tmp); + memmove(xfs_attr3_leaf_name(leaf_d, desti), + xfs_attr3_leaf_name(leaf_s, start_s + i), tmp); ASSERT(be16_to_cpu(entry_s->nameidx) + tmp <= XFS_LBSIZE(mp)); - memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp); - be16_add_cpu(&hdr_s->usedbytes, -tmp); - be16_add_cpu(&hdr_d->usedbytes, tmp); - be16_add_cpu(&hdr_s->count, -1); - be16_add_cpu(&hdr_d->count, 1); - tmp = be16_to_cpu(hdr_d->count) - * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - ASSERT(be16_to_cpu(hdr_d->firstused) >= tmp); + memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); + ichdr_s->usedbytes -= tmp; + ichdr_d->usedbytes += tmp; + ichdr_s->count -= 1; + ichdr_d->count += 1; + tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf_d); + ASSERT(ichdr_d->firstused >= tmp); #ifdef GROT } #endif /* GROT */ @@ -2283,71 +2511,40 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, /* * Zero out the entries we just copied. */ - if (start_s == be16_to_cpu(hdr_s->count)) { + if (start_s == ichdr_s->count) { tmp = count * sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_s->entries[start_s]; + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; ASSERT(((char *)entry_s + tmp) <= ((char *)leaf_s + XFS_LBSIZE(mp))); - memset((char *)entry_s, 0, tmp); + memset(entry_s, 0, tmp); } else { /* * Move the remaining entries down to fill the hole, * then zero the entries at the top. */ - tmp = be16_to_cpu(hdr_s->count) - count; - tmp *= sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_s->entries[start_s + count]; - entry_d = &leaf_s->entries[start_s]; - memmove((char *)entry_d, (char *)entry_s, tmp); + tmp = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t); + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count]; + entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + memmove(entry_d, entry_s, tmp); tmp = count * sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_s->entries[be16_to_cpu(hdr_s->count)]; + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count]; ASSERT(((char *)entry_s + tmp) <= ((char *)leaf_s + XFS_LBSIZE(mp))); - memset((char *)entry_s, 0, tmp); + memset(entry_s, 0, tmp); } /* * Fill in the freemap information */ - hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - be16_add_cpu(&hdr_d->freemap[0].base, be16_to_cpu(hdr_d->count) * - sizeof(xfs_attr_leaf_entry_t)); - hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) - - be16_to_cpu(hdr_d->freemap[0].base)); - hdr_d->freemap[1].base = 0; - hdr_d->freemap[2].base = 0; - hdr_d->freemap[1].size = 0; - hdr_d->freemap[2].size = 0; - hdr_s->holes = 1; /* leaf may not be compact */ -} - -/* - * Compare two leaf blocks "order". - * Return 0 unless leaf2 should go before leaf1. - */ -int -xfs_attr_leaf_order( - struct xfs_buf *leaf1_bp, - struct xfs_buf *leaf2_bp) -{ - xfs_attr_leafblock_t *leaf1, *leaf2; - - leaf1 = leaf1_bp->b_addr; - leaf2 = leaf2_bp->b_addr; - ASSERT((leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) && - (leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC))); - if ((be16_to_cpu(leaf1->hdr.count) > 0) && - (be16_to_cpu(leaf2->hdr.count) > 0) && - ((be32_to_cpu(leaf2->entries[0].hashval) < - be32_to_cpu(leaf1->entries[0].hashval)) || - (be32_to_cpu(leaf2->entries[ - be16_to_cpu(leaf2->hdr.count)-1].hashval) < - be32_to_cpu(leaf1->entries[ - be16_to_cpu(leaf1->hdr.count)-1].hashval)))) { - return(1); - } - return(0); + ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d); + ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t); + ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; + ichdr_d->freemap[1].base = 0; + ichdr_d->freemap[2].base = 0; + ichdr_d->freemap[1].size = 0; + ichdr_d->freemap[2].size = 0; + ichdr_s->holes = 1; /* leaf may not be compact */ } /* @@ -2358,15 +2555,16 @@ xfs_attr_leaf_lasthash( struct xfs_buf *bp, int *count) { - xfs_attr_leafblock_t *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entries; - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr); + entries = xfs_attr3_leaf_entryp(bp->b_addr); if (count) - *count = be16_to_cpu(leaf->hdr.count); - if (!leaf->hdr.count) - return(0); - return be32_to_cpu(leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval); + *count = ichdr.count; + if (!ichdr.count) + return 0; + return be32_to_cpu(entries[ichdr.count - 1].hashval); } /* @@ -2376,20 +2574,21 @@ xfs_attr_leaf_lasthash( STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) { + struct xfs_attr_leaf_entry *entries; xfs_attr_leaf_name_local_t *name_loc; xfs_attr_leaf_name_remote_t *name_rmt; int size; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - if (leaf->entries[index].flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, index); + entries = xfs_attr3_leaf_entryp(leaf); + if (entries[index].flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf, index); size = xfs_attr_leaf_entsize_local(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, index); size = xfs_attr_leaf_entsize_remote(name_rmt->namelen); } - return(size); + return size; } /* @@ -2414,35 +2613,40 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local) *local = 0; } } - return(size); + return size; } /* * Copy out attribute list entries for attr_list(), for leaf attribute lists. */ int -xfs_attr_leaf_list_int( - struct xfs_buf *bp, - xfs_attr_list_context_t *context) +xfs_attr3_leaf_list_int( + struct xfs_buf *bp, + struct xfs_attr_list_context *context) { - attrlist_cursor_kern_t *cursor; - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - int retval, i; + struct attrlist_cursor_kern *cursor; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_entry *entry; + int retval; + int i; + + trace_xfs_attr_list_leaf(context); - ASSERT(bp != NULL); leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + cursor = context->cursor; cursor->initted = 1; - trace_xfs_attr_list_leaf(context); - /* * Re-find our place in the leaf block if this is a new syscall. */ if (context->resynch) { - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + entry = &entries[0]; + for (i = 0; i < ichdr.count; entry++, i++) { if (be32_to_cpu(entry->hashval) == cursor->hashval) { if (cursor->offset == context->dupcnt) { context->dupcnt = 0; @@ -2455,12 +2659,12 @@ xfs_attr_leaf_list_int( break; } } - if (i == be16_to_cpu(leaf->hdr.count)) { + if (i == ichdr.count) { trace_xfs_attr_list_notfound(context); - return(0); + return 0; } } else { - entry = &leaf->entries[0]; + entry = &entries[0]; i = 0; } context->resynch = 0; @@ -2469,7 +2673,7 @@ xfs_attr_leaf_list_int( * We have found our place, start copying out the new attributes. */ retval = 0; - for ( ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) { + for (; i < ichdr.count; entry++, i++) { if (be32_to_cpu(entry->hashval) != cursor->hashval) { cursor->hashval = be32_to_cpu(entry->hashval); cursor->offset = 0; @@ -2480,7 +2684,7 @@ xfs_attr_leaf_list_int( if (entry->flags & XFS_ATTR_LOCAL) { xfs_attr_leaf_name_local_t *name_loc = - xfs_attr_leaf_name_local(leaf, i); + xfs_attr3_leaf_name_local(leaf, i); retval = context->put_listent(context, entry->flags, @@ -2492,7 +2696,7 @@ xfs_attr_leaf_list_int( return retval; } else { xfs_attr_leaf_name_remote_t *name_rmt = - xfs_attr_leaf_name_remote(leaf, i); + xfs_attr3_leaf_name_remote(leaf, i); int valuelen = be32_to_cpu(name_rmt->valuelen); @@ -2532,7 +2736,7 @@ xfs_attr_leaf_list_int( cursor->offset++; } trace_xfs_attr_list_leaf_end(context); - return(retval); + return retval; } @@ -2544,14 +2748,16 @@ xfs_attr_leaf_list_int( * Clear the INCOMPLETE flag on an entry in a leaf block. */ int -xfs_attr_leaf_clearflag(xfs_da_args_t *args) +xfs_attr3_leaf_clearflag( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_remote_t *name_rmt; - struct xfs_buf *bp; - int error; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp; + int error; #ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr; xfs_attr_leaf_name_local_t *name_loc; int namelen; char *name; @@ -2561,23 +2767,25 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) /* * Set up the operation. */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return(error); leaf = bp->b_addr; - ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); - ASSERT(args->index >= 0); - entry = &leaf->entries[ args->index ]; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); #ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(args->index < ichdr.count); + ASSERT(args->index >= 0); + if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); namelen = name_loc->namelen; name = (char *)name_loc->nameval; } else { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); namelen = name_rmt->namelen; name = (char *)name_rmt->name; } @@ -2592,7 +2800,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) if (args->rmtblkno) { ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->valuelen); xfs_trans_log_buf(args->trans, bp, @@ -2609,34 +2817,41 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) * Set the INCOMPLETE flag on an entry in a leaf block. */ int -xfs_attr_leaf_setflag(xfs_da_args_t *args) +xfs_attr3_leaf_setflag( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_remote_t *name_rmt; - struct xfs_buf *bp; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp; int error; +#ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr; +#endif trace_xfs_attr_leaf_setflag(args); /* * Set up the operation. */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) return(error); leaf = bp->b_addr; - ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); +#ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); - entry = &leaf->entries[ args->index ]; +#endif + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0); entry->flags |= XFS_ATTR_INCOMPLETE; xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if ((entry->flags & XFS_ATTR_LOCAL) == 0) { - name_rmt = xfs_attr_leaf_name_remote(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = 0; name_rmt->valuelen = 0; xfs_trans_log_buf(args->trans, bp, @@ -2657,14 +2872,20 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) * Note that they could be in different blocks, or in the same block. */ int -xfs_attr_leaf_flipflags(xfs_da_args_t *args) +xfs_attr3_leaf_flipflags( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf1, *leaf2; - xfs_attr_leaf_entry_t *entry1, *entry2; - xfs_attr_leaf_name_remote_t *name_rmt; - struct xfs_buf *bp1, *bp2; + struct xfs_attr_leafblock *leaf1; + struct xfs_attr_leafblock *leaf2; + struct xfs_attr_leaf_entry *entry1; + struct xfs_attr_leaf_entry *entry2; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp1; + struct xfs_buf *bp2; int error; #ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; xfs_attr_leaf_name_local_t *name_loc; int namelen1, namelen2; char *name1, *name2; @@ -2675,7 +2896,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) /* * Read the block containing the "old" attr */ - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); if (error) return error; @@ -2683,7 +2904,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { - error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2, + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2, -1, &bp2); if (error) return error; @@ -2692,31 +2913,35 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) } leaf1 = bp1->b_addr; - ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); - ASSERT(args->index >= 0); - entry1 = &leaf1->entries[ args->index ]; + entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index]; leaf2 = bp2->b_addr; - ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); - ASSERT(args->index2 >= 0); - entry2 = &leaf2->entries[ args->index2 ]; + entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2]; #ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); + ASSERT(args->index < ichdr1.count); + ASSERT(args->index >= 0); + + xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + ASSERT(args->index2 < ichdr2.count); + ASSERT(args->index2 >= 0); + if (entry1->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf1, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf1, args->index); namelen1 = name_loc->namelen; name1 = (char *)name_loc->nameval; } else { - name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); namelen1 = name_rmt->namelen; name1 = (char *)name_rmt->name; } if (entry2->flags & XFS_ATTR_LOCAL) { - name_loc = xfs_attr_leaf_name_local(leaf2, args->index2); + name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2); namelen2 = name_loc->namelen; name2 = (char *)name_loc->nameval; } else { - name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2); + name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); namelen2 = name_rmt->namelen; name2 = (char *)name_rmt->name; } @@ -2733,7 +2958,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); if (args->rmtblkno) { ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); - name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->valuelen); xfs_trans_log_buf(args->trans, bp1, @@ -2744,7 +2969,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { - name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2); + name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); name_rmt->valueblk = 0; name_rmt->valuelen = 0; xfs_trans_log_buf(args->trans, bp2, @@ -2756,7 +2981,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) */ error = xfs_trans_roll(&args->trans, args->dp); - return(error); + return error; } /*======================================================================== @@ -2768,12 +2993,14 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) * We're doing a depth-first traversal in order to invalidate everything. */ int -xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) +xfs_attr3_root_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp) { - xfs_da_blkinfo_t *info; - xfs_daddr_t blkno; - struct xfs_buf *bp; - int error; + struct xfs_da_blkinfo *info; + struct xfs_buf *bp; + xfs_daddr_t blkno; + int error; /* * Read block 0 to see what we have to work with. @@ -2781,40 +3008,46 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * the extents in reverse order the extent containing * block 0 must still be there. */ - error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) - return(error); - blkno = XFS_BUF_ADDR(bp); + return error; + blkno = bp->b_bn; /* * Invalidate the tree, even if the "tree" is only a single leaf block. * This is a depth-first traversal! */ info = bp->b_addr; - if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { - error = xfs_attr_node_inactive(trans, dp, bp, 1); - } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) { - error = xfs_attr_leaf_inactive(trans, dp, bp); - } else { + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, bp, 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, bp); + break; + default: error = XFS_ERROR(EIO); xfs_trans_brelse(*trans, bp); + break; } if (error) - return(error); + return error; /* * Invalidate the incore copy of the root block. */ error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); if (error) - return(error); + return error; xfs_trans_binval(*trans, bp); /* remove from cache */ /* * Commit the invalidate and start the next transaction. */ error = xfs_trans_roll(trans, dp); - return (error); + return error; } /* @@ -2822,7 +3055,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * We're doing a depth-first traversal in order to invalidate everything. */ STATIC int -xfs_attr_node_inactive( +xfs_attr3_node_inactive( struct xfs_trans **trans, struct xfs_inode *dp, struct xfs_buf *bp, @@ -2832,26 +3065,28 @@ xfs_attr_node_inactive( xfs_da_intnode_t *node; xfs_dablk_t child_fsb; xfs_daddr_t parent_blkno, child_blkno; - int error, count, i; + int error, i; struct xfs_buf *child_bp; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr ichdr; /* * Since this code is recursive (gasp!) we must protect ourselves. */ if (level > XFS_DA_NODE_MAXDEPTH) { xfs_trans_brelse(*trans, bp); /* no locks for later trans */ - return(XFS_ERROR(EIO)); + return XFS_ERROR(EIO); } node = bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - parent_blkno = XFS_BUF_ADDR(bp); /* save for re-read later */ - count = be16_to_cpu(node->hdr.count); - if (!count) { + xfs_da3_node_hdr_from_disk(&ichdr, node); + parent_blkno = bp->b_bn; + if (!ichdr.count) { xfs_trans_brelse(*trans, bp); - return(0); + return 0; } - child_fsb = be32_to_cpu(node->btree[0].before); + btree = xfs_da3_node_tree_p(node); + child_fsb = be32_to_cpu(btree[0].before); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ /* @@ -2859,14 +3094,14 @@ xfs_attr_node_inactive( * over the leaves removing all of them. If this is higher up * in the tree, recurse downward. */ - for (i = 0; i < count; i++) { + for (i = 0; i < ichdr.count; i++) { /* * Read the subsidiary block to see what we have to work with. * Don't do this in a transaction. This is a depth-first * traversal of the tree so we may deal with many blocks * before we come back to this one. */ - error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp, + error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp, XFS_ATTR_FORK); if (error) return(error); @@ -2878,18 +3113,24 @@ xfs_attr_node_inactive( * Invalidate the subtree, however we have to. */ info = child_bp->b_addr; - if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { - error = xfs_attr_node_inactive(trans, dp, - child_bp, level+1); - } else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) { - error = xfs_attr_leaf_inactive(trans, dp, - child_bp); - } else { + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, + child_bp, level + 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, + child_bp); + break; + default: error = XFS_ERROR(EIO); xfs_trans_brelse(*trans, child_bp); + break; } if (error) - return(error); + return error; /* * Remove the subsidiary block from the cache @@ -2898,7 +3139,7 @@ xfs_attr_node_inactive( error = xfs_da_get_buf(*trans, dp, 0, child_blkno, &child_bp, XFS_ATTR_FORK); if (error) - return(error); + return error; xfs_trans_binval(*trans, child_bp); } @@ -2906,12 +3147,12 @@ xfs_attr_node_inactive( * If we're not done, re-read the parent to get the next * child block number. */ - if ((i+1) < count) { - error = xfs_da_node_read(*trans, dp, 0, parent_blkno, + if (i + 1 < ichdr.count) { + error = xfs_da3_node_read(*trans, dp, 0, parent_blkno, &bp, XFS_ATTR_FORK); if (error) - return(error); - child_fsb = be32_to_cpu(node->btree[i+1].before); + return error; + child_fsb = be32_to_cpu(btree[i + 1].before); xfs_trans_brelse(*trans, bp); } /* @@ -2919,10 +3160,10 @@ xfs_attr_node_inactive( */ error = xfs_trans_roll(trans, dp); if (error) - return (error); + return error; } - return(0); + return 0; } /* @@ -2932,29 +3173,35 @@ xfs_attr_node_inactive( * caught holding something that the logging code wants to flush to disk. */ STATIC int -xfs_attr_leaf_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp) +xfs_attr3_leaf_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_remote_t *name_rmt; - xfs_attr_inactive_list_t *list, *lp; - int error, count, size, tmp, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_attr_inactive_list *list; + struct xfs_attr_inactive_list *lp; + int error; + int count; + int size; + int tmp; + int i; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); /* * Count the number of "remote" value extents. */ count = 0; - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { if (be16_to_cpu(entry->nameidx) && ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr_leaf_name_remote(leaf, i); + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); if (name_rmt->valueblk) count++; } @@ -2965,24 +3212,24 @@ xfs_attr_leaf_inactive( */ if (count == 0) { xfs_trans_brelse(*trans, bp); - return(0); + return 0; } /* * Allocate storage for a list of all the "remote" value extents. */ size = count * sizeof(xfs_attr_inactive_list_t); - list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP); + list = kmem_alloc(size, KM_SLEEP); /* * Identify each of the "remote" value extents. */ lp = list; - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { if (be16_to_cpu(entry->nameidx) && ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr_leaf_name_remote(leaf, i); + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); if (name_rmt->valueblk) { lp->valueblk = be32_to_cpu(name_rmt->valueblk); lp->valuelen = XFS_B_TO_FSB(dp->i_mount, @@ -2998,15 +3245,15 @@ xfs_attr_leaf_inactive( */ error = 0; for (lp = list, i = 0; i < count; i++, lp++) { - tmp = xfs_attr_leaf_freextent(trans, dp, + tmp = xfs_attr3_leaf_freextent(trans, dp, lp->valueblk, lp->valuelen); if (error == 0) error = tmp; /* save only the 1st errno */ } - kmem_free((xfs_caddr_t)list); - return(error); + kmem_free(list); + return error; } /* @@ -3014,14 +3261,20 @@ xfs_attr_leaf_inactive( * invalidate any buffers that are incore/in transactions. */ STATIC int -xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dablk_t blkno, int blkcnt) +xfs_attr3_leaf_freextent( + struct xfs_trans **trans, + struct xfs_inode *dp, + xfs_dablk_t blkno, + int blkcnt) { - xfs_bmbt_irec_t map; - xfs_dablk_t tblkno; - int tblkcnt, dblkcnt, nmap, error; - xfs_daddr_t dblkno; - xfs_buf_t *bp; + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_dablk_t tblkno; + xfs_daddr_t dblkno; + int tblkcnt; + int dblkcnt; + int nmap; + int error; /* * Roll through the "value", invalidating the attribute value's diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 77de139a58f0..f9d7846097e2 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -89,7 +90,7 @@ typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */ typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ __be32 hashval; /* hash value of name */ - __be16 nameidx; /* index into buffer of name/value */ + __be16 nameidx; /* index into buffer of name/value */ __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ __u8 pad2; /* unused pad byte */ } xfs_attr_leaf_entry_t; @@ -115,6 +116,54 @@ typedef struct xfs_attr_leafblock { } xfs_attr_leafblock_t; /* + * CRC enabled leaf structures. Called "version 3" structures to match the + * version number of the directory and dablk structures for this feature, and + * attr2 is already taken by the variable inode attribute fork size feature. + */ +struct xfs_attr3_leaf_hdr { + struct xfs_da3_blkinfo info; + __be16 count; + __be16 usedbytes; + __be16 firstused; + __u8 holes; + __u8 pad1; + struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE]; +}; + +#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc)) + +struct xfs_attr3_leafblock { + struct xfs_attr3_leaf_hdr hdr; + struct xfs_attr_leaf_entry entries[1]; + + /* + * The rest of the block contains the following structures after the + * leaf entries, growing from the bottom up. The variables are never + * referenced, the locations accessed purely from helper functions. + * + * struct xfs_attr_leaf_name_local + * struct xfs_attr_leaf_name_remote + */ +}; + +/* + * incore, neutral version of the attribute leaf header + */ +struct xfs_attr3_icleaf_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t usedbytes; + __uint16_t firstused; + __u8 holes; + struct { + __uint16_t base; + __uint16_t size; + } freemap[XFS_ATTR_LEAF_MAPSIZE]; +}; + +/* * Flags used in the leaf_entry[i].flags field. * NOTE: the INCOMPLETE bit must not collide with the flags bits specified * on the system call, they are "or"ed together for various operations. @@ -147,26 +196,43 @@ typedef struct xfs_attr_leafblock { */ #define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t)) +static inline int +xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp) +{ + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return sizeof(struct xfs_attr3_leaf_hdr); + return sizeof(struct xfs_attr_leaf_hdr); +} + +static inline struct xfs_attr_leaf_entry * +xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp) +{ + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return &((struct xfs_attr3_leafblock *)leafp)->entries[0]; + return &leafp->entries[0]; +} + /* * Cast typed pointers for "local" and "remote" name/value structs. */ -static inline xfs_attr_leaf_name_remote_t * -xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) +static inline char * +xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx) { - return (xfs_attr_leaf_name_remote_t *) - &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; + struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp); + + return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)]; } -static inline xfs_attr_leaf_name_local_t * -xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) +static inline xfs_attr_leaf_name_remote_t * +xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) { - return (xfs_attr_leaf_name_local_t *) - &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; + return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx); } -static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx) +static inline xfs_attr_leaf_name_local_t * +xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) { - return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; + return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx); } /* @@ -221,37 +287,37 @@ int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); /* * Internal routines when attribute fork size == XFS_LBSIZE(mp). */ -int xfs_attr_leaf_to_node(struct xfs_da_args *args); -int xfs_attr_leaf_to_shortform(struct xfs_buf *bp, +int xfs_attr3_leaf_to_node(struct xfs_da_args *args); +int xfs_attr3_leaf_to_shortform(struct xfs_buf *bp, struct xfs_da_args *args, int forkoff); -int xfs_attr_leaf_clearflag(struct xfs_da_args *args); -int xfs_attr_leaf_setflag(struct xfs_da_args *args); -int xfs_attr_leaf_flipflags(xfs_da_args_t *args); +int xfs_attr3_leaf_clearflag(struct xfs_da_args *args); +int xfs_attr3_leaf_setflag(struct xfs_da_args *args); +int xfs_attr3_leaf_flipflags(struct xfs_da_args *args); /* * Routines used for growing the Btree. */ -int xfs_attr_leaf_split(struct xfs_da_state *state, +int xfs_attr3_leaf_split(struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); -int xfs_attr_leaf_lookup_int(struct xfs_buf *leaf, +int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf, struct xfs_da_args *args); -int xfs_attr_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); -int xfs_attr_leaf_add(struct xfs_buf *leaf_buffer, +int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); +int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_remove(struct xfs_buf *leaf_buffer, +int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_list_int(struct xfs_buf *bp, +int xfs_attr3_leaf_list_int(struct xfs_buf *bp, struct xfs_attr_list_context *context); /* * Routines used for shrinking the Btree. */ -int xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval); -void xfs_attr_leaf_unbalance(struct xfs_da_state *state, +int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval); +void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, struct xfs_da_state_blk *drop_blk, struct xfs_da_state_blk *save_blk); -int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); +int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); /* * Utility routines. @@ -261,10 +327,12 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local); -int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, +int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); +void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from); -extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops; +extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c new file mode 100644 index 000000000000..dee84466dcc9 --- /dev/null +++ b/fs/xfs/xfs_attr_remote.c @@ -0,0 +1,541 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" + +#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ + +/* + * Each contiguous block has a header, so it is not just a simple attribute + * length to FSB conversion. + */ +static int +xfs_attr3_rmt_blocks( + struct xfs_mount *mp, + int attrlen) +{ + int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, + mp->m_sb.sb_blocksize); + return (attrlen + buflen - 1) / buflen; +} + +static bool +xfs_attr3_rmt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) + return false; + if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(rmt->rm_blkno)) + return false; + if (be32_to_cpu(rmt->rm_offset) + + be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX) + return false; + if (rmt->rm_owner == 0) + return false; + + return true; +} + +static void +xfs_attr3_rmt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_ATTR3_RMT_CRC_OFF) || + !xfs_attr3_rmt_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_attr3_rmt_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_attr3_rmt_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (bip) { + struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_ATTR3_RMT_CRC_OFF); +} + +const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { + .verify_read = xfs_attr3_rmt_read_verify, + .verify_write = xfs_attr3_rmt_write_verify, +}; + +static int +xfs_attr3_rmt_hdr_set( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return 0; + + rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC); + rmt->rm_offset = cpu_to_be32(offset); + rmt->rm_bytes = cpu_to_be32(size); + uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); + rmt->rm_owner = cpu_to_be64(ino); + rmt->rm_blkno = cpu_to_be64(bp->b_bn); + bp->b_ops = &xfs_attr3_rmt_buf_ops; + + return sizeof(struct xfs_attr3_rmt_hdr); +} + +/* + * Checking of the remote attribute header is split into two parts. the verifier + * does CRC, location and bounds checking, the unpacking function checks the + * attribute parameters and owner. + */ +static bool +xfs_attr3_rmt_hdr_ok( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + + if (offset != be32_to_cpu(rmt->rm_offset)) + return false; + if (size != be32_to_cpu(rmt->rm_bytes)) + return false; + if (ino != be64_to_cpu(rmt->rm_owner)) + return false; + + /* ok */ + return true; +} + +/* + * Read the value associated with an attribute from the out-of-line buffer + * that we stored it in. + */ +int +xfs_attr_rmtval_get( + struct xfs_da_args *args) +{ + struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE]; + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + xfs_dablk_t lblkno = args->rmtblkno; + void *dst = args->value; + int valuelen = args->valuelen; + int nmap; + int error; + int blkcnt; + int i; + int offset = 0; + + trace_xfs_attr_rmtval_get(args); + + ASSERT(!(args->flags & ATTR_KERNOVAL)); + + while (valuelen > 0) { + nmap = ATTR_RMTVALUE_MAPSIZE; + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return error; + ASSERT(nmap >= 1); + + for (i = 0; (i < nmap) && (valuelen > 0); i++) { + int byte_cnt; + char *src; + + ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && + (map[i].br_startblock != HOLESTARTBLOCK)); + dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); + blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + dblkno, blkcnt, 0, &bp, + &xfs_attr3_rmt_buf_ops); + if (error) + return error; + + byte_cnt = min_t(int, valuelen, BBTOB(bp->b_length)); + byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); + + src = bp->b_addr; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_attr3_rmt_hdr_ok(mp, args->dp->i_ino, + offset, byte_cnt, bp)) { + xfs_alert(mp, +"remote attribute header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", + offset, byte_cnt, args->dp->i_ino); + xfs_buf_relse(bp); + return EFSCORRUPTED; + + } + + src += sizeof(struct xfs_attr3_rmt_hdr); + } + + memcpy(dst, src, byte_cnt); + xfs_buf_relse(bp); + + offset += byte_cnt; + dst += byte_cnt; + valuelen -= byte_cnt; + + lblkno += map[i].br_blockcount; + } + } + ASSERT(valuelen == 0); + return 0; +} + +/* + * Write the value associated with an attribute into the out-of-line buffer + * that we have defined for it. + */ +int +xfs_attr_rmtval_set( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + xfs_dablk_t lblkno; + xfs_fileoff_t lfileoff = 0; + void *src = args->value; + int blkcnt; + int valuelen; + int nmap; + int error; + int hdrcnt = 0; + bool crcs = xfs_sb_version_hascrc(&mp->m_sb); + int offset = 0; + + trace_xfs_attr_rmtval_set(args); + + /* + * Find a "hole" in the attribute address space large enough for + * us to drop the new attribute's value into. Because CRC enable + * attributes have headers, we can't just do a straight byte to FSB + * conversion. We calculate the worst case block count in this case + * and we may not need that many, so we have to handle this when + * allocating the blocks below. + */ + if (!crcs) + blkcnt = XFS_B_TO_FSB(mp, args->valuelen); + else + blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + + error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, + XFS_ATTR_FORK); + if (error) + return error; + + /* Start with the attribute data. We'll allocate the rest afterwards. */ + if (crcs) + blkcnt = XFS_B_TO_FSB(mp, args->valuelen); + + args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; + args->rmtblkcnt = blkcnt; + + /* + * Roll through the "value", allocating blocks on disk as required. + */ + while (blkcnt > 0) { + int committed; + + /* + * Allocate a single extent, up to the size of the value. + */ + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, + blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + args->firstblock, args->total, &map, &nmap, + args->flist); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + hdrcnt++; + + /* + * If we have enough blocks for the attribute data, calculate + * how many extra blocks we need for headers. We might run + * through this multiple times in the case that the additional + * headers in the blocks needed for the data fragments spills + * into requiring more blocks. e.g. for 512 byte blocks, we'll + * spill for another block every 9 headers we require in this + * loop. + */ + if (crcs && blkcnt == 0) { + int total_len; + + total_len = args->valuelen + + hdrcnt * sizeof(struct xfs_attr3_rmt_hdr); + blkcnt = XFS_B_TO_FSB(mp, total_len); + blkcnt -= args->rmtblkcnt; + args->rmtblkcnt += blkcnt; + } + + /* + * Start the next trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + return (error); + } + + /* + * Roll through the "value", copying the attribute value to the + * already-allocated blocks. Blocks are written synchronously + * so that we can know they are all on disk before we turn off + * the INCOMPLETE flag. + */ + lblkno = args->rmtblkno; + valuelen = args->valuelen; + while (valuelen > 0) { + int byte_cnt; + char *buf; + + /* + * Try to remember where we decided to put the value. + */ + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return(error); + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); + if (!bp) + return ENOMEM; + bp->b_ops = &xfs_attr3_rmt_buf_ops; + + byte_cnt = BBTOB(bp->b_length); + byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); + if (valuelen < byte_cnt) + byte_cnt = valuelen; + + buf = bp->b_addr; + buf += xfs_attr3_rmt_hdr_set(mp, dp->i_ino, offset, + byte_cnt, bp); + memcpy(buf, src, byte_cnt); + + if (byte_cnt < BBTOB(bp->b_length)) + xfs_buf_zero(bp, byte_cnt, + BBTOB(bp->b_length) - byte_cnt); + + error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ + xfs_buf_relse(bp); + if (error) + return error; + + src += byte_cnt; + valuelen -= byte_cnt; + offset += byte_cnt; + hdrcnt--; + + lblkno += map.br_blockcount; + } + ASSERT(valuelen == 0); + ASSERT(hdrcnt == 0); + return 0; +} + +/* + * Remove the value associated with an attribute by deleting the + * out-of-line buffer that it is stored on. + */ +int +xfs_attr_rmtval_remove(xfs_da_args_t *args) +{ + xfs_mount_t *mp; + xfs_bmbt_irec_t map; + xfs_buf_t *bp; + xfs_daddr_t dblkno; + xfs_dablk_t lblkno; + int valuelen, blkcnt, nmap, error, done, committed; + + trace_xfs_attr_rmtval_remove(args); + + mp = args->dp->i_mount; + + /* + * Roll through the "value", invalidating the attribute value's + * blocks. + */ + lblkno = args->rmtblkno; + valuelen = args->rmtblkcnt; + while (valuelen > 0) { + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + args->rmtblkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return(error); + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + /* + * If the "remote" value is in the cache, remove it. + */ + bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); + if (bp) { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + bp = NULL; + } + + valuelen -= map.br_blockcount; + + lblkno += map.br_blockcount; + } + + /* + * Keep de-allocating extents until the remote-value region is gone. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + done = 0; + while (!done) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + 1, args->firstblock, args->flist, + &done); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return error; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, args->dp, 0); + + /* + * Close out trans and start the next one in the chain. + */ + error = xfs_trans_roll(&args->trans, args->dp); + if (error) + return (error); + } + return(0); +} + diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h new file mode 100644 index 000000000000..c7cca60a062a --- /dev/null +++ b/fs/xfs/xfs_attr_remote.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ATTR_REMOTE_H__ +#define __XFS_ATTR_REMOTE_H__ + +#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */ + +struct xfs_attr3_rmt_hdr { + __be32 rm_magic; + __be32 rm_offset; + __be32 rm_bytes; + __be32 rm_crc; + uuid_t rm_uuid; + __be64 rm_owner; + __be64 rm_blkno; + __be64 rm_lsn; +}; + +#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) + +#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_attr3_rmt_hdr) : 0)) + +extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; + +int xfs_attr_rmtval_get(struct xfs_da_args *args); +int xfs_attr_rmtval_set(struct xfs_da_args *args); +int xfs_attr_rmtval_remove(struct xfs_da_args *args); + +#endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index b44af9211bd9..89042848f9ec 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -25,6 +25,7 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" +#include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" @@ -47,180 +48,78 @@ #include "xfs_filestream.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_symlink.h" kmem_zone_t *xfs_bmap_free_item_zone; /* - * Prototypes for internal bmap routines. - */ - -#ifdef DEBUG -STATIC void -xfs_bmap_check_leaf_extents( - struct xfs_btree_cur *cur, - struct xfs_inode *ip, - int whichfork); -#else -#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) -#endif - - -/* - * Called from xfs_bmap_add_attrfork to handle extents format files. - */ -STATIC int /* error */ -xfs_bmap_add_attrfork_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ - int *flags); /* inode logging flags */ - -/* - * Called from xfs_bmap_add_attrfork to handle local format files. + * Miscellaneous helper functions */ -STATIC int /* error */ -xfs_bmap_add_attrfork_local( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ - int *flags); /* inode logging flags */ /* - * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. - * It figures out where to ask the underlying allocator to put the new extent. - */ -STATIC int /* error */ -xfs_bmap_alloc( - xfs_bmalloca_t *ap); /* bmap alloc argument struct */ - -/* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. - */ -STATIC int /* error */ -xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ - int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ - -/* - * Remove the entry "free" from the free item list. Prev points to the - * previous entry, unless "free" is the head of the list. - */ -STATIC void -xfs_bmap_del_free( - xfs_bmap_free_t *flist, /* free item list header */ - xfs_bmap_free_item_t *prev, /* previous item on list, if any */ - xfs_bmap_free_item_t *free); /* list item to be freed */ - -/* - * Convert an extents-format file into a btree-format file. - * The new file will have a root block (in the inode) and a single child block. - */ -STATIC int /* error */ -xfs_bmap_extents_to_btree( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first-block-allocated */ - xfs_bmap_free_t *flist, /* blocks freed in xaction */ - xfs_btree_cur_t **curp, /* cursor returned to caller */ - int wasdel, /* converting a delayed alloc */ - int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ - -/* - * Convert a local file to an extents file. - * This code is sort of bogus, since the file data needs to get - * logged so it won't be lost. The bmap-level manipulations are ok, though. - */ -STATIC int /* error */ -xfs_bmap_local_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated in xaction */ - xfs_extlen_t total, /* total blocks needed by transaction */ - int *logflagsp, /* inode logging flags */ - int whichfork, /* data or attr fork */ - void (*init_fn)(struct xfs_buf *bp, - struct xfs_inode *ip, - struct xfs_ifork *ifp)); - -/* - * Search the extents list for the inode, for the extent containing bno. - * If bno lies in a hole, point to the next entry. If bno lies past eof, - * *eofp will be set, and *prevp will contain the last entry (null if none). - * Else, *lastxp will be set to the index of the found - * entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_extents( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int whichfork, /* data or attr fork */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */ - -/* - * Compute the worst-case number of indirect blocks that will be used - * for ip's delayed extent of length "len". - */ -STATIC xfs_filblks_t -xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len); /* delayed extent length */ - -#ifdef DEBUG -/* - * Perform various validation checks on the values being returned - * from xfs_bmapi(). + * Compute and fill in the value of the maximum depth of a bmap btree + * in this filesystem. Done once, during mount. */ -STATIC void -xfs_bmap_validate_ret( - xfs_fileoff_t bno, - xfs_filblks_t len, - int flags, - xfs_bmbt_irec_t *mval, - int nmap, - int ret_nmap); -#else -#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) -#endif /* DEBUG */ - -STATIC int -xfs_bmap_count_tree( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ifork_t *ifp, - xfs_fsblock_t blockno, - int levelin, - int *count); - -STATIC void -xfs_bmap_count_leaves( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - int numrecs, - int *count); +void +xfs_bmap_compute_maxlevels( + xfs_mount_t *mp, /* file system mount structure */ + int whichfork) /* data or attr fork */ +{ + int level; /* btree level */ + uint maxblocks; /* max blocks at this level */ + uint maxleafents; /* max leaf entries possible */ + int maxrootrecs; /* max records in root block */ + int minleafrecs; /* min records in leaf block */ + int minnoderecs; /* min records in node block */ + int sz; /* root block size */ -STATIC void -xfs_bmap_disk_count_leaves( - struct xfs_mount *mp, - struct xfs_btree_block *block, - int numrecs, - int *count); + /* + * The maximum number of extents in a file, hence the maximum + * number of leaf entries, is controlled by the type of di_nextents + * (a signed 32-bit number, xfs_extnum_t), or by di_anextents + * (a signed 16-bit number, xfs_aextnum_t). + * + * Note that we can no longer assume that if we are in ATTR1 that + * the fork offset of all the inodes will be + * (xfs_default_attroffset(ip) >> 3) because we could have mounted + * with ATTR2 and then mounted back with ATTR1, keeping the + * di_forkoff's fixed but probably at various positions. Therefore, + * for both ATTR1 and ATTR2 we have to assume the worst case scenario + * of a minimum size available. + */ + if (whichfork == XFS_DATA_FORK) { + maxleafents = MAXEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + } else { + maxleafents = MAXAEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); + } + maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0); + minleafrecs = mp->m_bmap_dmnr[0]; + minnoderecs = mp->m_bmap_dmnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) { + if (maxblocks <= maxrootrecs) + maxblocks = 1; + else + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + } + mp->m_bm_maxlevels[whichfork] = level; +} /* - * Bmap internal routines. + * Convert the given file system block to a disk block. We have to treat it + * differently based on whether the file is a real time file or not, because the + * bmap code does. */ +xfs_daddr_t +xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) +{ + return (XFS_IS_REALTIME_INODE(ip) ? \ + (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); +} STATIC int /* error */ xfs_bmbt_lookup_eq( @@ -290,6 +189,1070 @@ xfs_bmbt_update( } /* + * Compute the worst-case number of indirect blocks that will be used + * for ip's delayed extent of length "len". + */ +STATIC xfs_filblks_t +xfs_bmap_worst_indlen( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ +{ + int level; /* btree level number */ + int maxrecs; /* maximum record count at this level */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t rval; /* return value */ + + mp = ip->i_mount; + maxrecs = mp->m_bmap_dmxr[0]; + for (level = 0, rval = 0; + level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); + level++) { + len += maxrecs - 1; + do_div(len, maxrecs); + rval += len; + if (len == 1) + return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + level - 1; + if (level == 0) + maxrecs = mp->m_bmap_dmxr[1]; + } + return rval; +} + +/* + * Calculate the default attribute fork offset for newly created inodes. + */ +uint +xfs_default_attroffset( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + uint offset; + + if (mp->m_sb.sb_inodesize == 256) { + offset = XFS_LITINO(mp, ip->i_d.di_version) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); + } else { + offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); + } + + ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version)); + return offset; +} + +/* + * Helper routine to reset inode di_forkoff field when switching + * attribute fork from local to extent format - we reset it where + * possible to make space available for inline data fork extents. + */ +STATIC void +xfs_bmap_forkoff_reset( + xfs_mount_t *mp, + xfs_inode_t *ip, + int whichfork) +{ + if (whichfork == XFS_ATTR_FORK && + ip->i_d.di_format != XFS_DINODE_FMT_DEV && + ip->i_d.di_format != XFS_DINODE_FMT_UUID && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { + uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; + + if (dfl_forkoff > ip->i_d.di_forkoff) + ip->i_d.di_forkoff = dfl_forkoff; + } +} + +/* + * Extent tree block counting routines. + */ + +/* + * Count leaf blocks given a range of extent records. + */ +STATIC void +xfs_bmap_count_leaves( + xfs_ifork_t *ifp, + xfs_extnum_t idx, + int numrecs, + int *count) +{ + int b; + + for (b = 0; b < numrecs; b++) { + xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); + *count += xfs_bmbt_get_blockcount(frp); + } +} + +/* + * Count leaf blocks given a range of extent records originally + * in btree format. + */ +STATIC void +xfs_bmap_disk_count_leaves( + struct xfs_mount *mp, + struct xfs_btree_block *block, + int numrecs, + int *count) +{ + int b; + xfs_bmbt_rec_t *frp; + + for (b = 1; b <= numrecs; b++) { + frp = XFS_BMBT_REC_ADDR(mp, block, b); + *count += xfs_bmbt_disk_get_blockcount(frp); + } +} + +/* + * Recursively walks each level of a btree + * to count total fsblocks is use. + */ +STATIC int /* error */ +xfs_bmap_count_tree( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fsblock_t blockno, /* file system block number */ + int levelin, /* level in btree */ + int *count) /* Count of blocks */ +{ + int error; + xfs_buf_t *bp, *nbp; + int level = levelin; + __be64 *pp; + xfs_fsblock_t bno = blockno; + xfs_fsblock_t nextbno; + struct xfs_btree_block *block, *nextblock; + int numrecs; + + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + + if (--level) { + /* Not at node above leaves, count this level of nodes */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + while (nextbno != NULLFSBLOCK) { + error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + nextblock = XFS_BUF_TO_BLOCK(nbp); + nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); + xfs_trans_brelse(tp, nbp); + } + + /* Dive to the next level */ + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + if (unlikely((error = + xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { + xfs_trans_brelse(tp, bp); + XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + xfs_trans_brelse(tp, bp); + } else { + /* count all level 1 nodes and their leaves */ + for (;;) { + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + numrecs = be16_to_cpu(block->bb_numrecs); + xfs_bmap_disk_count_leaves(mp, block, numrecs, count); + xfs_trans_brelse(tp, bp); + if (nextbno == NULLFSBLOCK) + break; + bno = nextbno; + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + } + } + return 0; +} + +/* + * Count fsblocks of the given fork. + */ +int /* error */ +xfs_bmap_count_blocks( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + int *count) /* out: count of blocks */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { + xfs_bmap_count_leaves(ifp, 0, + ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), + count); + return 0; + } + + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + block = ifp->if_broot; + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { + XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, + mp); + return XFS_ERROR(EFSCORRUPTED); + } + + return 0; +} + +/* + * Debug/sanity checking code + */ + +STATIC int +xfs_bmap_sanity_check( + struct xfs_mount *mp, + struct xfs_buf *bp, + int level) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + + if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) && + block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC)) + return 0; + + if (be16_to_cpu(block->bb_level) != level || + be16_to_cpu(block->bb_numrecs) == 0 || + be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) + return 0; + + return 1; +} + +#ifdef DEBUG +STATIC struct xfs_buf * +xfs_bmap_get_bp( + struct xfs_btree_cur *cur, + xfs_fsblock_t bno) +{ + struct xfs_log_item_desc *lidp; + int i; + + if (!cur) + return NULL; + + for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { + if (!cur->bc_bufs[i]) + break; + if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) + return cur->bc_bufs[i]; + } + + /* Chase down all the log items to see if the bp is there */ + list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { + struct xfs_buf_log_item *bip; + bip = (struct xfs_buf_log_item *)lidp->lid_item; + if (bip->bli_item.li_type == XFS_LI_BUF && + XFS_BUF_ADDR(bip->bli_buf) == bno) + return bip->bli_buf; + } + + return NULL; +} + +STATIC void +xfs_check_block( + struct xfs_btree_block *block, + xfs_mount_t *mp, + int root, + short sz) +{ + int i, j, dmxr; + __be64 *pp, *thispa; /* pointer to block address */ + xfs_bmbt_key_t *prevp, *keyp; + + ASSERT(be16_to_cpu(block->bb_level) > 0); + + prevp = NULL; + for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { + dmxr = mp->m_bmap_dmxr[0]; + keyp = XFS_BMBT_KEY_ADDR(mp, block, i); + + if (prevp) { + ASSERT(be64_to_cpu(prevp->br_startoff) < + be64_to_cpu(keyp->br_startoff)); + } + prevp = keyp; + + /* + * Compare the block numbers to see if there are dups. + */ + if (root) + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); + else + pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); + + for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { + if (root) + thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); + else + thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); + if (*thispa == *pp) { + xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", + __func__, j, i, + (unsigned long long)be64_to_cpu(*thispa)); + panic("%s: ptrs are equal in node\n", + __func__); + } + } + } +} + +/* + * Check that the extents for the inode ip are in the right order in all + * btree leaves. + */ + +STATIC void +xfs_bmap_check_leaf_extents( + xfs_btree_cur_t *cur, /* btree cursor or null */ + xfs_inode_t *ip, /* incore inode pointer */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_extnum_t i=0, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + xfs_bmbt_rec_t *ep; /* pointer to current extent */ + xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ + xfs_bmbt_rec_t *nextp; /* pointer to next extent */ + int bp_release = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { + return; + } + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + xfs_check_block(block, mp, 1, ifp->if_broot_bytes); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + /* See if buf is in cur first */ + bp_release = 0; + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (!bp) { + bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; + } + block = XFS_BUF_TO_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, level), + error0); + if (level == 0) + break; + + /* + * Check this block for basic sanity (increasing keys and + * no duplicate blocks). + */ + + xfs_check_block(block, mp, 0, 0); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + } + + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + i = 0; + + /* + * Loop over all leaf nodes checking that all extents are in the right order. + */ + for (;;) { + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + + + num_recs = xfs_btree_get_numrecs(block); + + /* + * Read-ahead the next leaf block, if any. + */ + + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + + /* + * Check all the extents to make sure they are OK. + * If we had a previous block, the last entry should + * conform with the first entry in this one. + */ + + ep = XFS_BMBT_REC_ADDR(mp, block, 1); + if (i) { + ASSERT(xfs_bmbt_disk_get_startoff(&last) + + xfs_bmbt_disk_get_blockcount(&last) <= + xfs_bmbt_disk_get_startoff(ep)); + } + for (j = 1; j < num_recs; j++) { + nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); + ASSERT(xfs_bmbt_disk_get_startoff(ep) + + xfs_bmbt_disk_get_blockcount(ep) <= + xfs_bmbt_disk_get_startoff(nextp)); + ep = nextp; + } + + last = *ep; + i += num_recs; + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + + bp_release = 0; + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (!bp) { + bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; + } + block = XFS_BUF_TO_BLOCK(bp); + } + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + return; + +error0: + xfs_warn(mp, "%s: at error0", __func__); + if (bp_release) + xfs_trans_brelse(NULL, bp); +error_norelse: + xfs_warn(mp, "%s: BAD after btree leaves for %d extents", + __func__, i); + panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); + return; +} + +/* + * Add bmap trace insert entries for all the contents of the extent records. + */ +void +xfs_bmap_trace_exlist( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t cnt, /* count of entries in the list */ + int whichfork, /* data or attr fork */ + unsigned long caller_ip) +{ + xfs_extnum_t idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int state = 0; + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + for (idx = 0; idx < cnt; idx++) + trace_xfs_extlist(ip, idx, whichfork, caller_ip); +} + +/* + * Validate that the bmbt_irecs being returned from bmapi are valid + * given the callers original parameters. Specifically check the + * ranges of the returned irecs to ensure that they only extent beyond + * the given parameters if the XFS_BMAPI_ENTIRE flag was set. + */ +STATIC void +xfs_bmap_validate_ret( + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_bmbt_irec_t *mval, + int nmap, + int ret_nmap) +{ + int i; /* index to map values */ + + ASSERT(ret_nmap <= nmap); + + for (i = 0; i < ret_nmap; i++) { + ASSERT(mval[i].br_blockcount > 0); + if (!(flags & XFS_BMAPI_ENTIRE)) { + ASSERT(mval[i].br_startoff >= bno); + ASSERT(mval[i].br_blockcount <= len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= + bno + len); + } else { + ASSERT(mval[i].br_startoff < bno + len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount > + bno); + } + ASSERT(i == 0 || + mval[i - 1].br_startoff + mval[i - 1].br_blockcount == + mval[i].br_startoff); + ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && + mval[i].br_startblock != HOLESTARTBLOCK); + ASSERT(mval[i].br_state == XFS_EXT_NORM || + mval[i].br_state == XFS_EXT_UNWRITTEN); + } +} + +#else +#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) +#endif /* DEBUG */ + +/* + * bmap free list manipulation functions + */ + +/* + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). + */ +void +xfs_bmap_add_free( + xfs_fsblock_t bno, /* fs block number of extent */ + xfs_filblks_t len, /* length of extent */ + xfs_bmap_free_t *flist, /* list of extents */ + xfs_mount_t *mp) /* mount point structure */ +{ + xfs_bmap_free_item_t *cur; /* current (next) element */ + xfs_bmap_free_item_t *new; /* new element */ + xfs_bmap_free_item_t *prev; /* previous element */ +#ifdef DEBUG + xfs_agnumber_t agno; + xfs_agblock_t agbno; + + ASSERT(bno != NULLFSBLOCK); + ASSERT(len > 0); + ASSERT(len <= MAXEXTLEN); + ASSERT(!isnullstartblock(bno)); + agno = XFS_FSB_TO_AGNO(mp, bno); + agbno = XFS_FSB_TO_AGBNO(mp, bno); + ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(agbno < mp->m_sb.sb_agblocks); + ASSERT(len < mp->m_sb.sb_agblocks); + ASSERT(agbno + len <= mp->m_sb.sb_agblocks); +#endif + ASSERT(xfs_bmap_free_item_zone != NULL); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new->xbfi_startblock = bno; + new->xbfi_blockcount = (xfs_extlen_t)len; + for (prev = NULL, cur = flist->xbf_first; + cur != NULL; + prev = cur, cur = cur->xbfi_next) { + if (cur->xbfi_startblock >= bno) + break; + } + if (prev) + prev->xbfi_next = new; + else + flist->xbf_first = new; + new->xbfi_next = cur; + flist->xbf_count++; +} + +/* + * Remove the entry "free" from the free item list. Prev points to the + * previous entry, unless "free" is the head of the list. + */ +STATIC void +xfs_bmap_del_free( + xfs_bmap_free_t *flist, /* free item list header */ + xfs_bmap_free_item_t *prev, /* previous item on list, if any */ + xfs_bmap_free_item_t *free) /* list item to be freed */ +{ + if (prev) + prev->xbfi_next = free->xbfi_next; + else + flist->xbf_first = free->xbfi_next; + flist->xbf_count--; + kmem_zone_free(xfs_bmap_free_item_zone, free); +} + + +/* + * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi + * caller. Frees all the extents that need freeing, which must be done + * last due to locking considerations. We never free any extents in + * the first transaction. + * + * Return 1 if the given transaction was committed and a new one + * started, and 0 otherwise in the committed parameter. + */ +int /* error */ +xfs_bmap_finish( + xfs_trans_t **tp, /* transaction pointer addr */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *committed) /* xact committed or not */ +{ + xfs_efd_log_item_t *efd; /* extent free data */ + xfs_efi_log_item_t *efi; /* extent free intention */ + int error; /* error return value */ + xfs_bmap_free_item_t *free; /* free extent item */ + unsigned int logres; /* new log reservation */ + unsigned int logcount; /* new log count */ + xfs_mount_t *mp; /* filesystem mount structure */ + xfs_bmap_free_item_t *next; /* next item on free list */ + xfs_trans_t *ntp; /* new transaction pointer */ + + ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); + if (flist->xbf_count == 0) { + *committed = 0; + return 0; + } + ntp = *tp; + efi = xfs_trans_get_efi(ntp, flist->xbf_count); + for (free = flist->xbf_first; free; free = free->xbfi_next) + xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + free->xbfi_blockcount); + logres = ntp->t_log_res; + logcount = ntp->t_log_count; + ntp = xfs_trans_dup(*tp); + error = xfs_trans_commit(*tp, 0); + *tp = ntp; + *committed = 1; + /* + * We have a new transaction, so we should return committed=1, + * even though we're returning an error. + */ + if (error) + return error; + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(ntp->t_ticket); + + if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES, + logcount))) + return error; + efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + for (free = flist->xbf_first; free != NULL; free = next) { + next = free->xbfi_next; + if ((error = xfs_free_extent(ntp, free->xbfi_startblock, + free->xbfi_blockcount))) { + /* + * The bmap free list will be cleaned up at a + * higher level. The EFI will be canceled when + * this transaction is aborted. + * Need to force shutdown here to make sure it + * happens, since this transaction may not be + * dirty yet. + */ + mp = ntp->t_mountp; + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_force_shutdown(mp, + (error == EFSCORRUPTED) ? + SHUTDOWN_CORRUPT_INCORE : + SHUTDOWN_META_IO_ERROR); + return error; + } + xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, + free->xbfi_blockcount); + xfs_bmap_del_free(flist, NULL, free); + } + return 0; +} + +/* + * Free up any items left in the list. + */ +void +xfs_bmap_cancel( + xfs_bmap_free_t *flist) /* list of bmap_free_items */ +{ + xfs_bmap_free_item_t *free; /* free list item */ + xfs_bmap_free_item_t *next; + + if (flist->xbf_count == 0) + return; + ASSERT(flist->xbf_first != NULL); + for (free = flist->xbf_first; free; free = next) { + next = free->xbfi_next; + xfs_bmap_del_free(flist, NULL, free); + } + ASSERT(flist->xbf_count == 0); +} + +/* + * Inode fork format manipulation functions + */ + +/* + * Transform a btree format file with only one leaf node, where the + * extents list will fit in the inode, into an extents format file. + * Since the file extents are already in-core, all we have to do is + * give up the space for the btree root and pitch the leaf block. + */ +STATIC int /* error */ +xfs_bmap_btree_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_btree_cur_t *cur, /* btree cursor */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + /* REFERENCED */ + struct xfs_btree_block *cblock;/* child btree block */ + xfs_fsblock_t cbno; /* child block number */ + xfs_buf_t *cbp; /* child block's buffer */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork data */ + xfs_mount_t *mp; /* mount point structure */ + __be64 *pp; /* ptr to block address */ + struct xfs_btree_block *rblock;/* root btree block */ + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + rblock = ifp->if_broot; + ASSERT(be16_to_cpu(rblock->bb_level) == 1); + ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); + ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); + cbno = be64_to_cpu(*pp); + *logflagsp = 0; +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, cbno, 1))) + return error; +#endif + error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + cblock = XFS_BUF_TO_BLOCK(cbp); + if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) + return error; + xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); + ip->i_d.di_nblocks--; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(tp, cbp); + if (cur->bc_bufs[0] == cbp) + cur->bc_bufs[0] = NULL; + xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + return 0; +} + +/* + * Convert an extents-format file into a btree-format file. + * The new file will have a root block (in the inode) and a single child block. + */ +STATIC int /* error */ +xfs_bmap_extents_to_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first-block-allocated */ + xfs_bmap_free_t *flist, /* blocks freed in xaction */ + xfs_btree_cur_t **curp, /* cursor returned to caller */ + int wasdel, /* converting a delayed alloc */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *ablock; /* allocated (child) bt block */ + xfs_buf_t *abp; /* buffer for ablock */ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_bmbt_rec_t *arp; /* child record pointer */ + struct xfs_btree_block *block; /* btree root block */ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + int error; /* error return value */ + xfs_extnum_t i, cnt; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_key_t *kp; /* root block key pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_bmbt_ptr_t *pp; /* root block address pointer */ + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); + + /* + * Make space in the inode incore. + */ + xfs_iroot_realloc(ip, 1, whichfork); + ifp->if_flags |= XFS_IFBROOT; + + /* + * Fill in the root. + */ + block = ifp->if_broot; + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS); + + /* + * Need a cursor. Can't allocate until bb_level is filled in. + */ + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + /* + * Convert to a btree with two levels, one record in root. + */ + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = mp; + args.firstblock = *firstblock; + if (*firstblock == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); + } else if (flist->xbf_low) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = *firstblock; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.fsbno = *firstblock; + } + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = wasdel; + *logflagsp = 0; + if ((error = xfs_alloc_vextent(&args))) { + xfs_iroot_realloc(ip, -1, whichfork); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + /* + * Allocation can't fail, the space was reserved. + */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(*firstblock == NULLFSBLOCK || + args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || + (flist->xbf_low && + args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); + *firstblock = cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + ip->i_d.di_nblocks++; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); + abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); + /* + * Fill in the child block. + */ + abp->b_ops = &xfs_bmbt_buf_ops; + ablock = XFS_BUF_TO_BLOCK(abp); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); + + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (cnt = i = 0; i < nextents; i++) { + ep = xfs_iext_get_ext(ifp, i); + if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { + arp->l0 = cpu_to_be64(ep->l0); + arp->l1 = cpu_to_be64(ep->l1); + arp++; cnt++; + } + } + ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); + xfs_btree_set_numrecs(ablock, cnt); + + /* + * Fill in the root key and pointer. + */ + kp = XFS_BMBT_KEY_ADDR(mp, block, 1); + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, + be16_to_cpu(block->bb_level))); + *pp = cpu_to_be64(args.fsbno); + + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS); + xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); + ASSERT(*curp == NULL); + *curp = cur; + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); + return 0; +} + +/* + * Convert a local file to an extents file. + * This code is out of bounds for data forks of regular files, + * since the file data needs to get logged so things will stay consistent. + * (The bmap-level manipulations are ok, though). + */ +STATIC int /* error */ +xfs_bmap_local_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated in xaction */ + xfs_extlen_t total, /* total blocks needed by transaction */ + int *logflagsp, /* inode logging flags */ + int whichfork, + void (*init_fn)(struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp)) +{ + int error; /* error return value */ + int flags; /* logging flags returned */ + xfs_ifork_t *ifp; /* inode fork pointer */ + + /* + * We don't want to deal with the case of keeping inode data inline yet. + * So sending the data fork of a regular inode is invalid. + */ + ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + flags = 0; + error = 0; + if (ifp->if_bytes) { + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_buf_t *bp; /* buffer for extent block */ + xfs_bmbt_rec_host_t *ep;/* extent record pointer */ + + ASSERT((ifp->if_flags & + (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = ip->i_mount; + args.firstblock = *firstblock; + /* + * Allocate a block. We know we need only one, since the + * file currently fits in an inode. + */ + if (*firstblock == NULLFSBLOCK) { + args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); + args.type = XFS_ALLOCTYPE_START_BNO; + } else { + args.fsbno = *firstblock; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + } + args.total = total; + args.minlen = args.maxlen = args.prod = 1; + error = xfs_alloc_vextent(&args); + if (error) + goto done; + + /* Can't fail, the space was reserved. */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(args.len == 1); + *firstblock = args.fsbno; + bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); + + /* initialise the block and copy the data */ + init_fn(tp, bp, ip, ifp); + + /* account for the change in fork size and log everything */ + xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); + xfs_bmap_forkoff_reset(args.mp, ip, whichfork); + xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); + xfs_iext_add(ifp, 0, 1); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); + trace_xfs_bmap_post_update(ip, 0, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, + _THIS_IP_); + XFS_IFORK_NEXT_SET(ip, whichfork, 1); + ip->i_d.di_nblocks = 1; + xfs_trans_mod_dquot_byino(tp, ip, + XFS_TRANS_DQ_BCOUNT, 1L); + flags |= xfs_ilog_fext(whichfork); + } else { + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); + xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); + } + ifp->if_flags &= ~XFS_IFINLINE; + ifp->if_flags |= XFS_IFEXTENTS; + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + flags |= XFS_ILOG_CORE; +done: + *logflagsp = flags; + return error; +} + +/* * Called from xfs_bmap_add_attrfork to handle btree format files. */ STATIC int /* error */ @@ -360,29 +1323,22 @@ xfs_bmap_add_attrfork_extents( } /* - * Block initialisation functions for local to extent format conversion. - * As these get more complex, they will be moved to the relevant files, - * but for now they are too simple to worry about. + * Block initialisation function for local to extent format conversion. + * + * This shouldn't actually be called by anyone, so make sure debug kernels cause + * a noticable failure. */ STATIC void xfs_bmap_local_to_extents_init_fn( + struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp) { + ASSERT(0); bp->b_ops = &xfs_bmbt_buf_ops; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); -} - -STATIC void -xfs_symlink_local_to_remote( - struct xfs_buf *bp, - struct xfs_inode *ip, - struct xfs_ifork *ifp) -{ - /* remote symlink blocks are not verifiable until CRCs come along */ - bp->b_ops = NULL; - memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); } /* @@ -394,8 +1350,7 @@ xfs_symlink_local_to_remote( * * XXX (dgc): investigate whether directory conversion can use the generic * formatting callout. It should be possible - it's just a very complex - * formatter. it would also require passing the transaction through to the init - * function. + * formatter. */ STATIC int /* error */ xfs_bmap_add_attrfork_local( @@ -432,6 +1387,640 @@ xfs_bmap_add_attrfork_local( } /* + * Convert inode from non-attributed to attributed. + * Must not be in a transaction, ip must not be locked. + */ +int /* error code */ +xfs_bmap_add_attrfork( + xfs_inode_t *ip, /* incore inode pointer */ + int size, /* space new attribute needs */ + int rsvd) /* xact may use reserved blks */ +{ + xfs_fsblock_t firstblock; /* 1st block/ag allocated */ + xfs_bmap_free_t flist; /* freed extent records */ + xfs_mount_t *mp; /* mount structure */ + xfs_trans_t *tp; /* transaction pointer */ + int blks; /* space reservation */ + int version = 1; /* superblock attr version */ + int committed; /* xaction was committed */ + int logflags; /* logging flags */ + int error; /* error return value */ + + ASSERT(XFS_IFORK_Q(ip) == 0); + + mp = ip->i_mount; + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); + blks = XFS_ADDAFORK_SPACE_RES(mp); + if (rsvd) + tp->t_flags |= XFS_TRANS_RESERVE; + if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT))) + goto error0; + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? + XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + return error; + } + if (XFS_IFORK_Q(ip)) + goto error1; + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { + /* + * For inodes coming from pre-6.2 filesystems. + */ + ASSERT(ip->i_d.di_aformat == 0); + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + } + ASSERT(ip->i_d.di_anextents == 0); + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + break; + case XFS_DINODE_FMT_UUID: + ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); + if (!ip->i_d.di_forkoff) + ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; + else if (mp->m_flags & XFS_MOUNT_ATTR2) + version = 2; + break; + default: + ASSERT(0); + error = XFS_ERROR(EINVAL); + goto error1; + } + + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp->if_flags = XFS_IFEXTENTS; + logflags = 0; + xfs_bmap_init(&flist, &firstblock); + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, + &logflags); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, + &flist, &logflags); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, + &logflags); + break; + default: + error = 0; + break; + } + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (error) + goto error2; + if (!xfs_sb_version_hasattr(&mp->m_sb) || + (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { + __int64_t sbfields = 0; + + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasattr(&mp->m_sb)) { + xfs_sb_version_addattr(&mp->m_sb); + sbfields |= XFS_SB_VERSIONNUM; + } + if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { + xfs_sb_version_addattr2(&mp->m_sb); + sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); + } + if (sbfields) { + spin_unlock(&mp->m_sb_lock); + xfs_mod_sb(tp, sbfields); + } else + spin_unlock(&mp->m_sb_lock); + } + + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) + goto error2; + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); +error2: + xfs_bmap_cancel(&flist); +error1: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +error0: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + return error; +} + +/* + * Internal and external extent tree search functions. + */ + +/* + * Read in the extents to if_extents. + * All inode fields are set up by caller, we just traverse the btree + * and copy the records in. If the file system cannot contain unwritten + * extents, the records are checked for no "state" flags. + */ +int /* error */ +xfs_bmap_read_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ + xfs_extnum_t i, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + /* REFERENCED */ + xfs_extnum_t room; /* number of entries there's room for */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : + XFS_EXTFMT_INODE(ip); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) + return error; + block = XFS_BUF_TO_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, level), + error0); + if (level == 0) + break; + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + xfs_trans_brelse(tp, bp); + } + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + i = 0; + /* + * Loop over all leaf nodes. Copy information to the extent records. + */ + for (;;) { + xfs_bmbt_rec_t *frp; + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + xfs_extnum_t start; + + num_recs = xfs_btree_get_numrecs(block); + if (unlikely(i + num_recs > room)) { + ASSERT(i + num_recs <= room); + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, (btree extents).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)", + XFS_ERRLEVEL_LOW, ip->i_mount, block); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, 0), + error0); + /* + * Read-ahead the next leaf block, if any. + */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + if (nextbno != NULLFSBLOCK) + xfs_btree_reada_bufl(mp, nextbno, 1, + &xfs_bmbt_buf_ops); + /* + * Copy records into the extent records. + */ + frp = XFS_BMBT_REC_ADDR(mp, block, 1); + start = i; + for (j = 0; j < num_recs; j++, i++, frp++) { + xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); + trp->l0 = be64_to_cpu(frp->l0); + trp->l1 = be64_to_cpu(frp->l1); + } + if (exntf == XFS_EXTFMT_NOSTATE) { + /* + * Check all attribute bmap btree records and + * any "older" data bmap btree records for a + * set bit in the "extent flag" position. + */ + if (unlikely(xfs_check_nostate_extents(ifp, + start, num_recs))) { + XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + goto error0; + } + } + xfs_trans_brelse(tp, bp); + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) + return error; + block = XFS_BUF_TO_BLOCK(bp); + } + ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); + XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); + return 0; +error0: + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); +} + + +/* + * Search the extent records for the entry containing block bno. + * If bno lies in a hole, point to the next entry. If bno lies + * past eof, *eofp will be set, and *prevp will contain the last + * entry (null if none). Else, *lastxp will be set to the index + * of the found entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_multi_extents( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + xfs_extnum_t lastx; /* last extent index */ + + /* + * Initialize the extent entry structure to catch access to + * uninitialized br_startblock field. + */ + gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; + gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; + gotp->br_state = XFS_EXT_INVALID; +#if XFS_BIG_BLKNOS + gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; +#else + gotp->br_startblock = 0xffffa5a5; +#endif + prevp->br_startoff = NULLFILEOFF; + + ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); + if (lastx > 0) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); + } + if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { + xfs_bmbt_get_all(ep, gotp); + *eofp = 0; + } else { + if (lastx > 0) { + *gotp = *prevp; + } + *eofp = 1; + ep = NULL; + } + *lastxp = lastx; + return ep; +} + +/* + * Search the extents list for the inode, for the extent containing bno. + * If bno lies in a hole, point to the next entry. If bno lies past eof, + * *eofp will be set, and *prevp will contain the last entry (null if none). + * Else, *lastxp will be set to the index of the found + * entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_extents( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int fork, /* data or attr fork */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + + XFS_STATS_INC(xs_look_exlist); + ifp = XFS_IFORK_PTR(ip, fork); + + ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); + + if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && + !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x lastx: %x\n", + (unsigned long long)ip->i_ino, + (unsigned long long)gotp->br_startblock, + (unsigned long long)gotp->br_startoff, + (unsigned long long)gotp->br_blockcount, + gotp->br_state, *lastxp); + *lastxp = NULLEXTNUM; + *eofp = 1; + return NULL; + } + return ep; +} + +/* + * Returns the file-relative block number of the first unused block(s) + * in the file with at least "len" logically contiguous blocks free. + * This is the lowest-address hole if the file has holes, else the first block + * past the end of file. + * Return 0 if the file is currently local (in-inode). + */ +int /* error */ +xfs_bmap_first_unused( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_extlen_t len, /* size of hole to find */ + xfs_fileoff_t *first_unused, /* unused block */ + int whichfork) /* data or attr fork */ +{ + int error; /* error return value */ + int idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t lastaddr; /* last block number seen */ + xfs_fileoff_t lowest; /* lowest useful block */ + xfs_fileoff_t max; /* starting useful block */ + xfs_fileoff_t off; /* offset for this block */ + xfs_extnum_t nextents; /* number of extent entries */ + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *first_unused = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + lowest = *first_unused; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); + off = xfs_bmbt_get_startoff(ep); + /* + * See if the hole before this extent will work. + */ + if (off >= lowest + len && off - max >= len) { + *first_unused = max; + return 0; + } + lastaddr = off + xfs_bmbt_get_blockcount(ep); + max = XFS_FILEOFF_MAX(lastaddr, lowest); + } + *first_unused = max; + return 0; +} + +/* + * Returns the file-relative block number of the last block + 1 before + * last_block (input value) in the file. + * This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int /* error */ +xfs_bmap_last_before( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ +{ + xfs_fileoff_t bno; /* input file offset */ + int eof; /* hit end of file */ + xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ + int error; /* error return value */ + xfs_bmbt_irec_t got; /* current extent value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent used */ + xfs_bmbt_irec_t prev; /* previous extent value */ + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EIO); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *last_block = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + bno = *last_block - 1; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + if (eof || xfs_bmbt_get_startoff(ep) > bno) { + if (prev.br_startoff == NULLFILEOFF) + *last_block = 0; + else + *last_block = prev.br_startoff + prev.br_blockcount; + } + /* + * Otherwise *last_block is already the right answer. + */ + return 0; +} + +STATIC int +xfs_bmap_last_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *rec, + int *is_empty) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int error; + int nextents; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *is_empty = 1; + return 0; + } + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); + *is_empty = 0; + return 0; +} + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + * + * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be + * at, or past the EOF. + */ +STATIC int +xfs_bmap_isaeof( + struct xfs_bmalloca *bma, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + bma->aeof = 0; + error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, + &is_empty); + if (error || is_empty) + return error; + + /* + * Check if we are allocation or past the last extent, or at least into + * the last delayed allocated extent. + */ + bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || + (bma->offset >= rec.br_startoff && + isnullstartblock(rec.br_startblock)); + return 0; +} + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary. All offsets are considered outside + * the end of file for an empty fork, so 1 is returned in *eof in that case. + */ +int +xfs_bmap_eof( + struct xfs_inode *ip, + xfs_fileoff_t endoff, + int whichfork, + int *eof) +{ + struct xfs_bmbt_irec rec; + int error; + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); + if (error || *eof) + return error; + + *eof = endoff >= rec.br_startoff + rec.br_blockcount; + return 0; +} + +/* + * Returns the file-relative block number of the first block past eof in + * the file. This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int +xfs_bmap_last_offset( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t *last_block, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + *last_block = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) + return 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return XFS_ERROR(EIO); + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); + if (error || is_empty) + return error; + + *last_block = rec.br_startoff + rec.br_blockcount; + return 0; +} + +/* + * Returns whether the selected fork of the inode has exactly one + * block or not. For the data fork we check this matches di_size, + * implying the file's range is 0..bsize-1. + */ +int /* 1=>1 block, 0=>otherwise */ +xfs_bmap_one_block( + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int rval; /* return value */ + xfs_bmbt_irec_t s; /* internal version of extent */ + +#ifndef DEBUG + if (whichfork == XFS_DATA_FORK) + return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; +#endif /* !DEBUG */ + if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) + return 0; + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return 0; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_get_all(ep, &s); + rval = s.br_startoff == 0 && s.br_blockcount == 1; + if (rval && whichfork == XFS_DATA_FORK) + ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); + return rval; +} + +/* + * Extent tree manipulation functions used during allocation. + */ + +/* * Convert a delayed allocation to a real allocation. */ STATIC int /* error */ @@ -1894,6 +3483,10 @@ done: } /* + * Functions used in the extent read, allocate and remove paths + */ + +/* * Adjust the size of the new extent based on di_extsize and rt extsize. */ STATIC int @@ -2666,1628 +4259,6 @@ xfs_bmap_alloc( } /* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. - */ -STATIC int /* error */ -xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - /* REFERENCED */ - struct xfs_btree_block *cblock;/* child btree block */ - xfs_fsblock_t cbno; /* child block number */ - xfs_buf_t *cbp; /* child block's buffer */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork data */ - xfs_mount_t *mp; /* mount point structure */ - __be64 *pp; /* ptr to block address */ - struct xfs_btree_block *rblock;/* root btree block */ - - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); - rblock = ifp->if_broot; - ASSERT(be16_to_cpu(rblock->bb_level) == 1); - ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); - ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); - cbno = be64_to_cpu(*pp); - *logflagsp = 0; -#ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, cbno, 1))) - return error; -#endif - error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - cblock = XFS_BUF_TO_BLOCK(cbp); - if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) - return error; - xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); - ip->i_d.di_nblocks--; - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); - xfs_trans_binval(tp, cbp); - if (cur->bc_bufs[0] == cbp) - cur->bc_bufs[0] = NULL; - xfs_iroot_realloc(ip, -1, whichfork); - ASSERT(ifp->if_broot == NULL); - ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); - return 0; -} - -/* - * Called by xfs_bmapi to update file extent records and the btree - * after removing space (or undoing a delayed allocation). - */ -STATIC int /* error */ -xfs_bmap_del_extent( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_trans_t *tp, /* current transaction pointer */ - xfs_extnum_t *idx, /* extent number to update/delete */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *del, /* data to remove from extents */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ - xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ - xfs_fsblock_t del_endblock=0; /* first block past del */ - xfs_fileoff_t del_endoff; /* first offset past del */ - int delay; /* current block is delayed allocated */ - int do_fx; /* free extent at end of routine */ - xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ - int error; /* error return value */ - int flags; /* inode logging flags */ - xfs_bmbt_irec_t got; /* current extent entry */ - xfs_fileoff_t got_endoff; /* first offset past got */ - int i; /* temp state */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t nblks; /* quota/sb block count */ - xfs_bmbt_irec_t new; /* new record to be inserted */ - /* REFERENCED */ - uint qfield; /* quota field to update */ - xfs_filblks_t temp; /* for indirect length calculations */ - xfs_filblks_t temp2; /* for indirect length calculations */ - int state = 0; - - XFS_STATS_INC(xs_del_exlist); - - if (whichfork == XFS_ATTR_FORK) - state |= BMAP_ATTRFORK; - - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t))); - ASSERT(del->br_blockcount > 0); - ep = xfs_iext_get_ext(ifp, *idx); - xfs_bmbt_get_all(ep, &got); - ASSERT(got.br_startoff <= del->br_startoff); - del_endoff = del->br_startoff + del->br_blockcount; - got_endoff = got.br_startoff + got.br_blockcount; - ASSERT(got_endoff >= del_endoff); - delay = isnullstartblock(got.br_startblock); - ASSERT(isnullstartblock(del->br_startblock) == delay); - flags = 0; - qfield = 0; - error = 0; - /* - * If deleting a real allocation, must free up the disk space. - */ - if (!delay) { - flags = XFS_ILOG_CORE; - /* - * Realtime allocation. Free it and record di_nblocks update. - */ - if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { - xfs_fsblock_t bno; - xfs_filblks_t len; - - ASSERT(do_mod(del->br_blockcount, - mp->m_sb.sb_rextsize) == 0); - ASSERT(do_mod(del->br_startblock, - mp->m_sb.sb_rextsize) == 0); - bno = del->br_startblock; - len = del->br_blockcount; - do_div(bno, mp->m_sb.sb_rextsize); - do_div(len, mp->m_sb.sb_rextsize); - error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); - if (error) - goto done; - do_fx = 0; - nblks = len * mp->m_sb.sb_rextsize; - qfield = XFS_TRANS_DQ_RTBCOUNT; - } - /* - * Ordinary allocation. - */ - else { - do_fx = 1; - nblks = del->br_blockcount; - qfield = XFS_TRANS_DQ_BCOUNT; - } - /* - * Set up del_endblock and cur for later. - */ - del_endblock = del->br_startblock + del->br_blockcount; - if (cur) { - if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff, - got.br_startblock, got.br_blockcount, - &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - } - da_old = da_new = 0; - } else { - da_old = startblockval(got.br_startblock); - da_new = 0; - nblks = 0; - do_fx = 0; - } - /* - * Set flag value to use in switch statement. - * Left-contig is 2, right-contig is 1. - */ - switch (((got.br_startoff == del->br_startoff) << 1) | - (got_endoff == del_endoff)) { - case 3: - /* - * Matches the whole extent. Delete the entry. - */ - xfs_iext_remove(ip, *idx, 1, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); - --*idx; - if (delay) - break; - - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); - flags |= XFS_ILOG_CORE; - if (!cur) { - flags |= xfs_ilog_fext(whichfork); - break; - } - if ((error = xfs_btree_delete(cur, &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - break; - - case 2: - /* - * Deleting the first part of the extent. - */ - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_startoff(ep, del_endoff); - temp = got.br_blockcount - del->br_blockcount; - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - da_new = temp; - break; - } - xfs_bmbt_set_startblock(ep, del_endblock); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - if (!cur) { - flags |= xfs_ilog_fext(whichfork); - break; - } - if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, - got.br_blockcount - del->br_blockcount, - got.br_state))) - goto done; - break; - - case 1: - /* - * Deleting the last part of the extent. - */ - temp = got.br_blockcount - del->br_blockcount; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - da_new = temp; - break; - } - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - if (!cur) { - flags |= xfs_ilog_fext(whichfork); - break; - } - if ((error = xfs_bmbt_update(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount - del->br_blockcount, - got.br_state))) - goto done; - break; - - case 0: - /* - * Deleting the middle of the extent. - */ - temp = del->br_startoff - got.br_startoff; - trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - new.br_startoff = del_endoff; - temp2 = got_endoff - del_endoff; - new.br_blockcount = temp2; - new.br_state = got.br_state; - if (!delay) { - new.br_startblock = del_endblock; - flags |= XFS_ILOG_CORE; - if (cur) { - if ((error = xfs_bmbt_update(cur, - got.br_startoff, - got.br_startblock, temp, - got.br_state))) - goto done; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto done; - cur->bc_rec.b = new; - error = xfs_btree_insert(cur, &i); - if (error && error != ENOSPC) - goto done; - /* - * If get no-space back from btree insert, - * it tried a split, and we have a zero - * block reservation. - * Fix up our state and return the error. - */ - if (error == ENOSPC) { - /* - * Reset the cursor, don't trust - * it after any insert operation. - */ - if ((error = xfs_bmbt_lookup_eq(cur, - got.br_startoff, - got.br_startblock, - temp, &i))) - goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - /* - * Update the btree record back - * to the original value. - */ - if ((error = xfs_bmbt_update(cur, - got.br_startoff, - got.br_startblock, - got.br_blockcount, - got.br_state))) - goto done; - /* - * Reset the extent record back - * to the original value. - */ - xfs_bmbt_set_blockcount(ep, - got.br_blockcount); - flags = 0; - error = XFS_ERROR(ENOSPC); - goto done; - } - XFS_WANT_CORRUPTED_GOTO(i == 1, done); - } else - flags |= xfs_ilog_fext(whichfork); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); - } else { - ASSERT(whichfork == XFS_DATA_FORK); - temp = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - temp2 = xfs_bmap_worst_indlen(ip, temp2); - new.br_startblock = nullstartblock((int)temp2); - da_new = temp + temp2; - while (da_new > da_old) { - if (temp) { - temp--; - da_new--; - xfs_bmbt_set_startblock(ep, - nullstartblock((int)temp)); - } - if (da_new == da_old) - break; - if (temp2) { - temp2--; - da_new--; - new.br_startblock = - nullstartblock((int)temp2); - } - } - } - trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_insert(ip, *idx + 1, 1, &new, state); - ++*idx; - break; - } - /* - * If we need to, add to list of extents to delete. - */ - if (do_fx) - xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, - mp); - /* - * Adjust inode # blocks in the file. - */ - if (nblks) - ip->i_d.di_nblocks -= nblks; - /* - * Adjust quota data. - */ - if (qfield) - xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); - - /* - * Account for change in delayed indirect blocks. - * Nothing to do for disk quota accounting here. - */ - ASSERT(da_old >= da_new); - if (da_old > da_new) { - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - da_new), 0); - } -done: - *logflagsp = flags; - return error; -} - -/* - * Remove the entry "free" from the free item list. Prev points to the - * previous entry, unless "free" is the head of the list. - */ -STATIC void -xfs_bmap_del_free( - xfs_bmap_free_t *flist, /* free item list header */ - xfs_bmap_free_item_t *prev, /* previous item on list, if any */ - xfs_bmap_free_item_t *free) /* list item to be freed */ -{ - if (prev) - prev->xbfi_next = free->xbfi_next; - else - flist->xbf_first = free->xbfi_next; - flist->xbf_count--; - kmem_zone_free(xfs_bmap_free_item_zone, free); -} - -/* - * Convert an extents-format file into a btree-format file. - * The new file will have a root block (in the inode) and a single child block. - */ -STATIC int /* error */ -xfs_bmap_extents_to_btree( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first-block-allocated */ - xfs_bmap_free_t *flist, /* blocks freed in xaction */ - xfs_btree_cur_t **curp, /* cursor returned to caller */ - int wasdel, /* converting a delayed alloc */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - struct xfs_btree_block *ablock; /* allocated (child) bt block */ - xfs_buf_t *abp; /* buffer for ablock */ - xfs_alloc_arg_t args; /* allocation arguments */ - xfs_bmbt_rec_t *arp; /* child record pointer */ - struct xfs_btree_block *block; /* btree root block */ - xfs_btree_cur_t *cur; /* bmap btree cursor */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - int error; /* error return value */ - xfs_extnum_t i, cnt; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_key_t *kp; /* root block key pointer */ - xfs_mount_t *mp; /* mount structure */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_bmbt_ptr_t *pp; /* root block address pointer */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); - - /* - * Make space in the inode incore. - */ - xfs_iroot_realloc(ip, 1, whichfork); - ifp->if_flags |= XFS_IFBROOT; - - /* - * Fill in the root. - */ - block = ifp->if_broot; - block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); - block->bb_level = cpu_to_be16(1); - block->bb_numrecs = cpu_to_be16(1); - block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); - - /* - * Need a cursor. Can't allocate until bb_level is filled in. - */ - mp = ip->i_mount; - cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.firstblock = *firstblock; - cur->bc_private.b.flist = flist; - cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; - /* - * Convert to a btree with two levels, one record in root. - */ - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); - memset(&args, 0, sizeof(args)); - args.tp = tp; - args.mp = mp; - args.firstblock = *firstblock; - if (*firstblock == NULLFSBLOCK) { - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); - } else if (flist->xbf_low) { - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = *firstblock; - } else { - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = *firstblock; - } - args.minlen = args.maxlen = args.prod = 1; - args.wasdel = wasdel; - *logflagsp = 0; - if ((error = xfs_alloc_vextent(&args))) { - xfs_iroot_realloc(ip, -1, whichfork); - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - return error; - } - /* - * Allocation can't fail, the space was reserved. - */ - ASSERT(args.fsbno != NULLFSBLOCK); - ASSERT(*firstblock == NULLFSBLOCK || - args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || - (flist->xbf_low && - args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); - *firstblock = cur->bc_private.b.firstblock = args.fsbno; - cur->bc_private.b.allocated++; - ip->i_d.di_nblocks++; - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); - abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); - /* - * Fill in the child block. - */ - abp->b_ops = &xfs_bmbt_buf_ops; - ablock = XFS_BUF_TO_BLOCK(abp); - ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); - ablock->bb_level = 0; - ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - for (cnt = i = 0; i < nextents; i++) { - ep = xfs_iext_get_ext(ifp, i); - if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { - arp->l0 = cpu_to_be64(ep->l0); - arp->l1 = cpu_to_be64(ep->l1); - arp++; cnt++; - } - } - ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); - xfs_btree_set_numrecs(ablock, cnt); - - /* - * Fill in the root key and pointer. - */ - kp = XFS_BMBT_KEY_ADDR(mp, block, 1); - arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); - kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, - be16_to_cpu(block->bb_level))); - *pp = cpu_to_be64(args.fsbno); - - /* - * Do all this logging at the end so that - * the root is at the right level. - */ - xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS); - xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); - ASSERT(*curp == NULL); - *curp = cur; - *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); - return 0; -} - -/* - * Calculate the default attribute fork offset for newly created inodes. - */ -uint -xfs_default_attroffset( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - uint offset; - - if (mp->m_sb.sb_inodesize == 256) { - offset = XFS_LITINO(mp) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); - } else { - offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); - } - - ASSERT(offset < XFS_LITINO(mp)); - return offset; -} - -/* - * Helper routine to reset inode di_forkoff field when switching - * attribute fork from local to extent format - we reset it where - * possible to make space available for inline data fork extents. - */ -STATIC void -xfs_bmap_forkoff_reset( - xfs_mount_t *mp, - xfs_inode_t *ip, - int whichfork) -{ - if (whichfork == XFS_ATTR_FORK && - ip->i_d.di_format != XFS_DINODE_FMT_DEV && - ip->i_d.di_format != XFS_DINODE_FMT_UUID && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { - uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; - - if (dfl_forkoff > ip->i_d.di_forkoff) - ip->i_d.di_forkoff = dfl_forkoff; - } -} - -/* - * Convert a local file to an extents file. - * This code is out of bounds for data forks of regular files, - * since the file data needs to get logged so things will stay consistent. - * (The bmap-level manipulations are ok, though). - */ -STATIC int /* error */ -xfs_bmap_local_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated in xaction */ - xfs_extlen_t total, /* total blocks needed by transaction */ - int *logflagsp, /* inode logging flags */ - int whichfork, - void (*init_fn)(struct xfs_buf *bp, - struct xfs_inode *ip, - struct xfs_ifork *ifp)) -{ - int error; /* error return value */ - int flags; /* logging flags returned */ - xfs_ifork_t *ifp; /* inode fork pointer */ - - /* - * We don't want to deal with the case of keeping inode data inline yet. - * So sending the data fork of a regular inode is invalid. - */ - ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); - flags = 0; - error = 0; - if (ifp->if_bytes) { - xfs_alloc_arg_t args; /* allocation arguments */ - xfs_buf_t *bp; /* buffer for extent block */ - xfs_bmbt_rec_host_t *ep;/* extent record pointer */ - - ASSERT((ifp->if_flags & - (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); - memset(&args, 0, sizeof(args)); - args.tp = tp; - args.mp = ip->i_mount; - args.firstblock = *firstblock; - /* - * Allocate a block. We know we need only one, since the - * file currently fits in an inode. - */ - if (*firstblock == NULLFSBLOCK) { - args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); - args.type = XFS_ALLOCTYPE_START_BNO; - } else { - args.fsbno = *firstblock; - args.type = XFS_ALLOCTYPE_NEAR_BNO; - } - args.total = total; - args.minlen = args.maxlen = args.prod = 1; - error = xfs_alloc_vextent(&args); - if (error) - goto done; - - /* Can't fail, the space was reserved. */ - ASSERT(args.fsbno != NULLFSBLOCK); - ASSERT(args.len == 1); - *firstblock = args.fsbno; - bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); - - /* initialise the block and copy the data */ - init_fn(bp, ip, ifp); - - /* account for the change in fork size and log everything */ - xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); - xfs_bmap_forkoff_reset(args.mp, ip, whichfork); - xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); - xfs_iext_add(ifp, 0, 1); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); - trace_xfs_bmap_post_update(ip, 0, - whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, - _THIS_IP_); - XFS_IFORK_NEXT_SET(ip, whichfork, 1); - ip->i_d.di_nblocks = 1; - xfs_trans_mod_dquot_byino(tp, ip, - XFS_TRANS_DQ_BCOUNT, 1L); - flags |= xfs_ilog_fext(whichfork); - } else { - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); - xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); - } - ifp->if_flags &= ~XFS_IFINLINE; - ifp->if_flags |= XFS_IFEXTENTS; - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - flags |= XFS_ILOG_CORE; -done: - *logflagsp = flags; - return error; -} - -/* - * Search the extent records for the entry containing block bno. - * If bno lies in a hole, point to the next entry. If bno lies - * past eof, *eofp will be set, and *prevp will contain the last - * entry (null if none). Else, *lastxp will be set to the index - * of the found entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_multi_extents( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - xfs_extnum_t lastx; /* last extent index */ - - /* - * Initialize the extent entry structure to catch access to - * uninitialized br_startblock field. - */ - gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; - gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; - gotp->br_state = XFS_EXT_INVALID; -#if XFS_BIG_BLKNOS - gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; -#else - gotp->br_startblock = 0xffffa5a5; -#endif - prevp->br_startoff = NULLFILEOFF; - - ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); - if (lastx > 0) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); - } - if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { - xfs_bmbt_get_all(ep, gotp); - *eofp = 0; - } else { - if (lastx > 0) { - *gotp = *prevp; - } - *eofp = 1; - ep = NULL; - } - *lastxp = lastx; - return ep; -} - -/* - * Search the extents list for the inode, for the extent containing bno. - * If bno lies in a hole, point to the next entry. If bno lies past eof, - * *eofp will be set, and *prevp will contain the last entry (null if none). - * Else, *lastxp will be set to the index of the found - * entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_extents( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int fork, /* data or attr fork */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - - XFS_STATS_INC(xs_look_exlist); - ifp = XFS_IFORK_PTR(ip, fork); - - ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); - - if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && - !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { - xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, - "Access to block zero in inode %llu " - "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x lastx: %x\n", - (unsigned long long)ip->i_ino, - (unsigned long long)gotp->br_startblock, - (unsigned long long)gotp->br_startoff, - (unsigned long long)gotp->br_blockcount, - gotp->br_state, *lastxp); - *lastxp = NULLEXTNUM; - *eofp = 1; - return NULL; - } - return ep; -} - -/* - * Compute the worst-case number of indirect blocks that will be used - * for ip's delayed extent of length "len". - */ -STATIC xfs_filblks_t -xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len) /* delayed extent length */ -{ - int level; /* btree level number */ - int maxrecs; /* maximum record count at this level */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t rval; /* return value */ - - mp = ip->i_mount; - maxrecs = mp->m_bmap_dmxr[0]; - for (level = 0, rval = 0; - level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); - level++) { - len += maxrecs - 1; - do_div(len, maxrecs); - rval += len; - if (len == 1) - return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - - level - 1; - if (level == 0) - maxrecs = mp->m_bmap_dmxr[1]; - } - return rval; -} - -/* - * Convert inode from non-attributed to attributed. - * Must not be in a transaction, ip must not be locked. - */ -int /* error code */ -xfs_bmap_add_attrfork( - xfs_inode_t *ip, /* incore inode pointer */ - int size, /* space new attribute needs */ - int rsvd) /* xact may use reserved blks */ -{ - xfs_fsblock_t firstblock; /* 1st block/ag allocated */ - xfs_bmap_free_t flist; /* freed extent records */ - xfs_mount_t *mp; /* mount structure */ - xfs_trans_t *tp; /* transaction pointer */ - int blks; /* space reservation */ - int version = 1; /* superblock attr version */ - int committed; /* xaction was committed */ - int logflags; /* logging flags */ - int error; /* error return value */ - - ASSERT(XFS_IFORK_Q(ip) == 0); - - mp = ip->i_mount; - ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); - blks = XFS_ADDAFORK_SPACE_RES(mp); - if (rsvd) - tp->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT))) - goto error0; - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? - XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); - return error; - } - if (XFS_IFORK_Q(ip)) - goto error1; - if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { - /* - * For inodes coming from pre-6.2 filesystems. - */ - ASSERT(ip->i_d.di_aformat == 0); - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - } - ASSERT(ip->i_d.di_anextents == 0); - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_DEV: - ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; - break; - case XFS_DINODE_FMT_UUID: - ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; - break; - case XFS_DINODE_FMT_LOCAL: - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); - if (!ip->i_d.di_forkoff) - ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; - else if (mp->m_flags & XFS_MOUNT_ATTR2) - version = 2; - break; - default: - ASSERT(0); - error = XFS_ERROR(EINVAL); - goto error1; - } - - ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); - ip->i_afp->if_flags = XFS_IFEXTENTS; - logflags = 0; - xfs_bmap_init(&flist, &firstblock); - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_LOCAL: - error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, - &logflags); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, - &flist, &logflags); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, - &logflags); - break; - default: - error = 0; - break; - } - if (logflags) - xfs_trans_log_inode(tp, ip, logflags); - if (error) - goto error2; - if (!xfs_sb_version_hasattr(&mp->m_sb) || - (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { - __int64_t sbfields = 0; - - spin_lock(&mp->m_sb_lock); - if (!xfs_sb_version_hasattr(&mp->m_sb)) { - xfs_sb_version_addattr(&mp->m_sb); - sbfields |= XFS_SB_VERSIONNUM; - } - if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { - xfs_sb_version_addattr2(&mp->m_sb); - sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); - } - if (sbfields) { - spin_unlock(&mp->m_sb_lock); - xfs_mod_sb(tp, sbfields); - } else - spin_unlock(&mp->m_sb_lock); - } - - error = xfs_bmap_finish(&tp, &flist, &committed); - if (error) - goto error2; - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); -error2: - xfs_bmap_cancel(&flist); -error1: - xfs_iunlock(ip, XFS_ILOCK_EXCL); -error0: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - return error; -} - -/* - * Add the extent to the list of extents to be free at transaction end. - * The list is maintained sorted (by block number). - */ -/* ARGSUSED */ -void -xfs_bmap_add_free( - xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len, /* length of extent */ - xfs_bmap_free_t *flist, /* list of extents */ - xfs_mount_t *mp) /* mount point structure */ -{ - xfs_bmap_free_item_t *cur; /* current (next) element */ - xfs_bmap_free_item_t *new; /* new element */ - xfs_bmap_free_item_t *prev; /* previous element */ -#ifdef DEBUG - xfs_agnumber_t agno; - xfs_agblock_t agbno; - - ASSERT(bno != NULLFSBLOCK); - ASSERT(len > 0); - ASSERT(len <= MAXEXTLEN); - ASSERT(!isnullstartblock(bno)); - agno = XFS_FSB_TO_AGNO(mp, bno); - agbno = XFS_FSB_TO_AGBNO(mp, bno); - ASSERT(agno < mp->m_sb.sb_agcount); - ASSERT(agbno < mp->m_sb.sb_agblocks); - ASSERT(len < mp->m_sb.sb_agblocks); - ASSERT(agbno + len <= mp->m_sb.sb_agblocks); -#endif - ASSERT(xfs_bmap_free_item_zone != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); - new->xbfi_startblock = bno; - new->xbfi_blockcount = (xfs_extlen_t)len; - for (prev = NULL, cur = flist->xbf_first; - cur != NULL; - prev = cur, cur = cur->xbfi_next) { - if (cur->xbfi_startblock >= bno) - break; - } - if (prev) - prev->xbfi_next = new; - else - flist->xbf_first = new; - new->xbfi_next = cur; - flist->xbf_count++; -} - -/* - * Compute and fill in the value of the maximum depth of a bmap btree - * in this filesystem. Done once, during mount. - */ -void -xfs_bmap_compute_maxlevels( - xfs_mount_t *mp, /* file system mount structure */ - int whichfork) /* data or attr fork */ -{ - int level; /* btree level */ - uint maxblocks; /* max blocks at this level */ - uint maxleafents; /* max leaf entries possible */ - int maxrootrecs; /* max records in root block */ - int minleafrecs; /* min records in leaf block */ - int minnoderecs; /* min records in node block */ - int sz; /* root block size */ - - /* - * The maximum number of extents in a file, hence the maximum - * number of leaf entries, is controlled by the type of di_nextents - * (a signed 32-bit number, xfs_extnum_t), or by di_anextents - * (a signed 16-bit number, xfs_aextnum_t). - * - * Note that we can no longer assume that if we are in ATTR1 that - * the fork offset of all the inodes will be - * (xfs_default_attroffset(ip) >> 3) because we could have mounted - * with ATTR2 and then mounted back with ATTR1, keeping the - * di_forkoff's fixed but probably at various positions. Therefore, - * for both ATTR1 and ATTR2 we have to assume the worst case scenario - * of a minimum size available. - */ - if (whichfork == XFS_DATA_FORK) { - maxleafents = MAXEXTNUM; - sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); - } else { - maxleafents = MAXAEXTNUM; - sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); - } - maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0); - minleafrecs = mp->m_bmap_dmnr[0]; - minnoderecs = mp->m_bmap_dmnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; - for (level = 1; maxblocks > 1; level++) { - if (maxblocks <= maxrootrecs) - maxblocks = 1; - else - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; - } - mp->m_bm_maxlevels[whichfork] = level; -} - -/* - * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi - * caller. Frees all the extents that need freeing, which must be done - * last due to locking considerations. We never free any extents in - * the first transaction. - * - * Return 1 if the given transaction was committed and a new one - * started, and 0 otherwise in the committed parameter. - */ -int /* error */ -xfs_bmap_finish( - xfs_trans_t **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *committed) /* xact committed or not */ -{ - xfs_efd_log_item_t *efd; /* extent free data */ - xfs_efi_log_item_t *efi; /* extent free intention */ - int error; /* error return value */ - xfs_bmap_free_item_t *free; /* free extent item */ - unsigned int logres; /* new log reservation */ - unsigned int logcount; /* new log count */ - xfs_mount_t *mp; /* filesystem mount structure */ - xfs_bmap_free_item_t *next; /* next item on free list */ - xfs_trans_t *ntp; /* new transaction pointer */ - - ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - if (flist->xbf_count == 0) { - *committed = 0; - return 0; - } - ntp = *tp; - efi = xfs_trans_get_efi(ntp, flist->xbf_count); - for (free = flist->xbf_first; free; free = free->xbfi_next) - xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, - free->xbfi_blockcount); - logres = ntp->t_log_res; - logcount = ntp->t_log_count; - ntp = xfs_trans_dup(*tp); - error = xfs_trans_commit(*tp, 0); - *tp = ntp; - *committed = 1; - /* - * We have a new transaction, so we should return committed=1, - * even though we're returning an error. - */ - if (error) - return error; - - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(ntp->t_ticket); - - if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES, - logcount))) - return error; - efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); - for (free = flist->xbf_first; free != NULL; free = next) { - next = free->xbfi_next; - if ((error = xfs_free_extent(ntp, free->xbfi_startblock, - free->xbfi_blockcount))) { - /* - * The bmap free list will be cleaned up at a - * higher level. The EFI will be canceled when - * this transaction is aborted. - * Need to force shutdown here to make sure it - * happens, since this transaction may not be - * dirty yet. - */ - mp = ntp->t_mountp; - if (!XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, - (error == EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); - return error; - } - xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, - free->xbfi_blockcount); - xfs_bmap_del_free(flist, NULL, free); - } - return 0; -} - -/* - * Free up any items left in the list. - */ -void -xfs_bmap_cancel( - xfs_bmap_free_t *flist) /* list of bmap_free_items */ -{ - xfs_bmap_free_item_t *free; /* free list item */ - xfs_bmap_free_item_t *next; - - if (flist->xbf_count == 0) - return; - ASSERT(flist->xbf_first != NULL); - for (free = flist->xbf_first; free; free = next) { - next = free->xbfi_next; - xfs_bmap_del_free(flist, NULL, free); - } - ASSERT(flist->xbf_count == 0); -} - -/* - * Returns the file-relative block number of the first unused block(s) - * in the file with at least "len" logically contiguous blocks free. - * This is the lowest-address hole if the file has holes, else the first block - * past the end of file. - * Return 0 if the file is currently local (in-inode). - */ -int /* error */ -xfs_bmap_first_unused( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_extlen_t len, /* size of hole to find */ - xfs_fileoff_t *first_unused, /* unused block */ - int whichfork) /* data or attr fork */ -{ - int error; /* error return value */ - int idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_fileoff_t lastaddr; /* last block number seen */ - xfs_fileoff_t lowest; /* lowest useful block */ - xfs_fileoff_t max; /* starting useful block */ - xfs_fileoff_t off; /* offset for this block */ - xfs_extnum_t nextents; /* number of extent entries */ - - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - *first_unused = 0; - return 0; - } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - lowest = *first_unused; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); - off = xfs_bmbt_get_startoff(ep); - /* - * See if the hole before this extent will work. - */ - if (off >= lowest + len && off - max >= len) { - *first_unused = max; - return 0; - } - lastaddr = off + xfs_bmbt_get_blockcount(ep); - max = XFS_FILEOFF_MAX(lastaddr, lowest); - } - *first_unused = max; - return 0; -} - -/* - * Returns the file-relative block number of the last block + 1 before - * last_block (input value) in the file. - * This is not based on i_size, it is based on the extent records. - * Returns 0 for local files, as they do not have extent records. - */ -int /* error */ -xfs_bmap_last_before( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork) /* data or attr fork */ -{ - xfs_fileoff_t bno; /* input file offset */ - int eof; /* hit end of file */ - xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ - int error; /* error return value */ - xfs_bmbt_irec_t got; /* current extent value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last extent used */ - xfs_bmbt_irec_t prev; /* previous extent value */ - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EIO); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - *last_block = 0; - return 0; - } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - bno = *last_block - 1; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - if (eof || xfs_bmbt_get_startoff(ep) > bno) { - if (prev.br_startoff == NULLFILEOFF) - *last_block = 0; - else - *last_block = prev.br_startoff + prev.br_blockcount; - } - /* - * Otherwise *last_block is already the right answer. - */ - return 0; -} - -STATIC int -xfs_bmap_last_extent( - struct xfs_trans *tp, - struct xfs_inode *ip, - int whichfork, - struct xfs_bmbt_irec *rec, - int *is_empty) -{ - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - int error; - int nextents; - - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(tp, ip, whichfork); - if (error) - return error; - } - - nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *is_empty = 1; - return 0; - } - - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); - *is_empty = 0; - return 0; -} - -/* - * Check the last inode extent to determine whether this allocation will result - * in blocks being allocated at the end of the file. When we allocate new data - * blocks at the end of the file which do not start at the previous data block, - * we will try to align the new blocks at stripe unit boundaries. - * - * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be - * at, or past the EOF. - */ -STATIC int -xfs_bmap_isaeof( - struct xfs_bmalloca *bma, - int whichfork) -{ - struct xfs_bmbt_irec rec; - int is_empty; - int error; - - bma->aeof = 0; - error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, - &is_empty); - if (error || is_empty) - return error; - - /* - * Check if we are allocation or past the last extent, or at least into - * the last delayed allocated extent. - */ - bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || - (bma->offset >= rec.br_startoff && - isnullstartblock(rec.br_startblock)); - return 0; -} - -/* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary. All offsets are considered outside - * the end of file for an empty fork, so 1 is returned in *eof in that case. - */ -int -xfs_bmap_eof( - struct xfs_inode *ip, - xfs_fileoff_t endoff, - int whichfork, - int *eof) -{ - struct xfs_bmbt_irec rec; - int error; - - error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); - if (error || *eof) - return error; - - *eof = endoff >= rec.br_startoff + rec.br_blockcount; - return 0; -} - -/* - * Returns the file-relative block number of the first block past eof in - * the file. This is not based on i_size, it is based on the extent records. - * Returns 0 for local files, as they do not have extent records. - */ -int -xfs_bmap_last_offset( - struct xfs_trans *tp, - struct xfs_inode *ip, - xfs_fileoff_t *last_block, - int whichfork) -{ - struct xfs_bmbt_irec rec; - int is_empty; - int error; - - *last_block = 0; - - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) - return 0; - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - return XFS_ERROR(EIO); - - error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); - if (error || is_empty) - return error; - - *last_block = rec.br_startoff + rec.br_blockcount; - return 0; -} - -/* - * Returns whether the selected fork of the inode has exactly one - * block or not. For the data fork we check this matches di_size, - * implying the file's range is 0..bsize-1. - */ -int /* 1=>1 block, 0=>otherwise */ -xfs_bmap_one_block( - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */ - xfs_ifork_t *ifp; /* inode fork pointer */ - int rval; /* return value */ - xfs_bmbt_irec_t s; /* internal version of extent */ - -#ifndef DEBUG - if (whichfork == XFS_DATA_FORK) - return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; -#endif /* !DEBUG */ - if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) - return 0; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - return 0; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_get_all(ep, &s); - rval = s.br_startoff == 0 && s.br_blockcount == 1; - if (rval && whichfork == XFS_DATA_FORK) - ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); - return rval; -} - -STATIC int -xfs_bmap_sanity_check( - struct xfs_mount *mp, - struct xfs_buf *bp, - int level) -{ - struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - - if (block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) || - be16_to_cpu(block->bb_level) != level || - be16_to_cpu(block->bb_numrecs) == 0 || - be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) - return 0; - return 1; -} - -/* - * Read in the extents to if_extents. - * All inode fields are set up by caller, we just traverse the btree - * and copy the records in. If the file system cannot contain unwritten - * extents, the records are checked for no "state" flags. - */ -int /* error */ -xfs_bmap_read_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ -{ - struct xfs_btree_block *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_buf_t *bp; /* buffer for "block" */ - int error; /* error return value */ - xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ - xfs_extnum_t i, j; /* index into the extents list */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - /* REFERENCED */ - xfs_extnum_t room; /* number of entries there's room for */ - - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : - XFS_EXTFMT_INODE(ip); - block = ifp->if_broot; - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - ASSERT(bno != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - /* - * Go down the tree until leaf level is reached, following the first - * pointer (leftmost) at each level. - */ - while (level-- > 0) { - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); - if (error) - return error; - block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); - if (level == 0) - break; - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); - xfs_trans_brelse(tp, bp); - } - /* - * Here with bp and block set to the leftmost leaf node in the tree. - */ - room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - i = 0; - /* - * Loop over all leaf nodes. Copy information to the extent records. - */ - for (;;) { - xfs_bmbt_rec_t *frp; - xfs_fsblock_t nextbno; - xfs_extnum_t num_recs; - xfs_extnum_t start; - - num_recs = xfs_btree_get_numrecs(block); - if (unlikely(i + num_recs > room)) { - ASSERT(i + num_recs <= room); - xfs_warn(ip->i_mount, - "corrupt dinode %Lu, (btree extents).", - (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)", - XFS_ERRLEVEL_LOW, ip->i_mount, block); - goto error0; - } - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, 0), - error0); - /* - * Read-ahead the next leaf block, if any. - */ - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - if (nextbno != NULLFSBLOCK) - xfs_btree_reada_bufl(mp, nextbno, 1, - &xfs_bmbt_buf_ops); - /* - * Copy records into the extent records. - */ - frp = XFS_BMBT_REC_ADDR(mp, block, 1); - start = i; - for (j = 0; j < num_recs; j++, i++, frp++) { - xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); - trp->l0 = be64_to_cpu(frp->l0); - trp->l1 = be64_to_cpu(frp->l1); - } - if (exntf == XFS_EXTFMT_NOSTATE) { - /* - * Check all attribute bmap btree records and - * any "older" data bmap btree records for a - * set bit in the "extent flag" position. - */ - if (unlikely(xfs_check_nostate_extents(ifp, - start, num_recs))) { - XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", - XFS_ERRLEVEL_LOW, - ip->i_mount); - goto error0; - } - } - xfs_trans_brelse(tp, bp); - bno = nextbno; - /* - * If we've reached the end, stop. - */ - if (bno == NULLFSBLOCK) - break; - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); - if (error) - return error; - block = XFS_BUF_TO_BLOCK(bp); - } - ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); - ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); - XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); - return 0; -error0: - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); -} - -#ifdef DEBUG -/* - * Add bmap trace insert entries for all the contents of the extent records. - */ -void -xfs_bmap_trace_exlist( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t cnt, /* count of entries in the list */ - int whichfork, /* data or attr fork */ - unsigned long caller_ip) -{ - xfs_extnum_t idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - int state = 0; - - if (whichfork == XFS_ATTR_FORK) - state |= BMAP_ATTRFORK; - - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); - for (idx = 0; idx < cnt; idx++) - trace_xfs_extlist(ip, idx, whichfork, caller_ip); -} - -/* - * Validate that the bmbt_irecs being returned from bmapi are valid - * given the callers original parameters. Specifically check the - * ranges of the returned irecs to ensure that they only extent beyond - * the given parameters if the XFS_BMAPI_ENTIRE flag was set. - */ -STATIC void -xfs_bmap_validate_ret( - xfs_fileoff_t bno, - xfs_filblks_t len, - int flags, - xfs_bmbt_irec_t *mval, - int nmap, - int ret_nmap) -{ - int i; /* index to map values */ - - ASSERT(ret_nmap <= nmap); - - for (i = 0; i < ret_nmap; i++) { - ASSERT(mval[i].br_blockcount > 0); - if (!(flags & XFS_BMAPI_ENTIRE)) { - ASSERT(mval[i].br_startoff >= bno); - ASSERT(mval[i].br_blockcount <= len); - ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= - bno + len); - } else { - ASSERT(mval[i].br_startoff < bno + len); - ASSERT(mval[i].br_startoff + mval[i].br_blockcount > - bno); - } - ASSERT(i == 0 || - mval[i - 1].br_startoff + mval[i - 1].br_blockcount == - mval[i].br_startoff); - ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && - mval[i].br_startblock != HOLESTARTBLOCK); - ASSERT(mval[i].br_state == XFS_EXT_NORM || - mval[i].br_state == XFS_EXT_UNWRITTEN); - } -} -#endif /* DEBUG */ - - -/* * Trim the returned map to the required bounds */ STATIC void @@ -5151,6 +5122,328 @@ error0: } /* + * Called by xfs_bmapi to update file extent records and the btree + * after removing space (or undoing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_del_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_trans_t *tp, /* current transaction pointer */ + xfs_extnum_t *idx, /* extent number to update/delete */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *del, /* data to remove from extents */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ + xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ + xfs_fsblock_t del_endblock=0; /* first block past del */ + xfs_fileoff_t del_endoff; /* first offset past del */ + int delay; /* current block is delayed allocated */ + int do_fx; /* free extent at end of routine */ + xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ + int error; /* error return value */ + int flags; /* inode logging flags */ + xfs_bmbt_irec_t got; /* current extent entry */ + xfs_fileoff_t got_endoff; /* first offset past got */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t nblks; /* quota/sb block count */ + xfs_bmbt_irec_t new; /* new record to be inserted */ + /* REFERENCED */ + uint qfield; /* quota field to update */ + xfs_filblks_t temp; /* for indirect length calculations */ + xfs_filblks_t temp2; /* for indirect length calculations */ + int state = 0; + + XFS_STATS_INC(xs_del_exlist); + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(del->br_blockcount > 0); + ep = xfs_iext_get_ext(ifp, *idx); + xfs_bmbt_get_all(ep, &got); + ASSERT(got.br_startoff <= del->br_startoff); + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got.br_startoff + got.br_blockcount; + ASSERT(got_endoff >= del_endoff); + delay = isnullstartblock(got.br_startblock); + ASSERT(isnullstartblock(del->br_startblock) == delay); + flags = 0; + qfield = 0; + error = 0; + /* + * If deleting a real allocation, must free up the disk space. + */ + if (!delay) { + flags = XFS_ILOG_CORE; + /* + * Realtime allocation. Free it and record di_nblocks update. + */ + if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { + xfs_fsblock_t bno; + xfs_filblks_t len; + + ASSERT(do_mod(del->br_blockcount, + mp->m_sb.sb_rextsize) == 0); + ASSERT(do_mod(del->br_startblock, + mp->m_sb.sb_rextsize) == 0); + bno = del->br_startblock; + len = del->br_blockcount; + do_div(bno, mp->m_sb.sb_rextsize); + do_div(len, mp->m_sb.sb_rextsize); + error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + if (error) + goto done; + do_fx = 0; + nblks = len * mp->m_sb.sb_rextsize; + qfield = XFS_TRANS_DQ_RTBCOUNT; + } + /* + * Ordinary allocation. + */ + else { + do_fx = 1; + nblks = del->br_blockcount; + qfield = XFS_TRANS_DQ_BCOUNT; + } + /* + * Set up del_endblock and cur for later. + */ + del_endblock = del->br_startblock + del->br_blockcount; + if (cur) { + if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, got.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } + da_old = da_new = 0; + } else { + da_old = startblockval(got.br_startblock); + da_new = 0; + nblks = 0; + do_fx = 0; + } + /* + * Set flag value to use in switch statement. + * Left-contig is 2, right-contig is 1. + */ + switch (((got.br_startoff == del->br_startoff) << 1) | + (got_endoff == del_endoff)) { + case 3: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, *idx, 1, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); + --*idx; + if (delay) + break; + + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + flags |= XFS_ILOG_CORE; + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + break; + + case 2: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startoff(ep, del_endoff); + temp = got.br_blockcount - del->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + da_new = temp; + break; + } + xfs_bmbt_set_startblock(ep, del_endblock); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 1: + /* + * Deleting the last part of the extent. + */ + temp = got.br_blockcount - del->br_blockcount; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + da_new = temp; + break; + } + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 0: + /* + * Deleting the middle of the extent. + */ + temp = del->br_startoff - got.br_startoff; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + new.br_startoff = del_endoff; + temp2 = got_endoff - del_endoff; + new.br_blockcount = temp2; + new.br_state = got.br_state; + if (!delay) { + new.br_startblock = del_endblock; + flags |= XFS_ILOG_CORE; + if (cur) { + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, temp, + got.br_state))) + goto done; + if ((error = xfs_btree_increment(cur, 0, &i))) + goto done; + cur->bc_rec.b = new; + error = xfs_btree_insert(cur, &i); + if (error && error != ENOSPC) + goto done; + /* + * If get no-space back from btree insert, + * it tried a split, and we have a zero + * block reservation. + * Fix up our state and return the error. + */ + if (error == ENOSPC) { + /* + * Reset the cursor, don't trust + * it after any insert operation. + */ + if ((error = xfs_bmbt_lookup_eq(cur, + got.br_startoff, + got.br_startblock, + temp, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + /* + * Update the btree record back + * to the original value. + */ + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state))) + goto done; + /* + * Reset the extent record back + * to the original value. + */ + xfs_bmbt_set_blockcount(ep, + got.br_blockcount); + flags = 0; + error = XFS_ERROR(ENOSPC); + goto done; + } + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + } else + flags |= xfs_ilog_fext(whichfork); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + } else { + ASSERT(whichfork == XFS_DATA_FORK); + temp = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + temp2 = xfs_bmap_worst_indlen(ip, temp2); + new.br_startblock = nullstartblock((int)temp2); + da_new = temp + temp2; + while (da_new > da_old) { + if (temp) { + temp--; + da_new--; + xfs_bmbt_set_startblock(ep, + nullstartblock((int)temp)); + } + if (da_new == da_old) + break; + if (temp2) { + temp2--; + da_new--; + new.br_startblock = + nullstartblock((int)temp2); + } + } + } + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_insert(ip, *idx + 1, 1, &new, state); + ++*idx; + break; + } + /* + * If we need to, add to list of extents to delete. + */ + if (do_fx) + xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, + mp); + /* + * Adjust inode # blocks in the file. + */ + if (nblks) + ip->i_d.di_nblocks -= nblks; + /* + * Adjust quota data. + */ + if (qfield) + xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); + + /* + * Account for change in delayed indirect blocks. + * Nothing to do for disk quota accounting here. + */ + ASSERT(da_old >= da_new); + if (da_old > da_new) { + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + (int64_t)(da_old - da_new), 0); + } +done: + *logflagsp = flags; + return error; +} + +/* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to * that value. If not all extents in the block range can be removed then @@ -5811,416 +6104,6 @@ xfs_getbmap( return error; } -#ifdef DEBUG -STATIC struct xfs_buf * -xfs_bmap_get_bp( - struct xfs_btree_cur *cur, - xfs_fsblock_t bno) -{ - struct xfs_log_item_desc *lidp; - int i; - - if (!cur) - return NULL; - - for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { - if (!cur->bc_bufs[i]) - break; - if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) - return cur->bc_bufs[i]; - } - - /* Chase down all the log items to see if the bp is there */ - list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { - struct xfs_buf_log_item *bip; - bip = (struct xfs_buf_log_item *)lidp->lid_item; - if (bip->bli_item.li_type == XFS_LI_BUF && - XFS_BUF_ADDR(bip->bli_buf) == bno) - return bip->bli_buf; - } - - return NULL; -} - -STATIC void -xfs_check_block( - struct xfs_btree_block *block, - xfs_mount_t *mp, - int root, - short sz) -{ - int i, j, dmxr; - __be64 *pp, *thispa; /* pointer to block address */ - xfs_bmbt_key_t *prevp, *keyp; - - ASSERT(be16_to_cpu(block->bb_level) > 0); - - prevp = NULL; - for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { - dmxr = mp->m_bmap_dmxr[0]; - keyp = XFS_BMBT_KEY_ADDR(mp, block, i); - - if (prevp) { - ASSERT(be64_to_cpu(prevp->br_startoff) < - be64_to_cpu(keyp->br_startoff)); - } - prevp = keyp; - - /* - * Compare the block numbers to see if there are dups. - */ - if (root) - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); - else - pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); - - for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { - if (root) - thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); - else - thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); - if (*thispa == *pp) { - xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", - __func__, j, i, - (unsigned long long)be64_to_cpu(*thispa)); - panic("%s: ptrs are equal in node\n", - __func__); - } - } - } -} - -/* - * Check that the extents for the inode ip are in the right order in all - * btree leaves. - */ - -STATIC void -xfs_bmap_check_leaf_extents( - xfs_btree_cur_t *cur, /* btree cursor or null */ - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork) /* data or attr fork */ -{ - struct xfs_btree_block *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_buf_t *bp; /* buffer for "block" */ - int error; /* error return value */ - xfs_extnum_t i=0, j; /* index into the extents list */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - xfs_bmbt_rec_t *ep; /* pointer to current extent */ - xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ - xfs_bmbt_rec_t *nextp; /* pointer to next extent */ - int bp_release = 0; - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { - return; - } - - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - block = ifp->if_broot; - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - xfs_check_block(block, mp, 1, ifp->if_broot_bytes); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - - ASSERT(bno != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - - /* - * Go down the tree until leaf level is reached, following the first - * pointer (leftmost) at each level. - */ - while (level-- > 0) { - /* See if buf is in cur first */ - bp_release = 0; - bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (!bp) { - bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - goto error_norelse; - } - block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); - if (level == 0) - break; - - /* - * Check this block for basic sanity (increasing keys and - * no duplicate blocks). - */ - - xfs_check_block(block, mp, 0, 0); - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); - } - } - - /* - * Here with bp and block set to the leftmost leaf node in the tree. - */ - i = 0; - - /* - * Loop over all leaf nodes checking that all extents are in the right order. - */ - for (;;) { - xfs_fsblock_t nextbno; - xfs_extnum_t num_recs; - - - num_recs = xfs_btree_get_numrecs(block); - - /* - * Read-ahead the next leaf block, if any. - */ - - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - - /* - * Check all the extents to make sure they are OK. - * If we had a previous block, the last entry should - * conform with the first entry in this one. - */ - - ep = XFS_BMBT_REC_ADDR(mp, block, 1); - if (i) { - ASSERT(xfs_bmbt_disk_get_startoff(&last) + - xfs_bmbt_disk_get_blockcount(&last) <= - xfs_bmbt_disk_get_startoff(ep)); - } - for (j = 1; j < num_recs; j++) { - nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); - ASSERT(xfs_bmbt_disk_get_startoff(ep) + - xfs_bmbt_disk_get_blockcount(ep) <= - xfs_bmbt_disk_get_startoff(nextp)); - ep = nextp; - } - - last = *ep; - i += num_recs; - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); - } - bno = nextbno; - /* - * If we've reached the end, stop. - */ - if (bno == NULLFSBLOCK) - break; - - bp_release = 0; - bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (!bp) { - bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - goto error_norelse; - } - block = XFS_BUF_TO_BLOCK(bp); - } - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); - } - return; - -error0: - xfs_warn(mp, "%s: at error0", __func__); - if (bp_release) - xfs_trans_brelse(NULL, bp); -error_norelse: - xfs_warn(mp, "%s: BAD after btree leaves for %d extents", - __func__, i); - panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); - return; -} -#endif - -/* - * Count fsblocks of the given fork. - */ -int /* error */ -xfs_bmap_count_blocks( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - int *count) /* out: count of blocks */ -{ - struct xfs_btree_block *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - __be64 *pp; /* pointer to block address */ - - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { - xfs_bmap_count_leaves(ifp, 0, - ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), - count); - return 0; - } - - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - block = ifp->if_broot; - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); - bno = be64_to_cpu(*pp); - ASSERT(bno != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); - - if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { - XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, - mp); - return XFS_ERROR(EFSCORRUPTED); - } - - return 0; -} - -/* - * Recursively walks each level of a btree - * to count total fsblocks is use. - */ -STATIC int /* error */ -xfs_bmap_count_tree( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fsblock_t blockno, /* file system block number */ - int levelin, /* level in btree */ - int *count) /* Count of blocks */ -{ - int error; - xfs_buf_t *bp, *nbp; - int level = levelin; - __be64 *pp; - xfs_fsblock_t bno = blockno; - xfs_fsblock_t nextbno; - struct xfs_btree_block *block, *nextblock; - int numrecs; - - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - block = XFS_BUF_TO_BLOCK(bp); - - if (--level) { - /* Not at node above leaves, count this level of nodes */ - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - while (nextbno != NULLFSBLOCK) { - error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - nextblock = XFS_BUF_TO_BLOCK(nbp); - nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); - xfs_trans_brelse(tp, nbp); - } - - /* Dive to the next level */ - pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); - bno = be64_to_cpu(*pp); - if (unlikely((error = - xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { - xfs_trans_brelse(tp, bp); - XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", - XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); - } - xfs_trans_brelse(tp, bp); - } else { - /* count all level 1 nodes and their leaves */ - for (;;) { - nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - numrecs = be16_to_cpu(block->bb_numrecs); - xfs_bmap_disk_count_leaves(mp, block, numrecs, count); - xfs_trans_brelse(tp, bp); - if (nextbno == NULLFSBLOCK) - break; - bno = nextbno; - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); - if (error) - return error; - *count += 1; - block = XFS_BUF_TO_BLOCK(bp); - } - } - return 0; -} - -/* - * Count leaf blocks given a range of extent records. - */ -STATIC void -xfs_bmap_count_leaves( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - int numrecs, - int *count) -{ - int b; - - for (b = 0; b < numrecs; b++) { - xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); - *count += xfs_bmbt_get_blockcount(frp); - } -} - -/* - * Count leaf blocks given a range of extent records originally - * in btree format. - */ -STATIC void -xfs_bmap_disk_count_leaves( - struct xfs_mount *mp, - struct xfs_btree_block *block, - int numrecs, - int *count) -{ - int b; - xfs_bmbt_rec_t *frp; - - for (b = 1; b <= numrecs; b++) { - frp = XFS_BMBT_REC_ADDR(mp, block, b); - *count += xfs_bmbt_disk_get_blockcount(frp); - } -} - /* * dead simple method of punching delalyed allocation blocks from a range in * the inode. Walks a block at a time so will be slow, but is only executed in @@ -6295,16 +6178,3 @@ next_block: return error; } - -/* - * Convert the given file system block to a disk block. We have to treat it - * differently based on whether the file is a real time file or not, because the - * bmap code does. - */ -xfs_daddr_t -xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) -{ - return (XFS_IS_REALTIME_INODE(ip) ? \ - (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ - XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); -} diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 061b45cbe614..3a86c3fa6de1 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -37,6 +37,7 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trace.h" +#include "xfs_cksum.h" /* * Determine the extent state. @@ -59,24 +60,31 @@ xfs_extent_state( */ void xfs_bmdr_to_bmbt( - struct xfs_mount *mp, + struct xfs_inode *ip, xfs_bmdr_block_t *dblock, int dblocklen, struct xfs_btree_block *rblock, int rblocklen) { + struct xfs_mount *mp = ip->i_mount; int dmxr; xfs_bmbt_key_t *fkp; __be64 *fpp; xfs_bmbt_key_t *tkp; __be64 *tpp; - rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); + rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; - rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0); fkp = XFS_BMDR_KEY_ADDR(dblock, 1); tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); @@ -424,7 +432,13 @@ xfs_bmbt_to_bmdr( xfs_bmbt_key_t *tkp; __be64 *tpp; - ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)); + ASSERT(rblock->bb_u.l.bb_blkno == + cpu_to_be64(XFS_BUF_DADDR_NULL)); + } else + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO)); ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO)); ASSERT(rblock->bb_level != 0); @@ -708,59 +722,89 @@ xfs_bmbt_key_diff( cur->bc_rec.b.br_startoff; } -static void +static int xfs_bmbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); unsigned int level; - int lblock_ok; /* block passes checks */ - /* magic number and level verification. + switch (block->bb_magic) { + case cpu_to_be32(XFS_BMAP_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn) + return false; + /* + * XXX: need a better way of verifying the owner here. Right now + * just make sure there has been one set. + */ + if (be64_to_cpu(block->bb_u.l.bb_owner) == 0) + return false; + /* fall through */ + case cpu_to_be32(XFS_BMAP_MAGIC): + break; + default: + return false; + } + + /* + * numrecs and level verification. * - * We don't know waht fork we belong to, so just verify that the level + * We don't know what fork we belong to, so just verify that the level * is less than the maximum of the two. Later checks will be more * precise. */ level = be16_to_cpu(block->bb_level); - lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) && - level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]); - - /* numrecs verification */ - lblock_ok = lblock_ok && - be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0]; + if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1])) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) + return false; /* sibling pointer verification */ - lblock_ok = lblock_ok && - block->bb_u.l.bb_leftsib && - (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || - XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_leftsib))) && - block->bb_u.l.bb_rightsib && - (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || - XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_rightsib))); - - if (!lblock_ok) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } + if (!block->bb_u.l.bb_leftsib || + (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))) + return false; + if (!block->bb_u.l.bb_rightsib || + (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))) + return false; + + return true; + } static void xfs_bmbt_read_verify( struct xfs_buf *bp) { - xfs_bmbt_verify(bp); + if (!(xfs_btree_lblock_verify_crc(bp) && + xfs_bmbt_verify(bp))) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + } static void xfs_bmbt_write_verify( struct xfs_buf *bp) { - xfs_bmbt_verify(bp); + if (!xfs_bmbt_verify(bp)) { + xfs_warn(bp->b_target->bt_mount, "bmbt daddr 0x%llx failed", bp->b_bn); + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + xfs_btree_lblock_calc_crc(bp); } const struct xfs_buf_ops xfs_bmbt_buf_ops = { @@ -838,6 +882,8 @@ xfs_bmbt_init_cursor( cur->bc_ops = &xfs_bmbt_ops; cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); cur->bc_private.b.ip = ip; diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 88469ca08696..70c43d9f72c1 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -18,7 +18,8 @@ #ifndef __XFS_BMAP_BTREE_H__ #define __XFS_BMAP_BTREE_H__ -#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ +#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ +#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */ struct xfs_btree_cur; struct xfs_btree_block; @@ -136,10 +137,10 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* * Btree block header size depends on a superblock flag. - * - * (not quite yet, but soon) */ -#define XFS_BMBT_BLOCK_LEN(mp) XFS_BTREE_LBLOCK_LEN +#define XFS_BMBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN) #define XFS_BMBT_REC_ADDR(mp, block, index) \ ((xfs_bmbt_rec_t *) \ @@ -186,12 +187,12 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; #define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \ XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0)) -#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \ - (int)(XFS_BTREE_LBLOCK_LEN + \ +#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \ + (int)(XFS_BMBT_BLOCK_LEN(mp) + \ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) -#define XFS_BMAP_BROOT_SPACE(bb) \ - (XFS_BMAP_BROOT_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) +#define XFS_BMAP_BROOT_SPACE(mp, bb) \ + (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs))) #define XFS_BMDR_SPACE_CALC(nrecs) \ (int)(sizeof(xfs_bmdr_block_t) + \ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) @@ -204,7 +205,7 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* * Prototypes for xfs_bmap.c to call. */ -extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int, +extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int, struct xfs_btree_block *, int); extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index db010408d701..8804b8a3c310 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -30,9 +30,11 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" +#include "xfs_buf_item.h" #include "xfs_btree.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" /* * Cursor allocation zone. @@ -42,9 +44,13 @@ kmem_zone_t *xfs_btree_cur_zone; /* * Btree magic numbers. */ -const __uint32_t xfs_magics[XFS_BTNUM_MAX] = { - XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC +static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { + { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC }, + { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, + XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC } }; +#define xfs_btree_magic(cur) \ + xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] STATIC int /* error (0 or EFSCORRUPTED) */ @@ -54,30 +60,38 @@ xfs_btree_check_lblock( int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer for block, if any */ { - int lblock_ok; /* block passes checks */ + int lblock_ok = 1; /* block passes checks */ struct xfs_mount *mp; /* file system mount point */ mp = cur->bc_mp; - lblock_ok = - be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + lblock_ok = lblock_ok && + uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.l.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } + + lblock_ok = lblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && block->bb_u.l.bb_leftsib && (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_leftsib))) && + be64_to_cpu(block->bb_u.l.bb_leftsib))) && block->bb_u.l.bb_rightsib && (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || XFS_FSB_SANITY_CHECK(mp, - be64_to_cpu(block->bb_u.l.bb_rightsib))); + be64_to_cpu(block->bb_u.l.bb_rightsib))); + if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK, XFS_RANDOM_BTREE_CHECK_LBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW, - mp); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); } return 0; @@ -90,16 +104,26 @@ xfs_btree_check_sblock( int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer containing block */ { + struct xfs_mount *mp; /* file system mount point */ struct xfs_buf *agbp; /* buffer for ag. freespace struct */ struct xfs_agf *agf; /* ag. freespace structure */ xfs_agblock_t agflen; /* native ag. freespace length */ - int sblock_ok; /* block passes checks */ + int sblock_ok = 1; /* block passes checks */ + mp = cur->bc_mp; agbp = cur->bc_private.a.agbp; agf = XFS_BUF_TO_AGF(agbp); agflen = be32_to_cpu(agf->agf_length); - sblock_ok = - be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + sblock_ok = sblock_ok && + uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.s.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } + + sblock_ok = sblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= cur->bc_ops->get_maxrecs(cur, level) && @@ -109,13 +133,13 @@ xfs_btree_check_sblock( (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) && block->bb_u.s.bb_rightsib; - if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp, + + if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK, XFS_RANDOM_BTREE_CHECK_SBLOCK))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_btree_check_sblock", - XFS_ERRLEVEL_LOW, cur->bc_mp, block); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); } return 0; @@ -194,6 +218,72 @@ xfs_btree_check_ptr( #endif /* + * Calculate CRC on the whole btree block and stuff it into the + * long-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. + */ +void +xfs_btree_lblock_calc_crc( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_LBLOCK_CRC_OFF); +} + +bool +xfs_btree_lblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_LBLOCK_CRC_OFF); + return true; +} + +/* + * Calculate CRC on the whole btree block and stuff it into the + * short-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. + */ +void +xfs_btree_sblock_calc_crc( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_SBLOCK_CRC_OFF); +} + +bool +xfs_btree_sblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_BTREE_SBLOCK_CRC_OFF); + return true; +} + +/* * Delete the btree cursor. */ void @@ -277,10 +367,8 @@ xfs_btree_dup_cursor( *ncur = NULL; return error; } - new->bc_bufs[i] = bp; - ASSERT(!xfs_buf_geterror(bp)); - } else - new->bc_bufs[i] = NULL; + } + new->bc_bufs[i] = bp; } *ncur = new; return 0; @@ -321,9 +409,14 @@ xfs_btree_dup_cursor( */ static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) { - return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? - XFS_BTREE_LBLOCK_LEN : - XFS_BTREE_SBLOCK_LEN; + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_LBLOCK_CRC_LEN; + return XFS_BTREE_LBLOCK_LEN; + } + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_SBLOCK_CRC_LEN; + return XFS_BTREE_SBLOCK_LEN; } /* @@ -863,43 +956,85 @@ xfs_btree_set_sibling( } void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags) +{ + buf->bb_magic = cpu_to_be32(magic); + buf->bb_level = cpu_to_be16(level); + buf->bb_numrecs = cpu_to_be16(numrecs); + + if (flags & XFS_BTREE_LONG_PTRS) { + buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); + buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.l.bb_owner = cpu_to_be64(owner); + uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid); + buf->bb_u.l.bb_pad = 0; + } + } else { + /* owner is a 32 bit value on short blocks */ + __u32 __owner = (__u32)owner; + + buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.s.bb_owner = cpu_to_be32(__owner); + uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid); + } + } +} + +void xfs_btree_init_block( struct xfs_mount *mp, struct xfs_buf *bp, __u32 magic, __u16 level, __u16 numrecs, + __u64 owner, unsigned int flags) { - struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp); - - new->bb_magic = cpu_to_be32(magic); - new->bb_level = cpu_to_be16(level); - new->bb_numrecs = cpu_to_be16(numrecs); - - if (flags & XFS_BTREE_LONG_PTRS) { - new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); - new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); - } else { - new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - } + xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + magic, level, numrecs, owner, flags); } STATIC void xfs_btree_init_block_cur( struct xfs_btree_cur *cur, + struct xfs_buf *bp, int level, - int numrecs, - struct xfs_buf *bp) + int numrecs) { - xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum], - level, numrecs, cur->bc_flags); + __u64 owner; + + /* + * we can pull the owner from the cursor right now as the different + * owners align directly with the pointer size of the btree. This may + * change in future, but is safe for current users of the generic btree + * code. + */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + owner = cur->bc_private.b.ip->i_ino; + else + owner = cur->bc_private.a.agno; + + xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + xfs_btree_magic(cur), level, numrecs, + owner, cur->bc_flags); } /* * Return true if ptr is the last record in the btree and - * we need to track updateŃ• to this record. The decision + * we need to track updates to this record. The decision * will be further refined in the update_lastrec method. */ STATIC int @@ -1147,6 +1282,7 @@ xfs_btree_log_keys( XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); if (bp) { + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_key_offset(cur, first), xfs_btree_key_offset(cur, last + 1) - 1); @@ -1171,6 +1307,7 @@ xfs_btree_log_recs( XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_rec_offset(cur, first), xfs_btree_rec_offset(cur, last + 1) - 1); @@ -1195,6 +1332,7 @@ xfs_btree_log_ptrs( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); int level = xfs_btree_get_level(block); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_ptr_offset(cur, first, level), xfs_btree_ptr_offset(cur, last + 1, level) - 1); @@ -1223,7 +1361,12 @@ xfs_btree_log_block( offsetof(struct xfs_btree_block, bb_numrecs), offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), - XFS_BTREE_SBLOCK_LEN + offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.s.bb_owner), + offsetof(struct xfs_btree_block, bb_u.s.bb_crc), + XFS_BTREE_SBLOCK_CRC_LEN }; static const short loffsets[] = { /* table of offsets (long) */ offsetof(struct xfs_btree_block, bb_magic), @@ -1231,17 +1374,40 @@ xfs_btree_log_block( offsetof(struct xfs_btree_block, bb_numrecs), offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), - XFS_BTREE_LBLOCK_LEN + offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.l.bb_owner), + offsetof(struct xfs_btree_block, bb_u.l.bb_crc), + offsetof(struct xfs_btree_block, bb_u.l.bb_pad), + XFS_BTREE_LBLOCK_CRC_LEN }; XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_TRACE_ARGBI(cur, bp, fields); if (bp) { + int nbits; + + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + /* + * We don't log the CRC when updating a btree + * block but instead recreate it during log + * recovery. As the log buffers have checksums + * of their own this is safe and avoids logging a crc + * update in a lot of places. + */ + if (fields == XFS_BB_ALL_BITS) + fields = XFS_BB_ALL_BITS_CRC; + nbits = XFS_BB_NUM_BITS_CRC; + } else { + nbits = XFS_BB_NUM_BITS; + } xfs_btree_offsets(fields, (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? loffsets : soffsets, - XFS_BB_NUM_BITS, &first, &last); + nbits, &first, &last); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, first, last); } else { xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, @@ -2204,7 +2370,7 @@ xfs_btree_split( goto error0; /* Fill in the btree header for the new right block. */ - xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp); + xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); /* * Split the entries between the old and the new block evenly. @@ -2513,7 +2679,7 @@ xfs_btree_new_root( nptr = 2; } /* Fill in the new block's btree header and log it. */ - xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp); + xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && !xfs_btree_ptr_is_null(cur, &rptr)); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index f932897194eb..6e6c915673fe 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -42,11 +42,15 @@ extern kmem_zone_t *xfs_btree_cur_zone; * Generic btree header. * * This is a combination of the actual format used on disk for short and long - * format btrees. The first three fields are shared by both format, but - * the pointers are different and should be used with care. + * format btrees. The first three fields are shared by both format, but the + * pointers are different and should be used with care. * - * To get the size of the actual short or long form headers please use - * the size macros below. Never use sizeof(xfs_btree_block). + * To get the size of the actual short or long form headers please use the size + * macros below. Never use sizeof(xfs_btree_block). + * + * The blkno, crc, lsn, owner and uuid fields are only available in filesystems + * with the crc feature bit, and all accesses to them must be conditional on + * that flag. */ struct xfs_btree_block { __be32 bb_magic; /* magic number for block type */ @@ -56,10 +60,23 @@ struct xfs_btree_block { struct { __be32 bb_leftsib; __be32 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be32 bb_owner; + __le32 bb_crc; } s; /* short form pointers */ struct { __be64 bb_leftsib; __be64 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be64 bb_owner; + __le32 bb_crc; + __be32 bb_pad; /* padding for alignment */ } l; /* long form pointers */ } bb_u; /* rest */ }; @@ -67,6 +84,16 @@ struct xfs_btree_block { #define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ #define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ +/* sizes of CRC enabled btree blocks */ +#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) +#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) + + +#define XFS_BTREE_SBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.s.bb_crc) +#define XFS_BTREE_LBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.l.bb_crc) + /* * Generic key, ptr and record wrapper structures. @@ -101,13 +128,11 @@ union xfs_btree_rec { #define XFS_BB_NUMRECS 0x04 #define XFS_BB_LEFTSIB 0x08 #define XFS_BB_RIGHTSIB 0x10 +#define XFS_BB_BLKNO 0x20 #define XFS_BB_NUM_BITS 5 #define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) - -/* - * Magic numbers for btree blocks. - */ -extern const __uint32_t xfs_magics[]; +#define XFS_BB_NUM_BITS_CRC 8 +#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) /* * Generic stats interface @@ -256,6 +281,7 @@ typedef struct xfs_btree_cur #define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ #define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ #define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ +#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ #define XFS_BTREE_NOERROR 0 @@ -393,8 +419,20 @@ xfs_btree_init_block( __u32 magic, __u16 level, __u16 numrecs, + __u64 owner, unsigned int flags); +void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags); + /* * Common btree core entry points. */ @@ -408,6 +446,14 @@ int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); /* + * btree block CRC helpers + */ +void xfs_btree_lblock_calc_crc(struct xfs_buf *); +bool xfs_btree_lblock_verify_crc(struct xfs_buf *); +void xfs_btree_sblock_calc_crc(struct xfs_buf *); +bool xfs_btree_sblock_verify_crc(struct xfs_buf *); + +/* * Internal btree helpers also used by xfs_bmap.c. */ void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8459b5d8cb71..82b70bda9f47 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1022,7 +1022,9 @@ xfs_buf_iodone_work( bool read = !!(bp->b_flags & XBF_READ); bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); - if (read && bp->b_ops) + + /* only validate buffers that were read without errors */ + if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE)) bp->b_ops->verify_read(bp); if (bp->b_iodone) diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index ee36c88ecfde..2573d2a75fc8 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -24,19 +24,20 @@ extern kmem_zone_t *xfs_buf_item_zone; * This flag indicates that the buffer contains on disk inodes * and requires special recovery handling. */ -#define XFS_BLF_INODE_BUF 0x1 +#define XFS_BLF_INODE_BUF (1<<0) /* * This flag indicates that the buffer should not be replayed * during recovery because its blocks are being freed. */ -#define XFS_BLF_CANCEL 0x2 +#define XFS_BLF_CANCEL (1<<1) + /* * This flag indicates that the buffer contains on disk * user or group dquots and may require special recovery handling. */ -#define XFS_BLF_UDQUOT_BUF 0x4 -#define XFS_BLF_PDQUOT_BUF 0x8 -#define XFS_BLF_GDQUOT_BUF 0x10 +#define XFS_BLF_UDQUOT_BUF (1<<2) +#define XFS_BLF_PDQUOT_BUF (1<<3) +#define XFS_BLF_GDQUOT_BUF (1<<4) #define XFS_BLF_CHUNK 128 #define XFS_BLF_SHIFT 7 @@ -61,6 +62,55 @@ typedef struct xfs_buf_log_format { } xfs_buf_log_format_t; /* + * All buffers now need to tell recovery where the magic number + * is so that it can verify and calculate the CRCs on the buffer correctly + * once the changes have been replayed into the buffer. + * + * The type value is held in the upper 5 bits of the blf_flags field, which is + * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down. + */ +#define XFS_BLFT_BITS 5 +#define XFS_BLFT_SHIFT 11 +#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT) + +enum xfs_blft { + XFS_BLFT_UNKNOWN_BUF = 0, + XFS_BLFT_UDQUOT_BUF, + XFS_BLFT_PDQUOT_BUF, + XFS_BLFT_GDQUOT_BUF, + XFS_BLFT_BTREE_BUF, + XFS_BLFT_AGF_BUF, + XFS_BLFT_AGFL_BUF, + XFS_BLFT_AGI_BUF, + XFS_BLFT_DINO_BUF, + XFS_BLFT_SYMLINK_BUF, + XFS_BLFT_DIR_BLOCK_BUF, + XFS_BLFT_DIR_DATA_BUF, + XFS_BLFT_DIR_FREE_BUF, + XFS_BLFT_DIR_LEAF1_BUF, + XFS_BLFT_DIR_LEAFN_BUF, + XFS_BLFT_DA_NODE_BUF, + XFS_BLFT_ATTR_LEAF_BUF, + XFS_BLFT_ATTR_RMT_BUF, + XFS_BLFT_SB_BUF, + XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS), +}; + +static inline void +xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type) +{ + ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF); + blf->blf_flags &= ~XFS_BLFT_MASK; + blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK); +} + +static inline __uint16_t +xfs_blft_from_flags(struct xfs_buf_log_format *blf) +{ + return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT; +} + +/* * buf log item flags */ #define XFS_BLI_HOLD 0x01 @@ -113,6 +163,10 @@ void xfs_buf_attach_iodone(struct xfs_buf *, void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); +void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *, + enum xfs_blft); +void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp); + #endif /* __KERNEL__ */ #endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 4d7696a02418..9b26a99ebfe9 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -38,6 +39,8 @@ #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" /* * xfs_da_btree.c @@ -52,69 +55,195 @@ /* * Routines used for growing the Btree. */ -STATIC int xfs_da_root_split(xfs_da_state_t *state, +STATIC int xfs_da3_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *existing_root, xfs_da_state_blk_t *new_child); -STATIC int xfs_da_node_split(xfs_da_state_t *state, +STATIC int xfs_da3_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *existing_blk, xfs_da_state_blk_t *split_blk, xfs_da_state_blk_t *blk_to_add, int treelevel, int *result); -STATIC void xfs_da_node_rebalance(xfs_da_state_t *state, +STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *node_blk_1, xfs_da_state_blk_t *node_blk_2); -STATIC void xfs_da_node_add(xfs_da_state_t *state, +STATIC void xfs_da3_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *old_node_blk, xfs_da_state_blk_t *new_node_blk); /* * Routines used for shrinking the Btree. */ -STATIC int xfs_da_root_join(xfs_da_state_t *state, +STATIC int xfs_da3_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk); -STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval); -STATIC void xfs_da_node_remove(xfs_da_state_t *state, +STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval); +STATIC void xfs_da3_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk); -STATIC void xfs_da_node_unbalance(xfs_da_state_t *state, +STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *src_node_blk, xfs_da_state_blk_t *dst_node_blk); /* * Utility routines. */ -STATIC uint xfs_da_node_lasthash(struct xfs_buf *bp, int *count); -STATIC int xfs_da_node_order(struct xfs_buf *node1_bp, - struct xfs_buf *node2_bp); -STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, +STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, xfs_da_state_blk_t *save_blk); -STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); -static void -xfs_da_node_verify( + +kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ + +/* + * Allocate a dir-state structure. + * We don't put them on the stack since they're large. + */ +xfs_da_state_t * +xfs_da_state_alloc(void) +{ + return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); +} + +/* + * Kill the altpath contents of a da-state structure. + */ +STATIC void +xfs_da_state_kill_altpath(xfs_da_state_t *state) +{ + int i; + + for (i = 0; i < state->altpath.active; i++) + state->altpath.blk[i].bp = NULL; + state->altpath.active = 0; +} + +/* + * Free a da-state structure. + */ +void +xfs_da_state_free(xfs_da_state_t *state) +{ + xfs_da_state_kill_altpath(state); +#ifdef DEBUG + memset((char *)state, 0, sizeof(*state)); +#endif /* DEBUG */ + kmem_zone_free(xfs_da_state_zone, state); +} + +void +xfs_da3_node_hdr_from_disk( + struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from) +{ + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); + + if (from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->__count); + to->level = be16_to_cpu(hdr3->__level); + return; + } + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.__count); + to->level = be16_to_cpu(from->hdr.__level); +} + +void +xfs_da3_node_hdr_to_disk( + struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from) +{ + ASSERT(from->magic == XFS_DA_NODE_MAGIC || + from->magic == XFS_DA3_NODE_MAGIC); + + if (from->magic == XFS_DA3_NODE_MAGIC) { + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to; + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->__count = cpu_to_be16(from->count); + hdr3->__level = cpu_to_be16(from->level); + return; + } + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.__count = cpu_to_be16(from->count); + to->hdr.__level = cpu_to_be16(from->level); +} + +static bool +xfs_da3_node_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_da_node_hdr *hdr = bp->b_addr; - int block_ok = 0; - - block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC); - block_ok = block_ok && - be16_to_cpu(hdr->level) > 0 && - be16_to_cpu(hdr->count) > 0 ; - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + struct xfs_da_intnode *hdr = bp->b_addr; + struct xfs_da3_icnode_hdr ichdr; + + xfs_da3_node_hdr_from_disk(&ichdr, hdr); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (ichdr.magic != XFS_DA3_NODE_MAGIC) + return false; + + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) + return false; + } else { + if (ichdr.magic != XFS_DA_NODE_MAGIC) + return false; } + if (ichdr.level == 0) + return false; + if (ichdr.level > XFS_DA_NODE_MAXDEPTH) + return false; + if (ichdr.count == 0) + return false; + /* + * we don't know if the node is for and attribute or directory tree, + * so only fail if the count is outside both bounds + */ + if (ichdr.count > mp->m_dir_node_ents && + ichdr.count > mp->m_attr_node_ents) + return false; + + /* XXX: hash order check? */ + + return true; } static void -xfs_da_node_write_verify( +xfs_da3_node_write_verify( struct xfs_buf *bp) { - xfs_da_node_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (!xfs_da3_node_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DA3_NODE_CRC_OFF); } /* @@ -124,40 +253,47 @@ xfs_da_node_write_verify( * format of the block being read. */ static void -xfs_da_node_read_verify( +xfs_da3_node_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_da_blkinfo *info = bp->b_addr; switch (be16_to_cpu(info->magic)) { + case XFS_DA3_NODE_MAGIC: + if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_DA3_NODE_CRC_OFF)) + break; + /* fall through */ case XFS_DA_NODE_MAGIC: - xfs_da_node_verify(bp); - break; + if (!xfs_da3_node_verify(bp)) + break; + return; case XFS_ATTR_LEAF_MAGIC: - bp->b_ops = &xfs_attr_leaf_buf_ops; + bp->b_ops = &xfs_attr3_leaf_buf_ops; bp->b_ops->verify_read(bp); return; case XFS_DIR2_LEAFN_MAGIC: - bp->b_ops = &xfs_dir2_leafn_buf_ops; + case XFS_DIR3_LEAFN_MAGIC: + bp->b_ops = &xfs_dir3_leafn_buf_ops; bp->b_ops->verify_read(bp); return; default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - mp, info); - xfs_buf_ioerror(bp, EFSCORRUPTED); break; } + + /* corrupt block */ + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); } -const struct xfs_buf_ops xfs_da_node_buf_ops = { - .verify_read = xfs_da_node_read_verify, - .verify_write = xfs_da_node_write_verify, +const struct xfs_buf_ops xfs_da3_node_buf_ops = { + .verify_read = xfs_da3_node_read_verify, + .verify_write = xfs_da3_node_write_verify, }; - int -xfs_da_node_read( +xfs_da3_node_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, @@ -165,8 +301,35 @@ xfs_da_node_read( struct xfs_buf **bpp, int which_fork) { - return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - which_fork, &xfs_da_node_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + which_fork, &xfs_da3_node_buf_ops); + if (!err && tp) { + struct xfs_da_blkinfo *info = (*bpp)->b_addr; + int type; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + type = XFS_BLFT_DA_NODE_BUF; + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + type = XFS_BLFT_ATTR_LEAF_BUF; + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + type = XFS_BLFT_DIR_LEAFN_BUF; + break; + default: + type = 0; + ASSERT(0); + break; + } + xfs_trans_buf_set_type(tp, *bpp, type); + } + return err; } /*======================================================================== @@ -177,33 +340,46 @@ xfs_da_node_read( * Create the initial contents of an intermediate node. */ int -xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - struct xfs_buf **bpp, int whichfork) +xfs_da3_node_create( + struct xfs_da_args *args, + xfs_dablk_t blkno, + int level, + struct xfs_buf **bpp, + int whichfork) { - xfs_da_intnode_t *node; - struct xfs_buf *bp; - int error; - xfs_trans_t *tp; + struct xfs_da_intnode *node; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_da3_icnode_hdr ichdr = {0}; + struct xfs_buf *bp; + int error; trace_xfs_da_node_create(args); + ASSERT(level <= XFS_DA_NODE_MAXDEPTH); - tp = args->trans; error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork); if (error) return(error); - ASSERT(bp != NULL); + bp->b_ops = &xfs_da3_node_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); node = bp->b_addr; - node->hdr.info.forw = 0; - node->hdr.info.back = 0; - node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC); - node->hdr.info.pad = 0; - node->hdr.count = 0; - node->hdr.level = cpu_to_be16(level); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + ichdr.magic = XFS_DA3_NODE_MAGIC; + hdr3->info.blkno = cpu_to_be64(bp->b_bn); + hdr3->info.owner = cpu_to_be64(args->dp->i_ino); + uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid); + } else { + ichdr.magic = XFS_DA_NODE_MAGIC; + } + ichdr.level = level; + + xfs_da3_node_hdr_to_disk(node, &ichdr); xfs_trans_log_buf(tp, bp, - XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node))); - bp->b_ops = &xfs_da_node_buf_ops; *bpp = bp; return(0); } @@ -213,12 +389,18 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, * intermediate nodes, rebalance, etc. */ int /* error */ -xfs_da_split(xfs_da_state_t *state) +xfs_da3_split( + struct xfs_da_state *state) { - xfs_da_state_blk_t *oldblk, *newblk, *addblk; - xfs_da_intnode_t *node; - struct xfs_buf *bp; - int max, action, error, i; + struct xfs_da_state_blk *oldblk; + struct xfs_da_state_blk *newblk; + struct xfs_da_state_blk *addblk; + struct xfs_da_intnode *node; + struct xfs_buf *bp; + int max; + int action; + int error; + int i; trace_xfs_da_split(state->args); @@ -246,7 +428,7 @@ xfs_da_split(xfs_da_state_t *state) */ switch (oldblk->magic) { case XFS_ATTR_LEAF_MAGIC: - error = xfs_attr_leaf_split(state, oldblk, newblk); + error = xfs_attr3_leaf_split(state, oldblk, newblk); if ((error != 0) && (error != ENOSPC)) { return(error); /* GROT: attr is inconsistent */ } @@ -261,12 +443,12 @@ xfs_da_split(xfs_da_state_t *state) if (state->inleaf) { state->extraafter = 0; /* before newblk */ trace_xfs_attr_leaf_split_before(state->args); - error = xfs_attr_leaf_split(state, oldblk, + error = xfs_attr3_leaf_split(state, oldblk, &state->extrablk); } else { state->extraafter = 1; /* after newblk */ trace_xfs_attr_leaf_split_after(state->args); - error = xfs_attr_leaf_split(state, newblk, + error = xfs_attr3_leaf_split(state, newblk, &state->extrablk); } if (error) @@ -280,7 +462,7 @@ xfs_da_split(xfs_da_state_t *state) addblk = newblk; break; case XFS_DA_NODE_MAGIC: - error = xfs_da_node_split(state, oldblk, newblk, addblk, + error = xfs_da3_node_split(state, oldblk, newblk, addblk, max - i, &action); addblk->bp = NULL; if (error) @@ -298,7 +480,7 @@ xfs_da_split(xfs_da_state_t *state) /* * Update the btree to show the new hashval for this child. */ - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); } if (!addblk) return(0); @@ -308,7 +490,7 @@ xfs_da_split(xfs_da_state_t *state) */ ASSERT(state->path.active == 0); oldblk = &state->path.blk[0]; - error = xfs_da_root_split(state, oldblk, addblk); + error = xfs_da3_root_split(state, oldblk, addblk); if (error) { addblk->bp = NULL; return(error); /* GROT: dir is inconsistent */ @@ -319,8 +501,12 @@ xfs_da_split(xfs_da_state_t *state) * just got bumped because of the addition of a new root node. * There might be three blocks involved if a double split occurred, * and the original block 0 could be at any position in the list. + * + * Note: the magic numbers and sibling pointers are in the same + * physical place for both v2 and v3 headers (by design). Hence it + * doesn't matter which version of the xfs_da_intnode structure we use + * here as the result will be the same using either structure. */ - node = oldblk->bp->b_addr; if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { @@ -359,18 +545,25 @@ xfs_da_split(xfs_da_state_t *state) * the EOF, extending the inode in process. */ STATIC int /* error */ -xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2) +xfs_da3_root_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) { - xfs_da_intnode_t *node, *oldroot; - xfs_da_args_t *args; - xfs_dablk_t blkno; - struct xfs_buf *bp; - int error, size; - xfs_inode_t *dp; - xfs_trans_t *tp; - xfs_mount_t *mp; - xfs_dir2_leaf_t *leaf; + struct xfs_da_intnode *node; + struct xfs_da_intnode *oldroot; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_args *args; + struct xfs_buf *bp; + struct xfs_inode *dp; + struct xfs_trans *tp; + struct xfs_mount *mp; + struct xfs_dir2_leaf *leaf; + xfs_dablk_t blkno; + int level; + int error; + int size; trace_xfs_da_root_split(state->args); @@ -379,29 +572,65 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * to a free space somewhere. */ args = state->args; - ASSERT(args != NULL); error = xfs_da_grow_inode(args, &blkno); if (error) - return(error); + return error; + dp = args->dp; tp = args->trans; mp = state->mp; error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork); if (error) - return(error); - ASSERT(bp != NULL); + return error; node = bp->b_addr; oldroot = blk1->bp->b_addr; - if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) { - size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] - - (char *)oldroot); + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { + struct xfs_da3_icnode_hdr nodehdr; + + xfs_da3_node_hdr_from_disk(&nodehdr, oldroot); + btree = xfs_da3_node_tree_p(oldroot); + size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot); + level = nodehdr.level; + + /* + * we are about to copy oldroot to bp, so set up the type + * of bp while we know exactly what it will be. + */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); } else { - ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + leaf = (xfs_dir2_leaf_t *)oldroot; - size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] - - (char *)leaf); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + size = (int)((char *)&ents[leafhdr.count] - (char *)leaf); + level = 0; + + /* + * we are about to copy oldroot to bp, so set up the type + * of bp while we know exactly what it will be. + */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); } + + /* + * we can copy most of the information in the node from one block to + * another, but for CRC enabled headers we have to make sure that the + * block specific identifiers are kept intact. We update the buffer + * directly for this. + */ memcpy(node, oldroot, size); + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node; + + node3->hdr.info.blkno = cpu_to_be64(bp->b_bn); + } xfs_trans_log_buf(tp, bp, 0, size - 1); bp->b_ops = blk1->bp->b_ops; @@ -411,20 +640,25 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, /* * Set up the new root node. */ - error = xfs_da_node_create(args, + error = xfs_da3_node_create(args, (args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0, - be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork); + level + 1, &bp, args->whichfork); if (error) - return(error); + return error; + node = bp->b_addr; - node->btree[0].hashval = cpu_to_be32(blk1->hashval); - node->btree[0].before = cpu_to_be32(blk1->blkno); - node->btree[1].hashval = cpu_to_be32(blk2->hashval); - node->btree[1].before = cpu_to_be32(blk2->blkno); - node->hdr.count = cpu_to_be16(2); + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + btree[0].hashval = cpu_to_be32(blk1->hashval); + btree[0].before = cpu_to_be32(blk1->blkno); + btree[1].hashval = cpu_to_be32(blk2->hashval); + btree[1].before = cpu_to_be32(blk2->blkno); + nodehdr.count = 2; + xfs_da3_node_hdr_to_disk(node, &nodehdr); #ifdef DEBUG - if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) { + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { ASSERT(blk1->blkno >= mp->m_dirleafblk && blk1->blkno < mp->m_dirfreeblk); ASSERT(blk2->blkno >= mp->m_dirleafblk && @@ -434,30 +668,34 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, /* Header is already logged by xfs_da_node_create */ xfs_trans_log_buf(tp, bp, - XFS_DA_LOGRANGE(node, node->btree, - sizeof(xfs_da_node_entry_t) * 2)); + XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2)); - return(0); + return 0; } /* * Split the node, rebalance, then add the new entry. */ STATIC int /* error */ -xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, - xfs_da_state_blk_t *newblk, - xfs_da_state_blk_t *addblk, - int treelevel, int *result) +xfs_da3_node_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk, + struct xfs_da_state_blk *addblk, + int treelevel, + int *result) { - xfs_da_intnode_t *node; - xfs_dablk_t blkno; - int newcount, error; - int useextra; + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + xfs_dablk_t blkno; + int newcount; + int error; + int useextra; trace_xfs_da_node_split(state->args); node = oldblk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + xfs_da3_node_hdr_from_disk(&nodehdr, node); /* * With V2 dirs the extra block is data or freespace. @@ -467,7 +705,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, /* * Do we have to split the node? */ - if ((be16_to_cpu(node->hdr.count) + newcount) > state->node_ents) { + if (nodehdr.count + newcount > state->node_ents) { /* * Allocate a new node, add to the doubly linked chain of * nodes, then move some of our excess entries into it. @@ -476,14 +714,14 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, if (error) return(error); /* GROT: dir is inconsistent */ - error = xfs_da_node_create(state->args, blkno, treelevel, + error = xfs_da3_node_create(state->args, blkno, treelevel, &newblk->bp, state->args->whichfork); if (error) return(error); /* GROT: dir is inconsistent */ newblk->blkno = blkno; newblk->magic = XFS_DA_NODE_MAGIC; - xfs_da_node_rebalance(state, oldblk, newblk); - error = xfs_da_blk_link(state, oldblk, newblk); + xfs_da3_node_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); if (error) return(error); *result = 1; @@ -495,7 +733,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Insert the new entry(s) into the correct block * (updating last hashval in the process). * - * xfs_da_node_add() inserts BEFORE the given index, + * xfs_da3_node_add() inserts BEFORE the given index, * and as a result of using node_lookup_int() we always * point to a valid entry (not after one), but a split * operation always results in a new block whose hashvals @@ -504,22 +742,23 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * If we had double-split op below us, then add the extra block too. */ node = oldblk->bp->b_addr; - if (oldblk->index <= be16_to_cpu(node->hdr.count)) { + xfs_da3_node_hdr_from_disk(&nodehdr, node); + if (oldblk->index <= nodehdr.count) { oldblk->index++; - xfs_da_node_add(state, oldblk, addblk); + xfs_da3_node_add(state, oldblk, addblk); if (useextra) { if (state->extraafter) oldblk->index++; - xfs_da_node_add(state, oldblk, &state->extrablk); + xfs_da3_node_add(state, oldblk, &state->extrablk); state->extravalid = 0; } } else { newblk->index++; - xfs_da_node_add(state, newblk, addblk); + xfs_da3_node_add(state, newblk, addblk); if (useextra) { if (state->extraafter) newblk->index++; - xfs_da_node_add(state, newblk, &state->extrablk); + xfs_da3_node_add(state, newblk, &state->extrablk); state->extravalid = 0; } } @@ -534,33 +773,53 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * NOTE: if blk2 is empty, then it will get the upper half of blk1. */ STATIC void -xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2) +xfs_da3_node_rebalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) { - xfs_da_intnode_t *node1, *node2, *tmpnode; - xfs_da_node_entry_t *btree_s, *btree_d; - int count, tmp; - xfs_trans_t *tp; + struct xfs_da_intnode *node1; + struct xfs_da_intnode *node2; + struct xfs_da_intnode *tmpnode; + struct xfs_da_node_entry *btree1; + struct xfs_da_node_entry *btree2; + struct xfs_da_node_entry *btree_s; + struct xfs_da_node_entry *btree_d; + struct xfs_da3_icnode_hdr nodehdr1; + struct xfs_da3_icnode_hdr nodehdr2; + struct xfs_trans *tp; + int count; + int tmp; + int swap = 0; trace_xfs_da_node_rebalance(state->args); node1 = blk1->bp->b_addr; node2 = blk2->bp->b_addr; + xfs_da3_node_hdr_from_disk(&nodehdr1, node1); + xfs_da3_node_hdr_from_disk(&nodehdr2, node2); + btree1 = xfs_da3_node_tree_p(node1); + btree2 = xfs_da3_node_tree_p(node2); + /* * Figure out how many entries need to move, and in which direction. * Swap the nodes around if that makes it simpler. */ - if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && - ((be32_to_cpu(node2->btree[0].hashval) < be32_to_cpu(node1->btree[0].hashval)) || - (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) < - be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) { + if (nodehdr1.count > 0 && nodehdr2.count > 0 && + ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || + (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) < + be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) { tmpnode = node1; node1 = node2; node2 = tmpnode; + xfs_da3_node_hdr_from_disk(&nodehdr1, node1); + xfs_da3_node_hdr_from_disk(&nodehdr2, node2); + btree1 = xfs_da3_node_tree_p(node1); + btree2 = xfs_da3_node_tree_p(node2); + swap = 1; } - ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT(node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - count = (be16_to_cpu(node1->hdr.count) - be16_to_cpu(node2->hdr.count)) / 2; + + count = (nodehdr1.count - nodehdr2.count) / 2; if (count == 0) return; tp = state->args->trans; @@ -571,10 +830,11 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, /* * Move elements in node2 up to make a hole. */ - if ((tmp = be16_to_cpu(node2->hdr.count)) > 0) { + tmp = nodehdr2.count; + if (tmp > 0) { tmp *= (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node2->btree[0]; - btree_d = &node2->btree[count]; + btree_s = &btree2[0]; + btree_d = &btree2[count]; memmove(btree_d, btree_s, tmp); } @@ -582,12 +842,12 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Move the req'd B-tree elements from high in node1 to * low in node2. */ - be16_add_cpu(&node2->hdr.count, count); + nodehdr2.count += count; tmp = count * (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node1->btree[be16_to_cpu(node1->hdr.count) - count]; - btree_d = &node2->btree[0]; + btree_s = &btree1[nodehdr1.count - count]; + btree_d = &btree2[0]; memcpy(btree_d, btree_s, tmp); - be16_add_cpu(&node1->hdr.count, -count); + nodehdr1.count -= count; } else { /* * Move the req'd B-tree elements from low in node2 to @@ -595,49 +855,60 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, */ count = -count; tmp = count * (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node2->btree[0]; - btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)]; + btree_s = &btree2[0]; + btree_d = &btree1[nodehdr1.count]; memcpy(btree_d, btree_s, tmp); - be16_add_cpu(&node1->hdr.count, count); + nodehdr1.count += count; + xfs_trans_log_buf(tp, blk1->bp, XFS_DA_LOGRANGE(node1, btree_d, tmp)); /* * Move elements in node2 down to fill the hole. */ - tmp = be16_to_cpu(node2->hdr.count) - count; + tmp = nodehdr2.count - count; tmp *= (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node2->btree[count]; - btree_d = &node2->btree[0]; + btree_s = &btree2[count]; + btree_d = &btree2[0]; memmove(btree_d, btree_s, tmp); - be16_add_cpu(&node2->hdr.count, -count); + nodehdr2.count -= count; } /* * Log header of node 1 and all current bits of node 2. */ + xfs_da3_node_hdr_to_disk(node1, &nodehdr1); xfs_trans_log_buf(tp, blk1->bp, - XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr))); + XFS_DA_LOGRANGE(node1, &node1->hdr, + xfs_da3_node_hdr_size(node1))); + + xfs_da3_node_hdr_to_disk(node2, &nodehdr2); xfs_trans_log_buf(tp, blk2->bp, XFS_DA_LOGRANGE(node2, &node2->hdr, - sizeof(node2->hdr) + - sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count))); + xfs_da3_node_hdr_size(node2) + + (sizeof(btree2[0]) * nodehdr2.count))); /* * Record the last hashval from each block for upward propagation. * (note: don't use the swapped node pointers) */ - node1 = blk1->bp->b_addr; - node2 = blk2->bp->b_addr; - blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval); - blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval); + if (swap) { + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; + xfs_da3_node_hdr_from_disk(&nodehdr1, node1); + xfs_da3_node_hdr_from_disk(&nodehdr2, node2); + btree1 = xfs_da3_node_tree_p(node1); + btree2 = xfs_da3_node_tree_p(node2); + } + blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval); + blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval); /* * Adjust the expected index for insertion. */ - if (blk1->index >= be16_to_cpu(node1->hdr.count)) { - blk2->index = blk1->index - be16_to_cpu(node1->hdr.count); - blk1->index = be16_to_cpu(node1->hdr.count) + 1; /* make it invalid */ + if (blk1->index >= nodehdr1.count) { + blk2->index = blk1->index - nodehdr1.count; + blk1->index = nodehdr1.count + 1; /* make it invalid */ } } @@ -645,18 +916,23 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Add a new entry to an intermediate node. */ STATIC void -xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, - xfs_da_state_blk_t *newblk) +xfs_da3_node_add( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk) { - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - int tmp; + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int tmp; trace_xfs_da_node_add(state->args); node = oldblk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + + ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count); ASSERT(newblk->blkno != 0); if (state->args->whichfork == XFS_DATA_FORK) ASSERT(newblk->blkno >= state->mp->m_dirleafblk && @@ -666,23 +942,25 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * We may need to make some room before we insert the new node. */ tmp = 0; - btree = &node->btree[ oldblk->index ]; - if (oldblk->index < be16_to_cpu(node->hdr.count)) { - tmp = (be16_to_cpu(node->hdr.count) - oldblk->index) * (uint)sizeof(*btree); - memmove(btree + 1, btree, tmp); + if (oldblk->index < nodehdr.count) { + tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree); + memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp); } - btree->hashval = cpu_to_be32(newblk->hashval); - btree->before = cpu_to_be32(newblk->blkno); + btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval); + btree[oldblk->index].before = cpu_to_be32(newblk->blkno); xfs_trans_log_buf(state->args->trans, oldblk->bp, - XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree))); - be16_add_cpu(&node->hdr.count, 1); + XFS_DA_LOGRANGE(node, &btree[oldblk->index], + tmp + sizeof(*btree))); + + nodehdr.count += 1; + xfs_da3_node_hdr_to_disk(node, &nodehdr); xfs_trans_log_buf(state->args->trans, oldblk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node))); /* * Copy the last hash value from the oldblk to propagate upwards. */ - oldblk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1 ].hashval); + oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); } /*======================================================================== @@ -694,14 +972,16 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * possibly deallocating that block, etc... */ int -xfs_da_join(xfs_da_state_t *state) +xfs_da3_join( + struct xfs_da_state *state) { - xfs_da_state_blk_t *drop_blk, *save_blk; - int action, error; + struct xfs_da_state_blk *drop_blk; + struct xfs_da_state_blk *save_blk; + int action = 0; + int error; trace_xfs_da_join(state->args); - action = 0; drop_blk = &state->path.blk[ state->path.active-1 ]; save_blk = &state->altpath.blk[ state->path.active-1 ]; ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC); @@ -722,12 +1002,12 @@ xfs_da_join(xfs_da_state_t *state) */ switch (drop_blk->magic) { case XFS_ATTR_LEAF_MAGIC: - error = xfs_attr_leaf_toosmall(state, &action); + error = xfs_attr3_leaf_toosmall(state, &action); if (error) return(error); if (action == 0) return(0); - xfs_attr_leaf_unbalance(state, drop_blk, save_blk); + xfs_attr3_leaf_unbalance(state, drop_blk, save_blk); break; case XFS_DIR2_LEAFN_MAGIC: error = xfs_dir2_leafn_toosmall(state, &action); @@ -742,18 +1022,18 @@ xfs_da_join(xfs_da_state_t *state) * Remove the offending node, fixup hashvals, * check for a toosmall neighbor. */ - xfs_da_node_remove(state, drop_blk); - xfs_da_fixhashpath(state, &state->path); - error = xfs_da_node_toosmall(state, &action); + xfs_da3_node_remove(state, drop_blk); + xfs_da3_fixhashpath(state, &state->path); + error = xfs_da3_node_toosmall(state, &action); if (error) return(error); if (action == 0) return 0; - xfs_da_node_unbalance(state, drop_blk, save_blk); + xfs_da3_node_unbalance(state, drop_blk, save_blk); break; } - xfs_da_fixhashpath(state, &state->altpath); - error = xfs_da_blk_unlink(state, drop_blk, save_blk); + xfs_da3_fixhashpath(state, &state->altpath); + error = xfs_da3_blk_unlink(state, drop_blk, save_blk); xfs_da_state_kill_altpath(state); if (error) return(error); @@ -768,9 +1048,9 @@ xfs_da_join(xfs_da_state_t *state) * we only have one entry in the root, make the child block * the new root. */ - xfs_da_node_remove(state, drop_blk); - xfs_da_fixhashpath(state, &state->path); - error = xfs_da_root_join(state, &state->path.blk[0]); + xfs_da3_node_remove(state, drop_blk); + xfs_da3_fixhashpath(state, &state->path); + error = xfs_da3_root_join(state, &state->path.blk[0]); return(error); } @@ -782,9 +1062,13 @@ xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level) if (level == 1) { ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || - magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - } else - ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) || + magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + } else { + ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); + } ASSERT(!blkinfo->forw); ASSERT(!blkinfo->back); } @@ -797,52 +1081,61 @@ xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level) * the old root to block 0 as the new root node. */ STATIC int -xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) +xfs_da3_root_join( + struct xfs_da_state *state, + struct xfs_da_state_blk *root_blk) { - xfs_da_intnode_t *oldroot; - xfs_da_args_t *args; - xfs_dablk_t child; - struct xfs_buf *bp; - int error; + struct xfs_da_intnode *oldroot; + struct xfs_da_args *args; + xfs_dablk_t child; + struct xfs_buf *bp; + struct xfs_da3_icnode_hdr oldroothdr; + struct xfs_da_node_entry *btree; + int error; trace_xfs_da_root_join(state->args); - args = state->args; - ASSERT(args != NULL); ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); + + args = state->args; oldroot = root_blk->bp->b_addr; - ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT(!oldroot->hdr.info.forw); - ASSERT(!oldroot->hdr.info.back); + xfs_da3_node_hdr_from_disk(&oldroothdr, oldroot); + ASSERT(oldroothdr.forw == 0); + ASSERT(oldroothdr.back == 0); /* * If the root has more than one child, then don't do anything. */ - if (be16_to_cpu(oldroot->hdr.count) > 1) - return(0); + if (oldroothdr.count > 1) + return 0; /* * Read in the (only) child block, then copy those bytes into * the root block's buffer and free the original child block. */ - child = be32_to_cpu(oldroot->btree[0].before); + btree = xfs_da3_node_tree_p(oldroot); + child = be32_to_cpu(btree[0].before); ASSERT(child != 0); - error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp, + error = xfs_da3_node_read(args->trans, args->dp, child, -1, &bp, args->whichfork); if (error) - return(error); - ASSERT(bp != NULL); - xfs_da_blkinfo_onlychild_validate(bp->b_addr, - be16_to_cpu(oldroot->hdr.level)); + return error; + xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); /* * This could be copying a leaf back into the root block in the case of * there only being a single leaf block left in the tree. Hence we have * to update the b_ops pointer as well to match the buffer type change - * that could occur. + * that could occur. For dir3 blocks we also need to update the block + * number in the buffer header. */ memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); root_blk->bp->b_ops = bp->b_ops; + xfs_trans_buf_copy_type(root_blk->bp, bp); + if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { + struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; + da3->blkno = cpu_to_be64(root_blk->bp->b_bn); + } xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); @@ -858,14 +1151,21 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) * If nothing can be done, return 0. */ STATIC int -xfs_da_node_toosmall(xfs_da_state_t *state, int *action) +xfs_da3_node_toosmall( + struct xfs_da_state *state, + int *action) { - xfs_da_intnode_t *node; - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *info; - int count, forward, error, retval, i; - xfs_dablk_t blkno; - struct xfs_buf *bp; + struct xfs_da_intnode *node; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *info; + xfs_dablk_t blkno; + struct xfs_buf *bp; + struct xfs_da3_icnode_hdr nodehdr; + int count; + int forward; + int error; + int retval; + int i; trace_xfs_da_node_toosmall(state->args); @@ -876,10 +1176,9 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) */ blk = &state->path.blk[ state->path.active-1 ]; info = blk->bp->b_addr; - ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); node = (xfs_da_intnode_t *)info; - count = be16_to_cpu(node->hdr.count); - if (count > (state->node_ents >> 1)) { + xfs_da3_node_hdr_from_disk(&nodehdr, node); + if (nodehdr.count > (state->node_ents >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return(0); /* blk over 50%, don't try to join */ } @@ -890,14 +1189,14 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * coalesce it with a sibling block. We choose (arbitrarily) * to merge with the forward block unless it is NULL. */ - if (count == 0) { + if (nodehdr.count == 0) { /* * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ forward = (info->forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); if (error) return(error); @@ -916,35 +1215,34 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * We prefer coalescing with the lower numbered sibling so as * to shrink a directory over time. */ + count = state->node_ents; + count -= state->node_ents >> 2; + count -= nodehdr.count; + /* start with smaller blk num */ - forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back)); + forward = nodehdr.forw < nodehdr.back; for (i = 0; i < 2; forward = !forward, i++) { if (forward) - blkno = be32_to_cpu(info->forw); + blkno = nodehdr.forw; else - blkno = be32_to_cpu(info->back); + blkno = nodehdr.back; if (blkno == 0) continue; - error = xfs_da_node_read(state->args->trans, state->args->dp, + error = xfs_da3_node_read(state->args->trans, state->args->dp, blkno, -1, &bp, state->args->whichfork); if (error) return(error); - ASSERT(bp != NULL); - node = (xfs_da_intnode_t *)info; - count = state->node_ents; - count -= state->node_ents >> 2; - count -= be16_to_cpu(node->hdr.count); node = bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - count -= be16_to_cpu(node->hdr.count); + xfs_da3_node_hdr_from_disk(&nodehdr, node); xfs_trans_brelse(state->args->trans, bp); - if (count >= 0) + + if (count - nodehdr.count >= 0) break; /* fits with at least 25% to spare */ } if (i >= 2) { *action = 0; - return(0); + return 0; } /* @@ -953,28 +1251,42 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) { - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); - if (error) { - return(error); - } - if (retval) { - *action = 0; - return(0); - } } else { - error = xfs_da_path_shift(state, &state->path, forward, + error = xfs_da3_path_shift(state, &state->path, forward, 0, &retval); - if (error) { - return(error); - } - if (retval) { - *action = 0; - return(0); - } + } + if (error) + return error; + if (retval) { + *action = 0; + return 0; } *action = 1; - return(0); + return 0; +} + +/* + * Pick up the last hashvalue from an intermediate node. + */ +STATIC uint +xfs_da3_node_lasthash( + struct xfs_buf *bp, + int *count) +{ + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + + node = bp->b_addr; + xfs_da3_node_hdr_from_disk(&nodehdr, node); + if (count) + *count = nodehdr.count; + if (!nodehdr.count) + return 0; + btree = xfs_da3_node_tree_p(node); + return be32_to_cpu(btree[nodehdr.count - 1].hashval); } /* @@ -982,13 +1294,16 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * when we stop making changes, return. */ void -xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) +xfs_da3_fixhashpath( + struct xfs_da_state *state, + struct xfs_da_state_path *path) { - xfs_da_state_blk_t *blk; - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - xfs_dahash_t lasthash=0; - int level, count; + struct xfs_da_state_blk *blk; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + xfs_dahash_t lasthash=0; + int level; + int count; trace_xfs_da_fixhashpath(state->args); @@ -1006,23 +1321,26 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) return; break; case XFS_DA_NODE_MAGIC: - lasthash = xfs_da_node_lasthash(blk->bp, &count); + lasthash = xfs_da3_node_lasthash(blk->bp, &count); if (count == 0) return; break; } for (blk--, level--; level >= 0; blk--, level--) { + struct xfs_da3_icnode_hdr nodehdr; + node = blk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - btree = &node->btree[ blk->index ]; + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); if (be32_to_cpu(btree->hashval) == lasthash) break; blk->hashval = lasthash; - btree->hashval = cpu_to_be32(lasthash); + btree[blk->index].hashval = cpu_to_be32(lasthash); xfs_trans_log_buf(state->args->trans, blk->bp, - XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); + XFS_DA_LOGRANGE(node, &btree[blk->index], + sizeof(*btree))); - lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); + lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval); } } @@ -1030,104 +1348,120 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) * Remove an entry from an intermediate node. */ STATIC void -xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) +xfs_da3_node_remove( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk) { - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - int tmp; + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int index; + int tmp; trace_xfs_da_node_remove(state->args); node = drop_blk->bp->b_addr; - ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count)); + xfs_da3_node_hdr_from_disk(&nodehdr, node); + ASSERT(drop_blk->index < nodehdr.count); ASSERT(drop_blk->index >= 0); /* * Copy over the offending entry, or just zero it out. */ - btree = &node->btree[drop_blk->index]; - if (drop_blk->index < (be16_to_cpu(node->hdr.count)-1)) { - tmp = be16_to_cpu(node->hdr.count) - drop_blk->index - 1; + index = drop_blk->index; + btree = xfs_da3_node_tree_p(node); + if (index < nodehdr.count - 1) { + tmp = nodehdr.count - index - 1; tmp *= (uint)sizeof(xfs_da_node_entry_t); - memmove(btree, btree + 1, tmp); + memmove(&btree[index], &btree[index + 1], tmp); xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, btree, tmp)); - btree = &node->btree[be16_to_cpu(node->hdr.count)-1]; + XFS_DA_LOGRANGE(node, &btree[index], tmp)); + index = nodehdr.count - 1; } - memset((char *)btree, 0, sizeof(xfs_da_node_entry_t)); + memset(&btree[index], 0, sizeof(xfs_da_node_entry_t)); xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); - be16_add_cpu(&node->hdr.count, -1); + XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index]))); + nodehdr.count -= 1; + xfs_da3_node_hdr_to_disk(node, &nodehdr); xfs_trans_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node))); /* * Copy the last hash value from the block to propagate upwards. */ - btree--; - drop_blk->hashval = be32_to_cpu(btree->hashval); + drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval); } /* - * Unbalance the btree elements between two intermediate nodes, + * Unbalance the elements between two intermediate nodes, * move all Btree elements from one node into another. */ STATIC void -xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk) +xfs_da3_node_unbalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) { - xfs_da_intnode_t *drop_node, *save_node; - xfs_da_node_entry_t *btree; - int tmp; - xfs_trans_t *tp; + struct xfs_da_intnode *drop_node; + struct xfs_da_intnode *save_node; + struct xfs_da_node_entry *drop_btree; + struct xfs_da_node_entry *save_btree; + struct xfs_da3_icnode_hdr drop_hdr; + struct xfs_da3_icnode_hdr save_hdr; + struct xfs_trans *tp; + int sindex; + int tmp; trace_xfs_da_node_unbalance(state->args); drop_node = drop_blk->bp->b_addr; save_node = save_blk->bp->b_addr; - ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - ASSERT(save_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + xfs_da3_node_hdr_from_disk(&drop_hdr, drop_node); + xfs_da3_node_hdr_from_disk(&save_hdr, save_node); + drop_btree = xfs_da3_node_tree_p(drop_node); + save_btree = xfs_da3_node_tree_p(save_node); tp = state->args->trans; /* * If the dying block has lower hashvals, then move all the * elements in the remaining block up to make a hole. */ - if ((be32_to_cpu(drop_node->btree[0].hashval) < be32_to_cpu(save_node->btree[ 0 ].hashval)) || - (be32_to_cpu(drop_node->btree[be16_to_cpu(drop_node->hdr.count)-1].hashval) < - be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval))) - { - btree = &save_node->btree[be16_to_cpu(drop_node->hdr.count)]; - tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); - memmove(btree, &save_node->btree[0], tmp); - btree = &save_node->btree[0]; + if ((be32_to_cpu(drop_btree[0].hashval) < + be32_to_cpu(save_btree[0].hashval)) || + (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) < + be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) { + /* XXX: check this - is memmove dst correct? */ + tmp = save_hdr.count * sizeof(xfs_da_node_entry_t); + memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp); + + sindex = 0; xfs_trans_log_buf(tp, save_blk->bp, - XFS_DA_LOGRANGE(save_node, btree, - (be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) * - sizeof(xfs_da_node_entry_t))); + XFS_DA_LOGRANGE(save_node, &save_btree[0], + (save_hdr.count + drop_hdr.count) * + sizeof(xfs_da_node_entry_t))); } else { - btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)]; + sindex = save_hdr.count; xfs_trans_log_buf(tp, save_blk->bp, - XFS_DA_LOGRANGE(save_node, btree, - be16_to_cpu(drop_node->hdr.count) * - sizeof(xfs_da_node_entry_t))); + XFS_DA_LOGRANGE(save_node, &save_btree[sindex], + drop_hdr.count * sizeof(xfs_da_node_entry_t))); } /* * Move all the B-tree elements from drop_blk to save_blk. */ - tmp = be16_to_cpu(drop_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); - memcpy(btree, &drop_node->btree[0], tmp); - be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count)); + tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t); + memcpy(&save_btree[sindex], &drop_btree[0], tmp); + save_hdr.count += drop_hdr.count; + xfs_da3_node_hdr_to_disk(save_node, &save_hdr); xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, &save_node->hdr, - sizeof(save_node->hdr))); + xfs_da3_node_hdr_size(save_node))); /* * Save the last hashval in the remaining block for upward propagation. */ - save_blk->hashval = be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval); + save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval); } /*======================================================================== @@ -1146,16 +1480,24 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * pruned depth-first tree search. */ int /* error */ -xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) +xfs_da3_node_lookup_int( + struct xfs_da_state *state, + int *result) { - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *curr; - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - xfs_dablk_t blkno; - int probe, span, max, error, retval; - xfs_dahash_t hashval, btreehashval; - xfs_da_args_t *args; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *curr; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_args *args; + xfs_dablk_t blkno; + xfs_dahash_t hashval; + xfs_dahash_t btreehashval; + int probe; + int span; + int max; + int error; + int retval; args = state->args; @@ -1171,7 +1513,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) * Read the next node down in the tree. */ blk->blkno = blkno; - error = xfs_da_node_read(args->trans, args->dp, blkno, + error = xfs_da3_node_read(args->trans, args->dp, blkno, -1, &blk->bp, args->whichfork); if (error) { blk->blkno = 0; @@ -1180,66 +1522,75 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) } curr = blk->bp->b_addr; blk->magic = be16_to_cpu(curr->magic); - ASSERT(blk->magic == XFS_DA_NODE_MAGIC || - blk->magic == XFS_DIR2_LEAFN_MAGIC || - blk->magic == XFS_ATTR_LEAF_MAGIC); + + if (blk->magic == XFS_ATTR_LEAF_MAGIC || + blk->magic == XFS_ATTR3_LEAF_MAGIC) { + blk->magic = XFS_ATTR_LEAF_MAGIC; + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); + break; + } + + if (blk->magic == XFS_DIR2_LEAFN_MAGIC || + blk->magic == XFS_DIR3_LEAFN_MAGIC) { + blk->magic = XFS_DIR2_LEAFN_MAGIC; + blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL); + break; + } + + blk->magic = XFS_DA_NODE_MAGIC; + /* * Search an intermediate node for a match. */ - if (blk->magic == XFS_DA_NODE_MAGIC) { - node = blk->bp->b_addr; - max = be16_to_cpu(node->hdr.count); - blk->hashval = be32_to_cpu(node->btree[max-1].hashval); + node = blk->bp->b_addr; + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); - /* - * Binary search. (note: small blocks will skip loop) - */ - probe = span = max / 2; - hashval = args->hashval; - for (btree = &node->btree[probe]; span > 4; - btree = &node->btree[probe]) { - span /= 2; - btreehashval = be32_to_cpu(btree->hashval); - if (btreehashval < hashval) - probe += span; - else if (btreehashval > hashval) - probe -= span; - else - break; - } - ASSERT((probe >= 0) && (probe < max)); - ASSERT((span <= 4) || (be32_to_cpu(btree->hashval) == hashval)); + max = nodehdr.count; + blk->hashval = be32_to_cpu(btree[max - 1].hashval); - /* - * Since we may have duplicate hashval's, find the first - * matching hashval in the node. - */ - while ((probe > 0) && (be32_to_cpu(btree->hashval) >= hashval)) { - btree--; - probe--; - } - while ((probe < max) && (be32_to_cpu(btree->hashval) < hashval)) { - btree++; - probe++; - } + /* + * Binary search. (note: small blocks will skip loop) + */ + probe = span = max / 2; + hashval = args->hashval; + while (span > 4) { + span /= 2; + btreehashval = be32_to_cpu(btree[probe].hashval); + if (btreehashval < hashval) + probe += span; + else if (btreehashval > hashval) + probe -= span; + else + break; + } + ASSERT((probe >= 0) && (probe < max)); + ASSERT((span <= 4) || + (be32_to_cpu(btree[probe].hashval) == hashval)); - /* - * Pick the right block to descend on. - */ - if (probe == max) { - blk->index = max-1; - blkno = be32_to_cpu(node->btree[max-1].before); - } else { - blk->index = probe; - blkno = be32_to_cpu(btree->before); - } - } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { - blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); - break; - } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) { - blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL); - break; + /* + * Since we may have duplicate hashval's, find the first + * matching hashval in the node. + */ + while (probe > 0 && + be32_to_cpu(btree[probe].hashval) >= hashval) { + probe--; + } + while (probe < max && + be32_to_cpu(btree[probe].hashval) < hashval) { + probe++; + } + + /* + * Pick the right block to descend on. + */ + if (probe == max) { + blk->index = max - 1; + blkno = be32_to_cpu(btree[max - 1].before); + } else { + blk->index = probe; + blkno = be32_to_cpu(btree[probe].before); } } @@ -1254,7 +1605,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) retval = xfs_dir2_leafn_lookup_int(blk->bp, args, &blk->index, state); } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { - retval = xfs_attr_leaf_lookup_int(blk->bp, args); + retval = xfs_attr3_leaf_lookup_int(blk->bp, args); blk->index = args->index; args->blkno = blk->blkno; } else { @@ -1263,7 +1614,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) } if (((retval == ENOENT) || (retval == ENOATTR)) && (blk->hashval == args->hashval)) { - error = xfs_da_path_shift(state, &state->path, 1, 1, + error = xfs_da3_path_shift(state, &state->path, 1, 1, &retval); if (error) return(error); @@ -1285,16 +1636,52 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) *========================================================================*/ /* + * Compare two intermediate nodes for "order". + */ +STATIC int +xfs_da3_node_order( + struct xfs_buf *node1_bp, + struct xfs_buf *node2_bp) +{ + struct xfs_da_intnode *node1; + struct xfs_da_intnode *node2; + struct xfs_da_node_entry *btree1; + struct xfs_da_node_entry *btree2; + struct xfs_da3_icnode_hdr node1hdr; + struct xfs_da3_icnode_hdr node2hdr; + + node1 = node1_bp->b_addr; + node2 = node2_bp->b_addr; + xfs_da3_node_hdr_from_disk(&node1hdr, node1); + xfs_da3_node_hdr_from_disk(&node2hdr, node2); + btree1 = xfs_da3_node_tree_p(node1); + btree2 = xfs_da3_node_tree_p(node2); + + if (node1hdr.count > 0 && node2hdr.count > 0 && + ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || + (be32_to_cpu(btree2[node2hdr.count - 1].hashval) < + be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) { + return 1; + } + return 0; +} + +/* * Link a new block into a doubly linked list of blocks (of whatever type). */ int /* error */ -xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, - xfs_da_state_blk_t *new_blk) +xfs_da3_blk_link( + struct xfs_da_state *state, + struct xfs_da_state_blk *old_blk, + struct xfs_da_state_blk *new_blk) { - xfs_da_blkinfo_t *old_info, *new_info, *tmp_info; - xfs_da_args_t *args; - int before=0, error; - struct xfs_buf *bp; + struct xfs_da_blkinfo *old_info; + struct xfs_da_blkinfo *new_info; + struct xfs_da_blkinfo *tmp_info; + struct xfs_da_args *args; + struct xfs_buf *bp; + int before = 0; + int error; /* * Set up environment. @@ -1306,9 +1693,6 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC || old_blk->magic == XFS_DIR2_LEAFN_MAGIC || old_blk->magic == XFS_ATTR_LEAF_MAGIC); - ASSERT(old_blk->magic == be16_to_cpu(old_info->magic)); - ASSERT(new_blk->magic == be16_to_cpu(new_info->magic)); - ASSERT(old_blk->magic == new_blk->magic); switch (old_blk->magic) { case XFS_ATTR_LEAF_MAGIC: @@ -1318,7 +1702,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp); break; case XFS_DA_NODE_MAGIC: - before = xfs_da_node_order(old_blk->bp, new_blk->bp); + before = xfs_da3_node_order(old_blk->bp, new_blk->bp); break; } @@ -1333,14 +1717,14 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = cpu_to_be32(old_blk->blkno); new_info->back = old_info->back; if (old_info->back) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(old_info->back), -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); tmp_info = bp->b_addr; - ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic)); + ASSERT(tmp_info->magic == old_info->magic); ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno); tmp_info->forw = cpu_to_be32(new_blk->blkno); xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); @@ -1354,7 +1738,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = old_info->forw; new_info->back = cpu_to_be32(old_blk->blkno); if (old_info->forw) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(old_info->forw), -1, &bp, args->whichfork); if (error) @@ -1375,59 +1759,20 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, } /* - * Compare two intermediate nodes for "order". - */ -STATIC int -xfs_da_node_order( - struct xfs_buf *node1_bp, - struct xfs_buf *node2_bp) -{ - xfs_da_intnode_t *node1, *node2; - - node1 = node1_bp->b_addr; - node2 = node2_bp->b_addr; - ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) && - node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && - ((be32_to_cpu(node2->btree[0].hashval) < - be32_to_cpu(node1->btree[0].hashval)) || - (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) < - be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) { - return(1); - } - return(0); -} - -/* - * Pick up the last hashvalue from an intermediate node. - */ -STATIC uint -xfs_da_node_lasthash( - struct xfs_buf *bp, - int *count) -{ - xfs_da_intnode_t *node; - - node = bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - if (count) - *count = be16_to_cpu(node->hdr.count); - if (!node->hdr.count) - return(0); - return be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); -} - -/* * Unlink a block from a doubly linked list of blocks. */ STATIC int /* error */ -xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk) +xfs_da3_blk_unlink( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) { - xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info; - xfs_da_args_t *args; - struct xfs_buf *bp; - int error; + struct xfs_da_blkinfo *drop_info; + struct xfs_da_blkinfo *save_info; + struct xfs_da_blkinfo *tmp_info; + struct xfs_da_args *args; + struct xfs_buf *bp; + int error; /* * Set up environment. @@ -1439,8 +1784,6 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC || save_blk->magic == XFS_DIR2_LEAFN_MAGIC || save_blk->magic == XFS_ATTR_LEAF_MAGIC); - ASSERT(save_blk->magic == be16_to_cpu(save_info->magic)); - ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic)); ASSERT(save_blk->magic == drop_blk->magic); ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) || (be32_to_cpu(save_info->back) == drop_blk->blkno)); @@ -1454,7 +1797,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_back(args); save_info->back = drop_info->back; if (drop_info->back) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(drop_info->back), -1, &bp, args->whichfork); if (error) @@ -1471,7 +1814,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_forward(args); save_info->forw = drop_info->forw; if (drop_info->forw) { - error = xfs_da_node_read(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(drop_info->forw), -1, &bp, args->whichfork); if (error) @@ -1499,15 +1842,22 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * the new bottom and the root. */ int /* error */ -xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, - int forward, int release, int *result) +xfs_da3_path_shift( + struct xfs_da_state *state, + struct xfs_da_state_path *path, + int forward, + int release, + int *result) { - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *info; - xfs_da_intnode_t *node; - xfs_da_args_t *args; - xfs_dablk_t blkno=0; - int level, error; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *info; + struct xfs_da_intnode *node; + struct xfs_da_args *args; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + xfs_dablk_t blkno = 0; + int level; + int error; trace_xfs_da_path_shift(state->args); @@ -1522,16 +1872,17 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); level = (path->active-1) - 1; /* skip bottom layer in path */ for (blk = &path->blk[level]; level >= 0; blk--, level--) { - ASSERT(blk->bp != NULL); node = blk->bp->b_addr; - ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) { + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + + if (forward && (blk->index < nodehdr.count - 1)) { blk->index++; - blkno = be32_to_cpu(node->btree[blk->index].before); + blkno = be32_to_cpu(btree[blk->index].before); break; } else if (!forward && (blk->index > 0)) { blk->index--; - blkno = be32_to_cpu(node->btree[blk->index].before); + blkno = be32_to_cpu(btree[blk->index].before); break; } } @@ -1557,45 +1908,60 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, * Read the next child block. */ blk->blkno = blkno; - error = xfs_da_node_read(args->trans, args->dp, blkno, -1, + error = xfs_da3_node_read(args->trans, args->dp, blkno, -1, &blk->bp, args->whichfork); if (error) return(error); - ASSERT(blk->bp != NULL); info = blk->bp->b_addr; ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || - info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - blk->magic = be16_to_cpu(info->magic); - if (blk->magic == XFS_DA_NODE_MAGIC) { + info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) || + info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + + + /* + * Note: we flatten the magic number to a single type so we + * don't have to compare against crc/non-crc types elsewhere. + */ + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + blk->magic = XFS_DA_NODE_MAGIC; node = (xfs_da_intnode_t *)info; - blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); + xfs_da3_node_hdr_from_disk(&nodehdr, node); + btree = xfs_da3_node_tree_p(node); + blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); if (forward) blk->index = 0; else - blk->index = be16_to_cpu(node->hdr.count)-1; - blkno = be32_to_cpu(node->btree[blk->index].before); - } else { + blk->index = nodehdr.count - 1; + blkno = be32_to_cpu(btree[blk->index].before); + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + blk->magic = XFS_ATTR_LEAF_MAGIC; ASSERT(level == path->active-1); blk->index = 0; - switch(blk->magic) { - case XFS_ATTR_LEAF_MAGIC: - blk->hashval = xfs_attr_leaf_lasthash(blk->bp, - NULL); - break; - case XFS_DIR2_LEAFN_MAGIC: - blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, - NULL); - break; - default: - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC || - blk->magic == XFS_DIR2_LEAFN_MAGIC); - break; - } + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, + NULL); + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + blk->magic = XFS_DIR2_LEAFN_MAGIC; + ASSERT(level == path->active-1); + blk->index = 0; + blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, + NULL); + break; + default: + ASSERT(0); + break; } } *result = 0; - return(0); + return 0; } @@ -1782,22 +2148,36 @@ xfs_da_grow_inode( * a bmap btree split to do that. */ STATIC int -xfs_da_swap_lastblock( - xfs_da_args_t *args, - xfs_dablk_t *dead_blknop, - struct xfs_buf **dead_bufp) +xfs_da3_swap_lastblock( + struct xfs_da_args *args, + xfs_dablk_t *dead_blknop, + struct xfs_buf **dead_bufp) { - xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno; - struct xfs_buf *dead_buf, *last_buf, *sib_buf, *par_buf; - xfs_fileoff_t lastoff; - xfs_inode_t *ip; - xfs_trans_t *tp; - xfs_mount_t *mp; - int error, w, entno, level, dead_level; - xfs_da_blkinfo_t *dead_info, *sib_info; - xfs_da_intnode_t *par_node, *dead_node; - xfs_dir2_leaf_t *dead_leaf2; - xfs_dahash_t dead_hash; + struct xfs_da_blkinfo *dead_info; + struct xfs_da_blkinfo *sib_info; + struct xfs_da_intnode *par_node; + struct xfs_da_intnode *dead_node; + struct xfs_dir2_leaf *dead_leaf2; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr par_hdr; + struct xfs_inode *ip; + struct xfs_trans *tp; + struct xfs_mount *mp; + struct xfs_buf *dead_buf; + struct xfs_buf *last_buf; + struct xfs_buf *sib_buf; + struct xfs_buf *par_buf; + xfs_dahash_t dead_hash; + xfs_fileoff_t lastoff; + xfs_dablk_t dead_blkno; + xfs_dablk_t last_blkno; + xfs_dablk_t sib_blkno; + xfs_dablk_t par_blkno; + int error; + int w; + int entno; + int level; + int dead_level; trace_xfs_da_swap_lastblock(args); @@ -1821,7 +2201,7 @@ xfs_da_swap_lastblock( * Read the last block in the btree space. */ last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; - error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w); + error = xfs_da3_node_read(tp, ip, last_blkno, -1, &last_buf, w); if (error) return error; /* @@ -1833,22 +2213,31 @@ xfs_da_swap_lastblock( /* * Get values from the moved block. */ - if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) { + if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + dead_leaf2 = (xfs_dir2_leaf_t *)dead_info; + xfs_dir3_leaf_hdr_from_disk(&leafhdr, dead_leaf2); + ents = xfs_dir3_leaf_ents_p(dead_leaf2); dead_level = 0; - dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval); + dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval); } else { - ASSERT(dead_info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + struct xfs_da3_icnode_hdr deadhdr; + dead_node = (xfs_da_intnode_t *)dead_info; - dead_level = be16_to_cpu(dead_node->hdr.level); - dead_hash = be32_to_cpu(dead_node->btree[be16_to_cpu(dead_node->hdr.count) - 1].hashval); + xfs_da3_node_hdr_from_disk(&deadhdr, dead_node); + btree = xfs_da3_node_tree_p(dead_node); + dead_level = deadhdr.level; + dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval); } sib_buf = par_buf = NULL; /* * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); + error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -1870,7 +2259,7 @@ xfs_da_swap_lastblock( * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); + error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -1894,31 +2283,31 @@ xfs_da_swap_lastblock( * Walk down the tree looking for the parent of the moved block. */ for (;;) { - error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); + error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; - if (unlikely(par_node->hdr.info.magic != - cpu_to_be16(XFS_DA_NODE_MAGIC) || - (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) { + xfs_da3_node_hdr_from_disk(&par_hdr, par_node); + if (level >= 0 && level != par_hdr.level + 1) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } - level = be16_to_cpu(par_node->hdr.level); + level = par_hdr.level; + btree = xfs_da3_node_tree_p(par_node); for (entno = 0; - entno < be16_to_cpu(par_node->hdr.count) && - be32_to_cpu(par_node->btree[entno].hashval) < dead_hash; + entno < par_hdr.count && + be32_to_cpu(btree[entno].hashval) < dead_hash; entno++) continue; - if (unlikely(entno == be16_to_cpu(par_node->hdr.count))) { + if (entno == par_hdr.count) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } - par_blkno = be32_to_cpu(par_node->btree[entno].before); + par_blkno = be32_to_cpu(btree[entno].before); if (level == dead_level + 1) break; xfs_trans_brelse(tp, par_buf); @@ -1930,13 +2319,13 @@ xfs_da_swap_lastblock( */ for (;;) { for (; - entno < be16_to_cpu(par_node->hdr.count) && - be32_to_cpu(par_node->btree[entno].before) != last_blkno; + entno < par_hdr.count && + be32_to_cpu(btree[entno].before) != last_blkno; entno++) continue; - if (entno < be16_to_cpu(par_node->hdr.count)) + if (entno < par_hdr.count) break; - par_blkno = be32_to_cpu(par_node->hdr.info.forw); + par_blkno = par_hdr.forw; xfs_trans_brelse(tp, par_buf); par_buf = NULL; if (unlikely(par_blkno == 0)) { @@ -1945,27 +2334,27 @@ xfs_da_swap_lastblock( error = XFS_ERROR(EFSCORRUPTED); goto done; } - error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); + error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; - if (unlikely( - be16_to_cpu(par_node->hdr.level) != level || - par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC))) { + xfs_da3_node_hdr_from_disk(&par_hdr, par_node); + if (par_hdr.level != level) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } + btree = xfs_da3_node_tree_p(par_node); entno = 0; } /* * Update the parent entry pointing to the moved block. */ - par_node->btree[entno].before = cpu_to_be32(dead_blkno); + btree[entno].before = cpu_to_be32(dead_blkno); xfs_trans_log_buf(tp, par_buf, - XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before, - sizeof(par_node->btree[entno].before))); + XFS_DA_LOGRANGE(par_node, &btree[entno].before, + sizeof(btree[entno].before))); *dead_blknop = last_blkno; *dead_bufp = last_buf; return 0; @@ -2007,14 +2396,15 @@ xfs_da_shrink_inode( * Remove extents. If we get ENOSPC for a dir we have to move * the last block to the place we want to kill. */ - if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, - xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, - 0, args->firstblock, args->flist, - &done)) == ENOSPC) { + error = xfs_bunmapi(tp, dp, dead_blkno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, + 0, args->firstblock, args->flist, &done); + if (error == ENOSPC) { if (w != XFS_DATA_FORK) break; - if ((error = xfs_da_swap_lastblock(args, &dead_blkno, - &dead_buf))) + error = xfs_da3_swap_lastblock(args, &dead_blkno, + &dead_buf); + if (error) break; } else { break; @@ -2279,12 +2669,21 @@ xfs_da_read_buf( magic1 = be32_to_cpu(hdr->magic); if (unlikely( XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) && + (magic != XFS_DA3_NODE_MAGIC) && (magic != XFS_ATTR_LEAF_MAGIC) && + (magic != XFS_ATTR3_LEAF_MAGIC) && (magic != XFS_DIR2_LEAF1_MAGIC) && + (magic != XFS_DIR3_LEAF1_MAGIC) && (magic != XFS_DIR2_LEAFN_MAGIC) && + (magic != XFS_DIR3_LEAFN_MAGIC) && (magic1 != XFS_DIR2_BLOCK_MAGIC) && + (magic1 != XFS_DIR3_BLOCK_MAGIC) && (magic1 != XFS_DIR2_DATA_MAGIC) && - (free->hdr.magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)), + (magic1 != XFS_DIR3_DATA_MAGIC) && + (free->hdr.magic != + cpu_to_be32(XFS_DIR2_FREE_MAGIC)) && + (free->hdr.magic != + cpu_to_be32(XFS_DIR3_FREE_MAGIC)), mp, XFS_ERRTAG_DA_READ_BUF, XFS_RANDOM_DA_READ_BUF))) { trace_xfs_da_btree_corrupt(bp, _RET_IP_); @@ -2342,41 +2741,3 @@ out_free: return -1; return mappedbno; } - -kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ - -/* - * Allocate a dir-state structure. - * We don't put them on the stack since they're large. - */ -xfs_da_state_t * -xfs_da_state_alloc(void) -{ - return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); -} - -/* - * Kill the altpath contents of a da-state structure. - */ -STATIC void -xfs_da_state_kill_altpath(xfs_da_state_t *state) -{ - int i; - - for (i = 0; i < state->altpath.active; i++) - state->altpath.blk[i].bp = NULL; - state->altpath.active = 0; -} - -/* - * Free a da-state structure. - */ -void -xfs_da_state_free(xfs_da_state_t *state) -{ - xfs_da_state_kill_altpath(state); -#ifdef DEBUG - memset((char *)state, 0, sizeof(*state)); -#endif /* DEBUG */ - kmem_zone_free(xfs_da_state_zone, state); -} diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index ee5170c46ae1..6fb3371c63cf 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -20,7 +21,6 @@ struct xfs_bmap_free; struct xfs_inode; -struct xfs_mount; struct xfs_trans; struct zone; @@ -47,6 +47,33 @@ typedef struct xfs_da_blkinfo { } xfs_da_blkinfo_t; /* + * CRC enabled directory structure types + * + * The headers change size for the additional verification information, but + * otherwise the tree layouts and contents are unchanged. Hence the da btree + * code can use the struct xfs_da_blkinfo for manipulating the tree links and + * magic numbers without modification for both v2 and v3 nodes. + */ +#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */ +#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */ +#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */ + +struct xfs_da3_blkinfo { + /* + * the node link manipulation code relies on the fact that the first + * element of this structure is the struct xfs_da_blkinfo so it can + * ignore the differences in the rest of the structures. + */ + struct xfs_da_blkinfo hdr; + __be32 crc; /* CRC of block */ + __be64 blkno; /* first block of the buffer */ + __be64 lsn; /* sequence number of last write */ + uuid_t uuid; /* filesystem we belong to */ + __be64 owner; /* inode that owns the block */ +}; + +/* * This is the structure of the root and intermediate nodes in the Btree. * The leaf nodes are defined above. * @@ -57,19 +84,76 @@ typedef struct xfs_da_blkinfo { */ #define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ +typedef struct xfs_da_node_hdr { + struct xfs_da_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ +} xfs_da_node_hdr_t; + +struct xfs_da3_node_hdr { + struct xfs_da3_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ + __be32 __pad32; +}; + +#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc)) + +typedef struct xfs_da_node_entry { + __be32 hashval; /* hash value for this descendant */ + __be32 before; /* Btree block before this key */ +} xfs_da_node_entry_t; + typedef struct xfs_da_intnode { - struct xfs_da_node_hdr { /* constant-structure header block */ - xfs_da_blkinfo_t info; /* block type, links, etc. */ - __be16 count; /* count of active entries */ - __be16 level; /* level above leaves (leaf == 0) */ - } hdr; - struct xfs_da_node_entry { - __be32 hashval; /* hash value for this descendant */ - __be32 before; /* Btree block before this key */ - } btree[1]; /* variable sized array of keys */ + struct xfs_da_node_hdr hdr; + struct xfs_da_node_entry __btree[]; } xfs_da_intnode_t; -typedef struct xfs_da_node_hdr xfs_da_node_hdr_t; -typedef struct xfs_da_node_entry xfs_da_node_entry_t; + +struct xfs_da3_intnode { + struct xfs_da3_node_hdr hdr; + struct xfs_da_node_entry __btree[]; +}; + +/* + * In-core version of the node header to abstract the differences in the v2 and + * v3 disk format of the headers. Callers need to convert to/from disk format as + * appropriate. + */ +struct xfs_da3_icnode_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t level; +}; + +extern void xfs_da3_node_hdr_from_disk(struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from); +extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from); + +static inline int +xfs_da3_node_hdr_size(struct xfs_da_intnode *dap) +{ + if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) + return sizeof(struct xfs_da3_node_hdr); + return sizeof(struct xfs_da_node_hdr); +} + +static inline struct xfs_da_node_entry * +xfs_da3_node_tree_p(struct xfs_da_intnode *dap) +{ + if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { + struct xfs_da3_intnode *dap3 = (struct xfs_da3_intnode *)dap; + return dap3->__btree; + } + return dap->__btree; +} + +extern void xfs_da3_intnode_from_disk(struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from); +extern void xfs_da3_intnode_to_disk(struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from); #define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize @@ -191,32 +275,34 @@ struct xfs_nameops { /* * Routines used for growing the Btree. */ -int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - struct xfs_buf **bpp, int whichfork); -int xfs_da_split(xfs_da_state_t *state); +int xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno, + int level, struct xfs_buf **bpp, int whichfork); +int xfs_da3_split(xfs_da_state_t *state); /* * Routines used for shrinking the Btree. */ -int xfs_da_join(xfs_da_state_t *state); -void xfs_da_fixhashpath(xfs_da_state_t *state, - xfs_da_state_path_t *path_to_to_fix); +int xfs_da3_join(xfs_da_state_t *state); +void xfs_da3_fixhashpath(struct xfs_da_state *state, + struct xfs_da_state_path *path_to_to_fix); /* * Routines used for finding things in the Btree. */ -int xfs_da_node_lookup_int(xfs_da_state_t *state, int *result); -int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, +int xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result); +int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, int forward, int release, int *result); /* * Utility routines. */ -int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, +int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_state_blk_t *new_blk); -int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp, +int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int which_fork); +extern const struct xfs_buf_ops xfs_da3_node_buf_ops; + /* * Utility routines. */ diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index 1d9643b3dce6..f7a0e95d197a 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -19,7 +19,7 @@ #define __XFS_DINODE_H__ #define XFS_DINODE_MAGIC 0x494e /* 'IN' */ -#define XFS_DINODE_GOOD_VERSION(v) (((v) == 1 || (v) == 2)) +#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3) typedef struct xfs_timestamp { __be32 t_sec; /* timestamp seconds */ @@ -70,11 +70,36 @@ typedef struct xfs_dinode { /* di_next_unlinked is the only non-core field in the old dinode */ __be32 di_next_unlinked;/* agi unlinked list ptr */ -} __attribute__((packed)) xfs_dinode_t; + + /* start of the extended dinode, writable fields */ + __le32 di_crc; /* CRC of the inode */ + __be64 di_changecount; /* number of attribute changes */ + __be64 di_lsn; /* flush sequence */ + __be64 di_flags2; /* more random flags */ + __u8 di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_timestamp_t di_crtime; /* time created */ + __be64 di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ +} xfs_dinode_t; #define DI_MAX_FLUSH 0xffff /* + * Size of the core inode on disk. Version 1 and 2 inodes have + * the same size, but version 3 has grown a few additional fields. + */ +static inline uint xfs_dinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_dinode); + return offsetof(struct xfs_dinode, di_crc); +} + +/* * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. * Since the pathconf interface is signed, we use 2^31 - 1 instead. * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. @@ -104,11 +129,11 @@ typedef enum xfs_dinode_fmt { /* * Inode size for given fs. */ -#define XFS_LITINO(mp) \ - ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode))) +#define XFS_LITINO(mp, version) \ + ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) -#define XFS_BROOT_SIZE_ADJ \ - (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t)) +#define XFS_BROOT_SIZE_ADJ(ip) \ + (XFS_BMBT_BLOCK_LEN((ip)->i_mount) - sizeof(xfs_bmdr_block_t)) /* * Inode data & attribute fork sizes, per inode. @@ -119,10 +144,10 @@ typedef enum xfs_dinode_fmt { #define XFS_DFORK_DSIZE(dip,mp) \ (XFS_DFORK_Q(dip) ? \ XFS_DFORK_BOFF(dip) : \ - XFS_LITINO(mp)) + XFS_LITINO(mp, (dip)->di_version)) #define XFS_DFORK_ASIZE(dip,mp) \ (XFS_DFORK_Q(dip) ? \ - XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : \ + XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ 0) #define XFS_DFORK_SIZE(dip,mp,w) \ ((w) == XFS_DATA_FORK ? \ @@ -133,7 +158,7 @@ typedef enum xfs_dinode_fmt { * Return pointers to the data or attribute forks. */ #define XFS_DFORK_DPTR(dip) \ - ((char *)(dip) + sizeof(struct xfs_dinode)) + ((char *)dip + xfs_dinode_size(dip->di_version)) #define XFS_DFORK_APTR(dip) \ (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) #define XFS_DFORK_PTR(dip,w) \ diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 12afe07a91d7..e59f5fc816fe 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -28,11 +29,13 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" +#include "xfs_buf_item.h" #include "xfs_dir2.h" #include "xfs_dir2_format.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" /* * Local function prototypes. @@ -56,52 +59,110 @@ xfs_dir_startup(void) xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } -static void -xfs_dir2_block_verify( +static bool +xfs_dir3_block_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dir2_data_hdr *hdr = bp->b_addr; - int block_ok = 0; - - block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); - block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; - - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) + return false; } + if (__xfs_dir3_data_check(NULL, bp)) + return false; + return true; } static void -xfs_dir2_block_read_verify( +xfs_dir3_block_read_verify( struct xfs_buf *bp) { - xfs_dir2_block_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_DIR3_DATA_CRC_OFF)) || + !xfs_dir3_block_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void -xfs_dir2_block_write_verify( +xfs_dir3_block_write_verify( struct xfs_buf *bp) { - xfs_dir2_block_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_block_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); } -const struct xfs_buf_ops xfs_dir2_block_buf_ops = { - .verify_read = xfs_dir2_block_read_verify, - .verify_write = xfs_dir2_block_write_verify, +const struct xfs_buf_ops xfs_dir3_block_buf_ops = { + .verify_read = xfs_dir3_block_read_verify, + .verify_write = xfs_dir3_block_write_verify, }; static int -xfs_dir2_block_read( +xfs_dir3_block_read( struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; + int err; - return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, - XFS_DATA_FORK, &xfs_dir2_block_buf_ops); + err = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, + XFS_DATA_FORK, &xfs_dir3_block_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); + return err; +} + +static void +xfs_dir3_block_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *dp) +{ + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + bp->b_ops = &xfs_dir3_block_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + memset(hdr3, 0, sizeof(*hdr3)); + hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + return; + + } + hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); } static void @@ -121,7 +182,7 @@ xfs_dir2_block_need_space( struct xfs_dir2_data_unused *enddup = NULL; *compact = 0; - bf = hdr->bestfree; + bf = xfs_dir3_data_bestfree_p(hdr); /* * If there are stale entries we'll use one for the leaf. @@ -303,7 +364,7 @@ xfs_dir2_block_addname( mp = dp->i_mount; /* Read the (one and only) directory block into bp. */ - error = xfs_dir2_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, &bp); if (error) return error; @@ -498,7 +559,7 @@ xfs_dir2_block_addname( xfs_dir2_data_log_header(tp, bp); xfs_dir2_block_log_tail(tp, bp); xfs_dir2_data_log_entry(tp, bp, dep); - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); return 0; } @@ -531,7 +592,7 @@ xfs_dir2_block_getdents( if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) return 0; - error = xfs_dir2_block_read(NULL, dp, &bp); + error = xfs_dir3_block_read(NULL, dp, &bp); if (error) return error; @@ -541,12 +602,12 @@ xfs_dir2_block_getdents( */ wantoff = xfs_dir2_dataptr_to_off(mp, *offset); hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); /* * Set up values for the loop. */ btp = xfs_dir2_block_tail_p(mp, hdr); - ptr = (char *)(hdr + 1); + ptr = (char *)xfs_dir3_data_entry_p(hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); /* @@ -665,7 +726,7 @@ xfs_dir2_block_lookup( dp = args->dp; mp = dp->i_mount; hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -711,12 +772,12 @@ xfs_dir2_block_lookup_int( tp = args->trans; mp = dp->i_mount; - error = xfs_dir2_block_read(tp, dp, &bp); + error = xfs_dir3_block_read(tp, dp, &bp); if (error) return error; hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -853,7 +914,7 @@ xfs_dir2_block_removename( xfs_dir2_data_freescan(mp, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(tp, bp); - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); /* * See if the size as a shortform is good enough. */ @@ -910,7 +971,7 @@ xfs_dir2_block_replace( */ dep->inumber = cpu_to_be64(args->inumber); xfs_dir2_data_log_entry(args->trans, bp, dep); - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); return 0; } @@ -958,6 +1019,8 @@ xfs_dir2_leaf_to_block( __be16 *tagp; /* end of entry (tag) */ int to; /* block/leaf to index */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_to_block(args); @@ -965,8 +1028,12 @@ xfs_dir2_leaf_to_block( tp = args->trans; mp = dp->i_mount; leaf = lbp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); ltp = xfs_dir2_leaf_tail_p(mp, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC || + leafhdr.magic == XFS_DIR3_LEAF1_MAGIC); /* * If there are data blocks other than the first one, take this * opportunity to remove trailing empty data blocks that may have @@ -974,9 +1041,12 @@ xfs_dir2_leaf_to_block( * These will show up in the leaf bests table. */ while (dp->i_d.di_size > mp->m_dirblksize) { + int hdrsz; + + hdrsz = xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb)); bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == - mp->m_dirblksize - (uint)sizeof(*hdr)) { + mp->m_dirblksize - hdrsz) { if ((error = xfs_dir2_leaf_trim_data(args, lbp, (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) @@ -988,17 +1058,19 @@ xfs_dir2_leaf_to_block( * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { - error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp); + error = xfs_dir3_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp); if (error) return error; } hdr = dbp->b_addr; - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + /* * Size of the "leaf" area in the block. */ size = (uint)sizeof(xfs_dir2_block_tail_t) + - (uint)sizeof(*lep) * (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); + (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale); /* * Look at the last data entry. */ @@ -1014,8 +1086,8 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ - dbp->b_ops = &xfs_dir2_block_buf_ops; - hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + xfs_dir3_block_init(mp, tp, dbp, dp); + needlog = 1; needscan = 0; /* @@ -1027,18 +1099,17 @@ xfs_dir2_leaf_to_block( * Initialize the block tail. */ btp = xfs_dir2_block_tail_p(mp, hdr); - btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); + btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale); btp->stale = 0; xfs_dir2_block_log_tail(tp, dbp); /* * Initialize the block leaf area. We compact out stale entries. */ lep = xfs_dir2_block_leaf_p(btp); - for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { - if (leaf->ents[from].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + for (from = to = 0; from < leafhdr.count; from++) { + if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; - lep[to++] = leaf->ents[from]; + lep[to++] = ents[from]; } ASSERT(to == be32_to_cpu(btp->count)); xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1); @@ -1137,16 +1208,16 @@ xfs_dir2_sf_to_block( return error; } /* - * Initialize the data block. + * Initialize the data block, then convert it to block format. */ - error = xfs_dir2_data_init(args, blkno, &bp); + error = xfs_dir3_data_init(args, blkno, &bp); if (error) { kmem_free(sfp); return error; } - bp->b_ops = &xfs_dir2_block_buf_ops; + xfs_dir3_block_init(mp, tp, bp, dp); hdr = bp->b_addr; - hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + /* * Compute size of block "tail" area. */ @@ -1156,7 +1227,7 @@ xfs_dir2_sf_to_block( * The whole thing is initialized to free by the init routine. * Say we're using the leaf and tail area. */ - dup = (xfs_dir2_data_unused_t *)(hdr + 1); + dup = xfs_dir3_data_unused_p(hdr); needlog = needscan = 0; xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog, &needscan); @@ -1178,8 +1249,7 @@ xfs_dir2_sf_to_block( /* * Create entry for . */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + XFS_DIR2_DATA_DOT_OFFSET); + dep = xfs_dir3_data_dot_entry_p(hdr); dep->inumber = cpu_to_be64(dp->i_ino); dep->namelen = 1; dep->name[0] = '.'; @@ -1192,8 +1262,7 @@ xfs_dir2_sf_to_block( /* * Create entry for .. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + XFS_DIR2_DATA_DOTDOT_OFFSET); + dep = xfs_dir3_data_dotdot_entry_p(hdr); dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; @@ -1203,7 +1272,7 @@ xfs_dir2_sf_to_block( blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, (char *)dep - (char *)hdr)); - offset = XFS_DIR2_DATA_FIRST_OFFSET; + offset = xfs_dir3_data_first_offset(hdr); /* * Loop over existing entries, stuff them in. */ @@ -1273,6 +1342,6 @@ xfs_dir2_sf_to_block( ASSERT(needscan == 0); xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1); xfs_dir2_block_log_tail(tp, bp); - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); return 0; } diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index ffcf1774152e..c2930238005c 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -30,6 +31,8 @@ #include "xfs_dir2_format.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" STATIC xfs_dir2_data_free_t * xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); @@ -40,7 +43,7 @@ xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); * Return 0 is the buffer is good, otherwise an error. */ int -__xfs_dir2_data_check( +__xfs_dir3_data_check( struct xfs_inode *dp, /* incore inode pointer */ struct xfs_buf *bp) /* data block's buffer */ { @@ -65,15 +68,17 @@ __xfs_dir2_data_check( mp = bp->b_target->bt_mount; hdr = bp->b_addr; - bf = hdr->bestfree; - p = (char *)(hdr + 1); + bf = xfs_dir3_data_bestfree_p(hdr); + p = (char *)xfs_dir3_data_entry_p(hdr); switch (hdr->magic) { + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): btp = xfs_dir2_block_tail_p(mp, hdr); lep = xfs_dir2_block_leaf_p(btp); endp = (char *)lep; break; + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): case cpu_to_be32(XFS_DIR2_DATA_MAGIC): endp = (char *)hdr + mp->m_dirblksize; break; @@ -148,7 +153,8 @@ __xfs_dir2_data_check( (char *)dep - (char *)hdr); count++; lastfree = 0; - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, (xfs_dir2_data_aoff_t) ((char *)dep - (char *)hdr)); @@ -168,7 +174,8 @@ __xfs_dir2_data_check( * Need to have seen all the entries and all the bestfree slots. */ XFS_WANT_CORRUPTED_RETURN(freeseen == 7); - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { if (lep[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) @@ -185,21 +192,27 @@ __xfs_dir2_data_check( return 0; } -static void -xfs_dir2_data_verify( +static bool +xfs_dir3_data_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dir2_data_hdr *hdr = bp->b_addr; - int block_ok = 0; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC); - block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; - - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) + return false; } + if (__xfs_dir3_data_check(NULL, bp)) + return false; + return true; } /* @@ -208,7 +221,7 @@ xfs_dir2_data_verify( * format buffer or a data format buffer on readahead. */ static void -xfs_dir2_data_reada_verify( +xfs_dir3_data_reada_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -216,11 +229,13 @@ xfs_dir2_data_reada_verify( switch (hdr->magic) { case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): - bp->b_ops = &xfs_dir2_block_buf_ops; + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + bp->b_ops = &xfs_dir3_block_buf_ops; bp->b_ops->verify_read(bp); return; case cpu_to_be32(XFS_DIR2_DATA_MAGIC): - xfs_dir2_data_verify(bp); + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + xfs_dir3_data_verify(bp); return; default: XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); @@ -230,51 +245,80 @@ xfs_dir2_data_reada_verify( } static void -xfs_dir2_data_read_verify( +xfs_dir3_data_read_verify( struct xfs_buf *bp) { - xfs_dir2_data_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_DIR3_DATA_CRC_OFF)) || + !xfs_dir3_data_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void -xfs_dir2_data_write_verify( +xfs_dir3_data_write_verify( struct xfs_buf *bp) { - xfs_dir2_data_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_data_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); } -const struct xfs_buf_ops xfs_dir2_data_buf_ops = { - .verify_read = xfs_dir2_data_read_verify, - .verify_write = xfs_dir2_data_write_verify, +const struct xfs_buf_ops xfs_dir3_data_buf_ops = { + .verify_read = xfs_dir3_data_read_verify, + .verify_write = xfs_dir3_data_write_verify, }; -static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = { - .verify_read = xfs_dir2_data_reada_verify, - .verify_write = xfs_dir2_data_write_verify, +static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { + .verify_read = xfs_dir3_data_reada_verify, + .verify_write = xfs_dir3_data_write_verify, }; int -xfs_dir2_data_read( +xfs_dir3_data_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, - XFS_DATA_FORK, &xfs_dir2_data_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, + XFS_DATA_FORK, &xfs_dir3_data_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); + return err; } int -xfs_dir2_data_readahead( +xfs_dir3_data_readahead( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno) { return xfs_da_reada_buf(tp, dp, bno, mapped_bno, - XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops); + XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops); } /* @@ -288,12 +332,15 @@ xfs_dir2_data_freefind( { xfs_dir2_data_free_t *dfp; /* bestfree entry */ xfs_dir2_data_aoff_t off; /* offset value needed */ + struct xfs_dir2_data_free *bf; #if defined(DEBUG) && defined(__KERNEL__) int matched; /* matched the value */ int seenzero; /* saw a 0 bestfree entry */ #endif off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); + bf = xfs_dir3_data_bestfree_p(hdr); + #if defined(DEBUG) && defined(__KERNEL__) /* * Validate some consistency in the bestfree table. @@ -301,9 +348,11 @@ xfs_dir2_data_freefind( * one we're looking for it has to be exact. */ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); - for (dfp = &hdr->bestfree[0], seenzero = matched = 0; - dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT]; + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + for (dfp = &bf[0], seenzero = matched = 0; + dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { if (!dfp->offset) { ASSERT(!dfp->length); @@ -319,7 +368,7 @@ xfs_dir2_data_freefind( else ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off); ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length)); - if (dfp > &hdr->bestfree[0]) + if (dfp > &bf[0]) ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length)); } #endif @@ -328,14 +377,12 @@ xfs_dir2_data_freefind( * it can't be there since they're sorted. */ if (be16_to_cpu(dup->length) < - be16_to_cpu(hdr->bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length)) + be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length)) return NULL; /* * Look at the three bestfree entries for our guy. */ - for (dfp = &hdr->bestfree[0]; - dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT]; - dfp++) { + for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { if (!dfp->offset) return NULL; if (be16_to_cpu(dfp->offset) == off) @@ -359,11 +406,12 @@ xfs_dir2_data_freeinsert( xfs_dir2_data_free_t *dfp; /* bestfree table pointer */ xfs_dir2_data_free_t new; /* new bestfree entry */ -#ifdef __KERNEL__ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); -#endif - dfp = hdr->bestfree; + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + + dfp = xfs_dir3_data_bestfree_p(hdr); new.length = dup->length; new.offset = cpu_to_be16((char *)dup - (char *)hdr); @@ -400,32 +448,36 @@ xfs_dir2_data_freeremove( xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */ int *loghead) /* out: log data header */ { -#ifdef __KERNEL__ + struct xfs_dir2_data_free *bf; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); -#endif + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + /* * It's the first entry, slide the next 2 up. */ - if (dfp == &hdr->bestfree[0]) { - hdr->bestfree[0] = hdr->bestfree[1]; - hdr->bestfree[1] = hdr->bestfree[2]; + bf = xfs_dir3_data_bestfree_p(hdr); + if (dfp == &bf[0]) { + bf[0] = bf[1]; + bf[1] = bf[2]; } /* * It's the second entry, slide the 3rd entry up. */ - else if (dfp == &hdr->bestfree[1]) - hdr->bestfree[1] = hdr->bestfree[2]; + else if (dfp == &bf[1]) + bf[1] = bf[2]; /* * Must be the last entry. */ else - ASSERT(dfp == &hdr->bestfree[2]); + ASSERT(dfp == &bf[2]); /* * Clear the 3rd entry, must be zero now. */ - hdr->bestfree[2].length = 0; - hdr->bestfree[2].offset = 0; + bf[2].length = 0; + bf[2].offset = 0; *loghead = 1; } @@ -441,23 +493,27 @@ xfs_dir2_data_freescan( xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* active data entry */ xfs_dir2_data_unused_t *dup; /* unused data entry */ + struct xfs_dir2_data_free *bf; char *endp; /* end of block's data */ char *p; /* current entry pointer */ -#ifdef __KERNEL__ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); -#endif + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + /* * Start by clearing the table. */ - memset(hdr->bestfree, 0, sizeof(hdr->bestfree)); + bf = xfs_dir3_data_bestfree_p(hdr); + memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT); *loghead = 1; /* * Set up pointers. */ - p = (char *)(hdr + 1); - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + p = (char *)xfs_dir3_data_entry_p(hdr); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { btp = xfs_dir2_block_tail_p(mp, hdr); endp = (char *)xfs_dir2_block_leaf_p(btp); } else @@ -493,7 +549,7 @@ xfs_dir2_data_freescan( * Give back the buffer for the created block. */ int /* error */ -xfs_dir2_data_init( +xfs_dir3_data_init( xfs_da_args_t *args, /* directory operation args */ xfs_dir2_db_t blkno, /* logical dir block number */ struct xfs_buf **bpp) /* output block buffer */ @@ -502,6 +558,7 @@ xfs_dir2_data_init( xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* unused entry pointer */ + struct xfs_dir2_data_free *bf; int error; /* error return value */ int i; /* bestfree index */ xfs_mount_t *mp; /* filesystem mount point */ @@ -518,27 +575,40 @@ xfs_dir2_data_init( XFS_DATA_FORK); if (error) return error; - bp->b_ops = &xfs_dir2_data_buf_ops; + bp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF); /* * Initialize the header. */ hdr = bp->b_addr; - hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); - hdr->bestfree[0].offset = cpu_to_be16(sizeof(*hdr)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + memset(hdr3, 0, sizeof(*hdr3)); + hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + + } else + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + + bf = xfs_dir3_data_bestfree_p(hdr); + bf[0].offset = cpu_to_be16(xfs_dir3_data_entry_offset(hdr)); for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { - hdr->bestfree[i].length = 0; - hdr->bestfree[i].offset = 0; + bf[i].length = 0; + bf[i].offset = 0; } /* * Set up an unused entry for the block's body. */ - dup = (xfs_dir2_data_unused_t *)(hdr + 1); + dup = xfs_dir3_data_unused_p(hdr); dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); - t = mp->m_dirblksize - (uint)sizeof(*hdr); - hdr->bestfree[0].length = cpu_to_be16(t); + t = mp->m_dirblksize - (uint)xfs_dir3_data_entry_offset(hdr); + bf[0].length = cpu_to_be16(t); dup->length = cpu_to_be16(t); *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr); /* @@ -562,7 +632,9 @@ xfs_dir2_data_log_entry( xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) - @@ -580,9 +652,11 @@ xfs_dir2_data_log_header( xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); - xfs_trans_log_buf(tp, bp, 0, sizeof(*hdr) - 1); + xfs_trans_log_buf(tp, bp, 0, xfs_dir3_data_entry_offset(hdr) - 1); } /* @@ -597,7 +671,9 @@ xfs_dir2_data_log_unused( xfs_dir2_data_hdr_t *hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); /* * Log the first part of the unused entry. @@ -635,6 +711,7 @@ xfs_dir2_data_make_free( xfs_dir2_data_unused_t *newdup; /* new unused entry */ xfs_dir2_data_unused_t *postdup; /* unused entry after us */ xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ + struct xfs_dir2_data_free *bf; mp = tp->t_mountp; hdr = bp->b_addr; @@ -642,12 +719,14 @@ xfs_dir2_data_make_free( /* * Figure out where the end of the data area is. */ - if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)) + if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)) endptr = (char *)hdr + mp->m_dirblksize; else { xfs_dir2_block_tail_t *btp; /* block tail */ - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); btp = xfs_dir2_block_tail_p(mp, hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); } @@ -655,7 +734,7 @@ xfs_dir2_data_make_free( * If this isn't the start of the block, then back up to * the previous entry and see if it's free. */ - if (offset > sizeof(*hdr)) { + if (offset > xfs_dir3_data_entry_offset(hdr)) { __be16 *tagp; /* tag just before us */ tagp = (__be16 *)((char *)hdr + offset) - 1; @@ -681,6 +760,7 @@ xfs_dir2_data_make_free( * Previous and following entries are both free, * merge everything into a single free entry. */ + bf = xfs_dir3_data_bestfree_p(hdr); if (prevdup && postdup) { xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */ @@ -695,7 +775,7 @@ xfs_dir2_data_make_free( * since the third bestfree is there, there might be more * entries. */ - needscan = (hdr->bestfree[2].length != 0); + needscan = (bf[2].length != 0); /* * Fix up the new big freespace. */ @@ -711,10 +791,10 @@ xfs_dir2_data_make_free( * Remove entry 1 first then entry 0. */ ASSERT(dfp && dfp2); - if (dfp == &hdr->bestfree[1]) { - dfp = &hdr->bestfree[0]; + if (dfp == &bf[1]) { + dfp = &bf[0]; ASSERT(dfp2 == dfp); - dfp2 = &hdr->bestfree[1]; + dfp2 = &bf[1]; } xfs_dir2_data_freeremove(hdr, dfp2, needlogp); xfs_dir2_data_freeremove(hdr, dfp, needlogp); @@ -722,7 +802,7 @@ xfs_dir2_data_make_free( * Now insert the new entry. */ dfp = xfs_dir2_data_freeinsert(hdr, prevdup, needlogp); - ASSERT(dfp == &hdr->bestfree[0]); + ASSERT(dfp == &bf[0]); ASSERT(dfp->length == prevdup->length); ASSERT(!dfp[1].length); ASSERT(!dfp[2].length); @@ -751,7 +831,7 @@ xfs_dir2_data_make_free( */ else { needscan = be16_to_cpu(prevdup->length) > - be16_to_cpu(hdr->bestfree[2].length); + be16_to_cpu(bf[2].length); } } /* @@ -779,7 +859,7 @@ xfs_dir2_data_make_free( */ else { needscan = be16_to_cpu(newdup->length) > - be16_to_cpu(hdr->bestfree[2].length); + be16_to_cpu(bf[2].length); } } /* @@ -818,10 +898,13 @@ xfs_dir2_data_use_free( xfs_dir2_data_unused_t *newdup; /* new unused entry */ xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ int oldlen; /* old unused entry's length */ + struct xfs_dir2_data_free *bf; hdr = bp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)); + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); ASSERT(offset >= (char *)dup - (char *)hdr); ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr); @@ -831,7 +914,8 @@ xfs_dir2_data_use_free( */ dfp = xfs_dir2_data_freefind(hdr, dup); oldlen = be16_to_cpu(dup->length); - ASSERT(dfp || oldlen <= be16_to_cpu(hdr->bestfree[2].length)); + bf = xfs_dir3_data_bestfree_p(hdr); + ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length)); /* * Check for alignment with front and back of the entry. */ @@ -845,7 +929,7 @@ xfs_dir2_data_use_free( */ if (matchfront && matchback) { if (dfp) { - needscan = (hdr->bestfree[2].offset != 0); + needscan = (bf[2].offset != 0); if (!needscan) xfs_dir2_data_freeremove(hdr, dfp, needlogp); } @@ -875,7 +959,7 @@ xfs_dir2_data_use_free( * that means we don't know if there was a better * choice for the last slot, or not. Rescan. */ - needscan = dfp == &hdr->bestfree[2]; + needscan = dfp == &bf[2]; } } /* @@ -902,7 +986,7 @@ xfs_dir2_data_use_free( * that means we don't know if there was a better * choice for the last slot, or not. Rescan. */ - needscan = dfp == &hdr->bestfree[2]; + needscan = dfp == &bf[2]; } } /* @@ -930,7 +1014,7 @@ xfs_dir2_data_use_free( * the 2 new will work. */ if (dfp) { - needscan = (hdr->bestfree[2].length != 0); + needscan = (bf[2].length != 0); if (!needscan) { xfs_dir2_data_freeremove(hdr, dfp, needlogp); xfs_dir2_data_freeinsert(hdr, newdup, needlogp); diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h index 07270981f48f..a3b1bd841a80 100644 --- a/fs/xfs/xfs_dir2_format.h +++ b/fs/xfs/xfs_dir2_format.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -36,6 +37,38 @@ #define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F: free index blocks */ /* + * Directory Version 3 With CRCs. + * + * The tree formats are the same as for version 2 directories. The difference + * is in the block header and dirent formats. In many cases the v3 structures + * use v2 definitions as they are no different and this makes code sharing much + * easier. + * + * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the + * format is v2 then they switch to the existing v2 code, or the format is v3 + * they implement the v3 functionality. This means the existing dir2 is a mix of + * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called + * where there is a difference in the formats, otherwise the code is unchanged. + * + * Where it is possible, the code decides what to do based on the magic numbers + * in the blocks rather than feature bits in the superblock. This means the code + * is as independent of the external XFS code as possible as doesn't require + * passing struct xfs_mount pointers into places where it isn't really + * necessary. + * + * Version 3 includes: + * + * - a larger block header for CRC and identification purposes and so the + * offsets of all the structures inside the blocks are different. + * + * - new magic numbers to be able to detect the v2/v3 types on the fly. + */ + +#define XFS_DIR3_BLOCK_MAGIC 0x58444233 /* XDB3: single block dirs */ +#define XFS_DIR3_DATA_MAGIC 0x58444433 /* XDD3: multiblock dirs */ +#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */ + +/* * Byte offset in data block and shortform entry. */ typedef __uint16_t xfs_dir2_data_off_t; @@ -195,16 +228,6 @@ xfs_dir2_sf_nextentry(struct xfs_dir2_sf_hdr *hdr, xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET) /* - * Offsets of . and .. in data space (always block 0) - */ -#define XFS_DIR2_DATA_DOT_OFFSET \ - ((xfs_dir2_data_aoff_t)sizeof(struct xfs_dir2_data_hdr)) -#define XFS_DIR2_DATA_DOTDOT_OFFSET \ - (XFS_DIR2_DATA_DOT_OFFSET + xfs_dir2_data_entsize(1)) -#define XFS_DIR2_DATA_FIRST_OFFSET \ - (XFS_DIR2_DATA_DOTDOT_OFFSET + xfs_dir2_data_entsize(2)) - -/* * Describe a free area in the data block. * * The freespace will be formatted as a xfs_dir2_data_unused_t. @@ -226,6 +249,39 @@ typedef struct xfs_dir2_data_hdr { } xfs_dir2_data_hdr_t; /* + * define a structure for all the verification fields we are adding to the + * directory block structures. This will be used in several structures. + * The magic number must be the first entry to align with all the dir2 + * structures so we determine how to decode them just by the magic number. + */ +struct xfs_dir3_blk_hdr { + __be32 magic; /* magic number */ + __be32 crc; /* CRC of block */ + __be64 blkno; /* first block of the buffer */ + __be64 lsn; /* sequence number of last write */ + uuid_t uuid; /* filesystem we belong to */ + __be64 owner; /* inode that owns the block */ +}; + +struct xfs_dir3_data_hdr { + struct xfs_dir3_blk_hdr hdr; + xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT]; +}; + +#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc) + +static inline struct xfs_dir2_data_free * +xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) +{ + if (hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { + struct xfs_dir3_data_hdr *hdr3 = (struct xfs_dir3_data_hdr *)hdr; + return hdr3->best_free; + } + return hdr->bestfree; +} + +/* * Active entry in a data block. * * Aligned to 8 bytes. After the variable length name field there is a @@ -280,6 +336,94 @@ xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup) be16_to_cpu(dup->length) - sizeof(__be16)); } +static inline size_t +xfs_dir3_data_hdr_size(bool dir3) +{ + if (dir3) + return sizeof(struct xfs_dir3_data_hdr); + return sizeof(struct xfs_dir2_data_hdr); +} + +static inline size_t +xfs_dir3_data_entry_offset(struct xfs_dir2_data_hdr *hdr) +{ + bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + return xfs_dir3_data_hdr_size(dir3); +} + +static inline struct xfs_dir2_data_entry * +xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + xfs_dir3_data_entry_offset(hdr)); +} + +static inline struct xfs_dir2_data_unused * +xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_unused *) + ((char *)hdr + xfs_dir3_data_entry_offset(hdr)); +} + +/* + * Offsets of . and .. in data space (always block 0) + * + * The macros are used for shortform directories as they have no headers to read + * the magic number out of. Shortform directories need to know the size of the + * data block header because the sfe embeds the block offset of the entry into + * it so that it doesn't change when format conversion occurs. Bad Things Happen + * if we don't follow this rule. + */ +#define XFS_DIR3_DATA_DOT_OFFSET(mp) \ + xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb)) +#define XFS_DIR3_DATA_DOTDOT_OFFSET(mp) \ + (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir2_data_entsize(1)) +#define XFS_DIR3_DATA_FIRST_OFFSET(mp) \ + (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir2_data_entsize(2)) + +static inline xfs_dir2_data_aoff_t +xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr) +{ + return xfs_dir3_data_entry_offset(hdr); +} + +static inline xfs_dir2_data_aoff_t +xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr) +{ + return xfs_dir3_data_dot_offset(hdr) + xfs_dir2_data_entsize(1); +} + +static inline xfs_dir2_data_aoff_t +xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr) +{ + return xfs_dir3_data_dotdot_offset(hdr) + xfs_dir2_data_entsize(2); +} + +/* + * location of . and .. in data space (always block 0) + */ +static inline struct xfs_dir2_data_entry * +xfs_dir3_data_dot_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + xfs_dir3_data_dot_offset(hdr)); +} + +static inline struct xfs_dir2_data_entry * +xfs_dir3_data_dotdot_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + xfs_dir3_data_dotdot_offset(hdr)); +} + +static inline struct xfs_dir2_data_entry * +xfs_dir3_data_first_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + xfs_dir3_data_first_offset(hdr)); +} + /* * Leaf block structures. * @@ -329,6 +473,21 @@ typedef struct xfs_dir2_leaf_hdr { __be16 stale; /* count of stale entries */ } xfs_dir2_leaf_hdr_t; +struct xfs_dir3_leaf_hdr { + struct xfs_da3_blkinfo info; /* header for da routines */ + __be16 count; /* count of entries */ + __be16 stale; /* count of stale entries */ + __be32 pad; +}; + +struct xfs_dir3_icleaf_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t stale; +}; + /* * Leaf block entry. */ @@ -348,23 +507,50 @@ typedef struct xfs_dir2_leaf_tail { * Leaf block. */ typedef struct xfs_dir2_leaf { - xfs_dir2_leaf_hdr_t hdr; /* leaf header */ - xfs_dir2_leaf_entry_t ents[]; /* entries */ + xfs_dir2_leaf_hdr_t hdr; /* leaf header */ + xfs_dir2_leaf_entry_t __ents[]; /* entries */ } xfs_dir2_leaf_t; -/* - * DB blocks here are logical directory block numbers, not filesystem blocks. - */ +struct xfs_dir3_leaf { + struct xfs_dir3_leaf_hdr hdr; /* leaf header */ + struct xfs_dir2_leaf_entry __ents[]; /* entries */ +}; -static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp) +#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc) + +static inline int +xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp) { - return (mp->m_dirblksize - (uint)sizeof(struct xfs_dir2_leaf_hdr)) / + if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) + return sizeof(struct xfs_dir3_leaf_hdr); + return sizeof(struct xfs_dir2_leaf_hdr); +} + +static inline int +xfs_dir3_max_leaf_ents(struct xfs_mount *mp, struct xfs_dir2_leaf *lp) +{ + return (mp->m_dirblksize - xfs_dir3_leaf_hdr_size(lp)) / (uint)sizeof(struct xfs_dir2_leaf_entry); } /* * Get address of the bestcount field in the single-leaf block. */ +static inline struct xfs_dir2_leaf_entry * +xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp) +{ + if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_dir3_leaf *lp3 = (struct xfs_dir3_leaf *)lp; + return lp3->__ents; + } + return lp->__ents; +} + +/* + * Get address of the bestcount field in the single-leaf block. + */ static inline struct xfs_dir2_leaf_tail * xfs_dir2_leaf_tail_p(struct xfs_mount *mp, struct xfs_dir2_leaf *lp) { @@ -383,6 +569,10 @@ xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp) } /* + * DB blocks here are logical directory block numbers, not filesystem blocks. + */ + +/* * Convert dataptr to byte in file space */ static inline xfs_dir2_off_t @@ -520,19 +710,65 @@ typedef struct xfs_dir2_free { /* unused entries are -1 */ } xfs_dir2_free_t; -static inline int xfs_dir2_free_max_bests(struct xfs_mount *mp) +struct xfs_dir3_free_hdr { + struct xfs_dir3_blk_hdr hdr; + __be32 firstdb; /* db of first entry */ + __be32 nvalid; /* count of valid entries */ + __be32 nused; /* count of used entries */ +}; + +struct xfs_dir3_free { + struct xfs_dir3_free_hdr hdr; + __be16 bests[]; /* best free counts */ + /* unused entries are -1 */ +}; + +#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc) + +/* + * In core version of the free block header, abstracted away from on-disk format + * differences. Use this in the code, and convert to/from the disk version using + * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk. + */ +struct xfs_dir3_icfree_hdr { + __uint32_t magic; + __uint32_t firstdb; + __uint32_t nvalid; + __uint32_t nused; + +}; + +void xfs_dir3_free_hdr_from_disk(struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from); + +static inline int +xfs_dir3_free_hdr_size(struct xfs_mount *mp) { - return (mp->m_dirblksize - sizeof(struct xfs_dir2_free_hdr)) / + if (xfs_sb_version_hascrc(&mp->m_sb)) + return sizeof(struct xfs_dir3_free_hdr); + return sizeof(struct xfs_dir2_free_hdr); +} + +static inline int +xfs_dir3_free_max_bests(struct xfs_mount *mp) +{ + return (mp->m_dirblksize - xfs_dir3_free_hdr_size(mp)) / sizeof(xfs_dir2_data_off_t); } +static inline __be16 * +xfs_dir3_free_bests_p(struct xfs_mount *mp, struct xfs_dir2_free *free) +{ + return (__be16 *)((char *)free + xfs_dir3_free_hdr_size(mp)); +} + /* * Convert data space db to the corresponding free db. */ static inline xfs_dir2_db_t xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) { - return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir2_free_max_bests(mp); + return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir3_free_max_bests(mp); } /* @@ -541,7 +777,7 @@ xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) static inline int xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) { - return db % xfs_dir2_free_max_bests(mp); + return db % xfs_dir3_free_max_bests(mp); } /* diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 60cd2fa4e047..721ba2fe8e54 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -33,97 +34,371 @@ #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" /* * Local function declarations. */ -#ifdef DEBUG -static void xfs_dir2_leaf_check(struct xfs_inode *dp, struct xfs_buf *bp); -#else -#define xfs_dir2_leaf_check(dp, bp) -#endif static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp, int *indexp, struct xfs_buf **dbpp); -static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, +static void xfs_dir3_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); -static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); +static void xfs_dir3_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); -static void -xfs_dir2_leaf_verify( +/* + * Check the internal consistency of a leaf1 block. + * Pop an assert if something is wrong. + */ +#ifdef DEBUG +#define xfs_dir3_leaf_check(mp, bp) \ +do { \ + if (!xfs_dir3_leaf1_check((mp), (bp))) \ + ASSERT(0); \ +} while (0); + +STATIC bool +xfs_dir3_leaf1_check( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + + if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) + return false; + + return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf); +} +#else +#define xfs_dir3_leaf_check(mp, bp) +#endif + +void +xfs_dir3_leaf_hdr_from_disk( + struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from) +{ + if (from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) { + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->stale = be16_to_cpu(from->hdr.stale); + } else { + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->count); + to->stale = be16_to_cpu(hdr3->stale); + } + + ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC || + to->magic == XFS_DIR3_LEAF1_MAGIC || + to->magic == XFS_DIR2_LEAFN_MAGIC || + to->magic == XFS_DIR3_LEAFN_MAGIC); +} + +void +xfs_dir3_leaf_hdr_to_disk( + struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC || + from->magic == XFS_DIR3_LEAF1_MAGIC || + from->magic == XFS_DIR2_LEAFN_MAGIC || + from->magic == XFS_DIR3_LEAFN_MAGIC); + + if (from->magic == XFS_DIR2_LEAF1_MAGIC || + from->magic == XFS_DIR2_LEAFN_MAGIC) { + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.stale = cpu_to_be16(from->stale); + } else { + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to; + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->count = cpu_to_be16(from->count); + hdr3->stale = cpu_to_be16(from->stale); + } +} + +bool +xfs_dir3_leaf_check_int( + struct xfs_mount *mp, + struct xfs_dir3_icleaf_hdr *hdr, + struct xfs_dir2_leaf *leaf) +{ + struct xfs_dir2_leaf_entry *ents; + xfs_dir2_leaf_tail_t *ltp; + int stale; + int i; + + ents = xfs_dir3_leaf_ents_p(leaf); + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + + /* + * XXX (dgc): This value is not restrictive enough. + * Should factor in the size of the bests table as well. + * We can deduce a value for that from di_size. + */ + if (hdr->count > xfs_dir3_max_leaf_ents(mp, leaf)) + return false; + + /* Leaves and bests don't overlap in leaf format. */ + if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR3_LEAF1_MAGIC) && + (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) + return false; + + /* Check hash value order, count stale entries. */ + for (i = stale = 0; i < hdr->count; i++) { + if (i + 1 < hdr->count) { + if (be32_to_cpu(ents[i].hashval) > + be32_to_cpu(ents[i + 1].hashval)) + return false; + } + if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + stale++; + } + if (hdr->stale != stale) + return false; + return true; +} + +static bool +xfs_dir3_leaf_verify( struct xfs_buf *bp, - __be16 magic) + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); + + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + + if ((magic == XFS_DIR2_LEAF1_MAGIC && + leafhdr.magic != XFS_DIR3_LEAF1_MAGIC) || + (magic == XFS_DIR2_LEAFN_MAGIC && + leafhdr.magic != XFS_DIR3_LEAFN_MAGIC)) + return false; + + if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else { + if (leafhdr.magic != magic) + return false; + } + return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf); +} + +static void +__read_verify( + struct xfs_buf *bp, + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_DIR3_LEAF_CRC_OFF)) || + !xfs_dir3_leaf_verify(bp, magic)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +__write_verify( + struct xfs_buf *bp, + __uint16_t magic) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dir2_leaf_hdr *hdr = bp->b_addr; - int block_ok = 0; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; - block_ok = hdr->info.magic == magic; - if (!block_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + if (!xfs_dir3_leaf_verify(bp, magic)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + return; } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_LEAF_CRC_OFF); } static void -xfs_dir2_leaf1_read_verify( +xfs_dir3_leaf1_read_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + __read_verify(bp, XFS_DIR2_LEAF1_MAGIC); } static void -xfs_dir2_leaf1_write_verify( +xfs_dir3_leaf1_write_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + __write_verify(bp, XFS_DIR2_LEAF1_MAGIC); } -void -xfs_dir2_leafn_read_verify( +static void +xfs_dir3_leafn_read_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + __read_verify(bp, XFS_DIR2_LEAFN_MAGIC); } -void -xfs_dir2_leafn_write_verify( +static void +xfs_dir3_leafn_write_verify( struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + __write_verify(bp, XFS_DIR2_LEAFN_MAGIC); } -static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = { - .verify_read = xfs_dir2_leaf1_read_verify, - .verify_write = xfs_dir2_leaf1_write_verify, +const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { + .verify_read = xfs_dir3_leaf1_read_verify, + .verify_write = xfs_dir3_leaf1_write_verify, }; -const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = { - .verify_read = xfs_dir2_leafn_read_verify, - .verify_write = xfs_dir2_leafn_write_verify, +const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { + .verify_read = xfs_dir3_leafn_read_verify, + .verify_write = xfs_dir3_leafn_write_verify, }; static int -xfs_dir2_leaf_read( +xfs_dir3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); + return err; } int -xfs_dir2_leafn_read( +xfs_dir3_leafn_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); + return err; +} + +/* + * Initialize a new leaf block, leaf1 or leafn magic accepted. + */ +static void +xfs_dir3_leaf_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *bp, + xfs_ino_t owner, + __uint16_t type) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + + ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + + memset(leaf3, 0, sizeof(*leaf3)); + + leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC) + ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) + : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); + leaf3->info.blkno = cpu_to_be64(bp->b_bn); + leaf3->info.owner = cpu_to_be64(owner); + uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid); + } else { + memset(leaf, 0, sizeof(*leaf)); + leaf->hdr.info.magic = cpu_to_be16(type); + } + + /* + * If it's a leaf-format directory initialize the tail. + * Caller is responsible for initialising the bests table. + */ + if (type == XFS_DIR2_LEAF1_MAGIC) { + struct xfs_dir2_leaf_tail *ltp; + + ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp->bestcount = 0; + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF); + } else { + bp->b_ops = &xfs_dir3_leafn_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); + } +} + +int +xfs_dir3_leaf_get_buf( + xfs_da_args_t *args, + xfs_dir2_db_t bno, + struct xfs_buf **bpp, + __uint16_t magic) +{ + struct xfs_inode *dp = args->dp; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + + ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); + ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) && + bno < XFS_DIR2_FREE_FIRSTDB(mp)); + + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, + XFS_DATA_FORK); + if (error) + return error; + + xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic); + xfs_dir3_leaf_log_header(tp, bp); + if (magic == XFS_DIR2_LEAF1_MAGIC) + xfs_dir3_leaf_log_tail(tp, bp); + *bpp = bp; + return 0; } /* @@ -149,6 +424,9 @@ xfs_dir2_block_to_leaf( int needlog; /* need to log block header */ int needscan; /* need to rescan bestfree */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_block_to_leaf(args); @@ -168,26 +446,33 @@ xfs_dir2_block_to_leaf( /* * Initialize the leaf block, get a buffer for it. */ - if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) { + error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC); + if (error) return error; - } - ASSERT(lbp != NULL); + leaf = lbp->b_addr; hdr = dbp->b_addr; - xfs_dir2_data_check(dp, dbp); + xfs_dir3_data_check(dp, dbp); btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); + bf = xfs_dir3_data_bestfree_p(hdr); + ents = xfs_dir3_leaf_ents_p(leaf); + /* * Set the counts in the leaf header. */ - leaf->hdr.count = cpu_to_be16(be32_to_cpu(btp->count)); - leaf->hdr.stale = cpu_to_be16(be32_to_cpu(btp->stale)); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + leafhdr.count = be32_to_cpu(btp->count); + leafhdr.stale = be32_to_cpu(btp->stale); + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, lbp); + /* * Could compact these but I think we always do the conversion * after squeezing out stale entries. */ - memcpy(leaf->ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, lbp, 0, be16_to_cpu(leaf->hdr.count) - 1); + memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(tp, lbp, 0, leafhdr.count - 1); needscan = 0; needlog = 1; /* @@ -202,8 +487,13 @@ xfs_dir2_block_to_leaf( /* * Fix up the block header, make it a data block. */ - dbp->b_ops = &xfs_dir2_data_buf_ops; - hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + dbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + else + hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + if (needscan) xfs_dir2_data_freescan(mp, hdr, &needlog); /* @@ -212,21 +502,22 @@ xfs_dir2_block_to_leaf( ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = cpu_to_be32(1); bestsp = xfs_dir2_leaf_bests_p(ltp); - bestsp[0] = hdr->bestfree[0].length; + bestsp[0] = bf[0].length; /* * Log the data header and leaf bests table. */ if (needlog) xfs_dir2_data_log_header(tp, dbp); - xfs_dir2_leaf_check(dp, lbp); - xfs_dir2_data_check(dp, dbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, 0); + xfs_dir3_leaf_check(mp, lbp); + xfs_dir3_data_check(dp, dbp); + xfs_dir3_leaf_log_bests(tp, lbp, 0, 0); return 0; } STATIC void -xfs_dir2_leaf_find_stale( - struct xfs_dir2_leaf *leaf, +xfs_dir3_leaf_find_stale( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int index, int *lowstale, int *highstale) @@ -235,7 +526,7 @@ xfs_dir2_leaf_find_stale( * Find the first stale entry before our index, if any. */ for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) { - if (leaf->ents[*lowstale].address == + if (ents[*lowstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) break; } @@ -245,10 +536,8 @@ xfs_dir2_leaf_find_stale( * Stop if the result would require moving more entries than using * lowstale. */ - for (*highstale = index; - *highstale < be16_to_cpu(leaf->hdr.count); - ++*highstale) { - if (leaf->ents[*highstale].address == + for (*highstale = index; *highstale < leafhdr->count; ++*highstale) { + if (ents[*highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) break; if (*lowstale >= 0 && index - *lowstale <= *highstale - index) @@ -257,8 +546,9 @@ xfs_dir2_leaf_find_stale( } struct xfs_dir2_leaf_entry * -xfs_dir2_leaf_find_entry( - xfs_dir2_leaf_t *leaf, /* leaf structure */ +xfs_dir3_leaf_find_entry( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int index, /* leaf table position */ int compact, /* need to compact leaves */ int lowstale, /* index of prev stale leaf */ @@ -266,7 +556,7 @@ xfs_dir2_leaf_find_entry( int *lfloglow, /* low leaf logging index */ int *lfloghigh) /* high leaf logging index */ { - if (!leaf->hdr.stale) { + if (!leafhdr->stale) { xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ /* @@ -274,18 +564,16 @@ xfs_dir2_leaf_find_entry( * * If there are no stale entries, just insert a hole at index. */ - lep = &leaf->ents[index]; - if (index < be16_to_cpu(leaf->hdr.count)) + lep = &ents[index]; + if (index < leafhdr->count) memmove(lep + 1, lep, - (be16_to_cpu(leaf->hdr.count) - index) * - sizeof(*lep)); + (leafhdr->count - index) * sizeof(*lep)); /* * Record low and high logging indices for the leaf. */ *lfloglow = index; - *lfloghigh = be16_to_cpu(leaf->hdr.count); - be16_add_cpu(&leaf->hdr.count, 1); + *lfloghigh = leafhdr->count++; return lep; } @@ -299,16 +587,17 @@ xfs_dir2_leaf_find_entry( * entries before and after our insertion point. */ if (compact == 0) - xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale); + xfs_dir3_leaf_find_stale(leafhdr, ents, index, + &lowstale, &highstale); /* * If the low one is better, use it. */ if (lowstale >= 0 && - (highstale == be16_to_cpu(leaf->hdr.count) || + (highstale == leafhdr->count || index - lowstale - 1 < highstale - index)) { ASSERT(index - lowstale - 1 >= 0); - ASSERT(leaf->ents[lowstale].address == + ASSERT(ents[lowstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); /* @@ -316,37 +605,34 @@ xfs_dir2_leaf_find_entry( * for the new entry. */ if (index - lowstale - 1 > 0) { - memmove(&leaf->ents[lowstale], - &leaf->ents[lowstale + 1], + memmove(&ents[lowstale], &ents[lowstale + 1], (index - lowstale - 1) * - sizeof(xfs_dir2_leaf_entry_t)); + sizeof(xfs_dir2_leaf_entry_t)); } *lfloglow = MIN(lowstale, *lfloglow); *lfloghigh = MAX(index - 1, *lfloghigh); - be16_add_cpu(&leaf->hdr.stale, -1); - return &leaf->ents[index - 1]; + leafhdr->stale--; + return &ents[index - 1]; } /* * The high one is better, so use that one. */ ASSERT(highstale - index >= 0); - ASSERT(leaf->ents[highstale].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); + ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); /* * Copy entries down to cover the stale entry and make room for the * new entry. */ if (highstale - index > 0) { - memmove(&leaf->ents[index + 1], - &leaf->ents[index], + memmove(&ents[index + 1], &ents[index], (highstale - index) * sizeof(xfs_dir2_leaf_entry_t)); } *lfloglow = MIN(index, *lfloglow); *lfloghigh = MAX(highstale, *lfloghigh); - be16_add_cpu(&leaf->hdr.stale, -1); - return &leaf->ents[index]; + leafhdr->stale--; + return &ents[index]; } /* @@ -383,6 +669,9 @@ xfs_dir2_leaf_addname( __be16 *tagp; /* end of data entry */ xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t use_block; /* data block number */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_addname(args); @@ -390,7 +679,7 @@ xfs_dir2_leaf_addname( tp = args->trans; mp = dp->i_mount; - error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); if (error) return error; @@ -403,16 +692,19 @@ xfs_dir2_leaf_addname( index = xfs_dir2_leaf_search_hash(args, lbp); leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); length = xfs_dir2_data_entsize(args->namelen); + /* * See if there are any entries with the same hash value * and space in their block for the new entry. * This is good because it puts multiple same-hash value entries * in a data block, improving the lookup of those entries. */ - for (use_block = -1, lep = &leaf->ents[index]; - index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval; + for (use_block = -1, lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; index++, lep++) { if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) continue; @@ -445,7 +737,7 @@ xfs_dir2_leaf_addname( * How many bytes do we need in the leaf block? */ needbytes = 0; - if (!leaf->hdr.stale) + if (!leafhdr.stale) needbytes += sizeof(xfs_dir2_leaf_entry_t); if (use_block == -1) needbytes += sizeof(xfs_dir2_data_off_t); @@ -460,16 +752,15 @@ xfs_dir2_leaf_addname( * If we don't have enough free bytes but we can make enough * by compacting out stale entries, we'll do that. */ - if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < - needbytes && be16_to_cpu(leaf->hdr.stale) > 1) { + if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes && + leafhdr.stale > 1) compact = 1; - } + /* * Otherwise if we don't have enough free bytes we need to * convert to node form. */ - else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu( - leaf->hdr.count)] < needbytes) { + else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) { /* * Just checking or no space reservation, give up. */ @@ -517,15 +808,15 @@ xfs_dir2_leaf_addname( * point later. */ if (compact) { - xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale, - &lfloglow, &lfloghigh); + xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale, + &highstale, &lfloglow, &lfloghigh); } /* * There are stale entries, so we'll need log-low and log-high * impossibly bad values later. */ - else if (be16_to_cpu(leaf->hdr.stale)) { - lfloglow = be16_to_cpu(leaf->hdr.count); + else if (leafhdr.stale) { + lfloglow = leafhdr.count; lfloghigh = -1; } /* @@ -544,7 +835,7 @@ xfs_dir2_leaf_addname( /* * Initialize the block. */ - if ((error = xfs_dir2_data_init(args, use_block, &dbp))) { + if ((error = xfs_dir3_data_init(args, use_block, &dbp))) { xfs_trans_brelse(tp, lbp); return error; } @@ -557,23 +848,24 @@ xfs_dir2_leaf_addname( memmove(&bestsp[0], &bestsp[1], be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0])); be32_add_cpu(<p->bestcount, 1); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(tp, lbp); + xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); } /* * If we're filling in a previously empty block just log it. */ else - xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); + xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block); hdr = dbp->b_addr; - bestsp[use_block] = hdr->bestfree[0].length; + bf = xfs_dir3_data_bestfree_p(hdr); + bestsp[use_block] = bf[0].length; grown = 1; } else { /* * Already had space in some data block. * Just read that one in. */ - error = xfs_dir2_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, use_block), -1, &dbp); if (error) { @@ -581,13 +873,14 @@ xfs_dir2_leaf_addname( return error; } hdr = dbp->b_addr; + bf = xfs_dir3_data_bestfree_p(hdr); grown = 0; } /* * Point to the biggest freespace in our data block. */ dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset)); + ((char *)hdr + be16_to_cpu(bf[0].offset)); ASSERT(be16_to_cpu(dup->length) >= length); needscan = needlog = 0; /* @@ -620,13 +913,13 @@ xfs_dir2_leaf_addname( * If the bests table needs to be changed, do it. * Log the change unless we've already done that. */ - if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(hdr->bestfree[0].length)) { - bestsp[use_block] = hdr->bestfree[0].length; + if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) { + bestsp[use_block] = bf[0].length; if (!grown) - xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); + xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block); } - lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale, + lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, highstale, &lfloglow, &lfloghigh); /* @@ -638,82 +931,40 @@ xfs_dir2_leaf_addname( /* * Log the leaf fields and give up the buffers. */ - xfs_dir2_leaf_log_header(tp, lbp); - xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh); - xfs_dir2_leaf_check(dp, lbp); - xfs_dir2_data_check(dp, dbp); + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, lbp); + xfs_dir3_leaf_log_ents(tp, lbp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(mp, lbp); + xfs_dir3_data_check(dp, dbp); return 0; } -#ifdef DEBUG -/* - * Check the internal consistency of a leaf1 block. - * Pop an assert if something is wrong. - */ -STATIC void -xfs_dir2_leaf_check( - struct xfs_inode *dp, /* incore directory inode */ - struct xfs_buf *bp) /* leaf's buffer */ -{ - int i; /* leaf index */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ - xfs_mount_t *mp; /* filesystem mount point */ - int stale; /* count of stale leaves */ - - leaf = bp->b_addr; - mp = dp->i_mount; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); - /* - * This value is not restrictive enough. - * Should factor in the size of the bests table as well. - * We can deduce a value for that from di_size. - */ - ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); - ltp = xfs_dir2_leaf_tail_p(mp, leaf); - /* - * Leaves and bests don't overlap. - */ - ASSERT((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <= - (char *)xfs_dir2_leaf_bests_p(ltp)); - /* - * Check hash value order, count stale entries. - */ - for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { - if (i + 1 < be16_to_cpu(leaf->hdr.count)) - ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= - be32_to_cpu(leaf->ents[i + 1].hashval)); - if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) - stale++; - } - ASSERT(be16_to_cpu(leaf->hdr.stale) == stale); -} -#endif /* DEBUG */ - /* * Compact out any stale entries in the leaf. * Log the header and changed leaf entries, if any. */ void -xfs_dir2_leaf_compact( +xfs_dir3_leaf_compact( xfs_da_args_t *args, /* operation arguments */ + struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp) /* leaf buffer */ { int from; /* source leaf index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int loglow; /* first leaf entry to log */ int to; /* target leaf index */ + struct xfs_dir2_leaf_entry *ents; leaf = bp->b_addr; - if (!leaf->hdr.stale) { + if (!leafhdr->stale) return; - } + /* * Compress out the stale entries in place. */ - for (from = to = 0, loglow = -1; from < be16_to_cpu(leaf->hdr.count); from++) { - if (leaf->ents[from].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + ents = xfs_dir3_leaf_ents_p(leaf); + for (from = to = 0, loglow = -1; from < leafhdr->count; from++) { + if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; /* * Only actually copy the entries that are different. @@ -721,19 +972,21 @@ xfs_dir2_leaf_compact( if (from > to) { if (loglow == -1) loglow = to; - leaf->ents[to] = leaf->ents[from]; + ents[to] = ents[from]; } to++; } /* * Update and log the header, log the leaf entries. */ - ASSERT(be16_to_cpu(leaf->hdr.stale) == from - to); - be16_add_cpu(&leaf->hdr.count, -(be16_to_cpu(leaf->hdr.stale))); - leaf->hdr.stale = 0; - xfs_dir2_leaf_log_header(args->trans, bp); + ASSERT(leafhdr->stale == from - to); + leafhdr->count -= leafhdr->stale; + leafhdr->stale = 0; + + xfs_dir3_leaf_hdr_to_disk(leaf, leafhdr); + xfs_dir3_leaf_log_header(args->trans, bp); if (loglow != -1) - xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1); + xfs_dir3_leaf_log_ents(args->trans, bp, loglow, to - 1); } /* @@ -745,8 +998,9 @@ xfs_dir2_leaf_compact( * and leaf logging indices. */ void -xfs_dir2_leaf_compact_x1( - struct xfs_buf *bp, /* leaf buffer */ +xfs_dir3_leaf_compact_x1( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int *indexp, /* insertion index */ int *lowstalep, /* out: stale entry before us */ int *highstalep, /* out: stale entry after us */ @@ -757,22 +1011,20 @@ xfs_dir2_leaf_compact_x1( int highstale; /* stale entry at/after index */ int index; /* insertion index */ int keepstale; /* source index of kept stale */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ int lowstale; /* stale entry before index */ int newindex=0; /* new insertion index */ int to; /* destination copy index */ - leaf = bp->b_addr; - ASSERT(be16_to_cpu(leaf->hdr.stale) > 1); + ASSERT(leafhdr->stale > 1); index = *indexp; - xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale); + xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale); /* * Pick the better of lowstale and highstale. */ if (lowstale >= 0 && - (highstale == be16_to_cpu(leaf->hdr.count) || + (highstale == leafhdr->count || index - lowstale <= highstale - index)) keepstale = lowstale; else @@ -781,15 +1033,14 @@ xfs_dir2_leaf_compact_x1( * Copy the entries in place, removing all the stale entries * except keepstale. */ - for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { + for (from = to = 0; from < leafhdr->count; from++) { /* * Notice the new value of index. */ if (index == from) newindex = to; if (from != keepstale && - leaf->ents[from].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { + ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { if (from == to) *lowlogp = to; continue; @@ -803,7 +1054,7 @@ xfs_dir2_leaf_compact_x1( * Copy only the entries that have moved. */ if (from > to) - leaf->ents[to] = leaf->ents[from]; + ents[to] = ents[from]; to++; } ASSERT(from > to); @@ -817,8 +1068,8 @@ xfs_dir2_leaf_compact_x1( /* * Adjust the leaf header values. */ - be16_add_cpu(&leaf->hdr.count, -(from - to)); - leaf->hdr.stale = cpu_to_be16(1); + leafhdr->count -= from - to; + leafhdr->stale = 1; /* * Remember the low/high stale value only in the "right" * direction. @@ -826,8 +1077,8 @@ xfs_dir2_leaf_compact_x1( if (lowstale >= newindex) lowstale = -1; else - highstale = be16_to_cpu(leaf->hdr.count); - *highlogp = be16_to_cpu(leaf->hdr.count) - 1; + highstale = leafhdr->count; + *highlogp = leafhdr->count - 1; *lowstalep = lowstale; *highstalep = highstale; } @@ -965,7 +1216,7 @@ xfs_dir2_leaf_readbuf( * Read the directory block starting at the first mapping. */ mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); - error = xfs_dir2_data_read(NULL, dp, map->br_startoff, + error = xfs_dir3_data_read(NULL, dp, map->br_startoff, map->br_blockcount >= mp->m_dirblkfsbs ? XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp); @@ -994,7 +1245,7 @@ xfs_dir2_leaf_readbuf( */ if (i > mip->ra_current && map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { - xfs_dir2_data_readahead(NULL, dp, + xfs_dir3_data_readahead(NULL, dp, map[mip->ra_index].br_startoff + mip->ra_offset, XFS_FSB_TO_DADDR(mp, map[mip->ra_index].br_startblock + @@ -1007,7 +1258,7 @@ xfs_dir2_leaf_readbuf( * use our mapping, but this is a very rare case. */ else if (i > mip->ra_current) { - xfs_dir2_data_readahead(NULL, dp, + xfs_dir3_data_readahead(NULL, dp, map[mip->ra_index].br_startoff + mip->ra_offset, -1); mip->ra_current = i; @@ -1133,17 +1384,17 @@ xfs_dir2_leaf_getdents( ASSERT(xfs_dir2_byte_to_db(mp, curoff) == map_info->curdb); hdr = bp->b_addr; - xfs_dir2_data_check(dp, bp); + xfs_dir3_data_check(dp, bp); /* * Find our position in the block. */ - ptr = (char *)(hdr + 1); + ptr = (char *)xfs_dir3_data_entry_p(hdr); byteoff = xfs_dir2_byte_to_off(mp, curoff); /* * Skip past the header. */ if (byteoff == 0) - curoff += (uint)sizeof(*hdr); + curoff += xfs_dir3_data_entry_offset(hdr); /* * Skip past entries until we reach our offset. */ @@ -1220,69 +1471,12 @@ xfs_dir2_leaf_getdents( return error; } -/* - * Initialize a new leaf block, leaf1 or leafn magic accepted. - */ -int -xfs_dir2_leaf_init( - xfs_da_args_t *args, /* operation arguments */ - xfs_dir2_db_t bno, /* directory block number */ - struct xfs_buf **bpp, /* out: leaf buffer */ - int magic) /* magic number for block */ -{ - struct xfs_buf *bp; /* leaf buffer */ - xfs_inode_t *dp; /* incore directory inode */ - int error; /* error return code */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ - xfs_trans_t *tp; /* transaction pointer */ - - dp = args->dp; - ASSERT(dp != NULL); - tp = args->trans; - mp = dp->i_mount; - ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) && - bno < XFS_DIR2_FREE_FIRSTDB(mp)); - /* - * Get the buffer for the block. - */ - error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, - XFS_DATA_FORK); - if (error) - return error; - - /* - * Initialize the header. - */ - leaf = bp->b_addr; - leaf->hdr.info.magic = cpu_to_be16(magic); - leaf->hdr.info.forw = 0; - leaf->hdr.info.back = 0; - leaf->hdr.count = 0; - leaf->hdr.stale = 0; - xfs_dir2_leaf_log_header(tp, bp); - /* - * If it's a leaf-format directory initialize the tail. - * In this case our caller has the real bests table to copy into - * the block. - */ - if (magic == XFS_DIR2_LEAF1_MAGIC) { - bp->b_ops = &xfs_dir2_leaf1_buf_ops; - ltp = xfs_dir2_leaf_tail_p(mp, leaf); - ltp->bestcount = 0; - xfs_dir2_leaf_log_tail(tp, bp); - } else - bp->b_ops = &xfs_dir2_leafn_buf_ops; - *bpp = bp; - return 0; -} /* * Log the bests entries indicated from a leaf1 block. */ static void -xfs_dir2_leaf_log_bests( +xfs_dir3_leaf_log_bests( xfs_trans_t *tp, /* transaction pointer */ struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ @@ -1290,11 +1484,12 @@ xfs_dir2_leaf_log_bests( { __be16 *firstb; /* pointer to first entry */ __be16 *lastb; /* pointer to last entry */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)); + ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf); firstb = xfs_dir2_leaf_bests_p(ltp) + first; lastb = xfs_dir2_leaf_bests_p(ltp) + last; @@ -1306,7 +1501,7 @@ xfs_dir2_leaf_log_bests( * Log the leaf entries indicated from a leaf1 or leafn block. */ void -xfs_dir2_leaf_log_ents( +xfs_dir3_leaf_log_ents( xfs_trans_t *tp, /* transaction pointer */ struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ @@ -1314,13 +1509,17 @@ xfs_dir2_leaf_log_ents( { xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */ xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *ents; - leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || - leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - firstlep = &leaf->ents[first]; - lastlep = &leaf->ents[last]; + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + + ents = xfs_dir3_leaf_ents_p(leaf); + firstlep = &ents[first]; + lastlep = &ents[last]; xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); } @@ -1329,34 +1528,38 @@ xfs_dir2_leaf_log_ents( * Log the header of the leaf1 or leafn block. */ void -xfs_dir2_leaf_log_header( +xfs_dir3_leaf_log_header( struct xfs_trans *tp, struct xfs_buf *bp) { - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; - leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || - leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), - (uint)(sizeof(leaf->hdr) - 1)); + xfs_dir3_leaf_hdr_size(leaf) - 1); } /* * Log the tail of the leaf1 block. */ STATIC void -xfs_dir2_leaf_log_tail( +xfs_dir3_leaf_log_tail( struct xfs_trans *tp, struct xfs_buf *bp) { - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ + struct xfs_mount *mp = tp->t_mountp; + + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); - mp = tp->t_mountp; - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); ltp = xfs_dir2_leaf_tail_p(mp, leaf); xfs_trans_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), (uint)(mp->m_dirblksize - 1)); @@ -1380,6 +1583,7 @@ xfs_dir2_leaf_lookup( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leaf_lookup(args); @@ -1391,12 +1595,14 @@ xfs_dir2_leaf_lookup( } tp = args->trans; dp = args->dp; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(dp->i_mount, lbp); leaf = lbp->b_addr; + ents = xfs_dir3_leaf_ents_p(leaf); /* * Get to the leaf entry and contained data entry address. */ - lep = &leaf->ents[index]; + lep = &ents[index]; + /* * Point to the data entry. */ @@ -1440,18 +1646,23 @@ xfs_dir2_leaf_lookup_int( xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t cidb = -1; /* case match data block no. */ enum xfs_dacmp cmp; /* name compare result */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; - error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); if (error) return error; *lbpp = lbp; leaf = lbp->b_addr; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(mp, lbp); + ents = xfs_dir3_leaf_ents_p(leaf); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + /* * Look for the first leaf entry with our hash value. */ @@ -1460,9 +1671,9 @@ xfs_dir2_leaf_lookup_int( * Loop over all the entries with the right hash value * looking to match the name. */ - for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip over stale leaf entries. */ @@ -1479,7 +1690,7 @@ xfs_dir2_leaf_lookup_int( if (newdb != curdb) { if (dbp) xfs_trans_brelse(tp, dbp); - error = xfs_dir2_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, newdb), -1, &dbp); if (error) { @@ -1520,7 +1731,7 @@ xfs_dir2_leaf_lookup_int( ASSERT(cidb != -1); if (cidb != curdb) { xfs_trans_brelse(tp, dbp); - error = xfs_dir2_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, cidb), -1, &dbp); if (error) { @@ -1566,6 +1777,9 @@ xfs_dir2_leaf_removename( int needscan; /* need to rescan data frees */ xfs_dir2_data_off_t oldbest; /* old value of best free */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_removename(args); @@ -1580,16 +1794,19 @@ xfs_dir2_leaf_removename( mp = dp->i_mount; leaf = lbp->b_addr; hdr = dbp->b_addr; - xfs_dir2_data_check(dp, dbp); + xfs_dir3_data_check(dp, dbp); + bf = xfs_dir3_data_bestfree_p(hdr); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); /* * Point to the leaf entry, use that to point to the data entry. */ - lep = &leaf->ents[index]; + lep = &ents[index]; db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); dep = (xfs_dir2_data_entry_t *) ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); needscan = needlog = 0; - oldbest = be16_to_cpu(hdr->bestfree[0].length); + oldbest = be16_to_cpu(bf[0].length); ltp = xfs_dir2_leaf_tail_p(mp, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); ASSERT(be16_to_cpu(bestsp[db]) == oldbest); @@ -1602,10 +1819,13 @@ xfs_dir2_leaf_removename( /* * We just mark the leaf entry stale by putting a null in it. */ - be16_add_cpu(&leaf->hdr.stale, 1); - xfs_dir2_leaf_log_header(tp, lbp); + leafhdr.stale++; + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, lbp); + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir2_leaf_log_ents(tp, lbp, index, index); + xfs_dir3_leaf_log_ents(tp, lbp, index, index); + /* * Scan the freespace in the data block again if necessary, * log the data block header if necessary. @@ -1618,16 +1838,16 @@ xfs_dir2_leaf_removename( * If the longest freespace in the data block has changed, * put the new value in the bests table and log that. */ - if (be16_to_cpu(hdr->bestfree[0].length) != oldbest) { - bestsp[db] = hdr->bestfree[0].length; - xfs_dir2_leaf_log_bests(tp, lbp, db, db); + if (be16_to_cpu(bf[0].length) != oldbest) { + bestsp[db] = bf[0].length; + xfs_dir3_leaf_log_bests(tp, lbp, db, db); } - xfs_dir2_data_check(dp, dbp); + xfs_dir3_data_check(dp, dbp); /* * If the data block is now empty then get rid of the data block. */ - if (be16_to_cpu(hdr->bestfree[0].length) == - mp->m_dirblksize - (uint)sizeof(*hdr)) { + if (be16_to_cpu(bf[0].length) == + mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr)) { ASSERT(db != mp->m_dirdatablk); if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { /* @@ -1638,7 +1858,7 @@ xfs_dir2_leaf_removename( */ if (error == ENOSPC && args->total == 0) error = 0; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(mp, lbp); return error; } dbp = NULL; @@ -1661,8 +1881,8 @@ xfs_dir2_leaf_removename( memmove(&bestsp[db - i], bestsp, (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp)); be32_add_cpu(<p->bestcount, -(db - i)); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(tp, lbp); + xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); } else bestsp[db] = cpu_to_be16(NULLDATAOFF); } @@ -1672,7 +1892,7 @@ xfs_dir2_leaf_removename( else if (db != mp->m_dirdatablk) dbp = NULL; - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(mp, lbp); /* * See if we can convert to block form. */ @@ -1695,6 +1915,7 @@ xfs_dir2_leaf_replace( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leaf_replace(args); @@ -1706,10 +1927,11 @@ xfs_dir2_leaf_replace( } dp = args->dp; leaf = lbp->b_addr; + ents = xfs_dir3_leaf_ents_p(leaf); /* * Point to the leaf entry, get data address from it. */ - lep = &leaf->ents[index]; + lep = &ents[index]; /* * Point to the data entry. */ @@ -1723,7 +1945,7 @@ xfs_dir2_leaf_replace( dep->inumber = cpu_to_be64(args->inumber); tp = args->trans; xfs_dir2_data_log_entry(tp, dbp, dep); - xfs_dir2_leaf_check(dp, lbp); + xfs_dir3_leaf_check(dp->i_mount, lbp); xfs_trans_brelse(tp, lbp); return 0; } @@ -1745,17 +1967,22 @@ xfs_dir2_leaf_search_hash( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ int mid=0; /* current leaf index */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; leaf = lbp->b_addr; + ents = xfs_dir3_leaf_ents_p(leaf); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + #ifndef __KERNEL__ - if (!leaf->hdr.count) + if (!leafhdr.count) return 0; #endif /* * Note, the table cannot be empty, so we have to go through the loop. * Binary search the leaf entries looking for our hash value. */ - for (lep = leaf->ents, low = 0, high = be16_to_cpu(leaf->hdr.count) - 1, + for (lep = ents, low = 0, high = leafhdr.count - 1, hashwant = args->hashval; low <= high; ) { mid = (low + high) >> 1; @@ -1807,7 +2034,7 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp); + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp); if (error) return error; @@ -1817,10 +2044,12 @@ xfs_dir2_leaf_trim_data( #ifdef DEBUG { struct xfs_dir2_data_hdr *hdr = dbp->b_addr; + struct xfs_dir2_data_free *bf = xfs_dir3_data_bestfree_p(hdr); - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); - ASSERT(be16_to_cpu(hdr->bestfree[0].length) == - mp->m_dirblksize - (uint)sizeof(*hdr)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + ASSERT(be16_to_cpu(bf[0].length) == + mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr)); ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); } #endif @@ -1839,23 +2068,29 @@ xfs_dir2_leaf_trim_data( bestsp = xfs_dir2_leaf_bests_p(ltp); be32_add_cpu(<p->bestcount, -1); memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp)); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(tp, lbp); + xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); return 0; } static inline size_t -xfs_dir2_leaf_size( - struct xfs_dir2_leaf_hdr *hdr, +xfs_dir3_leaf_size( + struct xfs_dir3_icleaf_hdr *hdr, int counts) { - int entries; + int entries; + int hdrsize; + + entries = hdr->count - hdr->stale; + if (hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR2_LEAFN_MAGIC) + hdrsize = sizeof(struct xfs_dir2_leaf_hdr); + else + hdrsize = sizeof(struct xfs_dir3_leaf_hdr); - entries = be16_to_cpu(hdr->count) - be16_to_cpu(hdr->stale); - return sizeof(xfs_dir2_leaf_hdr_t) + - entries * sizeof(xfs_dir2_leaf_entry_t) + - counts * sizeof(xfs_dir2_data_off_t) + - sizeof(xfs_dir2_leaf_tail_t); + return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t) + + counts * sizeof(xfs_dir2_data_off_t) + + sizeof(xfs_dir2_leaf_tail_t); } /* @@ -1879,6 +2114,8 @@ xfs_dir2_node_to_leaf( xfs_mount_t *mp; /* filesystem mount point */ int rval; /* successful free trim? */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir3_icfree_hdr freehdr; /* * There's more than a leaf level in the btree, so there must @@ -1928,7 +2165,11 @@ xfs_dir2_node_to_leaf( return 0; lbp = state->path.blk[0].bp; leaf = lbp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + /* * Read the freespace block. */ @@ -1936,44 +2177,49 @@ xfs_dir2_node_to_leaf( if (error) return error; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - ASSERT(!free->hdr.firstdb); + xfs_dir3_free_hdr_from_disk(&freehdr, free); + + ASSERT(!freehdr.firstdb); /* * Now see if the leafn and free data will fit in a leaf1. * If not, release the buffer and give up. */ - if (xfs_dir2_leaf_size(&leaf->hdr, be32_to_cpu(free->hdr.nvalid)) > - mp->m_dirblksize) { + if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > mp->m_dirblksize) { xfs_trans_brelse(tp, fbp); return 0; } /* * If the leaf has any stale entries in it, compress them out. - * The compact routine will log the header. */ - if (be16_to_cpu(leaf->hdr.stale)) - xfs_dir2_leaf_compact(args, lbp); - else - xfs_dir2_leaf_log_header(tp, lbp); + if (leafhdr.stale) + xfs_dir3_leaf_compact(args, &leafhdr, lbp); - lbp->b_ops = &xfs_dir2_leaf1_buf_ops; - leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); + lbp->b_ops = &xfs_dir3_leaf1_buf_ops; + xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF); + leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC) + ? XFS_DIR2_LEAF1_MAGIC + : XFS_DIR3_LEAF1_MAGIC; /* * Set up the leaf tail from the freespace block. */ ltp = xfs_dir2_leaf_tail_p(mp, leaf); - ltp->bestcount = free->hdr.nvalid; + ltp->bestcount = cpu_to_be32(freehdr.nvalid); + /* * Set up the leaf bests table. */ - memcpy(xfs_dir2_leaf_bests_p(ltp), free->bests, - be32_to_cpu(ltp->bestcount) * sizeof(xfs_dir2_data_off_t)); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_check(dp, lbp); + memcpy(xfs_dir2_leaf_bests_p(ltp), xfs_dir3_free_bests_p(mp, free), + freehdr.nvalid * sizeof(xfs_dir2_data_off_t)); + + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, lbp); + xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(tp, lbp); + xfs_dir3_leaf_check(mp, lbp); + /* * Get rid of the freespace block. */ diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 5980f9b7fa9b..ecc6c661064c 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -32,20 +33,14 @@ #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" /* * Function declarations. */ static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args, int index); -#ifdef DEBUG -static void xfs_dir2_leafn_check(struct xfs_inode *dp, struct xfs_buf *bp); -#else -#define xfs_dir2_leafn_check(dp, bp) -#endif -static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, struct xfs_buf *bp_s, - int start_s, struct xfs_buf *bp_d, - int start_d, int count); static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); @@ -55,52 +50,126 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, static int xfs_dir2_node_addname_int(xfs_da_args_t *args, xfs_da_state_blk_t *fblk); -static void -xfs_dir2_free_verify( +/* + * Check internal consistency of a leafn block. + */ +#ifdef DEBUG +#define xfs_dir3_leaf_check(mp, bp) \ +do { \ + if (!xfs_dir3_leafn_check((mp), (bp))) \ + ASSERT(0); \ +} while (0); + +static bool +xfs_dir3_leafn_check( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + + if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) + return false; + + return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf); +} +#else +#define xfs_dir3_leaf_check(mp, bp) +#endif + +static bool +xfs_dir3_free_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_free_hdr *hdr = bp->b_addr; - int block_ok = 0; - block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC); - if (!block_ok) { - XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic", - XFS_ERRLEVEL_LOW, mp, hdr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) + return false; } + + /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ + + return true; } static void -xfs_dir2_free_read_verify( +xfs_dir3_free_read_verify( struct xfs_buf *bp) { - xfs_dir2_free_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_DIR3_FREE_CRC_OFF)) || + !xfs_dir3_free_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void -xfs_dir2_free_write_verify( +xfs_dir3_free_write_verify( struct xfs_buf *bp) { - xfs_dir2_free_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_free_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_FREE_CRC_OFF); } -static const struct xfs_buf_ops xfs_dir2_free_buf_ops = { - .verify_read = xfs_dir2_free_read_verify, - .verify_write = xfs_dir2_free_write_verify, +const struct xfs_buf_ops xfs_dir3_free_buf_ops = { + .verify_read = xfs_dir3_free_read_verify, + .verify_write = xfs_dir3_free_write_verify, }; static int -__xfs_dir2_free_read( +__xfs_dir3_free_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp) { - return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, &xfs_dir2_free_buf_ops); + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_free_buf_ops); + + /* try read returns without an error or *bpp if it lands in a hole */ + if (!err && tp && *bpp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF); + return err; } int @@ -110,7 +179,7 @@ xfs_dir2_free_read( xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp); + return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp); } static int @@ -120,7 +189,95 @@ xfs_dir2_free_try_read( xfs_dablk_t fbno, struct xfs_buf **bpp) { - return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp); + return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp); +} + + +void +xfs_dir3_free_hdr_from_disk( + struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from) +{ + if (from->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)) { + to->magic = be32_to_cpu(from->hdr.magic); + to->firstdb = be32_to_cpu(from->hdr.firstdb); + to->nvalid = be32_to_cpu(from->hdr.nvalid); + to->nused = be32_to_cpu(from->hdr.nused); + } else { + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from; + + to->magic = be32_to_cpu(hdr3->hdr.magic); + to->firstdb = be32_to_cpu(hdr3->firstdb); + to->nvalid = be32_to_cpu(hdr3->nvalid); + to->nused = be32_to_cpu(hdr3->nused); + } + + ASSERT(to->magic == XFS_DIR2_FREE_MAGIC || + to->magic == XFS_DIR3_FREE_MAGIC); +} + +static void +xfs_dir3_free_hdr_to_disk( + struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_FREE_MAGIC || + from->magic == XFS_DIR3_FREE_MAGIC); + + if (from->magic == XFS_DIR2_FREE_MAGIC) { + to->hdr.magic = cpu_to_be32(from->magic); + to->hdr.firstdb = cpu_to_be32(from->firstdb); + to->hdr.nvalid = cpu_to_be32(from->nvalid); + to->hdr.nused = cpu_to_be32(from->nused); + } else { + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to; + + hdr3->hdr.magic = cpu_to_be32(from->magic); + hdr3->firstdb = cpu_to_be32(from->firstdb); + hdr3->nvalid = cpu_to_be32(from->nvalid); + hdr3->nused = cpu_to_be32(from->nused); + } +} + +static int +xfs_dir3_free_get_buf( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dir2_db_t fbno, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + struct xfs_dir3_icfree_hdr hdr; + + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno), + -1, &bp, XFS_DATA_FORK); + if (error) + return error; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF); + bp->b_ops = &xfs_dir3_free_buf_ops; + + /* + * Initialize the new block to be empty, and remember + * its first slot as our empty slot. + */ + hdr.magic = XFS_DIR2_FREE_MAGIC; + hdr.firstdb = 0; + hdr.nused = 0; + hdr.nvalid = 0; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; + + hdr.magic = XFS_DIR3_FREE_MAGIC; + hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); + hdr3->hdr.owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); + } + xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr); + *bpp = bp; + return 0; } /* @@ -134,13 +291,16 @@ xfs_dir2_free_log_bests( int last) /* last entry to log */ { xfs_dir2_free_t *free; /* freespace structure */ + __be16 *bests; free = bp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); + bests = xfs_dir3_free_bests_p(tp->t_mountp, free); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); xfs_trans_log_buf(tp, bp, - (uint)((char *)&free->bests[first] - (char *)free), - (uint)((char *)&free->bests[last] - (char *)free + - sizeof(free->bests[0]) - 1)); + (uint)((char *)&bests[first] - (char *)free), + (uint)((char *)&bests[last] - (char *)free + + sizeof(bests[0]) - 1)); } /* @@ -154,9 +314,9 @@ xfs_dir2_free_log_header( xfs_dir2_free_t *free; /* freespace structure */ free = bp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - xfs_trans_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free), - (uint)(sizeof(xfs_dir2_free_hdr_t) - 1)); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); + xfs_trans_log_buf(tp, bp, 0, xfs_dir3_free_hdr_size(tp->t_mountp) - 1); } /* @@ -183,6 +343,7 @@ xfs_dir2_leaf_to_node( xfs_dir2_data_off_t off; /* freespace entry value */ __be16 *to; /* pointer to freespace entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icfree_hdr freehdr; trace_xfs_dir2_leaf_to_node(args); @@ -199,44 +360,53 @@ xfs_dir2_leaf_to_node( /* * Get the buffer for the new freespace block. */ - error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, - XFS_DATA_FORK); + error = xfs_dir3_free_get_buf(tp, dp, fdb, &fbp); if (error) return error; - fbp->b_ops = &xfs_dir2_free_buf_ops; free = fbp->b_addr; + xfs_dir3_free_hdr_from_disk(&freehdr, free); leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); - /* - * Initialize the freespace block header. - */ - free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); - free->hdr.firstdb = 0; - ASSERT(be32_to_cpu(ltp->bestcount) <= (uint)dp->i_d.di_size / mp->m_dirblksize); - free->hdr.nvalid = ltp->bestcount; + ASSERT(be32_to_cpu(ltp->bestcount) <= + (uint)dp->i_d.di_size / mp->m_dirblksize); + /* * Copy freespace entries from the leaf block to the new block. * Count active entries. */ - for (i = n = 0, from = xfs_dir2_leaf_bests_p(ltp), to = free->bests; - i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { + from = xfs_dir2_leaf_bests_p(ltp); + to = xfs_dir3_free_bests_p(mp, free); + for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { if ((off = be16_to_cpu(*from)) != NULLDATAOFF) n++; *to = cpu_to_be16(off); } - free->hdr.nused = cpu_to_be32(n); - - lbp->b_ops = &xfs_dir2_leafn_buf_ops; - leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); /* - * Log everything. + * Now initialize the freespace block header. */ - xfs_dir2_leaf_log_header(tp, lbp); + freehdr.nused = n; + freehdr.nvalid = be32_to_cpu(ltp->bestcount); + + xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr); + xfs_dir2_free_log_bests(tp, fbp, 0, freehdr.nvalid - 1); xfs_dir2_free_log_header(tp, fbp); - xfs_dir2_free_log_bests(tp, fbp, 0, be32_to_cpu(free->hdr.nvalid) - 1); - xfs_dir2_leafn_check(dp, lbp); + + /* + * Converting the leaf to a leafnode is just a matter of changing the + * magic number and the ops. Do the change directly to the buffer as + * it's less work (and less code) than decoding the header to host + * format and back again. + */ + if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)) + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); + else + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); + lbp->b_ops = &xfs_dir3_leafn_buf_ops; + xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF); + xfs_dir3_leaf_log_header(tp, lbp); + xfs_dir3_leaf_check(mp, lbp); return 0; } @@ -260,6 +430,8 @@ xfs_dir2_leafn_add( int lowstale; /* previous stale entry */ xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leafn_add(args, index); @@ -267,6 +439,8 @@ xfs_dir2_leafn_add( mp = dp->i_mount; tp = args->trans; leaf = bp->b_addr; + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); /* * Quick check just to make sure we are not going to index @@ -282,15 +456,15 @@ xfs_dir2_leafn_add( * a compact. */ - if (be16_to_cpu(leaf->hdr.count) == xfs_dir2_max_leaf_ents(mp)) { - if (!leaf->hdr.stale) + if (leafhdr.count == xfs_dir3_max_leaf_ents(mp, leaf)) { + if (!leafhdr.stale) return XFS_ERROR(ENOSPC); - compact = be16_to_cpu(leaf->hdr.stale) > 1; + compact = leafhdr.stale > 1; } else compact = 0; - ASSERT(index == 0 || be32_to_cpu(leaf->ents[index - 1].hashval) <= args->hashval); - ASSERT(index == be16_to_cpu(leaf->hdr.count) || - be32_to_cpu(leaf->ents[index].hashval) >= args->hashval); + ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval); + ASSERT(index == leafhdr.count || + be32_to_cpu(ents[index].hashval) >= args->hashval); if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; @@ -299,61 +473,51 @@ xfs_dir2_leafn_add( * Compact out all but one stale leaf entry. Leaves behind * the entry closest to index. */ - if (compact) { - xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale, - &lfloglow, &lfloghigh); - } - /* - * Set impossible logging indices for this case. - */ - else if (leaf->hdr.stale) { - lfloglow = be16_to_cpu(leaf->hdr.count); + if (compact) + xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale, + &highstale, &lfloglow, &lfloghigh); + else if (leafhdr.stale) { + /* + * Set impossible logging indices for this case. + */ + lfloglow = leafhdr.count; lfloghigh = -1; } /* * Insert the new entry, log everything. */ - lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale, + lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, highstale, &lfloglow, &lfloghigh); lep->hashval = cpu_to_be32(args->hashval); lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, args->blkno, args->index)); - xfs_dir2_leaf_log_header(tp, bp); - xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh); - xfs_dir2_leafn_check(dp, bp); + + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, bp); + xfs_dir3_leaf_log_ents(tp, bp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(mp, bp); return 0; } #ifdef DEBUG -/* - * Check internal consistency of a leafn block. - */ -void -xfs_dir2_leafn_check( - struct xfs_inode *dp, - struct xfs_buf *bp) +static void +xfs_dir2_free_hdr_check( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_dir2_db_t db) { - int i; /* leaf index */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_mount_t *mp; /* filesystem mount point */ - int stale; /* count of stale leaves */ + struct xfs_dir3_icfree_hdr hdr; - leaf = bp->b_addr; - mp = dp->i_mount; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp)); - for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { - if (i + 1 < be16_to_cpu(leaf->hdr.count)) { - ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= - be32_to_cpu(leaf->ents[i + 1].hashval)); - } - if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) - stale++; - } - ASSERT(be16_to_cpu(leaf->hdr.stale) == stale); + xfs_dir3_free_hdr_from_disk(&hdr, bp->b_addr); + + ASSERT((hdr.firstdb % xfs_dir3_free_max_bests(mp)) == 0); + ASSERT(hdr.firstdb <= db); + ASSERT(db < hdr.firstdb + hdr.nvalid); } +#else +#define xfs_dir2_free_hdr_check(mp, dp, db) #endif /* DEBUG */ /* @@ -365,15 +529,22 @@ xfs_dir2_leafn_lasthash( struct xfs_buf *bp, /* leaf buffer */ int *count) /* count of entries in leaf */ { - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); - leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); if (count) - *count = be16_to_cpu(leaf->hdr.count); - if (!leaf->hdr.count) + *count = leafhdr.count; + if (!leafhdr.count) return 0; - return be32_to_cpu(leaf->ents[be16_to_cpu(leaf->hdr.count) - 1].hashval); + + ents = xfs_dir3_leaf_ents_p(leaf); + return be32_to_cpu(ents[leafhdr.count - 1].hashval); } /* @@ -402,16 +573,19 @@ xfs_dir2_leafn_lookup_for_addname( xfs_dir2_db_t newdb; /* new data block number */ xfs_dir2_db_t newfdb; /* new free block number */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); -#ifdef __KERNEL__ - ASSERT(be16_to_cpu(leaf->hdr.count) > 0); -#endif - xfs_dir2_leafn_check(dp, bp); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + + xfs_dir3_leaf_check(mp, bp); + ASSERT(leafhdr.count > 0); + /* * Look up the hash value in the leaf entries. */ @@ -424,15 +598,16 @@ xfs_dir2_leafn_lookup_for_addname( curbp = state->extrablk.bp; curfdb = state->extrablk.blkno; free = curbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); } length = xfs_dir2_data_entsize(args->namelen); /* * Loop over leaf entries with the right hash value. */ - for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip stale leaf entries. */ @@ -451,6 +626,8 @@ xfs_dir2_leafn_lookup_for_addname( * in hand, take a look at it. */ if (newdb != curdb) { + __be16 *bests; + curdb = newdb; /* * Convert the data block to the free block @@ -473,13 +650,8 @@ xfs_dir2_leafn_lookup_for_addname( if (error) return error; free = curbp->b_addr; - ASSERT(be32_to_cpu(free->hdr.magic) == - XFS_DIR2_FREE_MAGIC); - ASSERT((be32_to_cpu(free->hdr.firstdb) % - xfs_dir2_free_max_bests(mp)) == 0); - ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb); - ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) + - be32_to_cpu(free->hdr.nvalid)); + + xfs_dir2_free_hdr_check(mp, curbp, curdb); } /* * Get the index for our entry. @@ -488,8 +660,8 @@ xfs_dir2_leafn_lookup_for_addname( /* * If it has room, return it. */ - if (unlikely(free->bests[fi] == - cpu_to_be16(NULLDATAOFF))) { + bests = xfs_dir3_free_bests_p(mp, free); + if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) { XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", XFS_ERRLEVEL_LOW, mp); if (curfdb != newfdb) @@ -497,7 +669,7 @@ xfs_dir2_leafn_lookup_for_addname( return XFS_ERROR(EFSCORRUPTED); } curfdb = newfdb; - if (be16_to_cpu(free->bests[fi]) >= length) + if (be16_to_cpu(bests[fi]) >= length) goto out; } } @@ -511,6 +683,12 @@ out: state->extrablk.bp = curbp; state->extrablk.index = fi; state->extrablk.blkno = curfdb; + + /* + * Important: this magic number is not in the buffer - it's for + * buffer type information and therefore only the free/data type + * matters here, not whether CRCs are enabled or not. + */ state->extrablk.magic = XFS_DIR2_FREE_MAGIC; } else { state->extravalid = 0; @@ -545,16 +723,19 @@ xfs_dir2_leafn_lookup_for_entry( xfs_dir2_db_t newdb; /* new data block number */ xfs_trans_t *tp; /* transaction pointer */ enum xfs_dacmp cmp; /* comparison result */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); -#ifdef __KERNEL__ - ASSERT(be16_to_cpu(leaf->hdr.count) > 0); -#endif - xfs_dir2_leafn_check(dp, bp); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + + xfs_dir3_leaf_check(mp, bp); + ASSERT(leafhdr.count > 0); + /* * Look up the hash value in the leaf entries. */ @@ -569,9 +750,9 @@ xfs_dir2_leafn_lookup_for_entry( /* * Loop over leaf entries with the right hash value. */ - for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(lep->hashval) == args->hashval; - lep++, index++) { + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { /* * Skip stale leaf entries. */ @@ -604,13 +785,13 @@ xfs_dir2_leafn_lookup_for_entry( ASSERT(state->extravalid); curbp = state->extrablk.bp; } else { - error = xfs_dir2_data_read(tp, dp, + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, newdb), -1, &curbp); if (error) return error; } - xfs_dir2_data_check(dp, curbp); + xfs_dir3_data_check(dp, curbp); curdb = newdb; } /* @@ -638,13 +819,13 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = (int)((char *)dep - (char *)curbp->b_addr); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - curbp->b_ops = &xfs_dir2_data_buf_ops; + curbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); } } - ASSERT(index == be16_to_cpu(leaf->hdr.count) || - (args->op_flags & XFS_DA_OP_OKNOENT)); + ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT)); if (curbp) { if (args->cmpresult == XFS_CMP_DIFFERENT) { /* Giving back last used data block. */ @@ -653,7 +834,8 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = -1; state->extrablk.blkno = curdb; state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - curbp->b_ops = &xfs_dir2_data_buf_ops; + curbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); } else { /* If the curbp is not the CI match block, drop it */ if (state->extrablk.bp != curbp) @@ -689,52 +871,50 @@ xfs_dir2_leafn_lookup_int( * Log entries and headers. Stale entries are preserved. */ static void -xfs_dir2_leafn_moveents( - xfs_da_args_t *args, /* operation arguments */ - struct xfs_buf *bp_s, /* source leaf buffer */ - int start_s, /* source leaf index */ - struct xfs_buf *bp_d, /* destination leaf buffer */ - int start_d, /* destination leaf index */ - int count) /* count of leaves to copy */ +xfs_dir3_leafn_moveents( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *bp_s, /* source */ + struct xfs_dir3_icleaf_hdr *shdr, + struct xfs_dir2_leaf_entry *sents, + int start_s,/* source leaf index */ + struct xfs_buf *bp_d, /* destination */ + struct xfs_dir3_icleaf_hdr *dhdr, + struct xfs_dir2_leaf_entry *dents, + int start_d,/* destination leaf index */ + int count) /* count of leaves to copy */ { - xfs_dir2_leaf_t *leaf_d; /* destination leaf structure */ - xfs_dir2_leaf_t *leaf_s; /* source leaf structure */ - int stale; /* count stale leaves copied */ - xfs_trans_t *tp; /* transaction pointer */ + struct xfs_trans *tp = args->trans; + int stale; /* count stale leaves copied */ trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count); /* * Silently return if nothing to do. */ - if (count == 0) { + if (count == 0) return; - } - tp = args->trans; - leaf_s = bp_s->b_addr; - leaf_d = bp_d->b_addr; + /* * If the destination index is not the end of the current * destination leaf entries, open up a hole in the destination * to hold the new entries. */ - if (start_d < be16_to_cpu(leaf_d->hdr.count)) { - memmove(&leaf_d->ents[start_d + count], &leaf_d->ents[start_d], - (be16_to_cpu(leaf_d->hdr.count) - start_d) * - sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count, - count + be16_to_cpu(leaf_d->hdr.count) - 1); + if (start_d < dhdr->count) { + memmove(&dents[start_d + count], &dents[start_d], + (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(tp, bp_d, start_d + count, + count + dhdr->count - 1); } /* * If the source has stale leaves, count the ones in the copy range * so we can update the header correctly. */ - if (leaf_s->hdr.stale) { + if (shdr->stale) { int i; /* temp leaf index */ for (i = start_s, stale = 0; i < start_s + count; i++) { - if (leaf_s->ents[i].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + if (sents[i].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; } } else @@ -742,29 +922,27 @@ xfs_dir2_leafn_moveents( /* * Copy the leaf entries from source to destination. */ - memcpy(&leaf_d->ents[start_d], &leaf_s->ents[start_s], + memcpy(&dents[start_d], &sents[start_s], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1); + xfs_dir3_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1); + /* * If there are source entries after the ones we copied, * delete the ones we copied by sliding the next ones down. */ - if (start_s + count < be16_to_cpu(leaf_s->hdr.count)) { - memmove(&leaf_s->ents[start_s], &leaf_s->ents[start_s + count], + if (start_s + count < shdr->count) { + memmove(&sents[start_s], &sents[start_s + count], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1); + xfs_dir3_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1); } + /* * Update the headers and log them. */ - be16_add_cpu(&leaf_s->hdr.count, -(count)); - be16_add_cpu(&leaf_s->hdr.stale, -(stale)); - be16_add_cpu(&leaf_d->hdr.count, count); - be16_add_cpu(&leaf_d->hdr.stale, stale); - xfs_dir2_leaf_log_header(tp, bp_s); - xfs_dir2_leaf_log_header(tp, bp_d); - xfs_dir2_leafn_check(args->dp, bp_s); - xfs_dir2_leafn_check(args->dp, bp_d); + shdr->count -= count; + shdr->stale -= stale; + dhdr->count += count; + dhdr->stale += stale; } /* @@ -773,21 +951,25 @@ xfs_dir2_leafn_moveents( */ int /* sort order */ xfs_dir2_leafn_order( - struct xfs_buf *leaf1_bp, /* leaf1 buffer */ - struct xfs_buf *leaf2_bp) /* leaf2 buffer */ + struct xfs_buf *leaf1_bp, /* leaf1 buffer */ + struct xfs_buf *leaf2_bp) /* leaf2 buffer */ { - xfs_dir2_leaf_t *leaf1; /* leaf1 structure */ - xfs_dir2_leaf_t *leaf2; /* leaf2 structure */ - - leaf1 = leaf1_bp->b_addr; - leaf2 = leaf2_bp->b_addr; - ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - if (be16_to_cpu(leaf1->hdr.count) > 0 && - be16_to_cpu(leaf2->hdr.count) > 0 && - (be32_to_cpu(leaf2->ents[0].hashval) < be32_to_cpu(leaf1->ents[0].hashval) || - be32_to_cpu(leaf2->ents[be16_to_cpu(leaf2->hdr.count) - 1].hashval) < - be32_to_cpu(leaf1->ents[be16_to_cpu(leaf1->hdr.count) - 1].hashval))) + struct xfs_dir2_leaf *leaf1 = leaf1_bp->b_addr; + struct xfs_dir2_leaf *leaf2 = leaf2_bp->b_addr; + struct xfs_dir2_leaf_entry *ents1; + struct xfs_dir2_leaf_entry *ents2; + struct xfs_dir3_icleaf_hdr hdr1; + struct xfs_dir3_icleaf_hdr hdr2; + + xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1); + xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = xfs_dir3_leaf_ents_p(leaf1); + ents2 = xfs_dir3_leaf_ents_p(leaf2); + + if (hdr1.count > 0 && hdr2.count > 0 && + (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) || + be32_to_cpu(ents2[hdr2.count - 1].hashval) < + be32_to_cpu(ents1[hdr1.count - 1].hashval))) return 1; return 0; } @@ -816,6 +998,10 @@ xfs_dir2_leafn_rebalance( #endif int oldsum; /* old total leaf count */ int swap; /* swapped leaf blocks */ + struct xfs_dir2_leaf_entry *ents1; + struct xfs_dir2_leaf_entry *ents2; + struct xfs_dir3_icleaf_hdr hdr1; + struct xfs_dir3_icleaf_hdr hdr2; args = state->args; /* @@ -830,11 +1016,17 @@ xfs_dir2_leafn_rebalance( } leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; - oldsum = be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count); + xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1); + xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = xfs_dir3_leaf_ents_p(leaf1); + ents2 = xfs_dir3_leaf_ents_p(leaf2); + + oldsum = hdr1.count + hdr2.count; #ifdef DEBUG - oldstale = be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale); + oldstale = hdr1.stale + hdr2.stale; #endif mid = oldsum >> 1; + /* * If the old leaf count was odd then the new one will be even, * so we need to divide the new count evenly. @@ -842,10 +1034,10 @@ xfs_dir2_leafn_rebalance( if (oldsum & 1) { xfs_dahash_t midhash; /* middle entry hash value */ - if (mid >= be16_to_cpu(leaf1->hdr.count)) - midhash = be32_to_cpu(leaf2->ents[mid - be16_to_cpu(leaf1->hdr.count)].hashval); + if (mid >= hdr1.count) + midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval); else - midhash = be32_to_cpu(leaf1->ents[mid].hashval); + midhash = be32_to_cpu(ents1[mid].hashval); isleft = args->hashval <= midhash; } /* @@ -859,30 +1051,42 @@ xfs_dir2_leafn_rebalance( * Calculate moved entry count. Positive means left-to-right, * negative means right-to-left. Then move the entries. */ - count = be16_to_cpu(leaf1->hdr.count) - mid + (isleft == 0); + count = hdr1.count - mid + (isleft == 0); if (count > 0) - xfs_dir2_leafn_moveents(args, blk1->bp, - be16_to_cpu(leaf1->hdr.count) - count, blk2->bp, 0, count); + xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1, + hdr1.count - count, blk2->bp, + &hdr2, ents2, 0, count); else if (count < 0) - xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp, - be16_to_cpu(leaf1->hdr.count), count); - ASSERT(be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count) == oldsum); - ASSERT(be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale) == oldstale); + xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0, + blk1->bp, &hdr1, ents1, + hdr1.count, count); + + ASSERT(hdr1.count + hdr2.count == oldsum); + ASSERT(hdr1.stale + hdr2.stale == oldstale); + + /* log the changes made when moving the entries */ + xfs_dir3_leaf_hdr_to_disk(leaf1, &hdr1); + xfs_dir3_leaf_hdr_to_disk(leaf2, &hdr2); + xfs_dir3_leaf_log_header(args->trans, blk1->bp); + xfs_dir3_leaf_log_header(args->trans, blk2->bp); + + xfs_dir3_leaf_check(args->dp->i_mount, blk1->bp); + xfs_dir3_leaf_check(args->dp->i_mount, blk2->bp); + /* * Mark whether we're inserting into the old or new leaf. */ - if (be16_to_cpu(leaf1->hdr.count) < be16_to_cpu(leaf2->hdr.count)) + if (hdr1.count < hdr2.count) state->inleaf = swap; - else if (be16_to_cpu(leaf1->hdr.count) > be16_to_cpu(leaf2->hdr.count)) + else if (hdr1.count > hdr2.count) state->inleaf = !swap; else - state->inleaf = - swap ^ (blk1->index <= be16_to_cpu(leaf1->hdr.count)); + state->inleaf = swap ^ (blk1->index <= hdr1.count); /* * Adjust the expected index for insertion. */ if (!state->inleaf) - blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); + blk2->index = blk1->index - hdr1.count; /* * Finally sanity check just to make sure we are not returning a @@ -898,7 +1102,7 @@ xfs_dir2_leafn_rebalance( } static int -xfs_dir2_data_block_free( +xfs_dir3_data_block_free( xfs_da_args_t *args, struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_free *free, @@ -909,57 +1113,66 @@ xfs_dir2_data_block_free( { struct xfs_trans *tp = args->trans; int logfree = 0; + __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; - if (!hdr) { - /* One less used entry in the free table. */ - be32_add_cpu(&free->hdr.nused, -1); - xfs_dir2_free_log_header(tp, fbp); + xfs_dir3_free_hdr_from_disk(&freehdr, free); + bests = xfs_dir3_free_bests_p(tp->t_mountp, free); + if (hdr) { /* - * If this was the last entry in the table, we can trim the - * table size back. There might be other entries at the end - * referring to non-existent data blocks, get those too. + * Data block is not empty, just set the free entry to the new + * value. */ - if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { - int i; /* free entry index */ + bests[findex] = cpu_to_be16(longest); + xfs_dir2_free_log_bests(tp, fbp, findex, findex); + return 0; + } - for (i = findex - 1; i >= 0; i--) { - if (free->bests[i] != cpu_to_be16(NULLDATAOFF)) - break; - } - free->hdr.nvalid = cpu_to_be32(i + 1); - logfree = 0; - } else { - /* Not the last entry, just punch it out. */ - free->bests[findex] = cpu_to_be16(NULLDATAOFF); - logfree = 1; - } - /* - * If there are no useful entries left in the block, - * get rid of the block if we can. - */ - if (!free->hdr.nused) { - int error; + /* One less used entry in the free table. */ + freehdr.nused--; - error = xfs_dir2_shrink_inode(args, fdb, fbp); - if (error == 0) { - fbp = NULL; - logfree = 0; - } else if (error != ENOSPC || args->total != 0) - return error; - /* - * It's possible to get ENOSPC if there is no - * space reservation. In this case some one - * else will eventually get rid of this block. - */ + /* + * If this was the last entry in the table, we can trim the table size + * back. There might be other entries at the end referring to + * non-existent data blocks, get those too. + */ + if (findex == freehdr.nvalid - 1) { + int i; /* free entry index */ + + for (i = findex - 1; i >= 0; i--) { + if (bests[i] != cpu_to_be16(NULLDATAOFF)) + break; } + freehdr.nvalid = i + 1; + logfree = 0; } else { + /* Not the last entry, just punch it out. */ + bests[findex] = cpu_to_be16(NULLDATAOFF); + logfree = 1; + } + + xfs_dir3_free_hdr_to_disk(free, &freehdr); + xfs_dir2_free_log_header(tp, fbp); + + /* + * If there are no useful entries left in the block, get rid of the + * block if we can. + */ + if (!freehdr.nused) { + int error; + + error = xfs_dir2_shrink_inode(args, fdb, fbp); + if (error == 0) { + fbp = NULL; + logfree = 0; + } else if (error != ENOSPC || args->total != 0) + return error; /* - * Data block is not empty, just set the free entry to the new - * value. + * It's possible to get ENOSPC if there is no + * space reservation. In this case some one + * else will eventually get rid of this block. */ - free->bests[findex] = cpu_to_be16(longest); - logfree = 1; } /* Log the free entry that changed, unless we got rid of it. */ @@ -994,6 +1207,9 @@ xfs_dir2_leafn_remove( int needlog; /* need to log data header */ int needscan; /* need to rescan data frees */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leafn_remove(args, index); @@ -1001,11 +1217,14 @@ xfs_dir2_leafn_remove( tp = args->trans; mp = dp->i_mount; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + /* * Point to the entry we're removing. */ - lep = &leaf->ents[index]; + lep = &ents[index]; + /* * Extract the data block and offset from the entry. */ @@ -1013,14 +1232,18 @@ xfs_dir2_leafn_remove( ASSERT(dblk->blkno == db); off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)); ASSERT(dblk->index == off); + /* * Kill the leaf entry by marking it stale. * Log the leaf block changes. */ - be16_add_cpu(&leaf->hdr.stale, 1); - xfs_dir2_leaf_log_header(tp, bp); + leafhdr.stale++; + xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(tp, bp); + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir2_leaf_log_ents(tp, bp, index, index); + xfs_dir3_leaf_log_ents(tp, bp, index, index); + /* * Make the data entry free. Keep track of the longest freespace * in the data block in case it changes. @@ -1028,7 +1251,8 @@ xfs_dir2_leafn_remove( dbp = dblk->bp; hdr = dbp->b_addr; dep = (xfs_dir2_data_entry_t *)((char *)hdr + off); - longest = be16_to_cpu(hdr->bestfree[0].length); + bf = xfs_dir3_data_bestfree_p(hdr); + longest = be16_to_cpu(bf[0].length); needlog = needscan = 0; xfs_dir2_data_make_free(tp, dbp, off, xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); @@ -1040,12 +1264,12 @@ xfs_dir2_leafn_remove( xfs_dir2_data_freescan(mp, hdr, &needlog); if (needlog) xfs_dir2_data_log_header(tp, dbp); - xfs_dir2_data_check(dp, dbp); + xfs_dir3_data_check(dp, dbp); /* * If the longest data block freespace changes, need to update * the corresponding freeblock entry. */ - if (longest < be16_to_cpu(hdr->bestfree[0].length)) { + if (longest < be16_to_cpu(bf[0].length)) { int error; /* error return value */ struct xfs_buf *fbp; /* freeblock buffer */ xfs_dir2_db_t fdb; /* freeblock block number */ @@ -1062,20 +1286,25 @@ xfs_dir2_leafn_remove( if (error) return error; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); - ASSERT(be32_to_cpu(free->hdr.firstdb) == - xfs_dir2_free_max_bests(mp) * - (fdb - XFS_DIR2_FREE_FIRSTDB(mp))); +#ifdef DEBUG + { + struct xfs_dir3_icfree_hdr freehdr; + xfs_dir3_free_hdr_from_disk(&freehdr, free); + ASSERT(freehdr.firstdb == xfs_dir3_free_max_bests(mp) * + (fdb - XFS_DIR2_FREE_FIRSTDB(mp))); + } +#endif /* * Calculate which entry we need to fix. */ findex = xfs_dir2_db_to_fdindex(mp, db); - longest = be16_to_cpu(hdr->bestfree[0].length); + longest = be16_to_cpu(bf[0].length); /* * If the data block is now empty we can get rid of it * (usually). */ - if (longest == mp->m_dirblksize - (uint)sizeof(*hdr)) { + if (longest == mp->m_dirblksize - + xfs_dir3_data_entry_offset(hdr)) { /* * Try to punch out the data block. */ @@ -1096,21 +1325,19 @@ xfs_dir2_leafn_remove( * If we got rid of the data block, we can eliminate that entry * in the free block. */ - error = xfs_dir2_data_block_free(args, hdr, free, + error = xfs_dir3_data_block_free(args, hdr, free, fdb, findex, fbp, longest); if (error) return error; } - xfs_dir2_leafn_check(dp, bp); + xfs_dir3_leaf_check(mp, bp); /* * Return indication of whether this leaf block is empty enough * to justify trying to join it with a neighbor. */ - *rval = - ((uint)sizeof(leaf->hdr) + - (uint)sizeof(leaf->ents[0]) * - (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale))) < + *rval = (xfs_dir3_leaf_hdr_size(leaf) + + (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) < mp->m_dir_magicpct; return 0; } @@ -1143,11 +1370,11 @@ xfs_dir2_leafn_split( /* * Initialize the new leaf block. */ - error = xfs_dir2_leaf_init(args, xfs_dir2_da_to_db(mp, blkno), - &newblk->bp, XFS_DIR2_LEAFN_MAGIC); - if (error) { + error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(mp, blkno), + &newblk->bp, XFS_DIR2_LEAFN_MAGIC); + if (error) return error; - } + newblk->blkno = blkno; newblk->magic = XFS_DIR2_LEAFN_MAGIC; /* @@ -1155,7 +1382,7 @@ xfs_dir2_leafn_split( * block into the leaves. */ xfs_dir2_leafn_rebalance(state, oldblk, newblk); - error = xfs_da_blk_link(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); if (error) { return error; } @@ -1171,8 +1398,8 @@ xfs_dir2_leafn_split( */ oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL); newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL); - xfs_dir2_leafn_check(args->dp, oldblk->bp); - xfs_dir2_leafn_check(args->dp, newblk->bp); + xfs_dir3_leaf_check(mp, oldblk->bp); + xfs_dir3_leaf_check(mp, newblk->bp); return error; } @@ -1198,9 +1425,10 @@ xfs_dir2_leafn_toosmall( int error; /* error return value */ int forward; /* sibling block direction */ int i; /* sibling counter */ - xfs_da_blkinfo_t *info; /* leaf block header */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int rval; /* result from path_shift */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; /* * Check for the degenerate case of the block being over 50% full. @@ -1208,11 +1436,13 @@ xfs_dir2_leafn_toosmall( * to coalesce with a sibling. */ blk = &state->path.blk[state->path.active - 1]; - info = blk->bp->b_addr; - ASSERT(info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - leaf = (xfs_dir2_leaf_t *)info; - count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); - bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]); + leaf = blk->bp->b_addr; + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + xfs_dir3_leaf_check(state->args->dp->i_mount, blk->bp); + + count = leafhdr.count - leafhdr.stale; + bytes = xfs_dir3_leaf_hdr_size(leaf) + count * sizeof(ents[0]); if (bytes > (state->blocksize >> 1)) { /* * Blk over 50%, don't try to join. @@ -1231,9 +1461,9 @@ xfs_dir2_leafn_toosmall( * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ - forward = (info->forw != 0); + forward = (leafhdr.forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); - error = xfs_da_path_shift(state, &state->altpath, forward, 0, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &rval); if (error) return error; @@ -1247,15 +1477,17 @@ xfs_dir2_leafn_toosmall( * We prefer coalescing with the lower numbered sibling so as * to shrink a directory over time. */ - forward = be32_to_cpu(info->forw) < be32_to_cpu(info->back); + forward = leafhdr.forw < leafhdr.back; for (i = 0, bp = NULL; i < 2; forward = !forward, i++) { - blkno = forward ? be32_to_cpu(info->forw) : be32_to_cpu(info->back); + struct xfs_dir3_icleaf_hdr hdr2; + + blkno = forward ? leafhdr.forw : leafhdr.back; if (blkno == 0) continue; /* * Read the sibling leaf block. */ - error = xfs_dir2_leafn_read(state->args->trans, state->args->dp, + error = xfs_dir3_leafn_read(state->args->trans, state->args->dp, blkno, -1, &bp); if (error) return error; @@ -1263,13 +1495,15 @@ xfs_dir2_leafn_toosmall( /* * Count bytes in the two blocks combined. */ - leaf = (xfs_dir2_leaf_t *)info; - count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); + count = leafhdr.count - leafhdr.stale; bytes = state->blocksize - (state->blocksize >> 2); + leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); - bytes -= count * (uint)sizeof(leaf->ents[0]); + xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf); + ents = xfs_dir3_leaf_ents_p(leaf); + count += hdr2.count - hdr2.stale; + bytes -= count * sizeof(ents[0]); + /* * Fits with at least 25% to spare. */ @@ -1291,10 +1525,10 @@ xfs_dir2_leafn_toosmall( */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) - error = xfs_da_path_shift(state, &state->altpath, forward, 0, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &rval); else - error = xfs_da_path_shift(state, &state->path, forward, 0, + error = xfs_da3_path_shift(state, &state->path, forward, 0, &rval); if (error) { return error; @@ -1316,34 +1550,53 @@ xfs_dir2_leafn_unbalance( xfs_da_args_t *args; /* operation arguments */ xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */ xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */ + struct xfs_dir3_icleaf_hdr savehdr; + struct xfs_dir3_icleaf_hdr drophdr; + struct xfs_dir2_leaf_entry *sents; + struct xfs_dir2_leaf_entry *dents; args = state->args; ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC); drop_leaf = drop_blk->bp->b_addr; save_leaf = save_blk->bp->b_addr; - ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + + xfs_dir3_leaf_hdr_from_disk(&savehdr, save_leaf); + xfs_dir3_leaf_hdr_from_disk(&drophdr, drop_leaf); + sents = xfs_dir3_leaf_ents_p(save_leaf); + dents = xfs_dir3_leaf_ents_p(drop_leaf); + /* * If there are any stale leaf entries, take this opportunity * to purge them. */ - if (drop_leaf->hdr.stale) - xfs_dir2_leaf_compact(args, drop_blk->bp); - if (save_leaf->hdr.stale) - xfs_dir2_leaf_compact(args, save_blk->bp); + if (drophdr.stale) + xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp); + if (savehdr.stale) + xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp); + /* * Move the entries from drop to the appropriate end of save. */ - drop_blk->hashval = be32_to_cpu(drop_leaf->ents[be16_to_cpu(drop_leaf->hdr.count) - 1].hashval); + drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval); if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp)) - xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0, - be16_to_cpu(drop_leaf->hdr.count)); + xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0, + save_blk->bp, &savehdr, sents, 0, + drophdr.count); else - xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, - be16_to_cpu(save_leaf->hdr.count), be16_to_cpu(drop_leaf->hdr.count)); - save_blk->hashval = be32_to_cpu(save_leaf->ents[be16_to_cpu(save_leaf->hdr.count) - 1].hashval); - xfs_dir2_leafn_check(args->dp, save_blk->bp); + xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0, + save_blk->bp, &savehdr, sents, + savehdr.count, drophdr.count); + save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval); + + /* log the changes made when moving the entries */ + xfs_dir3_leaf_hdr_to_disk(save_leaf, &savehdr); + xfs_dir3_leaf_hdr_to_disk(drop_leaf, &drophdr); + xfs_dir3_leaf_log_header(args->trans, save_blk->bp); + xfs_dir3_leaf_log_header(args->trans, drop_blk->bp); + + xfs_dir3_leaf_check(args->dp->i_mount, save_blk->bp); + xfs_dir3_leaf_check(args->dp->i_mount, drop_blk->bp); } /* @@ -1372,7 +1625,7 @@ xfs_dir2_node_addname( * Look up the name. We're not supposed to find it, but * this gives us the insertion point. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) rval = error; if (rval != ENOENT) { @@ -1398,7 +1651,7 @@ xfs_dir2_node_addname( * It worked, fix the hash values up the btree. */ if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); } else { /* * It didn't work, we need to split the leaf block. @@ -1410,7 +1663,7 @@ xfs_dir2_node_addname( /* * Split the leaf block and insert the new entry. */ - rval = xfs_da_split(state); + rval = xfs_da3_split(state); } done: xfs_da_state_free(state); @@ -1447,6 +1700,9 @@ xfs_dir2_node_addname_int( int needscan; /* need to rescan data frees */ __be16 *tagp; /* data entry tag pointer */ xfs_trans_t *tp; /* transaction pointer */ + __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_dir2_data_free *bf; dp = args->dp; mp = dp->i_mount; @@ -1464,36 +1720,37 @@ xfs_dir2_node_addname_int( */ ifbno = fblk->blkno; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = fblk->index; + bests = xfs_dir3_free_bests_p(mp, free); + xfs_dir3_free_hdr_from_disk(&freehdr, free); + /* * This means the free entry showed that the data block had * space for our entry, so we remembered it. * Use that data block. */ if (findex >= 0) { - ASSERT(findex < be32_to_cpu(free->hdr.nvalid)); - ASSERT(be16_to_cpu(free->bests[findex]) != NULLDATAOFF); - ASSERT(be16_to_cpu(free->bests[findex]) >= length); - dbno = be32_to_cpu(free->hdr.firstdb) + findex; - } - /* - * The data block looked at didn't have enough room. - * We'll start at the beginning of the freespace entries. - */ - else { + ASSERT(findex < freehdr.nvalid); + ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); + ASSERT(be16_to_cpu(bests[findex]) >= length); + dbno = freehdr.firstdb + findex; + } else { + /* + * The data block looked at didn't have enough room. + * We'll start at the beginning of the freespace entries. + */ dbno = -1; findex = 0; } - } - /* - * Didn't come in with a freespace block, so don't have a data block. - */ - else { + } else { + /* + * Didn't come in with a freespace block, so no data block. + */ ifbno = dbno = -1; fbp = NULL; findex = 0; } + /* * If we don't have a data block yet, we're going to scan the * freespace blocks looking for one. Figure out what the @@ -1547,20 +1804,26 @@ xfs_dir2_node_addname_int( if (!fbp) continue; free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = 0; } /* * Look at the current free entry. Is it good enough? + * + * The bests initialisation should be where the bufer is read in + * the above branch. But gcc is too stupid to realise that bests + * and the freehdr are actually initialised if they are placed + * there, so we have to do it here to avoid warnings. Blech. */ - if (be16_to_cpu(free->bests[findex]) != NULLDATAOFF && - be16_to_cpu(free->bests[findex]) >= length) - dbno = be32_to_cpu(free->hdr.firstdb) + findex; + bests = xfs_dir3_free_bests_p(mp, free); + xfs_dir3_free_hdr_from_disk(&freehdr, free); + if (be16_to_cpu(bests[findex]) != NULLDATAOFF && + be16_to_cpu(bests[findex]) >= length) + dbno = freehdr.firstdb + findex; else { /* * Are we done with the freeblock? */ - if (++findex == be32_to_cpu(free->hdr.nvalid)) { + if (++findex == freehdr.nvalid) { /* * Drop the block. */ @@ -1588,7 +1851,7 @@ xfs_dir2_node_addname_int( if (unlikely((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &dbno)) || - (error = xfs_dir2_data_init(args, dbno, &dbp)))) + (error = xfs_dir3_data_init(args, dbno, &dbp)))) return error; /* @@ -1614,11 +1877,11 @@ xfs_dir2_node_addname_int( * If there wasn't a freespace block, the read will * return a NULL fbp. Allocate and initialize a new one. */ - if( fbp == NULL ) { - if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, - &fbno))) { + if (!fbp) { + error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, + &fbno); + if (error) return error; - } if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) { xfs_alert(mp, @@ -1646,27 +1909,24 @@ xfs_dir2_node_addname_int( /* * Get a buffer for the new block. */ - error = xfs_da_get_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), - -1, &fbp, XFS_DATA_FORK); + error = xfs_dir3_free_get_buf(tp, dp, fbno, &fbp); if (error) return error; - fbp->b_ops = &xfs_dir2_free_buf_ops; + free = fbp->b_addr; + bests = xfs_dir3_free_bests_p(mp, free); + xfs_dir3_free_hdr_from_disk(&freehdr, free); /* - * Initialize the new block to be empty, and remember - * its first slot as our empty slot. + * Remember the first slot as our empty slot. */ - free = fbp->b_addr; - free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); - free->hdr.firstdb = cpu_to_be32( - (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * - xfs_dir2_free_max_bests(mp)); + freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * + xfs_dir3_free_max_bests(mp); free->hdr.nvalid = 0; free->hdr.nused = 0; } else { free = fbp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); + bests = xfs_dir3_free_bests_p(mp, free); + xfs_dir3_free_hdr_from_disk(&freehdr, free); } /* @@ -1677,20 +1937,21 @@ xfs_dir2_node_addname_int( * If it's after the end of the current entries in the * freespace block, extend that table. */ - if (findex >= be32_to_cpu(free->hdr.nvalid)) { - ASSERT(findex < xfs_dir2_free_max_bests(mp)); - free->hdr.nvalid = cpu_to_be32(findex + 1); + if (findex >= freehdr.nvalid) { + ASSERT(findex < xfs_dir3_free_max_bests(mp)); + freehdr.nvalid = findex + 1; /* * Tag new entry so nused will go up. */ - free->bests[findex] = cpu_to_be16(NULLDATAOFF); + bests[findex] = cpu_to_be16(NULLDATAOFF); } /* * If this entry was for an empty data block * (this should always be true) then update the header. */ - if (free->bests[findex] == cpu_to_be16(NULLDATAOFF)) { - be32_add_cpu(&free->hdr.nused, 1); + if (bests[findex] == cpu_to_be16(NULLDATAOFF)) { + freehdr.nused++; + xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr); xfs_dir2_free_log_header(tp, fbp); } /* @@ -1699,7 +1960,8 @@ xfs_dir2_node_addname_int( * change again. */ hdr = dbp->b_addr; - free->bests[findex] = hdr->bestfree[0].length; + bf = xfs_dir3_data_bestfree_p(hdr); + bests[findex] = bf[0].length; logfree = 1; } /* @@ -1715,19 +1977,20 @@ xfs_dir2_node_addname_int( /* * Read the data block in. */ - error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno), + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno), -1, &dbp); if (error) return error; hdr = dbp->b_addr; + bf = xfs_dir3_data_bestfree_p(hdr); logfree = 0; } - ASSERT(be16_to_cpu(hdr->bestfree[0].length) >= length); + ASSERT(be16_to_cpu(bf[0].length) >= length); /* * Point to the existing unused space. */ dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset)); + ((char *)hdr + be16_to_cpu(bf[0].offset)); needscan = needlog = 0; /* * Mark the first part of the unused space, inuse for us. @@ -1758,8 +2021,9 @@ xfs_dir2_node_addname_int( /* * If the freespace entry is now wrong, update it. */ - if (be16_to_cpu(free->bests[findex]) != be16_to_cpu(hdr->bestfree[0].length)) { - free->bests[findex] = hdr->bestfree[0].length; + bests = xfs_dir3_free_bests_p(mp, free); /* gcc is so stupid */ + if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) { + bests[findex] = bf[0].length; logfree = 1; } /* @@ -1777,7 +2041,7 @@ xfs_dir2_node_addname_int( /* * Lookup an entry in a node-format directory. - * All the real work happens in xfs_da_node_lookup_int. + * All the real work happens in xfs_da3_node_lookup_int. * The only real output is the inode number of the entry. */ int /* error */ @@ -1802,7 +2066,7 @@ xfs_dir2_node_lookup( /* * Fill in the path to the entry in the cursor. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) rval = error; else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) { @@ -1857,7 +2121,7 @@ xfs_dir2_node_removename( /* * Look up the entry we're deleting, set up the cursor. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) rval = error; /* @@ -1881,12 +2145,12 @@ xfs_dir2_node_removename( /* * Fix the hash values up the btree. */ - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); /* * If we need to join leaf blocks, do it. */ if (rval && state->path.active > 1) - error = xfs_da_join(state); + error = xfs_da3_join(state); /* * If no errors so far, try conversion to leaf format. */ @@ -1928,7 +2192,7 @@ xfs_dir2_node_replace( /* * Lookup the entry to change in the btree. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) { rval = error; } @@ -1937,19 +2201,22 @@ xfs_dir2_node_replace( * and locked it. But paranoia is good. */ if (rval == EEXIST) { + struct xfs_dir2_leaf_entry *ents; /* * Find the leaf entry. */ blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); leaf = blk->bp->b_addr; - lep = &leaf->ents[blk->index]; + ents = xfs_dir3_leaf_ents_p(leaf); + lep = &ents[blk->index]; ASSERT(state->extravalid); /* * Point to the data entry. */ hdr = state->extrablk.bp->b_addr; - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); dep = (xfs_dir2_data_entry_t *) ((char *)hdr + xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address))); @@ -1995,6 +2262,7 @@ xfs_dir2_node_trim_free( xfs_dir2_free_t *free; /* freespace structure */ xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icfree_hdr freehdr; dp = args->dp; mp = dp->i_mount; @@ -2012,11 +2280,12 @@ xfs_dir2_node_trim_free( if (!bp) return 0; free = bp->b_addr; - ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); + xfs_dir3_free_hdr_from_disk(&freehdr, free); + /* * If there are used entries, there's nothing to do. */ - if (be32_to_cpu(free->hdr.nused) > 0) { + if (freehdr.nused > 0) { xfs_trans_brelse(tp, bp); *rvalp = 0; return 0; diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 7da79f6515fd..7cf573c88aad 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -30,7 +30,7 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const unsigned char *name, int len); /* xfs_dir2_block.c */ -extern const struct xfs_buf_ops xfs_dir2_block_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; extern int xfs_dir2_block_addname(struct xfs_da_args *args); extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, @@ -43,17 +43,18 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, /* xfs_dir2_data.c */ #ifdef DEBUG -#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp); +#define xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp); #else -#define xfs_dir2_data_check(dp,bp) +#define xfs_dir3_data_check(dp,bp) #endif -extern const struct xfs_buf_ops xfs_dir2_data_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_data_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_free_buf_ops; -extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); -extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, +extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); -extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp, +extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno); extern struct xfs_dir2_data_free * @@ -61,7 +62,7 @@ xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup, int *loghead); extern void xfs_dir2_data_freescan(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr, int *loghead); -extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, +extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, struct xfs_buf **bpp); extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_dir2_data_entry *dep); @@ -77,24 +78,26 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ -extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; -extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, +extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); -extern void xfs_dir2_leaf_compact(struct xfs_da_args *args, - struct xfs_buf *bp); -extern void xfs_dir2_leaf_compact_x1(struct xfs_buf *bp, int *indexp, +extern void xfs_dir3_leaf_compact(struct xfs_da_args *args, + struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp); +extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int *indexp, int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent, size_t bufsize, xfs_off_t *offset, filldir_t filldir); -extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno, - struct xfs_buf **bpp, int magic); -extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, +extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, + struct xfs_buf **bpp, __uint16_t magic); +extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); -extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp, +extern void xfs_dir3_leaf_log_header(struct xfs_trans *tp, struct xfs_buf *bp); extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); @@ -104,11 +107,18 @@ extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args, extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args, struct xfs_buf *lbp, xfs_dir2_db_t db); extern struct xfs_dir2_leaf_entry * -xfs_dir2_leaf_find_entry(struct xfs_dir2_leaf *leaf, int index, int compact, - int lowstale, int highstale, - int *lfloglow, int *lfloghigh); +xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int index, int compact, + int lowstale, int highstale, int *lfloglow, int *lfloghigh); extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); +extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from); +extern void xfs_dir3_leaf_hdr_to_disk(struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from); +extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, + struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf); + /* xfs_dir2_node.c */ extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_buf *lbp); diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 1b9fc3ec7e4b..6157424dbf8f 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -278,7 +278,7 @@ xfs_dir2_block_to_sf( * Set up to loop over the block's entries. */ btp = xfs_dir2_block_tail_p(mp, hdr); - ptr = (char *)(hdr + 1); + ptr = (char *)xfs_dir3_data_entry_p(hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); sfep = xfs_dir2_sf_firstentry(sfp); /* @@ -535,7 +535,7 @@ xfs_dir2_sf_addname_hard( * to insert the new entry. * If it's going to end up at the end then oldsfep will point there. */ - for (offset = XFS_DIR2_DATA_FIRST_OFFSET, + for (offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount), oldsfep = xfs_dir2_sf_firstentry(oldsfp), add_datasize = xfs_dir2_data_entsize(args->namelen), eof = (char *)oldsfep == &buf[old_isize]; @@ -617,7 +617,7 @@ xfs_dir2_sf_addname_pick( sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; size = xfs_dir2_data_entsize(args->namelen); - offset = XFS_DIR2_DATA_FIRST_OFFSET; + offset = XFS_DIR3_DATA_FIRST_OFFSET(mp); sfep = xfs_dir2_sf_firstentry(sfp); holefit = 0; /* @@ -688,7 +688,7 @@ xfs_dir2_sf_check( dp = args->dp; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - offset = XFS_DIR2_DATA_FIRST_OFFSET; + offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount); ino = xfs_dir2_sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; @@ -812,9 +812,9 @@ xfs_dir2_sf_getdents( * mp->m_dirdatablk. */ dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOT_OFFSET); + XFS_DIR3_DATA_DOT_OFFSET(mp)); dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOTDOT_OFFSET); + XFS_DIR3_DATA_DOTDOT_OFFSET(mp)); /* * Put . entry unless we're starting past it. diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 8025eb23ad72..a41f8bf1da37 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -36,6 +36,7 @@ #include "xfs_trans_space.h" #include "xfs_trans_priv.h" #include "xfs_qm.h" +#include "xfs_cksum.h" #include "xfs_trace.h" /* @@ -85,17 +86,23 @@ xfs_qm_dqdestroy( */ void xfs_qm_adjust_dqlimits( - xfs_mount_t *mp, - xfs_disk_dquot_t *d) + struct xfs_mount *mp, + struct xfs_dquot *dq) { - xfs_quotainfo_t *q = mp->m_quotainfo; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_disk_dquot *d = &dq->q_core; + int prealloc = 0; ASSERT(d->d_id); - if (q->qi_bsoftlimit && !d->d_blk_softlimit) + if (q->qi_bsoftlimit && !d->d_blk_softlimit) { d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit); - if (q->qi_bhardlimit && !d->d_blk_hardlimit) + prealloc = 1; + } + if (q->qi_bhardlimit && !d->d_blk_hardlimit) { d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit); + prealloc = 1; + } if (q->qi_isoftlimit && !d->d_ino_softlimit) d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit); if (q->qi_ihardlimit && !d->d_ino_hardlimit) @@ -104,6 +111,9 @@ xfs_qm_adjust_dqlimits( d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit); if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit) d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit); + + if (prealloc) + xfs_dquot_set_prealloc_limits(dq); } /* @@ -239,6 +249,8 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_flags = type; + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); } xfs_trans_dquot_buf(tp, bp, @@ -248,25 +260,113 @@ xfs_qm_init_dquot_blk( xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } -static void +/* + * Initialize the dynamic speculative preallocation thresholds. The lo/hi + * watermarks correspond to the soft and hard limits by default. If a soft limit + * is not specified, we use 95% of the hard limit. + */ +void +xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) +{ + __uint64_t space; + + dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit); + dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit); + if (!dqp->q_prealloc_lo_wmark) { + dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark; + do_div(dqp->q_prealloc_lo_wmark, 100); + dqp->q_prealloc_lo_wmark *= 95; + } + + space = dqp->q_prealloc_hi_wmark; + + do_div(space, 100); + dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space; + dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3; + dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; +} + +STATIC void +xfs_dquot_buf_calc_crc( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + int i; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++, d++) { + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + offsetof(struct xfs_dqblk, dd_crc)); + } +} + +STATIC bool +xfs_dquot_buf_verify_crc( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + int ndquots; + int i; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return true; + + /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_qm_calc_dquots_per_chunk(mp, + XFS_BB_TO_FSB(mp, bp->b_length)); + + for (i = 0; i < ndquots; i++, d++) { + if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), + offsetof(struct xfs_dqblk, dd_crc))) + return false; + if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) + return false; + } + + return true; +} + +STATIC bool xfs_dquot_buf_verify( + struct xfs_mount *mp, struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; - struct xfs_disk_dquot *ddq; xfs_dqid_t id = 0; + int ndquots; int i; /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_qm_calc_dquots_per_chunk(mp, bp->b_length); + + /* * On the first read of the buffer, verify that each dquot is valid. * We don't know what the id of the dquot is supposed to be, just that * they should be increasing monotonically within the buffer. If the * first id is corrupt, then it will fail on the second dquot in the * buffer so corruptions could point to the wrong dquot in this case. */ - for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { - int error; + for (i = 0; i < ndquots; i++) { + struct xfs_disk_dquot *ddq; + int error; ddq = &d[i].dd_diskdq; @@ -274,27 +374,37 @@ xfs_dquot_buf_verify( id = be32_to_cpu(ddq->d_id); error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, - "xfs_dquot_read_verify"); - if (error) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d); - xfs_buf_ioerror(bp, EFSCORRUPTED); - break; - } + "xfs_dquot_buf_verify"); + if (error) + return false; } + return true; } static void xfs_dquot_buf_read_verify( struct xfs_buf *bp) { - xfs_dquot_buf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify_crc(mp, bp) || !xfs_dquot_buf_verify(mp, bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } void xfs_dquot_buf_write_verify( struct xfs_buf *bp) { - xfs_dquot_buf_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify(mp, bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + xfs_dquot_buf_calc_crc(mp, bp); } const struct xfs_buf_ops xfs_dquot_buf_ops = { @@ -648,6 +758,9 @@ xfs_qm_dqread( dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); + /* initialize the dquot speculative prealloc thresholds */ + xfs_dquot_set_prealloc_limits(dqp); + /* Mark the buf so that this will stay incore a little longer */ xfs_buf_set_ref(bp, XFS_DQUOT_REF); @@ -1035,6 +1148,17 @@ xfs_qm_dqflush( &dqp->q_logitem.qli_item.li_lsn); /* + * copy the lsn into the on-disk dquot now while we have the in memory + * dquot here. This can't be done later in the write verifier as we + * can't get access to the log item at that point in time. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; + + dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + } + + /* * Attach an iodone routine so that we can remove this dquot from the * AIL and release the flush lock once the dquot is synced to disk. */ diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index c694a8469c4a..4f0ebfc43cc9 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -32,6 +32,13 @@ struct xfs_mount; struct xfs_trans; +enum { + XFS_QLOWSP_1_PCNT = 0, + XFS_QLOWSP_3_PCNT, + XFS_QLOWSP_5_PCNT, + XFS_QLOWSP_MAX +}; + /* * The incore dquot structure */ @@ -51,6 +58,9 @@ typedef struct xfs_dquot { xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ + xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */ + xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */ + int64_t q_low_space[XFS_QLOWSP_MAX]; struct mutex q_qlock; /* quota lock */ struct completion q_flush; /* flush completion queue */ atomic_t q_pincount; /* dquot pin count */ @@ -145,14 +155,16 @@ extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); -extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, - xfs_disk_dquot_t *); +extern void xfs_qm_adjust_dqlimits(struct xfs_mount *, + struct xfs_dquot *); extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, xfs_dqid_t, uint, uint, xfs_dquot_t **); extern void xfs_qm_dqput(xfs_dquot_t *); extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); +extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); + static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { xfs_dqlock(dqp); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 610456054dc2..35d3f5b041dd 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -66,7 +66,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression, int i; int64_t fsid; - if (random32() % randfactor) + if (prandom_u32() % randfactor) return 0; memcpy(&fsid, fsidp, sizeof(xfs_fsid_t)); @@ -178,7 +178,7 @@ xfs_corruption_error( inst_t *ra) { if (level <= xfs_error_level) - xfs_hex_dump(p, 16); + xfs_hex_dump(p, 64); xfs_error_report(tag, level, mp, filename, linenum, ra); xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index feb36d7551ae..c0f375087efc 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -50,9 +50,8 @@ xfs_efi_item_free( * Freeing the efi requires that we remove it from the AIL if it has already * been placed there. However, the EFI may not yet have been placed in the AIL * when called by xfs_efi_release() from EFD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the - * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees - * the EFI. + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the EFI. */ STATIC void __xfs_efi_release( @@ -60,7 +59,7 @@ __xfs_efi_release( { struct xfs_ail *ailp = efip->efi_item.li_ailp; - if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) { + if (atomic_dec_and_test(&efip->efi_refcount)) { spin_lock(&ailp->xa_lock); /* xfs_trans_ail_delete() drops the AIL lock. */ xfs_trans_ail_delete(ailp, &efip->efi_item, @@ -126,8 +125,8 @@ xfs_efi_item_pin( * which the EFI is manipulated during a transaction. If we are being asked to * remove the EFI it's because the transaction has been cancelled and by * definition that means the EFI cannot be in the AIL so remove it from the - * transaction and free it. Otherwise coordinate with xfs_efi_release() (via - * XFS_EFI_COMMITTED) to determine who gets to free the EFI. + * transaction and free it. Otherwise coordinate with xfs_efi_release() + * to determine who gets to free the EFI. */ STATIC void xfs_efi_item_unpin( @@ -171,19 +170,13 @@ xfs_efi_item_unlock( /* * The EFI is logged only once and cannot be moved in the log, so simply return - * the lsn at which it's been logged. For bulk transaction committed - * processing, the EFI may be processed but not yet unpinned prior to the EFD - * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected - * when processing the EFD. + * the lsn at which it's been logged. */ STATIC xfs_lsn_t xfs_efi_item_committed( struct xfs_log_item *lip, xfs_lsn_t lsn) { - struct xfs_efi_log_item *efip = EFI_ITEM(lip); - - set_bit(XFS_EFI_COMMITTED, &efip->efi_flags); return lsn; } @@ -241,6 +234,7 @@ xfs_efi_init( efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (__psint_t)(void*)efip; atomic_set(&efip->efi_next_extent, 0); + atomic_set(&efip->efi_refcount, 2); return efip; } @@ -310,8 +304,13 @@ xfs_efi_release(xfs_efi_log_item_t *efip, uint nextents) { ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); - if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) + if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { __xfs_efi_release(efip); + + /* recovery needs us to drop the EFI reference, too */ + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) + __xfs_efi_release(efip); + } } static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 375f68e42531..432222418c56 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -114,16 +114,20 @@ typedef struct xfs_efd_log_format_64 { * Define EFI flag bits. Manipulated by set/clear/test_bit operators. */ #define XFS_EFI_RECOVERED 1 -#define XFS_EFI_COMMITTED 2 /* - * This is the "extent free intention" log item. It is used - * to log the fact that some extents need to be free. It is - * used in conjunction with the "extent free done" log item - * described below. + * This is the "extent free intention" log item. It is used to log the fact + * that some extents need to be free. It is used in conjunction with the + * "extent free done" log item described below. + * + * The EFI is reference counted so that it is not freed prior to both the EFI + * and EFD being committed and unpinned. This ensures that when the last + * reference goes away the EFI will always be in the AIL as it has been + * unpinned, regardless of whether the EFD is processed before or after the EFI. */ typedef struct xfs_efi_log_item { xfs_log_item_t efi_item; + atomic_t efi_refcount; atomic_t efi_next_extent; unsigned long efi_flags; /* misc flags */ xfs_efi_log_format_t efi_format; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f03bf1a456fb..054d60c0ac57 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -775,8 +775,6 @@ xfs_file_aio_write( if (ocount == 0) return 0; - sb_start_write(inode->i_sb); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ret = -EIO; goto out; @@ -800,7 +798,6 @@ xfs_file_aio_write( } out: - sb_end_write(inode->i_sb); return ret; } @@ -893,7 +890,7 @@ xfs_dir_open( */ mode = xfs_ilock_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_dir2_data_readahead(NULL, ip, 0, -1); + xfs_dir3_data_readahead(NULL, ip, 0, -1); xfs_iunlock(ip, mode); return 0; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 2866b8c78b7a..87595b211da1 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -247,6 +247,9 @@ xfs_growfs_data_private( tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -265,6 +268,11 @@ xfs_growfs_data_private( } agfl = XFS_BUF_TO_AGFL(bp); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); + agfl->agfl_seqno = cpu_to_be32(agno); + uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); + } for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); @@ -296,8 +304,11 @@ xfs_growfs_data_private( agi->agi_freecount = 0; agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -316,7 +327,13 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, + agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -339,7 +356,13 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, + agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -363,7 +386,12 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, + agno, 0); error = xfs_bwrite(bp); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 515bf71ce01c..c8f5ae1debf2 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -36,6 +36,8 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_bmap.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" /* @@ -165,6 +167,7 @@ xfs_ialloc_inode_init( int version; int i, j; xfs_daddr_t d; + xfs_ino_t ino = 0; /* * Loop over the new block(s), filling in the inodes. @@ -183,13 +186,29 @@ xfs_ialloc_inode_init( } /* - * Figure out what version number to use in the inodes we create. - * If the superblock version has caught up to the one that supports - * the new inode format, then use the new inode version. Otherwise - * use the old version so that old kernels will continue to be - * able to use the file system. + * Figure out what version number to use in the inodes we create. If + * the superblock version has caught up to the one that supports the new + * inode format, then use the new inode version. Otherwise use the old + * version so that old kernels will continue to be able to use the file + * system. + * + * For v3 inodes, we also need to write the inode number into the inode, + * so calculate the first inode number of the chunk here as + * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not + * across multiple filesystem blocks (such as a cluster) and so cannot + * be used in the cluster buffer loop below. + * + * Further, because we are writing the inode directly into the buffer + * and calculating a CRC on the entire inode, we have ot log the entire + * inode so that the entire range the CRC covers is present in the log. + * That means for v3 inode we log the entire buffer rather than just the + * inode cores. */ - if (xfs_sb_version_hasnlink(&mp->m_sb)) + if (xfs_sb_version_hascrc(&mp->m_sb)) { + version = 3; + ino = XFS_AGINO_TO_INO(mp, agno, + XFS_OFFBNO_TO_AGINO(mp, agbno, 0)); + } else if (xfs_sb_version_hasnlink(&mp->m_sb)) version = 2; else version = 1; @@ -212,17 +231,32 @@ xfs_ialloc_inode_init( * individual transactions causing a lot of log traffic. */ fbuf->b_ops = &xfs_inode_buf_ops; - xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); + xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < ninodes; i++) { int ioffset = i << mp->m_sb.sb_inodelog; - uint isize = sizeof(struct xfs_dinode); + uint isize = xfs_dinode_size(version); free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); free->di_version = version; free->di_gen = cpu_to_be32(gen); free->di_next_unlinked = cpu_to_be32(NULLAGINO); - xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1); + + if (version == 3) { + free->di_ino = cpu_to_be64(ino); + ino++; + uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid); + xfs_dinode_calc_crc(mp, free); + } else { + /* just log the inode core */ + xfs_trans_log_buf(tp, fbuf, ioffset, + ioffset + isize - 1); + } + } + if (version == 3) { + /* need to log the entire buffer */ + xfs_trans_log_buf(tp, fbuf, 0, + BBTOB(fbuf->b_length) - 1); } xfs_trans_inode_alloc_buf(tp, fbuf); } @@ -369,7 +403,7 @@ xfs_ialloc_ag_alloc( * number from being easily guessable. */ error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, - args.len, random32()); + args.len, prandom_u32()); if (error) return error; @@ -1453,6 +1487,7 @@ xfs_ialloc_log_agi( /* * Log the allocation group inode header buffer. */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, bp, first, last); } @@ -1470,19 +1505,23 @@ xfs_check_agi_unlinked( #define xfs_check_agi_unlinked(agi) #endif -static void +static bool xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); - int agi_ok; + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid)) + return false; /* * Validate the magic number of the agi block. */ - agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) && - XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); + if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) + return false; + if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) + return false; /* * during growfs operations, the perag is not fully initialised, @@ -1490,30 +1529,52 @@ xfs_agi_verify( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag) - agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) == - bp->b_pag->pag_agno; + if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno) + return false; - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, - XFS_RANDOM_IALLOC_READ_AGI))) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } xfs_check_agi_unlinked(agi); + return true; } static void xfs_agi_read_verify( struct xfs_buf *bp) { - xfs_agi_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + int agi_ok = 1; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + agi_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agi, agi_crc)); + agi_ok = agi_ok && xfs_agi_verify(bp); + + if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void xfs_agi_write_verify( struct xfs_buf *bp) { - xfs_agi_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_agi_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_agi, agi_crc)); } const struct xfs_buf_ops xfs_agi_buf_ops = { diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index bec344b36507..c82ac8867421 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -34,6 +34,7 @@ #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_cksum.h" STATIC int @@ -182,52 +183,88 @@ xfs_inobt_key_diff( cur->bc_rec.i.ir_startino; } -void +static int xfs_inobt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; unsigned int level; - int sblock_ok; /* block passes checks */ - /* magic number and level verification */ - level = be16_to_cpu(block->bb_level); - sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) && - level < mp->m_in_maxlevels; + /* + * During growfs operations, we can't verify the exact owner as the + * perag is not fully initialised and hence not attached to the buffer. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agi information will not yet have been initialised + * from the on disk AGI. We don't currently use any of this information, + * but beware of the landmine (i.e. need to check pag->pagi_init) if we + * ever do. + */ + switch (block->bb_magic) { + case cpu_to_be32(XFS_IBT_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ + case cpu_to_be32(XFS_IBT_MAGIC): + break; + default: + return 0; + } - /* numrecs verification */ - sblock_ok = sblock_ok && - be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0]; + /* numrecs and level verification */ + level = be16_to_cpu(block->bb_level); + if (level >= mp->m_in_maxlevels) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0]) + return false; /* sibling pointer verification */ - sblock_ok = sblock_ok && - (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_leftsib && - (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || - be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && - block->bb_u.s.bb_rightsib; - - if (!sblock_ok) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; } static void xfs_inobt_read_verify( struct xfs_buf *bp) { - xfs_inobt_verify(bp); + if (!(xfs_btree_sblock_verify_crc(bp) && + xfs_inobt_verify(bp))) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } } static void xfs_inobt_write_verify( struct xfs_buf *bp) { - xfs_inobt_verify(bp); + if (!xfs_inobt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + bp->b_target->bt_mount, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + xfs_btree_sblock_calc_crc(bp); + } const struct xfs_buf_ops xfs_inobt_buf_ops = { @@ -301,6 +338,8 @@ xfs_inobt_init_cursor( cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_inobt_ops; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index 25c0239a8eab..3ac36b7642e9 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -29,7 +29,8 @@ struct xfs_mount; /* * There is a btree for the inode map per allocation group. */ -#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ +#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ +#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */ typedef __uint64_t xfs_inofree_t; #define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) @@ -76,10 +77,10 @@ typedef __be32 xfs_inobt_ptr_t; /* * Btree block header size depends on a superblock flag. - * - * (not quite yet, but soon) */ -#define XFS_INOBT_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN +#define XFS_INOBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* * Record, key, and pointer address macros for btree blocks. diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4f201656d2d9..558ef4947206 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -44,6 +44,7 @@ #include "xfs_quota.h" #include "xfs_filestream.h" #include "xfs_vnodeops.h" +#include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -786,6 +787,7 @@ xfs_iformat_btree( xfs_dinode_t *dip, int whichfork) { + struct xfs_mount *mp = ip->i_mount; xfs_bmdr_block_t *dfp; xfs_ifork_t *ifp; /* REFERENCED */ @@ -794,7 +796,7 @@ xfs_iformat_btree( ifp = XFS_IFORK_PTR(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); - size = XFS_BMAP_BROOT_SPACE(dfp); + size = XFS_BMAP_BROOT_SPACE(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); /* @@ -805,14 +807,14 @@ xfs_iformat_btree( * blocks. */ if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork) || + XFS_IFORK_MAXEXT(ip, whichfork) || XFS_BMDR_SPACE_CALC(nrecs) > - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) || + XFS_DFORK_SIZE(dip, mp, whichfork) || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { - xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", - (unsigned long long) ip->i_ino); + xfs_warn(mp, "corrupt inode %Lu (btree).", + (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); + mp, dip); return XFS_ERROR(EFSCORRUPTED); } @@ -823,8 +825,7 @@ xfs_iformat_btree( * Copy and convert from the on-disk structure * to the in-memory structure. */ - xfs_bmdr_to_bmbt(ip->i_mount, dfp, - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), + xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), ifp->if_broot, size); ifp->if_flags &= ~XFS_IFEXTENTS; ifp->if_flags |= XFS_IFBROOT; @@ -866,6 +867,17 @@ xfs_dinode_from_disk( to->di_dmstate = be16_to_cpu(from->di_dmstate); to->di_flags = be16_to_cpu(from->di_flags); to->di_gen = be32_to_cpu(from->di_gen); + + if (to->di_version == 3) { + to->di_changecount = be64_to_cpu(from->di_changecount); + to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); + to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); + to->di_flags2 = be64_to_cpu(from->di_flags2); + to->di_ino = be64_to_cpu(from->di_ino); + to->di_lsn = be64_to_cpu(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + } } void @@ -902,6 +914,17 @@ xfs_dinode_to_disk( to->di_dmstate = cpu_to_be16(from->di_dmstate); to->di_flags = cpu_to_be16(from->di_flags); to->di_gen = cpu_to_be32(from->di_gen); + + if (from->di_version == 3) { + to->di_changecount = cpu_to_be64(from->di_changecount); + to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); + to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); + to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_ino = cpu_to_be64(from->di_ino); + to->di_lsn = cpu_to_be64(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + } } STATIC uint @@ -962,6 +985,47 @@ xfs_dic2xflags( (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); } +static bool +xfs_dinode_verify( + struct xfs_mount *mp, + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) + return false; + + /* only version 3 or greater inodes are extensively verified here */ + if (dip->di_version < 3) + return true; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + offsetof(struct xfs_dinode, di_crc))) + return false; + if (be64_to_cpu(dip->di_ino) != ip->i_ino) + return false; + if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid)) + return false; + return true; +} + +void +xfs_dinode_calc_crc( + struct xfs_mount *mp, + struct xfs_dinode *dip) +{ + __uint32_t crc; + + if (dip->di_version < 3) + return; + + ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); + crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, + offsetof(struct xfs_dinode, di_crc)); + dip->di_crc = xfs_end_cksum(crc); +} + /* * Read the disk inode attributes into the in-core inode structure. */ @@ -990,17 +1054,13 @@ xfs_iread( if (error) return error; - /* - * If we got something that isn't an inode it means someone - * (nfs or dmi) has a stale handle. - */ - if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) { -#ifdef DEBUG - xfs_alert(mp, - "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)", - __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC); -#endif /* DEBUG */ - error = XFS_ERROR(EINVAL); + /* even unallocated inodes are verified */ + if (!xfs_dinode_verify(mp, ip, dip)) { + xfs_alert(mp, "%s: validation failed for inode %lld failed", + __func__, ip->i_ino); + + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); + error = XFS_ERROR(EFSCORRUPTED); goto out_brelse; } @@ -1022,10 +1082,20 @@ xfs_iread( goto out_brelse; } } else { + /* + * Partial initialisation of the in-core inode. Just the bits + * that xfs_ialloc won't overwrite or relies on being correct. + */ ip->i_d.di_magic = be16_to_cpu(dip->di_magic); ip->i_d.di_version = dip->di_version; ip->i_d.di_gen = be32_to_cpu(dip->di_gen); ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); + + if (dip->di_version == 3) { + ip->i_d.di_ino = be64_to_cpu(dip->di_ino); + uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid); + } + /* * Make sure to pull in the mode here as well in * case the inode is released without being used. @@ -1161,6 +1231,7 @@ xfs_ialloc( xfs_buf_t **ialloc_context, xfs_inode_t **ipp) { + struct xfs_mount *mp = tp->t_mountp; xfs_ino_t ino; xfs_inode_t *ip; uint flags; @@ -1187,7 +1258,7 @@ xfs_ialloc( * This is because we're setting fields here we need * to prevent others from looking at until we're done. */ - error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, + error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); if (error) return error; @@ -1208,7 +1279,7 @@ xfs_ialloc( * the inode version number now. This way we only do the conversion * here rather than here and in the flush/logging code. */ - if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) && + if (xfs_sb_version_hasnlink(&mp->m_sb) && ip->i_d.di_version == 1) { ip->i_d.di_version = 2; /* @@ -1258,6 +1329,19 @@ xfs_ialloc( ip->i_d.di_dmevmask = 0; ip->i_d.di_dmstate = 0; ip->i_d.di_flags = 0; + + if (ip->i_d.di_version == 3) { + ASSERT(ip->i_d.di_ino == ino); + ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid)); + ip->i_d.di_crc = 0; + ip->i_d.di_changecount = 1; + ip->i_d.di_lsn = 0; + ip->i_d.di_flags2 = 0; + memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2)); + ip->i_d.di_crtime = ip->i_d.di_mtime; + } + + flags = XFS_ILOG_CORE; switch (mode & S_IFMT) { case S_IFIFO: @@ -2037,7 +2121,7 @@ xfs_iroot_realloc( * allocate it now and get out. */ if (ifp->if_broot_bytes == 0) { - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); ifp->if_broot_bytes = (int)new_size; return; @@ -2051,9 +2135,9 @@ xfs_iroot_realloc( */ cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); new_max = cur_max + rec_diff; - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ + XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), KM_SLEEP | KM_NOFS); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); @@ -2061,7 +2145,7 @@ xfs_iroot_realloc( (int)new_size); ifp->if_broot_bytes = (int)new_size; ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); + XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); return; } @@ -2076,7 +2160,7 @@ xfs_iroot_realloc( new_max = cur_max + rec_diff; ASSERT(new_max >= 0); if (new_max > 0) - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); else new_size = 0; if (new_size > 0) { @@ -2084,7 +2168,8 @@ xfs_iroot_realloc( /* * First copy over the btree block header. */ - memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN); + memcpy(new_broot, ifp->if_broot, + XFS_BMBT_BLOCK_LEN(ip->i_mount)); } else { new_broot = NULL; ifp->if_flags &= ~XFS_IFBROOT; @@ -2114,7 +2199,7 @@ xfs_iroot_realloc( ifp->if_broot = new_broot; ifp->if_broot_bytes = (int)new_size; ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); + XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); return; } @@ -2427,7 +2512,7 @@ xfs_iflush_fork( ASSERT(ifp->if_broot != NULL); ASSERT(ifp->if_broot_bytes <= (XFS_IFORK_SIZE(ip, whichfork) + - XFS_BROOT_SIZE_ADJ)); + XFS_BROOT_SIZE_ADJ(ip))); xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, (xfs_bmdr_block_t *)cp, XFS_DFORK_SIZE(dip, mp, whichfork)); @@ -2715,20 +2800,18 @@ abort_out: STATIC int xfs_iflush_int( - xfs_inode_t *ip, - xfs_buf_t *bp) + struct xfs_inode *ip, + struct xfs_buf *bp) { - xfs_inode_log_item_t *iip; - xfs_dinode_t *dip; - xfs_mount_t *mp; + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_dinode *dip; + struct xfs_mount *mp = ip->i_mount; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - - iip = ip->i_itemp; - mp = ip->i_mount; + ASSERT(iip != NULL && iip->ili_fields != 0); /* set *dip = inode's place in the buffer */ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -2789,9 +2872,9 @@ xfs_iflush_int( } /* * bump the flush iteration count, used to detect flushes which - * postdate a log record during recovery. + * postdate a log record during recovery. This is redundant as we now + * log every change and hence this can't happen. Still, it doesn't hurt. */ - ip->i_d.di_flushiter++; /* @@ -2867,41 +2950,30 @@ xfs_iflush_int( * need the AIL lock, because it is a 64 bit value that cannot be read * atomically. */ - if (iip != NULL && iip->ili_fields != 0) { - iip->ili_last_fields = iip->ili_fields; - iip->ili_fields = 0; - iip->ili_logged = 1; + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_logged = 1; - xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); + xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); - /* - * Attach the function xfs_iflush_done to the inode's - * buffer. This will remove the inode from the AIL - * and unlock the inode's flush lock when the inode is - * completely written to disk. - */ - xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); + /* + * Attach the function xfs_iflush_done to the inode's + * buffer. This will remove the inode from the AIL + * and unlock the inode's flush lock when the inode is + * completely written to disk. + */ + xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); - ASSERT(bp->b_fspriv != NULL); - ASSERT(bp->b_iodone != NULL); - } else { - /* - * We're flushing an inode which is not in the AIL and has - * not been logged. For this case we can immediately drop - * the inode flush lock because we can avoid the whole - * AIL state thing. It's OK to drop the flush lock now, - * because we've already locked the buffer and to do anything - * you really need both. - */ - if (iip != NULL) { - ASSERT(iip->ili_logged == 0); - ASSERT(iip->ili_last_fields == 0); - ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); - } - xfs_ifunlock(ip); - } + /* update the lsn in the on disk inode if required */ + if (ip->i_d.di_version == 3) + dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn); + + /* generate the checksum. */ + xfs_dinode_calc_crc(mp, dip); + ASSERT(bp->b_fspriv != NULL); + ASSERT(bp->b_iodone != NULL); return 0; corrupt_out: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 237e7f6f2ab3..91129794aaec 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -150,13 +150,38 @@ typedef struct xfs_icdinode { __uint16_t di_dmstate; /* DMIG state info */ __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ __uint32_t di_gen; /* generation number */ + + /* di_next_unlinked is the only non-core field in the old dinode */ + xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ + + /* start of the extended dinode, writable fields */ + __uint32_t di_crc; /* CRC of the inode */ + __uint64_t di_changecount; /* number of attribute changes */ + xfs_lsn_t di_lsn; /* flush sequence */ + __uint64_t di_flags2; /* more random flags */ + __uint8_t di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_ictimestamp_t di_crtime; /* time created */ + xfs_ino_t di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ } xfs_icdinode_t; +static inline uint xfs_icdinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_icdinode); + return offsetof(struct xfs_icdinode, di_next_unlinked); +} + /* * Flags for xfs_ichgtime(). */ #define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ #define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ +#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ /* * Per-fork incore inode flags. @@ -180,10 +205,11 @@ typedef struct xfs_icdinode { #define XFS_IFORK_DSIZE(ip) \ (XFS_IFORK_Q(ip) ? \ XFS_IFORK_BOFF(ip) : \ - XFS_LITINO((ip)->i_mount)) + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version)) #define XFS_IFORK_ASIZE(ip) \ (XFS_IFORK_Q(ip) ? \ - XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \ + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \ + XFS_IFORK_BOFF(ip) : \ 0) #define XFS_IFORK_SIZE(ip,w) \ ((w) == XFS_DATA_FORK ? \ @@ -555,6 +581,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); +void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index f034bd1652f0..f76ff52e43c0 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -179,7 +179,7 @@ xfs_inode_item_format( nvecs = 1; vecp->i_addr = &ip->i_d; - vecp->i_len = sizeof(struct xfs_icdinode); + vecp->i_len = xfs_icdinode_size(ip->i_d.di_version); vecp->i_type = XLOG_REG_TYPE_ICORE; vecp++; nvecs++; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 5a30dd899d2b..8f8aaee7f379 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -42,6 +42,8 @@ #include "xfs_iomap.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -362,10 +364,65 @@ xfs_iomap_eof_prealloc_initial_size( if (imap[0].br_startblock == HOLESTARTBLOCK) return 0; if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) - return imap[0].br_blockcount; + return imap[0].br_blockcount << 1; return XFS_B_TO_FSB(mp, offset); } +STATIC bool +xfs_quota_need_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t alloc_blocks) +{ + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + if (!dq || !xfs_this_quota_on(ip->i_mount, type)) + return false; + + /* no hi watermark, no throttle */ + if (!dq->q_prealloc_hi_wmark) + return false; + + /* under the lo watermark, no throttle */ + if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark) + return false; + + return true; +} + +STATIC void +xfs_quota_calc_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t *qblocks, + int *qshift) +{ + int64_t freesp; + int shift = 0; + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + /* over hi wmark, squash the prealloc completely */ + if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { + *qblocks = 0; + return; + } + + freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount; + if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) { + shift = 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT]) + shift += 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT]) + shift += 2; + } + + /* only overwrite the throttle values if we are more aggressive */ + if ((freesp >> shift) < (*qblocks >> *qshift)) { + *qblocks = freesp; + *qshift = shift; + } +} + /* * If we don't have a user specified preallocation size, dynamically increase * the preallocation size as the size of the file grows. Cap the maximum size @@ -381,45 +438,89 @@ xfs_iomap_prealloc_size( int nimaps) { xfs_fsblock_t alloc_blocks = 0; + int shift = 0; + int64_t freesp; + xfs_fsblock_t qblocks; + int qshift = 0; alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, imap, nimaps); - if (alloc_blocks > 0) { - int shift = 0; - int64_t freesp; - - alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, - rounddown_pow_of_two(alloc_blocks)); - - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); - freesp = mp->m_sb.sb_fdblocks; - if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { - shift = 2; - if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) - shift++; - if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) - shift++; - } - if (shift) - alloc_blocks >>= shift; + if (!alloc_blocks) + goto check_writeio; + qblocks = alloc_blocks; - /* - * If we are still trying to allocate more space than is - * available, squash the prealloc hard. This can happen if we - * have a large file on a small filesystem and the above - * lowspace thresholds are smaller than MAXEXTLEN. - */ - while (alloc_blocks && alloc_blocks >= freesp) - alloc_blocks >>= 4; + /* + * MAXEXTLEN is not a power of two value but we round the prealloc down + * to the nearest power of two value after throttling. To prevent the + * round down from unconditionally reducing the maximum supported prealloc + * size, we round up first, apply appropriate throttling, round down and + * cap the value to MAXEXTLEN. + */ + alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), + alloc_blocks); + + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + freesp = mp->m_sb.sb_fdblocks; + if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { + shift = 2; + if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) + shift++; } + /* + * Check each quota to cap the prealloc size and provide a shift + * value to throttle with. + */ + if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift); + if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift); + if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift); + + /* + * The final prealloc size is set to the minimum of free space available + * in each of the quotas and the overall filesystem. + * + * The shift throttle value is set to the maximum value as determined by + * the global low free space values and per-quota low free space values. + */ + alloc_blocks = MIN(alloc_blocks, qblocks); + shift = MAX(shift, qshift); + + if (shift) + alloc_blocks >>= shift; + /* + * rounddown_pow_of_two() returns an undefined result if we pass in + * alloc_blocks = 0. + */ + if (alloc_blocks) + alloc_blocks = rounddown_pow_of_two(alloc_blocks); + if (alloc_blocks > MAXEXTLEN) + alloc_blocks = MAXEXTLEN; + + /* + * If we are still trying to allocate more space than is + * available, squash the prealloc hard. This can happen if we + * have a large file on a small filesystem and the above + * lowspace thresholds are smaller than MAXEXTLEN. + */ + while (alloc_blocks && alloc_blocks >= freesp) + alloc_blocks >>= 4; + +check_writeio: if (alloc_blocks < mp->m_writeio_blocks) alloc_blocks = mp->m_writeio_blocks; + trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, + mp->m_writeio_blocks); + return alloc_blocks; } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index fe7e4df85a7b..14e59d953b7b 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -72,6 +72,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/list_sort.h> +#include <linux/ratelimit.h> #include <asm/page.h> #include <asm/div64.h> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index eec226f78a40..b345a7c85153 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3485,7 +3485,7 @@ xlog_ticket_alloc( tic->t_curr_res = unit_bytes; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = random32(); + tic->t_tid = prandom_u32(); tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; tic->t_trans_type = 0; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ddc4529d07d3..e3d0b85d852b 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -668,10 +668,6 @@ xlog_cil_push_foreground( * transaction to the checkpoint context so we carry the busy extents through * to checkpoint completion, and then unlock all the items in the transaction. * - * For more specific information about the order of operations in - * xfs_log_commit_cil() please refer to the comments in - * xfs_trans_commit_iclog(). - * * Called with the context lock already held in read mode to lock out * background commit, returns without it held once background commits are * allowed again. diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 16d8d12ea3b4..b9ea262dd1c2 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -468,7 +468,6 @@ struct xfs_cil { * threshold, yet give us plenty of space for aggregation on large logs. */ #define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) -#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4)) /* * ticket grant locks, queues and accounting have their own cachlines diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index d1dba7ce75ae..93f03ec17eec 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -29,6 +29,7 @@ #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" +#include "xfs_btree.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -45,6 +46,14 @@ #include "xfs_trace.h" #include "xfs_icache.h" +/* Need all the magic numbers and buffer ops structures from these headers */ +#include "xfs_symlink.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" + STATIC int xlog_find_zeroed( struct xlog *, @@ -1785,6 +1794,7 @@ xlog_recover_do_inode_buffer( xfs_agino_t *buffer_nextp; trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + bp->b_ops = &xfs_inode_buf_ops; inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { @@ -1857,6 +1867,201 @@ xlog_recover_do_inode_buffer( } /* + * Validate the recovered buffer is of the correct type and attach the + * appropriate buffer operations to them for writeback. Magic numbers are in a + * few places: + * the first 16 bits of the buffer (inode buffer, dquot buffer), + * the first 32 bits of the buffer (most blocks), + * inside a struct xfs_da_blkinfo at the start of the buffer. + */ +static void +xlog_recovery_validate_buf_type( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); + magic16 = be16_to_cpu(*(__be16*)bp->b_addr); + magicda = be16_to_cpu(info->magic); + switch (xfs_blft_from_flags(buf_f)) { + case XFS_BLFT_BTREE_BUF: + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + bp->b_ops = &xfs_allocbt_buf_ops; + break; + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: + bp->b_ops = &xfs_inobt_buf_ops; + break; + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: + bp->b_ops = &xfs_bmbt_buf_ops; + break; + default: + xfs_warn(mp, "Bad btree block magic!"); + ASSERT(0); + break; + } + break; + case XFS_BLFT_AGF_BUF: + if (magic32 != XFS_AGF_MAGIC) { + xfs_warn(mp, "Bad AGF block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agf_buf_ops; + break; + case XFS_BLFT_AGFL_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_AGFL_MAGIC) { + xfs_warn(mp, "Bad AGFL block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agfl_buf_ops; + break; + case XFS_BLFT_AGI_BUF: + if (magic32 != XFS_AGI_MAGIC) { + xfs_warn(mp, "Bad AGI block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agi_buf_ops; + break; + case XFS_BLFT_UDQUOT_BUF: + case XFS_BLFT_PDQUOT_BUF: + case XFS_BLFT_GDQUOT_BUF: +#ifdef CONFIG_XFS_QUOTA + if (magic16 != XFS_DQUOT_MAGIC) { + xfs_warn(mp, "Bad DQUOT block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dquot_buf_ops; +#else + xfs_alert(mp, + "Trying to recover dquots without QUOTA support built in!"); + ASSERT(0); +#endif + break; + case XFS_BLFT_DINO_BUF: + /* + * we get here with inode allocation buffers, not buffers that + * track unlinked list changes. + */ + if (magic16 != XFS_DINODE_MAGIC) { + xfs_warn(mp, "Bad INODE block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_inode_buf_ops; + break; + case XFS_BLFT_SYMLINK_BUF: + if (magic32 != XFS_SYMLINK_MAGIC) { + xfs_warn(mp, "Bad symlink block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_symlink_buf_ops; + break; + case XFS_BLFT_DIR_BLOCK_BUF: + if (magic32 != XFS_DIR2_BLOCK_MAGIC && + magic32 != XFS_DIR3_BLOCK_MAGIC) { + xfs_warn(mp, "Bad dir block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_block_buf_ops; + break; + case XFS_BLFT_DIR_DATA_BUF: + if (magic32 != XFS_DIR2_DATA_MAGIC && + magic32 != XFS_DIR3_DATA_MAGIC) { + xfs_warn(mp, "Bad dir data magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_data_buf_ops; + break; + case XFS_BLFT_DIR_FREE_BUF: + if (magic32 != XFS_DIR2_FREE_MAGIC && + magic32 != XFS_DIR3_FREE_MAGIC) { + xfs_warn(mp, "Bad dir3 free magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_free_buf_ops; + break; + case XFS_BLFT_DIR_LEAF1_BUF: + if (magicda != XFS_DIR2_LEAF1_MAGIC && + magicda != XFS_DIR3_LEAF1_MAGIC) { + xfs_warn(mp, "Bad dir leaf1 magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + break; + case XFS_BLFT_DIR_LEAFN_BUF: + if (magicda != XFS_DIR2_LEAFN_MAGIC && + magicda != XFS_DIR3_LEAFN_MAGIC) { + xfs_warn(mp, "Bad dir leafn magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leafn_buf_ops; + break; + case XFS_BLFT_DA_NODE_BUF: + if (magicda != XFS_DA_NODE_MAGIC && + magicda != XFS_DA3_NODE_MAGIC) { + xfs_warn(mp, "Bad da node magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_da3_node_buf_ops; + break; + case XFS_BLFT_ATTR_LEAF_BUF: + if (magicda != XFS_ATTR_LEAF_MAGIC && + magicda != XFS_ATTR3_LEAF_MAGIC) { + xfs_warn(mp, "Bad attr leaf magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_leaf_buf_ops; + break; + case XFS_BLFT_ATTR_RMT_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_ATTR3_RMT_MAGIC) { + xfs_warn(mp, "Bad attr remote magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_rmt_buf_ops; + break; + case XFS_BLFT_SB_BUF: + if (magic32 != XFS_SB_MAGIC) { + xfs_warn(mp, "Bad SB block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_sb_buf_ops; + break; + default: + xfs_warn(mp, "Unknown buffer type %d!", + xfs_blft_from_flags(buf_f)); + break; + } +} + +/* * Perform a 'normal' buffer recovery. Each logged region of the * buffer should be copied over the corresponding region in the * given buffer. The bitmap in the buf log format structure indicates @@ -1928,6 +2133,8 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); + + xlog_recovery_validate_buf_type(mp, bp, buf_f); } /* @@ -2213,6 +2420,7 @@ xlog_recover_inode_pass2( int attr_index; uint fields; xfs_icdinode_t *dicp; + uint isize; int need_free = 0; if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { @@ -2238,7 +2446,7 @@ xlog_recover_inode_pass2( trace_xfs_log_recover_inode_recover(log, in_f); bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, - NULL); + &xfs_inode_buf_ops); if (!bp) { error = ENOMEM; goto error; @@ -2349,7 +2557,8 @@ xlog_recover_inode_pass2( error = EFSCORRUPTED; goto error; } - if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { + isize = xfs_icdinode_size(dicp->di_version); + if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); @@ -2361,13 +2570,13 @@ xlog_recover_inode_pass2( } /* The core is in in-core format */ - xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr); + xfs_dinode_to_disk(dip, dicp); /* the rest is in on-disk format */ - if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { - memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_len - sizeof(struct xfs_icdinode)); + if (item->ri_buf[1].i_len > isize) { + memcpy((char *)dip + isize, + item->ri_buf[1].i_addr + isize, + item->ri_buf[1].i_len - isize); } fields = in_f->ilf_fields; @@ -2451,6 +2660,9 @@ xlog_recover_inode_pass2( } write_inode_buffer: + /* re-generate the checksum. */ + xfs_dinode_calc_crc(log->l_mp, dip); + ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; xfs_buf_delwri_queue(bp, buffer_list); @@ -2948,6 +3160,7 @@ xlog_recover_process_efi( * This will pull the EFI from the AIL and * free the memory associated with it. */ + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip, efip->efi_format.efi_nextents); return XFS_ERROR(EIO); } @@ -3751,6 +3964,25 @@ xlog_recover( return error; } + /* + * Version 5 superblock log feature mask validation. We know the + * log is dirty so check if there are any unknown log features + * in what we need to recover. If there are unknown features + * (e.g. unsupported transactions, then simply reject the + * attempt at recovery before touching anything. + */ + if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { + xfs_warn(log->l_mp, +"Superblock has unknown incompatible log features (0x%x) enabled.\n" +"The log can not be fully and/or safely recovered by this kernel.\n" +"Please recover the log on a kernel that supports the unknown features.", + (log->l_mp->m_sb.sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); + return EINVAL; + } + xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", log->l_mp->m_logname ? log->l_mp->m_logname : "internal"); diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 56dc0c17f16a..76c81982f964 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -30,6 +30,32 @@ void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) } #endif +#define xfs_printk_ratelimited(func, dev, fmt, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + func(dev, fmt, ##__VA_ARGS__); \ +} while (0) + +#define xfs_emerg_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__) +#define xfs_alert_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_alert, dev, fmt, ##__VA_ARGS__) +#define xfs_crit_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_crit, dev, fmt, ##__VA_ARGS__) +#define xfs_err_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_err, dev, fmt, ##__VA_ARGS__) +#define xfs_warn_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_warn, dev, fmt, ##__VA_ARGS__) +#define xfs_notice_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_notice, dev, fmt, ##__VA_ARGS__) +#define xfs_info_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_info, dev, fmt, ##__VA_ARGS__) +#define xfs_debug_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) + extern void assfail(char *expr, char *f, int l); extern void xfs_hex_dump(void *p, int length); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 3806088a8f77..f6bfbd734669 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -43,6 +43,8 @@ #include "xfs_utils.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" #ifdef HAVE_PERCPU_SB @@ -109,6 +111,14 @@ static const struct { { offsetof(xfs_sb_t, sb_logsunit), 0 }, { offsetof(xfs_sb_t, sb_features2), 0 }, { offsetof(xfs_sb_t, sb_bad_features2), 0 }, + { offsetof(xfs_sb_t, sb_features_compat), 0 }, + { offsetof(xfs_sb_t, sb_features_ro_compat), 0 }, + { offsetof(xfs_sb_t, sb_features_incompat), 0 }, + { offsetof(xfs_sb_t, sb_features_log_incompat), 0 }, + { offsetof(xfs_sb_t, sb_crc), 0 }, + { offsetof(xfs_sb_t, sb_pad), 0 }, + { offsetof(xfs_sb_t, sb_pquotino), 0 }, + { offsetof(xfs_sb_t, sb_lsn), 0 }, { sizeof(xfs_sb_t), 0 } }; @@ -319,11 +329,54 @@ xfs_mount_validate_sb( return XFS_ERROR(EWRONGFS); } + if (!xfs_sb_good_version(sbp)) { xfs_warn(mp, "bad version"); return XFS_ERROR(EWRONGFS); } + /* + * Version 5 superblock feature mask validation. Reject combinations the + * kernel cannot support up front before checking anything else. + */ + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { + xfs_alert(mp, +"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" +"Use of these features in this kernel is at your own risk!"); + + if (xfs_sb_has_compat_feature(sbp, + XFS_SB_FEAT_COMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown compatible features (0x%x) enabled.\n" +"Using a more recent kernel is recommended.", + (sbp->sb_features_compat & + XFS_SB_FEAT_COMPAT_UNKNOWN)); + } + + if (xfs_sb_has_ro_compat_feature(sbp, + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_alert(mp, +"Superblock has unknown read-only compatible features (0x%x) enabled.", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_warn(mp, +"Attempted to mount read-only compatible filesystem read-write.\n" +"Filesystem can only be safely mounted read only."); + return XFS_ERROR(EINVAL); + } + } + if (xfs_sb_has_incompat_feature(sbp, + XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown incompatible features (0x%x) enabled.\n" +"Filesystem can not be safely mounted by this kernel.", + (sbp->sb_features_incompat & + XFS_SB_FEAT_INCOMPAT_UNKNOWN)); + return XFS_ERROR(EINVAL); + } + } + if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { xfs_warn(mp, @@ -557,6 +610,14 @@ xfs_sb_from_disk( to->sb_logsunit = be32_to_cpu(from->sb_logsunit); to->sb_features2 = be32_to_cpu(from->sb_features2); to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2); + to->sb_features_compat = be32_to_cpu(from->sb_features_compat); + to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat); + to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat); + to->sb_features_log_incompat = + be32_to_cpu(from->sb_features_log_incompat); + to->sb_pad = 0; + to->sb_pquotino = be64_to_cpu(from->sb_pquotino); + to->sb_lsn = be64_to_cpu(from->sb_lsn); } /* @@ -612,13 +673,12 @@ xfs_sb_to_disk( } } -static void +static int xfs_sb_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_sb sb; - int error; xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); @@ -626,16 +686,46 @@ xfs_sb_verify( * Only check the in progress field for the primary superblock as * mkfs.xfs doesn't clear it from secondary superblocks. */ - error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); - if (error) - xfs_buf_ioerror(bp, error); + return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); } +/* + * If the superblock has the CRC feature bit set or the CRC field is non-null, + * check that the CRC is valid. We check the CRC field is non-null because a + * single bit error could clear the feature bit and unused parts of the + * superblock are supposed to be zero. Hence a non-null crc field indicates that + * we've potentially lost a feature bit and we should check it anyway. + */ static void xfs_sb_read_verify( struct xfs_buf *bp) { - xfs_sb_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + int error; + + /* + * open code the version check to avoid needing to convert the entire + * superblock from disk order just to check the version number + */ + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) && + (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) == + XFS_SB_VERSION_5) || + dsb->sb_crc != 0)) { + + if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize), + offsetof(struct xfs_sb, sb_crc))) { + error = EFSCORRUPTED; + goto out_error; + } + } + error = xfs_sb_verify(bp); + +out_error: + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, error); + } } /* @@ -648,11 +738,10 @@ static void xfs_sb_quiet_read_verify( struct xfs_buf *bp) { - struct xfs_sb sb; + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); - xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); - if (sb.sb_magicnum == XFS_SB_MAGIC) { + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { /* XFS filesystem, verify noisily! */ xfs_sb_read_verify(bp); return; @@ -663,9 +752,27 @@ xfs_sb_quiet_read_verify( static void xfs_sb_write_verify( - struct xfs_buf *bp) + struct xfs_buf *bp) { - xfs_sb_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + int error; + + error = xfs_sb_verify(bp); + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, error); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_sb, sb_crc)); } const struct xfs_buf_ops xfs_sb_buf_ops = { @@ -687,7 +794,8 @@ int xfs_readsb(xfs_mount_t *mp, int flags) { unsigned int sector_size; - xfs_buf_t *bp; + struct xfs_buf *bp; + struct xfs_sb *sbp = &mp->m_sb; int error; int loud = !(flags & XFS_MFSI_QUIET); @@ -714,7 +822,7 @@ reread: if (bp->b_error) { error = bp->b_error; if (loud) - xfs_warn(mp, "SB validate failed"); + xfs_warn(mp, "SB validate failed with error %d.", error); goto release_buf; } @@ -726,10 +834,10 @@ reread: /* * We must be able to do sector-sized and sector-aligned IO. */ - if (sector_size > mp->m_sb.sb_sectsize) { + if (sector_size > sbp->sb_sectsize) { if (loud) xfs_warn(mp, "device supports %u byte sectors (not %u)", - sector_size, mp->m_sb.sb_sectsize); + sector_size, sbp->sb_sectsize); error = ENOSYS; goto release_buf; } @@ -738,15 +846,18 @@ reread: * If device sector size is smaller than the superblock size, * re-read the superblock so the buffer is correctly sized. */ - if (sector_size < mp->m_sb.sb_sectsize) { + if (sector_size < sbp->sb_sectsize) { xfs_buf_relse(bp); - sector_size = mp->m_sb.sb_sectsize; + sector_size = sbp->sb_sectsize; goto reread; } /* Initialize per-cpu counters */ xfs_icsb_reinit_counters(mp); + /* no need to be quiet anymore, so reset the buf ops */ + bp->b_ops = &xfs_sb_buf_ops; + mp->m_sb_bp = bp; xfs_buf_unlock(bp); return 0; @@ -1633,6 +1744,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) ASSERT((1LL << f) & XFS_SB_MOD_BITS); first = xfs_sb_info[f].offset; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(tp, bp, first, last); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index bc907061d392..b004cecdfb04 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -207,7 +207,6 @@ typedef struct xfs_mount { trimming */ __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ - struct shrinker m_inode_shrink; /* inode reclaim shrinker */ int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ @@ -392,6 +391,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); #endif /* __KERNEL__ */ +extern void xfs_sb_calc_crc(struct xfs_buf *); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index e5b5cf973781..f41702b43003 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -617,6 +617,20 @@ xfs_qm_dqdetach( } } +int +xfs_qm_calc_dquots_per_chunk( + struct xfs_mount *mp, + unsigned int nbblks) /* basic block units */ +{ + unsigned int ndquots; + + ASSERT(nbblks > 0); + ndquots = BBTOB(nbblks); + do_div(ndquots, sizeof(xfs_dqblk_t)); + + return ndquots; +} + /* * This initializes all the quota information that's kept in the * mount structure @@ -656,9 +670,8 @@ xfs_qm_init_quotainfo( /* Precalc some constants */ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); - ASSERT(qinf->qi_dqchunklen); - qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen); - do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t)); + qinf->qi_dqperchunk = xfs_qm_calc_dquots_per_chunk(mp, + qinf->qi_dqchunklen); mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); @@ -897,6 +910,10 @@ xfs_qm_dqiter_bufs( if (error) break; + /* + * XXX(hch): need to figure out if it makes sense to validate + * the CRC here. + */ xfs_qm_reset_dqcounts(mp, bp, firstid, type); xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); @@ -1057,7 +1074,7 @@ xfs_qm_quotacheck_dqadjust( * There are no timers for the default values set in the root dquot. */ if (dqp->q_core.d_id) { - xfs_qm_adjust_dqlimits(mp, &dqp->q_core); + xfs_qm_adjust_dqlimits(mp, dqp); xfs_qm_adjust_dqtimers(mp, &dqp->q_core); } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 44b858b79d71..5d16a6e6900f 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -75,6 +75,8 @@ typedef struct xfs_quotainfo { &((qi)->qi_gquota_tree)) +extern int xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp, + unsigned int nbblks); extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long); extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *, xfs_dquot_t *, xfs_dquot_t *, long, long, uint); @@ -116,7 +118,7 @@ extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint); extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint, fs_disk_quota_t *); -extern int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint, +extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, fs_disk_quota_t *); extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index cf9a34051e07..c41190cad6e9 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -472,15 +472,15 @@ xfs_qm_scall_getqstat( */ int xfs_qm_scall_setqlim( - xfs_mount_t *mp, + struct xfs_mount *mp, xfs_dqid_t id, uint type, fs_disk_quota_t *newlim) { struct xfs_quotainfo *q = mp->m_quotainfo; - xfs_disk_dquot_t *ddq; - xfs_dquot_t *dqp; - xfs_trans_t *tp; + struct xfs_disk_dquot *ddq; + struct xfs_dquot *dqp; + struct xfs_trans *tp; int error; xfs_qcnt_t hard, soft; @@ -529,6 +529,7 @@ xfs_qm_scall_setqlim( if (hard == 0 || hard >= soft) { ddq->d_blk_hardlimit = cpu_to_be64(hard); ddq->d_blk_softlimit = cpu_to_be64(soft); + xfs_dquot_set_prealloc_limits(dqp); if (id == 0) { q->qi_bhardlimit = hard; q->qi_bsoftlimit = soft; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index b50ec5b95d5a..c61e31c7d997 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -77,7 +77,14 @@ typedef struct xfs_disk_dquot { */ typedef struct xfs_dqblk { xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ - char dd_fill[32]; /* filling for posterity */ + char dd_fill[4]; /* filling for posterity */ + + /* + * These two are only present on filesystems with the CRC bits set. + */ + __be32 dd_crc; /* checksum */ + __be64 dd_lsn; /* last modification in log */ + uuid_t dd_uuid; /* location information */ } xfs_dqblk_t; /* @@ -380,5 +387,7 @@ extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *); extern int xfs_mount_reset_sbqflags(struct xfs_mount *); +extern const struct xfs_buf_ops xfs_dquot_buf_ops; + #endif /* __KERNEL__ */ #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index a05b45175fb0..2de58a85833c 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -32,6 +32,7 @@ struct xfs_mount; #define XFS_SB_VERSION_2 2 /* 6.2 - attributes */ #define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */ #define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */ +#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */ #define XFS_SB_VERSION_NUMBITS 0x000f #define XFS_SB_VERSION_ALLFBITS 0xfff0 #define XFS_SB_VERSION_SASHFBITS 0xf000 @@ -161,6 +162,20 @@ typedef struct xfs_sb { */ __uint32_t sb_bad_features2; + /* version 5 superblock fields start here */ + + /* feature masks */ + __uint32_t sb_features_compat; + __uint32_t sb_features_ro_compat; + __uint32_t sb_features_incompat; + __uint32_t sb_features_log_incompat; + + __uint32_t sb_crc; /* superblock crc */ + __uint32_t sb_pad; + + xfs_ino_t sb_pquotino; /* project quota inode */ + xfs_lsn_t sb_lsn; /* last write sequence */ + /* must be padded to 64 bit alignment */ } xfs_sb_t; @@ -229,7 +244,21 @@ typedef struct xfs_dsb { * for features2 bits. Easiest just to mark it bad and not use * it for anything else. */ - __be32 sb_bad_features2; + __be32 sb_bad_features2; + + /* version 5 superblock fields start here */ + + /* feature masks */ + __be32 sb_features_compat; + __be32 sb_features_ro_compat; + __be32 sb_features_incompat; + __be32 sb_features_log_incompat; + + __le32 sb_crc; /* superblock crc */ + __be32 sb_pad; + + __be64 sb_pquotino; /* project quota inode */ + __be64 sb_lsn; /* last write sequence */ /* must be padded to 64 bit alignment */ } xfs_dsb_t; @@ -250,7 +279,10 @@ typedef enum { XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, - XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, + XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT, + XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT, + XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD, + XFS_SBS_PQUOTINO, XFS_SBS_LSN, XFS_SBS_FIELDCOUNT } xfs_sb_field_t; @@ -276,6 +308,12 @@ typedef enum { #define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) #define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2) #define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2) +#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT) +#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT) +#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT) +#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT) +#define XFS_SB_CRC XFS_SB_MVAL(CRC) +#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO) #define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) #define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) #define XFS_SB_MOD_BITS \ @@ -283,7 +321,9 @@ typedef enum { XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ - XFS_SB_BAD_FEATURES2) + XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \ + XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \ + XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO) /* @@ -325,6 +365,8 @@ static inline int xfs_sb_good_version(xfs_sb_t *sbp) return 1; } + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) + return 1; return 0; } @@ -365,7 +407,7 @@ static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp) { return sbp->sb_versionnum == XFS_SB_VERSION_2 || sbp->sb_versionnum == XFS_SB_VERSION_3 || - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)); } @@ -373,7 +415,7 @@ static inline void xfs_sb_version_addattr(xfs_sb_t *sbp) { if (sbp->sb_versionnum == XFS_SB_VERSION_1) sbp->sb_versionnum = XFS_SB_VERSION_2; - else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + else if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4) sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; else sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT; @@ -382,7 +424,7 @@ static inline void xfs_sb_version_addattr(xfs_sb_t *sbp) static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp) { return sbp->sb_versionnum == XFS_SB_VERSION_3 || - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)); } @@ -396,13 +438,13 @@ static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp) static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); } static inline void xfs_sb_version_addquota(xfs_sb_t *sbp) { - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4) sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; else sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) | @@ -411,13 +453,14 @@ static inline void xfs_sb_version_addquota(xfs_sb_t *sbp) static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); } static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); } @@ -429,38 +472,42 @@ static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp) static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)); } static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT)); } static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)); } static inline int xfs_sb_version_hassector(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); } static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); } static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT)); } /* @@ -475,14 +522,16 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp) { - return xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); } static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp) { - return xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); } static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) @@ -500,14 +549,73 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp) static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) { - return xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); } static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) { - return (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT)); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + + +/* + * Extended v5 superblock feature masks. These are to be used for new v5 + * superblock features only. + * + * Compat features are new features that old kernels will not notice or affect + * and so can mount read-write without issues. + * + * RO-Compat (read only) are features that old kernels can read but will break + * if they write. Hence only read-only mounts of such filesystems are allowed on + * kernels that don't support the feature bit. + * + * InCompat features are features which old kernels will not understand and so + * must not mount. + * + * Log-InCompat features are for changes to log formats or new transactions that + * can't be replayed on older kernels. The fields are set when the filesystem is + * mounted, and a clean unmount clears the fields. + */ +#define XFS_SB_FEAT_COMPAT_ALL 0 +#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL +static inline bool +xfs_sb_has_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_compat & feature) != 0; +} + +#define XFS_SB_FEAT_RO_COMPAT_ALL 0 +#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL +static inline bool +xfs_sb_has_ro_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_ro_compat & feature) != 0; +} + +#define XFS_SB_FEAT_INCOMPAT_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL +static inline bool +xfs_sb_has_incompat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_incompat & feature) != 0; +} + +#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL +static inline bool +xfs_sb_has_incompat_log_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_log_incompat & feature) != 0; } /* diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c new file mode 100644 index 000000000000..5f234389327c --- /dev/null +++ b/fs/xfs/xfs_symlink.c @@ -0,0 +1,730 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012-2013 Red Hat, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_da_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_itable.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_utils.h" +#include "xfs_trans_space.h" +#include "xfs_log_priv.h" +#include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" + + +/* + * Each contiguous block has a header, so it is not just a simple pathlen + * to FSB conversion. + */ +int +xfs_symlink_blocks( + struct xfs_mount *mp, + int pathlen) +{ + int fsblocks = 0; + int len = pathlen; + + do { + fsblocks++; + len -= XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + } while (len > 0); + + ASSERT(fsblocks <= XFS_SYMLINK_MAPS); + return fsblocks; +} + +static int +xfs_symlink_hdr_set( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return 0; + + dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC); + dsl->sl_offset = cpu_to_be32(offset); + dsl->sl_bytes = cpu_to_be32(size); + uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid); + dsl->sl_owner = cpu_to_be64(ino); + dsl->sl_blkno = cpu_to_be64(bp->b_bn); + bp->b_ops = &xfs_symlink_buf_ops; + + return sizeof(struct xfs_dsymlink_hdr); +} + +/* + * Checking of the symlink header is split into two parts. the verifier does + * CRC, location and bounds checking, the unpacking function checks the path + * parameters and owner. + */ +bool +xfs_symlink_hdr_ok( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (offset != be32_to_cpu(dsl->sl_offset)) + return false; + if (size != be32_to_cpu(dsl->sl_bytes)) + return false; + if (ino != be64_to_cpu(dsl->sl_owner)) + return false; + + /* ok */ + return true; +} + +static bool +xfs_symlink_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) + return false; + if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) + return false; + if (be32_to_cpu(dsl->sl_offset) + + be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN) + return false; + if (dsl->sl_owner == 0) + return false; + + return true; +} + +static void +xfs_symlink_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_dsymlink_hdr, sl_crc)) || + !xfs_symlink_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +static void +xfs_symlink_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_symlink_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (bip) { + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + offsetof(struct xfs_dsymlink_hdr, sl_crc)); +} + +const struct xfs_buf_ops xfs_symlink_buf_ops = { + .verify_read = xfs_symlink_read_verify, + .verify_write = xfs_symlink_write_verify, +}; + +void +xfs_symlink_local_to_remote( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp) +{ + struct xfs_mount *mp = ip->i_mount; + char *buf; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) { + bp->b_ops = NULL; + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + return; + } + + /* + * As this symlink fits in an inode literal area, it must also fit in + * the smallest buffer the filesystem supports. + */ + ASSERT(BBTOB(bp->b_length) >= + ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr)); + + bp->b_ops = &xfs_symlink_buf_ops; + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); + memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes); +} + +/* ----- Kernel only functions below ----- */ +STATIC int +xfs_readlink_bmap( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_buf *bp; + xfs_daddr_t d; + char *cur_chunk; + int pathlen = ip->i_d.di_size; + int nmaps = XFS_SYMLINK_MAPS; + int byte_cnt; + int n; + int error = 0; + int fsblocks = 0; + int offset; + + fsblocks = xfs_symlink_blocks(mp, pathlen); + error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); + if (error) + goto out; + + offset = 0; + for (n = 0; n < nmaps; n++) { + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, + &xfs_symlink_buf_ops); + if (!bp) + return XFS_ERROR(ENOMEM); + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); + goto out; + } + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + if (pathlen < byte_cnt) + byte_cnt = pathlen; + + cur_chunk = bp->b_addr; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_symlink_hdr_ok(mp, ip->i_ino, offset, + byte_cnt, bp)) { + error = EFSCORRUPTED; + xfs_alert(mp, +"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", + offset, byte_cnt, ip->i_ino); + xfs_buf_relse(bp); + goto out; + + } + + cur_chunk += sizeof(struct xfs_dsymlink_hdr); + } + + memcpy(link + offset, bp->b_addr, byte_cnt); + + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_buf_relse(bp); + } + ASSERT(pathlen == 0); + + link[ip->i_d.di_size] = '\0'; + error = 0; + + out: + return error; +} + +int +xfs_readlink( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t pathlen; + int error = 0; + + trace_xfs_readlink(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + + pathlen = ip->i_d.di_size; + if (!pathlen) + goto out; + + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", + __func__, (unsigned long long) ip->i_ino, + (long long) pathlen); + ASSERT(0); + error = XFS_ERROR(EFSCORRUPTED); + goto out; + } + + + if (ip->i_df.if_flags & XFS_IFINLINE) { + memcpy(link, ip->i_df.if_u1.if_data, pathlen); + link[pathlen] = '\0'; + } else { + error = xfs_readlink_bmap(ip, link); + } + + out: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +int +xfs_symlink( + struct xfs_inode *dp, + struct xfs_name *link_name, + const char *target_path, + umode_t mode, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_trans *tp = NULL; + struct xfs_inode *ip = NULL; + int error = 0; + int pathlen; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + bool unlock_dp_on_error = false; + uint cancel_flags; + int committed; + xfs_fileoff_t first_fsb; + xfs_filblks_t fs_blocks; + int nmaps; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + xfs_daddr_t d; + const char *cur_chunk; + int byte_cnt; + int n; + xfs_buf_t *bp; + prid_t prid; + struct xfs_dquot *udqp, *gdqp; + uint resblks; + + *ipp = NULL; + + trace_xfs_symlink(dp, link_name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* + * Check component lengths of the target path name. + */ + pathlen = strlen(target_path); + if (pathlen >= MAXPATHLEN) /* total string too long */ + return XFS_ERROR(ENAMETOOLONG); + + udqp = gdqp = NULL; + if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + prid = xfs_get_projid(dp); + else + prid = XFS_PROJID_DEFAULT; + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + /* + * The symlink will fit into the inode data fork? + * There can't be any attributes so we get the whole variable part. + */ + if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) + fs_blocks = 0; + else + fs_blocks = XFS_B_TO_FSB(mp, pathlen); + resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); + error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); + if (error == ENOSPC && fs_blocks == 0) { + resblks = 0; + error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); + } + if (error) { + cancel_flags = 0; + goto error_return; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = true; + + /* + * Check whether the directory allows new symlinks or not. + */ + if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { + error = XFS_ERROR(EPERM); + goto error_return; + } + + /* + * Reserve disk quota : blocks and inode. + */ + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0); + if (error) + goto error_return; + + /* + * Check for ability to enter directory entry, if no space reserved. + */ + error = xfs_dir_canenter(tp, dp, link_name, resblks); + if (error) + goto error_return; + /* + * Initialize the bmap freelist prior to calling either + * bmapi or the directory create code. + */ + xfs_bmap_init(&free_list, &first_block); + + /* + * Allocate an inode for the symlink. + */ + error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, + prid, resblks > 0, &ip, NULL); + if (error) { + if (error == ENOSPC) + goto error_return; + goto error1; + } + + /* + * An error after we've joined dp to the transaction will result in the + * transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + unlock_dp_on_error = false; + + /* + * Also attach the dquot(s) to it, if applicable. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); + + if (resblks) + resblks -= XFS_IALLOC_SPACE_RES(mp); + /* + * If the symlink will fit into the inode, write it inline. + */ + if (pathlen <= XFS_IFORK_DSIZE(ip)) { + xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); + memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); + ip->i_d.di_size = pathlen; + + /* + * The inode was initially created in extent format. + */ + ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); + ip->i_df.if_flags |= XFS_IFINLINE; + + ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); + + } else { + int offset; + + first_fsb = 0; + nmaps = XFS_SYMLINK_MAPS; + + error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, + XFS_BMAPI_METADATA, &first_block, resblks, + mval, &nmaps, &free_list); + if (error) + goto error2; + + if (resblks) + resblks -= fs_blocks; + ip->i_d.di_size = pathlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + cur_chunk = target_path; + offset = 0; + for (n = 0; n < nmaps; n++) { + char *buf; + + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0); + if (!bp) { + error = ENOMEM; + goto error2; + } + bp->b_ops = &xfs_symlink_buf_ops; + + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + if (pathlen < byte_cnt) { + byte_cnt = pathlen; + } + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, + byte_cnt, bp); + + memcpy(buf, cur_chunk, byte_cnt); + + cur_chunk += byte_cnt; + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - + (char *)bp->b_addr); + } + } + + /* + * Create the directory entry for the symlink. + */ + error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) + goto error2; + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + /* + * If this is a synchronous mount, make sure that the + * symlink transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error2; + } + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + *ipp = ip; + return 0; + + error2: + IRELE(ip); + error1: + xfs_bmap_cancel(&free_list); + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_trans_cancel(tp, cancel_flags); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + std_return: + return error; +} + +/* + * Free a symlink that has blocks associated with it. + */ +int +xfs_inactive_symlink_rmt( + xfs_inode_t *ip, + xfs_trans_t **tpp) +{ + xfs_buf_t *bp; + int committed; + int done; + int error; + xfs_fsblock_t first_block; + xfs_bmap_free_t free_list; + int i; + xfs_mount_t *mp; + xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS]; + int nmaps; + xfs_trans_t *ntp; + int size; + xfs_trans_t *tp; + + tp = *tpp; + mp = ip->i_mount; + ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip)); + /* + * We're freeing a symlink that has some + * blocks allocated to it. Free the + * blocks here. We know that we've got + * either 1 or 2 extents and that we can + * free them all in one bunmapi call. + */ + ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); + + /* + * Lock the inode, fix the size, and join it to the transaction. + * Hold it so in the normal path, we still have it locked for + * the second transaction. In the error paths we need it + * held so the cancel won't rele it, see below. + */ + size = (int)ip->i_d.di_size; + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Find the block(s) so we can inval and unmap them. + */ + done = 0; + xfs_bmap_init(&free_list, &first_block); + nmaps = ARRAY_SIZE(mval); + error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size), + mval, &nmaps, 0); + if (error) + goto error0; + /* + * Invalidate the block(s). No validation is done. + */ + for (i = 0; i < nmaps; i++) { + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); + if (!bp) { + error = ENOMEM; + goto error1; + } + xfs_trans_binval(tp, bp); + } + /* + * Unmap the dead block(s) to the free_list. + */ + if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, + &first_block, &free_list, &done))) + goto error1; + ASSERT(done); + /* + * Commit the first transaction. This logs the EFI and the inode. + */ + if ((error = xfs_bmap_finish(&tp, &free_list, &committed))) + goto error1; + /* + * The transaction must have been committed, since there were + * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. + * The new tp has the extent freeing and EFDs. + */ + ASSERT(committed); + /* + * The first xact was committed, so add the inode to the new one. + * Mark it dirty so it will be logged and moved forward in the log as + * part of every commit. + */ + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Get a new, empty transaction to return to our caller. + */ + ntp = xfs_trans_dup(tp); + /* + * Commit the transaction containing extent freeing and EFDs. + * If we get an error on the commit here or on the reserve below, + * we need to unlock the inode since the new transaction doesn't + * have the inode attached. + */ + error = xfs_trans_commit(tp, 0); + tp = ntp; + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + goto error0; + } + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(tp->t_ticket); + + /* + * Remove the memory for extent descriptions (just bookkeeping). + */ + if (ip->i_df.if_bytes) + xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); + ASSERT(ip->i_df.if_bytes == 0); + /* + * Put an itruncate log reservation in the new transaction + * for our caller. + */ + if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + goto error0; + } + + xfs_trans_ijoin(tp, ip, 0); + *tpp = tp; + return 0; + + error1: + xfs_bmap_cancel(&free_list); + error0: + return error; +} diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h new file mode 100644 index 000000000000..b39398d2097c --- /dev/null +++ b/fs/xfs/xfs_symlink.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2012 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SYMLINK_H +#define __XFS_SYMLINK_H 1 + +struct xfs_mount; +struct xfs_trans; +struct xfs_inode; +struct xfs_buf; +struct xfs_ifork; +struct xfs_name; + +#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */ + +struct xfs_dsymlink_hdr { + __be32 sl_magic; + __be32 sl_offset; + __be32 sl_bytes; + __be32 sl_crc; + uuid_t sl_uuid; + __be64 sl_owner; + __be64 sl_blkno; + __be64 sl_lsn; +}; + +/* + * The maximum pathlen is 1024 bytes. Since the minimum file system + * blocksize is 512 bytes, we can get a max of 3 extents back from + * bmapi when crc headers are taken into account. + */ +#define XFS_SYMLINK_MAPS 3 + +#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_dsymlink_hdr) : 0)) + +int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); + +void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp); + +extern const struct xfs_buf_ops xfs_symlink_buf_ops; + +#ifdef __KERNEL__ + +int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, + const char *target_path, umode_t mode, struct xfs_inode **ipp); +int xfs_readlink(struct xfs_inode *ip, char *link); +int xfs_inactive_symlink_rmt(struct xfs_inode *ip, struct xfs_trans **tpp); + +#endif /* __KERNEL__ */ +#endif /* __XFS_SYMLINK_H */ diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 624bedd81357..b6e3897c1d9f 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -22,7 +22,6 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" @@ -30,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_mount.h" +#include "xfs_da_btree.h" #include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_alloc.h" diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 16a812977eab..aa4db3307d36 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -619,6 +619,30 @@ DECLARE_EVENT_CLASS(xfs_iref_class, (char *)__entry->caller_ip) ) +TRACE_EVENT(xfs_iomap_prealloc_size, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t blocks, int shift, + unsigned int writeio_blocks), + TP_ARGS(ip, blocks, shift, writeio_blocks), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, blocks) + __field(int, shift) + __field(unsigned int, writeio_blocks) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->blocks = blocks; + __entry->shift = shift; + __entry->writeio_blocks = writeio_blocks; + ), + TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d " + "m_writeio_blocks %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, + __entry->blocks, __entry->shift, __entry->writeio_blocks) +) + #define DEFINE_IREF_EVENT(name) \ DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 3edf5dbee001..73a5fa457e16 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -659,6 +659,7 @@ xfs_trans_binval( ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK)); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); @@ -671,6 +672,7 @@ xfs_trans_binval( bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; + bip->__bli_format.blf_flags &= ~XFS_BLFT_MASK; for (i = 0; i < bip->bli_format_count; i++) { memset(bip->bli_formats[i].blf_data_map, 0, (bip->bli_formats[i].blf_map_size * sizeof(uint))); @@ -702,12 +704,13 @@ xfs_trans_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* * This call is used to indicate that the buffer is going to * be staled and was an inode buffer. This means it gets - * special processing during unpin - where any inodes + * special processing during unpin - where any inodes * associated with the buffer should be removed from ail. * There is also special processing during recovery, * any replay of the inodes in the buffer needs to be @@ -726,6 +729,7 @@ xfs_trans_stale_inode_buf( bip->bli_flags |= XFS_BLI_STALE_INODE; bip->bli_item.li_cb = xfs_buf_iodone; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* @@ -749,8 +753,43 @@ xfs_trans_inode_alloc_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } +/* + * Set the type of the buffer for log recovery so that it can correctly identify + * and hence attach the correct buffer ops to the buffer after replay. + */ +void +xfs_trans_buf_set_type( + struct xfs_trans *tp, + struct xfs_buf *bp, + enum xfs_blft type) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!tp) + return; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + xfs_blft_to_flags(&bip->__bli_format, type); +} + +void +xfs_trans_buf_copy_type( + struct xfs_buf *dst_bp, + struct xfs_buf *src_bp) +{ + struct xfs_buf_log_item *sbip = src_bp->b_fspriv; + struct xfs_buf_log_item *dbip = dst_bp->b_fspriv; + enum xfs_blft type; + + type = xfs_blft_from_flags(&sbip->__bli_format); + xfs_blft_to_flags(&dbip->__bli_format, type); +} /* * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of @@ -769,14 +808,28 @@ xfs_trans_dquot_buf( xfs_buf_t *bp, uint type) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_fspriv; - ASSERT(bp->b_transp == tp); - ASSERT(bip != NULL); ASSERT(type == XFS_BLF_UDQUOT_BUF || type == XFS_BLF_PDQUOT_BUF || type == XFS_BLF_GDQUOT_BUF); - ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->__bli_format.blf_flags |= type; + + switch (type) { + case XFS_BLF_UDQUOT_BUF: + type = XFS_BLFT_UDQUOT_BUF; + break; + case XFS_BLF_PDQUOT_BUF: + type = XFS_BLFT_PDQUOT_BUF; + break; + case XFS_BLF_GDQUOT_BUF: + type = XFS_BLFT_GDQUOT_BUF; + break; + default: + type = XFS_BLFT_UNKNOWN_BUF; + break; + } + + xfs_trans_buf_set_type(tp, bp, type); } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 642c2d6e1db1..fec75d023703 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -326,12 +326,12 @@ xfs_trans_dqlockedjoin( */ void xfs_trans_apply_dquot_deltas( - xfs_trans_t *tp) + struct xfs_trans *tp) { int i, j; - xfs_dquot_t *dqp; - xfs_dqtrx_t *qtrx, *qa; - xfs_disk_dquot_t *d; + struct xfs_dquot *dqp; + struct xfs_dqtrx *qtrx, *qa; + struct xfs_disk_dquot *d; long totalbdelta; long totalrtbdelta; @@ -412,7 +412,7 @@ xfs_trans_apply_dquot_deltas( * Start/reset the timer(s) if needed. */ if (d->d_id) { - xfs_qm_adjust_dqlimits(tp->t_mountp, d); + xfs_qm_adjust_dqlimits(tp->t_mountp, dqp); xfs_qm_adjust_dqtimers(tp->t_mountp, d); } diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 77ad74834baa..1501f4fa51a6 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -48,103 +49,8 @@ #include "xfs_vnodeops.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_symlink.h" -/* - * The maximum pathlen is 1024 bytes. Since the minimum file system - * blocksize is 512 bytes, we can get a max of 2 extents back from - * bmapi. - */ -#define SYMLINK_MAPS 2 - -STATIC int -xfs_readlink_bmap( - xfs_inode_t *ip, - char *link) -{ - xfs_mount_t *mp = ip->i_mount; - int pathlen = ip->i_d.di_size; - int nmaps = SYMLINK_MAPS; - xfs_bmbt_irec_t mval[SYMLINK_MAPS]; - xfs_daddr_t d; - int byte_cnt; - int n; - xfs_buf_t *bp; - int error = 0; - - error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps, - 0); - if (error) - goto out; - - for (n = 0; n < nmaps; n++) { - d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); - byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL); - if (!bp) - return XFS_ERROR(ENOMEM); - error = bp->b_error; - if (error) { - xfs_buf_ioerror_alert(bp, __func__); - xfs_buf_relse(bp); - goto out; - } - if (pathlen < byte_cnt) - byte_cnt = pathlen; - pathlen -= byte_cnt; - - memcpy(link, bp->b_addr, byte_cnt); - xfs_buf_relse(bp); - } - - link[ip->i_d.di_size] = '\0'; - error = 0; - - out: - return error; -} - -int -xfs_readlink( - xfs_inode_t *ip, - char *link) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_fsize_t pathlen; - int error = 0; - - trace_xfs_readlink(ip); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - xfs_ilock(ip, XFS_ILOCK_SHARED); - - pathlen = ip->i_d.di_size; - if (!pathlen) - goto out; - - if (pathlen < 0 || pathlen > MAXPATHLEN) { - xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", - __func__, (unsigned long long) ip->i_ino, - (long long) pathlen); - ASSERT(0); - error = XFS_ERROR(EFSCORRUPTED); - goto out; - } - - - if (ip->i_df.if_flags & XFS_IFINLINE) { - memcpy(link, ip->i_df.if_u1.if_data, pathlen); - link[pathlen] = '\0'; - } else { - error = xfs_readlink_bmap(ip, link); - } - - out: - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; -} /* * This is called by xfs_inactive to free any blocks beyond eof @@ -249,145 +155,6 @@ xfs_free_eofblocks( return error; } -/* - * Free a symlink that has blocks associated with it. - */ -STATIC int -xfs_inactive_symlink_rmt( - xfs_inode_t *ip, - xfs_trans_t **tpp) -{ - xfs_buf_t *bp; - int committed; - int done; - int error; - xfs_fsblock_t first_block; - xfs_bmap_free_t free_list; - int i; - xfs_mount_t *mp; - xfs_bmbt_irec_t mval[SYMLINK_MAPS]; - int nmaps; - xfs_trans_t *ntp; - int size; - xfs_trans_t *tp; - - tp = *tpp; - mp = ip->i_mount; - ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip)); - /* - * We're freeing a symlink that has some - * blocks allocated to it. Free the - * blocks here. We know that we've got - * either 1 or 2 extents and that we can - * free them all in one bunmapi call. - */ - ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); - - /* - * Lock the inode, fix the size, and join it to the transaction. - * Hold it so in the normal path, we still have it locked for - * the second transaction. In the error paths we need it - * held so the cancel won't rele it, see below. - */ - size = (int)ip->i_d.di_size; - ip->i_d.di_size = 0; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* - * Find the block(s) so we can inval and unmap them. - */ - done = 0; - xfs_bmap_init(&free_list, &first_block); - nmaps = ARRAY_SIZE(mval); - error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size), - mval, &nmaps, 0); - if (error) - goto error0; - /* - * Invalidate the block(s). - */ - for (i = 0; i < nmaps; i++) { - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), - XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); - if (!bp) { - error = ENOMEM; - goto error1; - } - xfs_trans_binval(tp, bp); - } - /* - * Unmap the dead block(s) to the free_list. - */ - if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, - &first_block, &free_list, &done))) - goto error1; - ASSERT(done); - /* - * Commit the first transaction. This logs the EFI and the inode. - */ - if ((error = xfs_bmap_finish(&tp, &free_list, &committed))) - goto error1; - /* - * The transaction must have been committed, since there were - * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. - * The new tp has the extent freeing and EFDs. - */ - ASSERT(committed); - /* - * The first xact was committed, so add the inode to the new one. - * Mark it dirty so it will be logged and moved forward in the log as - * part of every commit. - */ - xfs_trans_ijoin(tp, ip, 0); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* - * Get a new, empty transaction to return to our caller. - */ - ntp = xfs_trans_dup(tp); - /* - * Commit the transaction containing extent freeing and EFDs. - * If we get an error on the commit here or on the reserve below, - * we need to unlock the inode since the new transaction doesn't - * have the inode attached. - */ - error = xfs_trans_commit(tp, 0); - tp = ntp; - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - goto error0; - } - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - - /* - * Remove the memory for extent descriptions (just bookkeeping). - */ - if (ip->i_df.if_bytes) - xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); - ASSERT(ip->i_df.if_bytes == 0); - /* - * Put an itruncate log reservation in the new transaction - * for our caller. - */ - if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - goto error0; - } - - xfs_trans_ijoin(tp, ip, 0); - *tpp = tp; - return 0; - - error1: - xfs_bmap_cancel(&free_list); - error0: - return error; -} - int xfs_release( xfs_inode_t *ip) @@ -1353,247 +1120,6 @@ xfs_link( } int -xfs_symlink( - xfs_inode_t *dp, - struct xfs_name *link_name, - const char *target_path, - umode_t mode, - xfs_inode_t **ipp) -{ - xfs_mount_t *mp = dp->i_mount; - xfs_trans_t *tp; - xfs_inode_t *ip; - int error; - int pathlen; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - bool unlock_dp_on_error = false; - uint cancel_flags; - int committed; - xfs_fileoff_t first_fsb; - xfs_filblks_t fs_blocks; - int nmaps; - xfs_bmbt_irec_t mval[SYMLINK_MAPS]; - xfs_daddr_t d; - const char *cur_chunk; - int byte_cnt; - int n; - xfs_buf_t *bp; - prid_t prid; - struct xfs_dquot *udqp, *gdqp; - uint resblks; - - *ipp = NULL; - error = 0; - ip = NULL; - tp = NULL; - - trace_xfs_symlink(dp, link_name); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - /* - * Check component lengths of the target path name. - */ - pathlen = strlen(target_path); - if (pathlen >= MAXPATHLEN) /* total string too long */ - return XFS_ERROR(ENAMETOOLONG); - - udqp = gdqp = NULL; - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = xfs_get_projid(dp); - else - prid = XFS_PROJID_DEFAULT; - - /* - * Make sure that we have allocated dquot(s) on disk. - */ - error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); - if (error) - goto std_return; - - tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - /* - * The symlink will fit into the inode data fork? - * There can't be any attributes so we get the whole variable part. - */ - if (pathlen <= XFS_LITINO(mp)) - fs_blocks = 0; - else - fs_blocks = XFS_B_TO_FSB(mp, pathlen); - resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); - error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); - if (error == ENOSPC && fs_blocks == 0) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); - } - if (error) { - cancel_flags = 0; - goto error_return; - } - - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - unlock_dp_on_error = true; - - /* - * Check whether the directory allows new symlinks or not. - */ - if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { - error = XFS_ERROR(EPERM); - goto error_return; - } - - /* - * Reserve disk quota : blocks and inode. - */ - error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0); - if (error) - goto error_return; - - /* - * Check for ability to enter directory entry, if no space reserved. - */ - error = xfs_dir_canenter(tp, dp, link_name, resblks); - if (error) - goto error_return; - /* - * Initialize the bmap freelist prior to calling either - * bmapi or the directory create code. - */ - xfs_bmap_init(&free_list, &first_block); - - /* - * Allocate an inode for the symlink. - */ - error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, - prid, resblks > 0, &ip, NULL); - if (error) { - if (error == ENOSPC) - goto error_return; - goto error1; - } - - /* - * An error after we've joined dp to the transaction will result in the - * transaction cancel unlocking dp so don't do it explicitly in the - * error path. - */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = false; - - /* - * Also attach the dquot(s) to it, if applicable. - */ - xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); - - if (resblks) - resblks -= XFS_IALLOC_SPACE_RES(mp); - /* - * If the symlink will fit into the inode, write it inline. - */ - if (pathlen <= XFS_IFORK_DSIZE(ip)) { - xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); - memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); - ip->i_d.di_size = pathlen; - - /* - * The inode was initially created in extent format. - */ - ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); - ip->i_df.if_flags |= XFS_IFINLINE; - - ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; - xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); - - } else { - first_fsb = 0; - nmaps = SYMLINK_MAPS; - - error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, - XFS_BMAPI_METADATA, &first_block, resblks, - mval, &nmaps, &free_list); - if (error) - goto error2; - - if (resblks) - resblks -= fs_blocks; - ip->i_d.di_size = pathlen; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - cur_chunk = target_path; - for (n = 0; n < nmaps; n++) { - d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); - byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - BTOBB(byte_cnt), 0); - if (!bp) { - error = ENOMEM; - goto error2; - } - if (pathlen < byte_cnt) { - byte_cnt = pathlen; - } - pathlen -= byte_cnt; - - memcpy(bp->b_addr, cur_chunk, byte_cnt); - cur_chunk += byte_cnt; - - xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1); - } - } - - /* - * Create the directory entry for the symlink. - */ - error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, - &first_block, &free_list, resblks); - if (error) - goto error2; - xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - - /* - * If this is a synchronous mount, make sure that the - * symlink transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - goto error2; - } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); - - *ipp = ip; - return 0; - - error2: - IRELE(ip); - error1: - xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; - error_return: - xfs_trans_cancel(tp, cancel_flags); - xfs_qm_dqrele(udqp); - xfs_qm_dqrele(gdqp); - - if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_ILOCK_EXCL); - std_return: - return error; -} - -int xfs_set_dmattrs( xfs_inode_t *ip, u_int evmask, |