diff options
32 files changed, 366 insertions, 200 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 680fb566b928..8362860e21a7 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -144,8 +144,8 @@ prototypes: void (*kill_sb) (struct super_block *); locking rules: may block BKL -get_sb yes yes -kill_sb yes yes +get_sb yes no +kill_sb yes no ->get_sb() returns error or 0 with locked superblock attached to the vfsmount (exclusive on ->s_umount). @@ -409,12 +409,12 @@ ioctl: yes (see below) unlocked_ioctl: no (see below) compat_ioctl: no mmap: no -open: maybe (see below) +open: no flush: no release: no fsync: no (see below) aio_fsync: no -fasync: yes (see below) +fasync: no lock: yes readv: no writev: no @@ -431,13 +431,6 @@ For many filesystems, it is probably safe to acquire the inode semaphore. Note some filesystems (i.e. remote ones) provide no protection for i_size so you will need to use the BKL. -->open() locking is in-transit: big lock partially moved into the methods. -The only exception is ->open() in the instances of file_operations that never -end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices -(chrdev_open() takes lock before replacing ->f_op and calling the secondary -method. As soon as we fix the handling of module reference counters all -instances of ->open() will be called without the BKL. - Note: ext2_release() was *the* source of contention on fs-intensive loads and dropping BKL on ->release() helps to get rid of that (we still grab BKL for cases when we close a file that had been opened r/w, but that diff --git a/MAINTAINERS b/MAINTAINERS index b3e92fbe336c..186be3ba5069 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -750,11 +750,13 @@ P: Ville Syrjala M: syrjala@sci.fi S: Maintained -ATL1 ETHERNET DRIVER +ATLX ETHERNET DRIVERS P: Jay Cliburn M: jcliburn@gmail.com P: Chris Snook M: csnook@redhat.com +P: Jie Yang +M: jie.yang@atheros.com L: atl1-devel@lists.sourceforge.net W: http://sourceforge.net/projects/atl1 W: http://atl1.sourceforge.net diff --git a/arch/ia64/include/asm/sections.h b/arch/ia64/include/asm/sections.h index 7286e4a9fe84..a7acad2bc2f0 100644 --- a/arch/ia64/include/asm/sections.h +++ b/arch/ia64/include/asm/sections.h @@ -21,5 +21,8 @@ extern char __start_gate_brl_fsys_bubble_down_patchlist[], __end_gate_brl_fsys_b extern char __start_unwind[], __end_unwind[]; extern char __start_ivt_text[], __end_ivt_text[]; +#undef dereference_function_descriptor +void *dereference_function_descriptor(void *); + #endif /* _ASM_IA64_SECTIONS_H */ diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c index 29aad349e0c4..545626f66a4c 100644 --- a/arch/ia64/kernel/module.c +++ b/arch/ia64/kernel/module.c @@ -31,9 +31,11 @@ #include <linux/elf.h> #include <linux/moduleloader.h> #include <linux/string.h> +#include <linux/uaccess.h> #include <linux/vmalloc.h> #include <asm/patch.h> +#include <asm/sections.h> #include <asm/unaligned.h> #define ARCH_MODULE_DEBUG 0 @@ -941,3 +943,13 @@ module_arch_cleanup (struct module *mod) if (mod->arch.core_unw_table) unw_remove_unwind_table(mod->arch.core_unw_table); } + +void *dereference_function_descriptor(void *ptr) +{ + struct fdesc *desc = ptr; + void *p; + + if (!probe_kernel_address(&desc->ip, p)) + ptr = p; + return ptr; +} diff --git a/arch/mips/sgi-ip22/ip22-platform.c b/arch/mips/sgi-ip22/ip22-platform.c index 60141235ec40..52486c4d2b01 100644 --- a/arch/mips/sgi-ip22/ip22-platform.c +++ b/arch/mips/sgi-ip22/ip22-platform.c @@ -150,7 +150,7 @@ static int __init sgiseeq_devinit(void) return res; /* Second HPC is missing? */ - if (!ip22_is_fullhouse() || + if (ip22_is_fullhouse() || get_dbe(tmp, (unsigned int *)&hpc3c1->pbdma[1])) return 0; diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index fdacdd4341c9..44138c3e6ea7 100644 --- a/arch/parisc/kernel/module.c +++ b/arch/parisc/kernel/module.c @@ -47,7 +47,9 @@ #include <linux/string.h> #include <linux/kernel.h> #include <linux/bug.h> +#include <linux/uaccess.h> +#include <asm/sections.h> #include <asm/unwind.h> #if 0 @@ -860,3 +862,15 @@ void module_arch_cleanup(struct module *mod) deregister_unwind_table(mod); module_bug_cleanup(mod); } + +#ifdef CONFIG_64BIT +void *dereference_function_descriptor(void *ptr) +{ + Elf64_Fdesc *desc = ptr; + void *p; + + if (!probe_kernel_address(&desc->addr, p)) + ptr = p; + return ptr; +} +#endif diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index 916018e425c4..7710e9e6660f 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -16,6 +16,9 @@ static inline int in_kernel_text(unsigned long addr) return 0; } +#undef dereference_function_descriptor +void *dereference_function_descriptor(void *); + #endif #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index ee6a2982d567..ad79de272ff3 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -21,8 +21,9 @@ #include <linux/err.h> #include <linux/vmalloc.h> #include <linux/bug.h> +#include <linux/uaccess.h> #include <asm/module.h> -#include <asm/uaccess.h> +#include <asm/sections.h> #include <asm/firmware.h> #include <asm/code-patching.h> #include <linux/sort.h> @@ -451,3 +452,13 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, return 0; } + +void *dereference_function_descriptor(void *ptr) +{ + struct ppc64_opd_entry *desc = ptr; + void *p; + + if (!probe_kernel_address(&desc->funcaddr, p)) + ptr = p; + return ptr; +} diff --git a/arch/s390/kernel/compat_ptrace.h b/arch/s390/kernel/compat_ptrace.h index cde81fa64f89..a2be3a978d5c 100644 --- a/arch/s390/kernel/compat_ptrace.h +++ b/arch/s390/kernel/compat_ptrace.h @@ -42,6 +42,7 @@ struct user_regs_struct32 u32 gprs[NUM_GPRS]; u32 acrs[NUM_ACRS]; u32 orig_gpr2; + /* nb: there's a 4-byte hole here */ s390_fp_regs fp_regs; /* * These per registers are in here so that gdb can modify them diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 2815bfe348a6..c8b08289eb87 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -170,6 +170,13 @@ static unsigned long __peek_user(struct task_struct *child, addr_t addr) */ tmp = (addr_t) task_pt_regs(child)->orig_gpr2; + } else if (addr < (addr_t) &dummy->regs.fp_regs) { + /* + * prevent reads of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + tmp = 0; + } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { /* * floating point regs. are stored in the thread structure @@ -270,6 +277,13 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) */ task_pt_regs(child)->orig_gpr2 = data; + } else if (addr < (addr_t) &dummy->regs.fp_regs) { + /* + * prevent writes of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + return 0; + } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { /* * floating point regs. are stored in the thread structure @@ -428,6 +442,13 @@ static u32 __peek_user_compat(struct task_struct *child, addr_t addr) */ tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4); + } else if (addr < (addr_t) &dummy32->regs.fp_regs) { + /* + * prevent reads of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + tmp = 0; + } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { /* * floating point regs. are stored in the thread structure @@ -514,6 +535,13 @@ static int __poke_user_compat(struct task_struct *child, */ *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp; + } else if (addr < (addr_t) &dummy32->regs.fp_regs) { + /* + * prevent writess of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + return 0; + } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { /* * floating point regs. are stored in the thread structure diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c index 9b6689d9d570..23963882bc18 100644 --- a/arch/sparc64/kernel/irq.c +++ b/arch/sparc64/kernel/irq.c @@ -792,6 +792,8 @@ void fixup_irqs(void) } spin_unlock_irqrestore(&irq_desc[irq].lock, flags); } + + tick_ops->disable_irq(); } #endif diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 2c518fbc52ec..b225219c448c 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -382,14 +382,17 @@ config X86_OOSTORE # P6_NOPs are a relatively minor optimization that require a family >= # 6 processor, except that it is broken on certain VIA chips. # Furthermore, AMD chips prefer a totally different sequence of NOPs -# (which work on all CPUs). As a result, disallow these if we're -# compiling X86_GENERIC but not X86_64 (these NOPs do work on all -# x86-64 capable chips); the list of processors in the right-hand clause -# are the cores that benefit from this optimization. +# (which work on all CPUs). In addition, it looks like Virtual PC +# does not understand them. +# +# As a result, disallow these if we're not compiling for X86_64 (these +# NOPs do work on all x86-64 capable chips); the list of processors in +# the right-hand clause are the cores that benefit from this optimization. # config X86_P6_NOP def_bool y - depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4 || MPSC) + depends on X86_64 + depends on (MCORE2 || MPENTIUM4 || MPSC) config X86_TSC def_bool y diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 9af89078f7bb..66e48aa2dd1b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1203,7 +1203,7 @@ static int __init parse_memmap_opt(char *p) if (!p) return -EINVAL; - if (!strcmp(p, "exactmap")) { + if (!strncmp(p, "exactmap", 8)) { #ifdef CONFIG_CRASH_DUMP /* * If we are doing a crash dump, we still need to know diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index db00b0591733..f1216cf6fa8f 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -423,7 +423,7 @@ int chp_new(struct chp_id chpid) ret = sysfs_create_group(&chp->dev.kobj, &chp_attr_group); if (ret) { device_unregister(&chp->dev); - goto out_free; + goto out; } mutex_lock(&channel_subsystems[chpid.cssid]->mutex); if (channel_subsystems[chpid.cssid]->cm_enabled) { @@ -432,14 +432,15 @@ int chp_new(struct chp_id chpid) sysfs_remove_group(&chp->dev.kobj, &chp_attr_group); device_unregister(&chp->dev); mutex_unlock(&channel_subsystems[chpid.cssid]->mutex); - goto out_free; + goto out; } } channel_subsystems[chpid.cssid]->chps[chpid.id] = chp; mutex_unlock(&channel_subsystems[chpid.cssid]->mutex); - return ret; + goto out; out_free: kfree(chp); +out: return ret; } diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c index 33bff8fec7d1..5954b905e3ca 100644 --- a/drivers/s390/cio/cio.c +++ b/drivers/s390/cio/cio.c @@ -208,8 +208,10 @@ cio_start_key (struct subchannel *sch, /* subchannel structure */ case 1: /* status pending */ case 2: /* busy */ return -EBUSY; - default: /* device/path not operational */ + case 3: /* device/path not operational */ return cio_start_handle_notoper(sch, lpm); + default: + return ccode; } } diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 51489eff6b0b..1261e1a9e8cd 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -633,6 +633,11 @@ channel_subsystem_release(struct device *dev) css = to_css(dev); mutex_destroy(&css->mutex); + if (css->pseudo_subchannel) { + /* Implies that it has been generated but never registered. */ + css_subchannel_release(&css->pseudo_subchannel->dev); + css->pseudo_subchannel = NULL; + } kfree(css); } @@ -785,11 +790,15 @@ init_channel_subsystem (void) } channel_subsystems[i] = css; ret = setup_css(i); - if (ret) - goto out_free; + if (ret) { + kfree(channel_subsystems[i]); + goto out_unregister; + } ret = device_register(&css->device); - if (ret) - goto out_free_all; + if (ret) { + put_device(&css->device); + goto out_unregister; + } if (css_chsc_characteristics.secm) { ret = device_create_file(&css->device, &dev_attr_cm_enable); @@ -802,7 +811,7 @@ init_channel_subsystem (void) } ret = register_reboot_notifier(&css_reboot_notifier); if (ret) - goto out_pseudo; + goto out_unregister; css_init_done = 1; /* Enable default isc for I/O subchannels. */ @@ -810,18 +819,12 @@ init_channel_subsystem (void) for_each_subchannel(__init_channel_subsystem, NULL); return 0; -out_pseudo: - device_unregister(&channel_subsystems[i]->pseudo_subchannel->dev); out_file: - device_remove_file(&channel_subsystems[i]->device, - &dev_attr_cm_enable); + if (css_chsc_characteristics.secm) + device_remove_file(&channel_subsystems[i]->device, + &dev_attr_cm_enable); out_device: device_unregister(&channel_subsystems[i]->device); -out_free_all: - kfree(channel_subsystems[i]->pseudo_subchannel->lock); - kfree(channel_subsystems[i]->pseudo_subchannel); -out_free: - kfree(channel_subsystems[i]); out_unregister: while (i > 0) { struct channel_subsystem *css; @@ -829,6 +832,7 @@ out_unregister: i--; css = channel_subsystems[i]; device_unregister(&css->pseudo_subchannel->dev); + css->pseudo_subchannel = NULL; if (css_chsc_characteristics.secm) device_remove_file(&css->device, &dev_attr_cm_enable); diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c index 550508df952b..84cc9ea346db 100644 --- a/drivers/s390/cio/device_fsm.c +++ b/drivers/s390/cio/device_fsm.c @@ -658,6 +658,13 @@ ccw_device_offline(struct ccw_device *cdev) { struct subchannel *sch; + /* Allow ccw_device_offline while disconnected. */ + if (cdev->private->state == DEV_STATE_DISCONNECTED || + cdev->private->state == DEV_STATE_NOT_OPER) { + cdev->private->flags.donotify = 0; + ccw_device_done(cdev, DEV_STATE_NOT_OPER); + return 0; + } if (ccw_device_is_orphan(cdev)) { ccw_device_done(cdev, DEV_STATE_OFFLINE); return 0; diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index d4427cb86979..2e15da5459cf 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -60,7 +60,7 @@ #define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) -#define BALLOON_CLASS_NAME "memory" +#define BALLOON_CLASS_NAME "xen_memory" struct balloon_stats { /* We aim for 'current allocation' == 'target allocation'. */ diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 154098157473..73db464cd08b 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -302,18 +302,6 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs) int subtract_lebs; long long available; - /* - * Force the amount available to the total size reported if the used - * space is zero. - */ - if (c->lst.total_used <= UBIFS_INO_NODE_SZ && - c->budg_data_growth + c->budg_dd_growth == 0) { - /* Do the same calculation as for c->block_cnt */ - available = c->main_lebs - 2; - available *= c->leb_size - c->dark_wm; - return available; - } - available = c->main_bytes - c->lst.total_used; /* @@ -714,34 +702,106 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c, } /** - * ubifs_budg_get_free_space - return amount of free space. + * ubifs_reported_space - calculate reported free space. + * @c: the UBIFS file-system description object + * @free: amount of free space + * + * This function calculates amount of free space which will be reported to + * user-space. User-space application tend to expect that if the file-system + * (e.g., via the 'statfs()' call) reports that it has N bytes available, they + * are able to write a file of size N. UBIFS attaches node headers to each data + * node and it has to write indexind nodes as well. This introduces additional + * overhead, and UBIFS it has to report sligtly less free space to meet the + * above expectetion. + * + * This function assumes free space is made up of uncompressed data nodes and + * full index nodes (one per data node, tripled because we always allow enough + * space to write the index thrice). + * + * Note, the calculation is pessimistic, which means that most of the time + * UBIFS reports less space than it actually has. + */ +long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free) +{ + int divisor, factor, f; + + /* + * Reported space size is @free * X, where X is UBIFS block size + * divided by UBIFS block size + all overhead one data block + * introduces. The overhead is the node header + indexing overhead. + * + * Indexing overhead calculations are based on the following formula: + * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number + * of data nodes, f - fanout. Because effective UBIFS fanout is twice + * as less than maximum fanout, we assume that each data node + * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes. + * Note, the multiplier 3 is because UBIFS reseves thrice as more space + * for the index. + */ + f = c->fanout > 3 ? c->fanout >> 1 : 2; + factor = UBIFS_BLOCK_SIZE; + divisor = UBIFS_MAX_DATA_NODE_SZ; + divisor += (c->max_idx_node_sz * 3) / (f - 1); + free *= factor; + do_div(free, divisor); + return free; +} + +/** + * ubifs_get_free_space - return amount of free space. * @c: UBIFS file-system description object * - * This function returns amount of free space on the file-system. + * This function calculates amount of free space to report to user-space. + * + * Because UBIFS may introduce substantial overhead (the index, node headers, + * alighment, wastage at the end of eraseblocks, etc), it cannot report real + * amount of free flash space it has (well, because not all dirty space is + * reclamable, UBIFS does not actually know the real amount). If UBIFS did so, + * it would bread user expectetion about what free space is. Users seem to + * accustomed to assume that if the file-system reports N bytes of free space, + * they would be able to fit a file of N bytes to the FS. This almost works for + * traditional file-systems, because they have way less overhead than UBIFS. + * So, to keep users happy, UBIFS tries to take the overhead into account. */ -long long ubifs_budg_get_free_space(struct ubifs_info *c) +long long ubifs_get_free_space(struct ubifs_info *c) { - int min_idx_lebs, rsvd_idx_lebs; + int min_idx_lebs, rsvd_idx_lebs, lebs; long long available, outstanding, free; - /* Do exactly the same calculations as in 'do_budget_space()' */ spin_lock(&c->space_lock); min_idx_lebs = ubifs_calc_min_idx_lebs(c); + outstanding = c->budg_data_growth + c->budg_dd_growth; - if (min_idx_lebs > c->lst.idx_lebs) - rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; - else - rsvd_idx_lebs = 0; - - if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - - c->lst.taken_empty_lebs) { + /* + * Force the amount available to the total size reported if the used + * space is zero. + */ + if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) { spin_unlock(&c->space_lock); - return 0; + return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT; } available = ubifs_calc_available(c, min_idx_lebs); - outstanding = c->budg_data_growth + c->budg_dd_growth; - c->min_idx_lebs = min_idx_lebs; + + /* + * When reporting free space to user-space, UBIFS guarantees that it is + * possible to write a file of free space size. This means that for + * empty LEBs we may use more precise calculations than + * 'ubifs_calc_available()' is using. Namely, we know that in empty + * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm. + * Thus, amend the available space. + * + * Note, the calculations below are similar to what we have in + * 'do_budget_space()', so refer there for comments. + */ + if (min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + lebs -= rsvd_idx_lebs; + available += lebs * (c->dark_wm - c->leb_overhead); spin_unlock(&c->space_lock); if (available > outstanding) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 5c96f1fb7016..2b267c9a1806 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -587,7 +587,6 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) if (err) { if (err != -ENOSPC) return err; - err = 0; budgeted = 0; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 4071d1cae29f..3d698e2022b1 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -793,7 +793,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, int err; struct ubifs_budget_req req; loff_t old_size = inode->i_size, new_size = attr->ia_size; - int offset = new_size & (UBIFS_BLOCK_SIZE - 1); + int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1; struct ubifs_inode *ui = ubifs_inode(inode); dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); @@ -811,8 +811,15 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, /* A funny way to budget for truncation node */ req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ; err = ubifs_budget_space(c, &req); - if (err) - return err; + if (err) { + /* + * Treat truncations to zero as deletion and always allow them, + * just like we do for '->unlink()'. + */ + if (new_size || err != -ENOSPC) + return err; + budgeted = 0; + } err = vmtruncate(inode, new_size); if (err) @@ -869,7 +876,12 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, err = ubifs_jnl_truncate(c, inode, old_size, new_size); mutex_unlock(&ui->ui_mutex); out_budg: - ubifs_release_budget(c, &req); + if (budgeted) + ubifs_release_budget(c, &req); + else { + c->nospace = c->nospace_rp = 0; + smp_wmb(); + } return err; } diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index adee7b5ddeab..e045c8b55423 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -211,14 +211,8 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, * dirty index heap, and it falls-back to LPT scanning if the heaps are empty * or do not have an LEB which satisfies the @min_space criteria. * - * Note: - * o LEBs which have less than dead watermark of dirty space are never picked - * by this function; - * - * Returns zero and the LEB properties of - * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a - * negative error code in case of other failures. The returned LEB is marked as - * "taken". + * Note, LEBs which have less than dead watermark of free + dirty space are + * never picked by this function. * * The additional @pick_free argument controls if this function has to return a * free or freeable LEB if one is present. For example, GC must to set it to %1, @@ -231,6 +225,10 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, * * In addition @pick_free is set to %2 by the recovery process in order to * recover gc_lnum in which case an index LEB must not be returned. + * + * This function returns zero and the LEB properties of found dirty LEB in case + * of success, %-ENOSPC if no dirty LEB was found and a negative error code in + * case of other failures. The returned LEB is marked as "taken". */ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, int min_space, int pick_free) @@ -245,7 +243,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, int lebs, rsvd_idx_lebs = 0; spin_lock(&c->space_lock); - lebs = c->lst.empty_lebs; + lebs = c->lst.empty_lebs + c->idx_gc_cnt; lebs += c->freeable_cnt - c->lst.taken_empty_lebs; /* @@ -317,7 +315,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, lp = idx_lp; if (lp) { - ubifs_assert(lp->dirty >= c->dead_wm); + ubifs_assert(lp->free + lp->dirty >= c->dead_wm); goto found; } diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index d0f3dac29081..13f1019c859f 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -344,6 +344,12 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) if (err) goto out; + /* Allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); + if (c->gc_lnum == -1) { c->gc_lnum = lnum; err = LEB_RETAINED; diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 87dabf9fe742..4c12a9215d7f 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -284,38 +284,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c, } /** - * ubifs_reported_space - calculate reported free space. - * @c: the UBIFS file-system description object - * @free: amount of free space - * - * This function calculates amount of free space which will be reported to - * user-space. User-space application tend to expect that if the file-system - * (e.g., via the 'statfs()' call) reports that it has N bytes available, they - * are able to write a file of size N. UBIFS attaches node headers to each data - * node and it has to write indexind nodes as well. This introduces additional - * overhead, and UBIFS it has to report sligtly less free space to meet the - * above expectetion. - * - * This function assumes free space is made up of uncompressed data nodes and - * full index nodes (one per data node, doubled because we always allow enough - * space to write the index twice). - * - * Note, the calculation is pessimistic, which means that most of the time - * UBIFS reports less space than it actually has. - */ -static inline long long ubifs_reported_space(const struct ubifs_info *c, - uint64_t free) -{ - int divisor, factor; - - divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz * 3); - factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ; - do_div(free, divisor); - - return free * factor; -} - -/** * ubifs_current_time - round current time to time granularity. * @inode: inode */ @@ -325,4 +293,21 @@ static inline struct timespec ubifs_current_time(struct inode *inode) current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; } +/** + * ubifs_tnc_lookup - look up a file-system node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. + */ +static inline int ubifs_tnc_lookup(struct ubifs_info *c, + const union ubifs_key *key, void *node) +{ + return ubifs_tnc_locate(c, key, node, NULL, NULL); +} + #endif /* __UBIFS_MISC_H__ */ diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f71e6b8822c4..7562464ac83f 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -370,8 +370,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct ubifs_info *c = dentry->d_sb->s_fs_info; unsigned long long free; + __le32 *uuid = (__le32 *)c->uuid; - free = ubifs_budg_get_free_space(c); + free = ubifs_get_free_space(c); dbg_gen("free space %lld bytes (%lld blocks)", free, free >> UBIFS_BLOCK_SHIFT); @@ -386,7 +387,8 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = 0; buf->f_ffree = 0; buf->f_namelen = UBIFS_MAX_NLEN; - + buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); + buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); return 0; } @@ -530,6 +532,12 @@ static int init_constants_early(struct ubifs_info *c) c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); + /* + * Calculate how many bytes would be wasted at the end of LEB if it was + * fully filled with data nodes of maximum size. This is used in + * calculations when reporting free space. + */ + c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ; return 0; } @@ -647,13 +655,11 @@ static int init_constants_late(struct ubifs_info *c) * internally because it does not make much sense for UBIFS, but it is * necessary to report something for the 'statfs()' call. * - * Subtract the LEB reserved for GC and the LEB which is reserved for - * deletions. - * - * Review 'ubifs_calc_available()' if changing this calculation. + * Subtract the LEB reserved for GC, the LEB which is reserved for + * deletions, and assume only one journal head is available. */ - tmp64 = c->main_lebs - 2; - tmp64 *= (uint64_t)c->leb_size - c->dark_wm; + tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1; + tmp64 *= (uint64_t)c->leb_size - c->leb_overhead; tmp64 = ubifs_reported_space(c, tmp64); c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index e909f4a96443..7da209ab9378 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -506,7 +506,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, if (keys_cmp(c, key, &node_key) != 0) ret = 0; } - if (ret == 0) + if (ret == 0 && c->replaying) dbg_mnt("dangling branch LEB %d:%d len %d, key %s", zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); return ret; @@ -1382,50 +1382,39 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key, } /** - * ubifs_tnc_lookup - look up a file-system node. + * maybe_leb_gced - determine if a LEB may have been garbage collected. * @c: UBIFS file-system description object - * @key: node key to lookup - * @node: the node is returned here + * @lnum: LEB number + * @gc_seq1: garbage collection sequence number * - * This function look up and reads node with key @key. The caller has to make - * sure the @node buffer is large enough to fit the node. Returns zero in case - * of success, %-ENOENT if the node was not found, and a negative error code in - * case of failure. + * This function determines if @lnum may have been garbage collected since + * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise + * %0 is returned. */ -int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, - void *node) +static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1) { - int found, n, err; - struct ubifs_znode *znode; - struct ubifs_zbranch zbr, *zt; + int gc_seq2, gced_lnum; - mutex_lock(&c->tnc_mutex); - found = ubifs_lookup_level0(c, key, &znode, &n); - if (!found) { - err = -ENOENT; - goto out; - } else if (found < 0) { - err = found; - goto out; - } - zt = &znode->zbranch[n]; - if (is_hash_key(c, key)) { - /* - * In this case the leaf node cache gets used, so we pass the - * address of the zbranch and keep the mutex locked - */ - err = tnc_read_node_nm(c, zt, node); - goto out; - } - zbr = znode->zbranch[n]; - mutex_unlock(&c->tnc_mutex); - - err = ubifs_tnc_read_node(c, &zbr, node); - return err; - -out: - mutex_unlock(&c->tnc_mutex); - return err; + gced_lnum = c->gced_lnum; + smp_rmb(); + gc_seq2 = c->gc_seq; + /* Same seq means no GC */ + if (gc_seq1 == gc_seq2) + return 0; + /* Different by more than 1 means we don't know */ + if (gc_seq1 + 1 != gc_seq2) + return 1; + /* + * We have seen the sequence number has increased by 1. Now we need to + * be sure we read the right LEB number, so read it again. + */ + smp_rmb(); + if (gced_lnum != c->gced_lnum) + return 1; + /* Finally we can check lnum */ + if (gced_lnum == lnum) + return 1; + return 0; } /** @@ -1436,16 +1425,19 @@ out: * @lnum: LEB number is returned here * @offs: offset is returned here * - * This function is the same as 'ubifs_tnc_lookup()' but it returns the node - * location also. See 'ubifs_tnc_lookup()'. + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. The node location can be returned in @lnum and @offs. */ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, void *node, int *lnum, int *offs) { - int found, n, err; + int found, n, err, safely = 0, gc_seq1; struct ubifs_znode *znode; struct ubifs_zbranch zbr, *zt; +again: mutex_lock(&c->tnc_mutex); found = ubifs_lookup_level0(c, key, &znode, &n); if (!found) { @@ -1456,24 +1448,43 @@ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, goto out; } zt = &znode->zbranch[n]; + if (lnum) { + *lnum = zt->lnum; + *offs = zt->offs; + } if (is_hash_key(c, key)) { /* * In this case the leaf node cache gets used, so we pass the * address of the zbranch and keep the mutex locked */ - *lnum = zt->lnum; - *offs = zt->offs; err = tnc_read_node_nm(c, zt, node); goto out; } + if (safely) { + err = ubifs_tnc_read_node(c, zt, node); + goto out; + } + /* Drop the TNC mutex prematurely and race with garbage collection */ zbr = znode->zbranch[n]; + gc_seq1 = c->gc_seq; mutex_unlock(&c->tnc_mutex); - *lnum = zbr.lnum; - *offs = zbr.offs; + if (ubifs_get_wbuf(c, zbr.lnum)) { + /* We do not GC journal heads */ + err = ubifs_tnc_read_node(c, &zbr, node); + return err; + } - err = ubifs_tnc_read_node(c, &zbr, node); - return err; + err = fallible_read_node(c, key, &zbr, node); + if (maybe_leb_gced(c, zbr.lnum, gc_seq1)) { + /* + * The node may have been GC'ed out from under us so try again + * while keeping the TNC mutex locked. + */ + safely = 1; + goto again; + } + return 0; out: mutex_unlock(&c->tnc_mutex); @@ -1498,7 +1509,6 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, { int found, n, err; struct ubifs_znode *znode; - struct ubifs_zbranch zbr; dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); mutex_lock(&c->tnc_mutex); @@ -1522,11 +1532,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, goto out_unlock; } - zbr = znode->zbranch[n]; - mutex_unlock(&c->tnc_mutex); - - err = tnc_read_node_nm(c, &zbr, node); - return err; + err = tnc_read_node_nm(c, &znode->zbranch[n], node); out_unlock: mutex_unlock(&c->tnc_mutex); diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index bd2121f3426e..a9ecbd9af20d 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -87,7 +87,7 @@ #define UBIFS_SK_LEN 8 /* Minimum index tree fanout */ -#define UBIFS_MIN_FANOUT 2 +#define UBIFS_MIN_FANOUT 3 /* Maximum number of levels in UBIFS indexing B-tree */ #define UBIFS_MAX_LEVELS 512 diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index d7f706f7a302..17c620b93eec 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -995,6 +995,9 @@ struct ubifs_mount_opts { * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary * @max_inode_sz: maximum possible inode size in bytes * @max_znode_sz: size of znode in bytes + * + * @leb_overhead: how many bytes are wasted in an LEB when it is filled with + * data nodes of maximum size - used in free space reporting * @dead_wm: LEB dead space watermark * @dark_wm: LEB dark space watermark * @block_cnt: count of 4KiB blocks on the FS @@ -1028,6 +1031,8 @@ struct ubifs_mount_opts { * @sbuf: a buffer of LEB size used by GC and replay for scanning * @idx_gc: list of index LEBs that have been garbage collected * @idx_gc_cnt: number of elements on the idx_gc list + * @gc_seq: incremented for every non-index LEB garbage collected + * @gced_lnum: last non-index LEB that was garbage collected * * @infos_list: links all 'ubifs_info' objects * @umount_mutex: serializes shrinker and un-mount @@ -1224,6 +1229,8 @@ struct ubifs_info { int max_idx_node_sz; long long max_inode_sz; int max_znode_sz; + + int leb_overhead; int dead_wm; int dark_wm; int block_cnt; @@ -1257,6 +1264,8 @@ struct ubifs_info { void *sbuf; struct list_head idx_gc; int idx_gc_cnt; + volatile int gc_seq; + volatile int gced_lnum; struct list_head infos_list; struct mutex umount_mutex; @@ -1434,9 +1443,10 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode, struct ubifs_budget_req *req); void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, struct ubifs_budget_req *req); -long long ubifs_budg_get_free_space(struct ubifs_info *c); +long long ubifs_get_free_space(struct ubifs_info *c); int ubifs_calc_min_idx_lebs(struct ubifs_info *c); void ubifs_convert_page_budget(struct ubifs_info *c); +long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free); long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); /* find.c */ @@ -1451,8 +1461,6 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c); /* tnc.c */ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode **zn, int *n); -int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, - void *node); int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, void *node, const struct qstr *nm); int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 8feeae1f2369..79a7ff925bf8 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -14,4 +14,10 @@ extern char __kprobes_text_start[], __kprobes_text_end[]; extern char __initdata_begin[], __initdata_end[]; extern char __start_rodata[], __end_rodata[]; +/* function descriptor handling (if any). Override + * in asm/sections.h */ +#ifndef dereference_function_descriptor +#define dereference_function_descriptor(p) (p) +#endif + #endif /* _ASM_GENERIC_SECTIONS_H_ */ diff --git a/include/asm-parisc/sections.h b/include/asm-parisc/sections.h index fdd43ec42ec5..9d13c3507ad6 100644 --- a/include/asm-parisc/sections.h +++ b/include/asm-parisc/sections.h @@ -4,4 +4,9 @@ /* nothing to see, move along */ #include <asm-generic/sections.h> +#ifdef CONFIG_64BIT +#undef dereference_function_descriptor +void *dereference_function_descriptor(void *); +#endif + #endif diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 2e35501e61dd..2e8de678e767 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -43,19 +43,17 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, * and emit a warning. */ if (++i > 2) { - printk(KERN_WARNING "CE: __tick_program_event of %s is " - "stuck %llx %llx\n", dev->name ? dev->name : "?", - now.tv64, expires.tv64); - printk(KERN_WARNING - "CE: increasing min_delta_ns %ld to %ld nsec\n", - dev->min_delta_ns, dev->min_delta_ns << 1); - WARN_ON(1); - - /* Double the min. delta and try again */ + /* Increase the min. delta and try again */ if (!dev->min_delta_ns) dev->min_delta_ns = 5000; else - dev->min_delta_ns <<= 1; + dev->min_delta_ns += dev->min_delta_ns >> 1; + + printk(KERN_WARNING + "CE: %s increasing min_delta_ns to %lu nsec\n", + dev->name ? dev->name : "?", + dev->min_delta_ns << 1); + i = 0; } diff --git a/lib/vsprintf.c b/lib/vsprintf.c index d8d1d1142248..c399bc1093cb 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -27,6 +27,7 @@ #include <asm/page.h> /* for PAGE_SIZE */ #include <asm/div64.h> +#include <asm/sections.h> /* for dereference_function_descriptor() */ /* Works only for digits and letters, but small and fast */ #define TOLOWER(x) ((x) | 0x20) @@ -513,16 +514,6 @@ static char *string(char *buf, char *end, char *s, int field_width, int precisio return buf; } -static inline void *dereference_function_descriptor(void *ptr) -{ -#if defined(CONFIG_IA64) || defined(CONFIG_PPC64) - void *p; - if (!probe_kernel_address(ptr, p)) - ptr = p; -#endif - return ptr; -} - static char *symbol_string(char *buf, char *end, void *ptr, int field_width, int precision, int flags) { unsigned long value = (unsigned long) ptr; |