diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/bounce.c | 4 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 11 | ||||
-rw-r--r-- | mm/filemap_xip.c | 8 | ||||
-rw-r--r-- | mm/fremap.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 59 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 6 | ||||
-rw-r--r-- | mm/mempolicy.c | 6 | ||||
-rw-r--r-- | mm/mincore.c | 183 | ||||
-rw-r--r-- | mm/mmap.c | 17 | ||||
-rw-r--r-- | mm/mremap.c | 1 | ||||
-rw-r--r-- | mm/nommu.c | 12 | ||||
-rw-r--r-- | mm/oom_kill.c | 21 | ||||
-rw-r--r-- | mm/page-writeback.c | 147 | ||||
-rw-r--r-- | mm/page_alloc.c | 130 | ||||
-rw-r--r-- | mm/readahead.c | 4 | ||||
-rw-r--r-- | mm/rmap.c | 36 | ||||
-rw-r--r-- | mm/shmem.c | 27 | ||||
-rw-r--r-- | mm/slab.c | 119 | ||||
-rw-r--r-- | mm/slob.c | 27 | ||||
-rw-r--r-- | mm/swapfile.c | 12 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 4 | ||||
-rw-r--r-- | mm/truncate.c | 49 | ||||
-rw-r--r-- | mm/vmscan.c | 47 |
25 files changed, 607 insertions, 335 deletions
diff --git a/mm/bounce.c b/mm/bounce.c index e4b62d2a4024..643efbe82402 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -237,6 +237,8 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, if (!bio) return; + blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); + /* * at least one page was bounced, fill in possible non-highmem * pages @@ -291,8 +293,6 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) pool = isa_page_pool; } - blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); - /* * slow path */ diff --git a/mm/fadvise.c b/mm/fadvise.c index 168c78a121bb..0df4c899e979 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -38,7 +38,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) if (!file) return -EBADF; - if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) { + if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) { ret = -ESPIPE; goto out; } diff --git a/mm/filemap.c b/mm/filemap.c index af7e2f5caea9..8332c77b1bd1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1181,8 +1181,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, if (pos < size) { retval = generic_file_direct_IO(READ, iocb, iov, pos, nr_segs); - if (retval > 0 && !is_sync_kiocb(iocb)) - retval = -EIOCBQUEUED; if (retval > 0) *ppos = pos + retval; } @@ -2047,15 +2045,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, * Sync the fs metadata but not the minor inode changes and * of course not the data as we did direct DMA for the IO. * i_mutex is held, which protects generic_osync_inode() from - * livelocking. + * livelocking. AIO O_DIRECT ops attempt to sync metadata here. */ - if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if ((written >= 0 || written == -EIOCBQUEUED) && + ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); if (err < 0) written = err; } - if (written == count && !is_sync_kiocb(iocb)) - written = -EIOCBQUEUED; return written; } EXPORT_SYMBOL(generic_file_direct_write); @@ -2269,7 +2266,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; - err = remove_suid(file->f_dentry); + err = remove_suid(file->f_path.dentry); if (err) goto out; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b4fd0d7c9bfb..9dd9fbb75139 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -183,13 +183,13 @@ __xip_unmap (struct address_space * mapping, address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); BUG_ON(address < vma->vm_start || address >= vma->vm_end); - page = ZERO_PAGE(address); + page = ZERO_PAGE(0); pte = page_check_address(page, mm, address, &ptl); if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush(vma, address, pte); - page_remove_rmap(page); + page_remove_rmap(page, vma); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); @@ -246,7 +246,7 @@ xip_file_nopage(struct vm_area_struct * area, __xip_unmap(mapping, pgoff); } else { /* not shared and writable, use ZERO_PAGE() */ - page = ZERO_PAGE(address); + page = ZERO_PAGE(0); } out: @@ -379,7 +379,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, if (count == 0) goto out_backing; - ret = remove_suid(filp->f_dentry); + ret = remove_suid(filp->f_path.dentry); if (ret) goto out_backing; diff --git a/mm/fremap.c b/mm/fremap.c index b77a002c3352..4e3f53dd5fd4 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -33,7 +33,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, if (page) { if (pte_dirty(pte)) set_page_dirty(page); - page_remove_rmap(page); + page_remove_rmap(page, vma); page_cache_release(page); } } else { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0ccc7f230252..cb362f761f17 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -44,14 +44,14 @@ static void clear_huge_page(struct page *page, unsigned long addr) } static void copy_huge_page(struct page *dst, struct page *src, - unsigned long addr) + unsigned long addr, struct vm_area_struct *vma) { int i; might_sleep(); for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { cond_resched(); - copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE); + copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); } } @@ -73,7 +73,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, for (z = zonelist->zones; *z; z++) { nid = zone_to_nid(*z); - if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && + if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) && !list_empty(&hugepage_freelists[nid])) break; } @@ -442,7 +442,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, } spin_unlock(&mm->page_table_lock); - copy_huge_page(new_page, old_page, address); + copy_huge_page(new_page, old_page, address, vma); spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & HPAGE_MASK); diff --git a/mm/memory.c b/mm/memory.c index 4198df0dff1c..ef09f0acb1d8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -681,7 +681,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, mark_page_accessed(page); file_rss--; } - page_remove_rmap(page); + page_remove_rmap(page, vma); tlb_remove_page(tlb, page); continue; } @@ -1091,7 +1091,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (pages) { pages[i] = page; - flush_anon_page(page, start); + flush_anon_page(vma, page, start); flush_dcache_page(page); } if (vmas) @@ -1110,23 +1110,29 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; spinlock_t *ptl; + int err = 0; pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) - return -ENOMEM; + return -EAGAIN; arch_enter_lazy_mmu_mode(); do { struct page *page = ZERO_PAGE(addr); pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); + + if (unlikely(!pte_none(*pte))) { + err = -EEXIST; + pte++; + break; + } page_cache_get(page); page_add_file_rmap(page); inc_mm_counter(mm, file_rss); - BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, zero_pte); } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); - return 0; + return err; } static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, @@ -1134,16 +1140,18 @@ static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, { pmd_t *pmd; unsigned long next; + int err; pmd = pmd_alloc(mm, pud, addr); if (!pmd) - return -ENOMEM; + return -EAGAIN; do { next = pmd_addr_end(addr, end); - if (zeromap_pte_range(mm, pmd, addr, next, prot)) - return -ENOMEM; + err = zeromap_pte_range(mm, pmd, addr, next, prot); + if (err) + break; } while (pmd++, addr = next, addr != end); - return 0; + return err; } static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, @@ -1151,16 +1159,18 @@ static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, { pud_t *pud; unsigned long next; + int err; pud = pud_alloc(mm, pgd, addr); if (!pud) - return -ENOMEM; + return -EAGAIN; do { next = pud_addr_end(addr, end); - if (zeromap_pmd_range(mm, pud, addr, next, prot)) - return -ENOMEM; + err = zeromap_pmd_range(mm, pud, addr, next, prot); + if (err) + break; } while (pud++, addr = next, addr != end); - return 0; + return err; } int zeromap_page_range(struct vm_area_struct *vma, @@ -1431,7 +1441,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { /* * If the source page was a PFN mapping, we don't have @@ -1454,9 +1464,9 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(dst); return; - + } - copy_user_highpage(dst, src, va); + copy_user_highpage(dst, src, va, vma); } /* @@ -1567,7 +1577,7 @@ gotten: new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, address); + cow_user_page(new_page, old_page, address, vma); } /* @@ -1576,7 +1586,7 @@ gotten: page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { - page_remove_rmap(old_page); + page_remove_rmap(old_page, vma); if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); @@ -2190,7 +2200,7 @@ retry: page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!page) goto oom; - copy_user_highpage(page, new_page, address); + copy_user_highpage(page, new_page, address, vma); page_cache_release(new_page); new_page = page; anon = 1; @@ -2596,8 +2606,15 @@ static int __init gate_vma_init(void) gate_vma.vm_mm = NULL; gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_page_prot = PAGE_READONLY; - gate_vma.vm_flags = 0; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + gate_vma.vm_page_prot = __P101; + /* + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully interpretable later + * without matching up the same kernel and hardware config to see + * what PC values meant. + */ + gate_vma.vm_flags |= VM_ALWAYSDUMP; return 0; } __initcall(gate_vma_init); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0c055a090f4d..84279127fcd3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -67,11 +67,13 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) zone_type = zone - pgdat->node_zones; if (!populated_zone(zone)) { int ret = 0; - ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages); + ret = init_currently_empty_zone(zone, phys_start_pfn, + nr_pages, MEMMAP_HOTPLUG); if (ret < 0) return ret; } - memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); + memmap_init_zone(nr_pages, nid, zone_type, + phys_start_pfn, MEMMAP_HOTPLUG); return 0; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b917d6fdc1bb..c2aec0e1090d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -884,6 +884,10 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, err = get_nodes(&nodes, nmask, maxnode); if (err) return err; +#ifdef CONFIG_CPUSETS + /* Restrict the nodes to the allowed nodes in the cpuset */ + nodes_and(nodes, nodes, current->mems_allowed); +#endif return do_mbind(start, len, mode, &nodes, flags); } @@ -1857,7 +1861,7 @@ int show_numa_map(struct seq_file *m, void *v) if (file) { seq_printf(m, " file="); - seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= "); + seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_printf(m, " heap"); } else if (vma->vm_start <= mm->start_stack && diff --git a/mm/mincore.c b/mm/mincore.c index 72890780c1c9..8aca6f7167bb 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -1,7 +1,7 @@ /* * linux/mm/mincore.c * - * Copyright (C) 1994-1999 Linus Torvalds + * Copyright (C) 1994-2006 Linus Torvalds */ /* @@ -38,46 +38,51 @@ static unsigned char mincore_page(struct vm_area_struct * vma, return present; } -static long mincore_vma(struct vm_area_struct * vma, - unsigned long start, unsigned long end, unsigned char __user * vec) +/* + * Do a chunk of "sys_mincore()". We've already checked + * all the arguments, we hold the mmap semaphore: we should + * just return the amount of info we're asked for. + */ +static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages) { - long error, i, remaining; - unsigned char * tmp; - - error = -ENOMEM; - if (!vma->vm_file) - return error; - - start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - if (end > vma->vm_end) - end = vma->vm_end; - end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + unsigned long i, nr, pgoff; + struct vm_area_struct *vma = find_vma(current->mm, addr); - error = -EAGAIN; - tmp = (unsigned char *) __get_free_page(GFP_KERNEL); - if (!tmp) - return error; + /* + * find_vma() didn't find anything above us, or we're + * in an unmapped hole in the address space: ENOMEM. + */ + if (!vma || addr < vma->vm_start) + return -ENOMEM; - /* (end - start) is # of pages, and also # of bytes in "vec */ - remaining = (end - start), + /* + * Ok, got it. But check whether it's a segment we support + * mincore() on. Right now, we don't do any anonymous mappings. + * + * FIXME: This is just stupid. And returning ENOMEM is + * stupid too. We should just look at the page tables. But + * this is what we've traditionally done, so we'll just + * continue doing it. + */ + if (!vma->vm_file) + return -ENOMEM; - error = 0; - for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { - int j = 0; - long thispiece = (remaining < PAGE_SIZE) ? - remaining : PAGE_SIZE; + /* + * Calculate how many pages there are left in the vma, and + * what the pgoff is for our address. + */ + nr = (vma->vm_end - addr) >> PAGE_SHIFT; + if (nr > pages) + nr = pages; - while (j < thispiece) - tmp[j++] = mincore_page(vma, start++); + pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + pgoff += vma->vm_pgoff; - if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { - error = -EFAULT; - break; - } - } + /* And then we just fill the sucker in.. */ + for (i = 0 ; i < nr; i++, pgoff++) + vec[i] = mincore_page(vma, pgoff); - free_page((unsigned long) tmp); - return error; + return nr; } /* @@ -107,82 +112,50 @@ static long mincore_vma(struct vm_area_struct * vma, asmlinkage long sys_mincore(unsigned long start, size_t len, unsigned char __user * vec) { - int index = 0; - unsigned long end, limit; - struct vm_area_struct * vma; - size_t max; - int unmapped_error = 0; - long error; - - /* check the arguments */ - if (start & ~PAGE_CACHE_MASK) - goto einval; - - limit = TASK_SIZE; - if (start >= limit) - goto enomem; - - if (!len) - return 0; - - max = limit - start; - len = PAGE_CACHE_ALIGN(len); - if (len > max || !len) - goto enomem; + long retval; + unsigned long pages; + unsigned char *tmp; - end = start + len; + /* Check the start address: needs to be page-aligned.. */ + if (start & ~PAGE_CACHE_MASK) + return -EINVAL; - /* check the output buffer whilst holding the lock */ - error = -EFAULT; - down_read(¤t->mm->mmap_sem); + /* ..and we need to be passed a valid user-space range */ + if (!access_ok(VERIFY_READ, (void __user *) start, len)) + return -ENOMEM; - if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT)) - goto out; + /* This also avoids any overflows on PAGE_CACHE_ALIGN */ + pages = len >> PAGE_SHIFT; + pages += (len & ~PAGE_MASK) != 0; - /* - * If the interval [start,end) covers some unmapped address - * ranges, just ignore them, but return -ENOMEM at the end. - */ - error = 0; - - vma = find_vma(current->mm, start); - while (vma) { - /* Here start < vma->vm_end. */ - if (start < vma->vm_start) { - unmapped_error = -ENOMEM; - start = vma->vm_start; - } + if (!access_ok(VERIFY_WRITE, vec, pages)) + return -EFAULT; - /* Here vma->vm_start <= start < vma->vm_end. */ - if (end <= vma->vm_end) { - if (start < end) { - error = mincore_vma(vma, start, end, - &vec[index]); - if (error) - goto out; - } - error = unmapped_error; - goto out; + tmp = (void *) __get_free_page(GFP_USER); + if (!tmp) + return -EAGAIN; + + retval = 0; + while (pages) { + /* + * Do at most PAGE_SIZE entries per iteration, due to + * the temporary buffer size. + */ + down_read(¤t->mm->mmap_sem); + retval = do_mincore(start, tmp, min(pages, PAGE_SIZE)); + up_read(¤t->mm->mmap_sem); + + if (retval <= 0) + break; + if (copy_to_user(vec, tmp, retval)) { + retval = -EFAULT; + break; } - - /* Here vma->vm_start <= start < vma->vm_end < end. */ - error = mincore_vma(vma, start, vma->vm_end, &vec[index]); - if (error) - goto out; - index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT; - start = vma->vm_end; - vma = vma->vm_next; + pages -= retval; + vec += retval; + start += retval << PAGE_SHIFT; + retval = 0; } - - /* we found a hole in the area queried if we arrive here */ - error = -ENOMEM; - -out: - up_read(¤t->mm->mmap_sem); - return error; - -einval: - return -EINVAL; -enomem: - return -ENOMEM; + free_page((unsigned long) tmp); + return retval; } diff --git a/mm/mmap.c b/mm/mmap.c index 7be110e98d4c..cc3a20819457 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -188,7 +188,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) { if (vma->vm_flags & VM_DENYWRITE) - atomic_inc(&file->f_dentry->d_inode->i_writecount); + atomic_inc(&file->f_path.dentry->d_inode->i_writecount); if (vma->vm_flags & VM_SHARED) mapping->i_mmap_writable--; @@ -399,7 +399,7 @@ static inline void __vma_link_file(struct vm_area_struct *vma) struct address_space *mapping = file->f_mapping; if (vma->vm_flags & VM_DENYWRITE) - atomic_dec(&file->f_dentry->d_inode->i_writecount); + atomic_dec(&file->f_path.dentry->d_inode->i_writecount); if (vma->vm_flags & VM_SHARED) mapping->i_mmap_writable++; @@ -907,7 +907,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, * mounted, in which case we dont add PROT_EXEC.) */ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) - if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))) + if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) prot |= PROT_EXEC; if (!len) @@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, return -EAGAIN; } - inode = file ? file->f_dentry->d_inode : NULL; + inode = file ? file->f_path.dentry->d_inode : NULL; if (file) { switch (flags & MAP_TYPE) { @@ -989,7 +989,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; - if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { + if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { if (vm_flags & VM_EXEC) return -EPERM; vm_flags &= ~VM_MAYEXEC; @@ -1477,6 +1477,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un { struct mm_struct *mm = vma->vm_mm; struct rlimit *rlim = current->signal->rlim; + unsigned long new_start; /* address space limit tests */ if (!may_expand_vm(mm, grow)) @@ -1496,6 +1497,12 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un return -ENOMEM; } + /* Check to ensure the stack will not grow into a hugetlb-only region */ + new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : + vma->vm_end - size; + if (is_hugepage_only_range(vma->vm_mm, new_start, size)) + return -EFAULT; + /* * Overcommit.. This must be the final test, as it will * update security statistics. diff --git a/mm/mremap.c b/mm/mremap.c index 9c769fa29f32..5d4bd4f95b8e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -105,7 +105,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (pte_none(*old_pte)) continue; pte = ptep_clear_flush(vma, old_addr, old_pte); - /* ZERO_PAGE can be dependant on virtual addr */ pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); set_pte_at(mm, new_addr, new_pte, pte); } diff --git a/mm/nommu.c b/mm/nommu.c index af874569d0f1..23fb033e596d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -523,7 +523,7 @@ static int validate_mmap_request(struct file *file, */ mapping = file->f_mapping; if (!mapping) - mapping = file->f_dentry->d_inode->i_mapping; + mapping = file->f_path.dentry->d_inode->i_mapping; capabilities = 0; if (mapping && mapping->backing_dev_info) @@ -532,7 +532,7 @@ static int validate_mmap_request(struct file *file, if (!capabilities) { /* no explicit capabilities set, so assume some * defaults */ - switch (file->f_dentry->d_inode->i_mode & S_IFMT) { + switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) { case S_IFREG: case S_IFBLK: capabilities = BDI_CAP_MAP_COPY; @@ -563,11 +563,11 @@ static int validate_mmap_request(struct file *file, !(file->f_mode & FMODE_WRITE)) return -EACCES; - if (IS_APPEND(file->f_dentry->d_inode) && + if (IS_APPEND(file->f_path.dentry->d_inode) && (file->f_mode & FMODE_WRITE)) return -EACCES; - if (locks_verify_locked(file->f_dentry->d_inode)) + if (locks_verify_locked(file->f_path.dentry->d_inode)) return -EAGAIN; if (!(capabilities & BDI_CAP_MAP_DIRECT)) @@ -598,7 +598,7 @@ static int validate_mmap_request(struct file *file, /* handle executable mappings and implied executable * mappings */ - if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { + if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { if (prot & PROT_EXEC) return -EPERM; } @@ -833,7 +833,7 @@ unsigned long do_mmap_pgoff(struct file *file, continue; /* search for overlapping mappings on the same file */ - if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode) + if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode) continue; if (vma->vm_pgoff >= pgoff + pglen) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 223d9ccb7d64..b278b8d60eee 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -61,12 +61,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) } /* - * swapoff can easily use up all memory, so kill those first. - */ - if (p->flags & PF_SWAPOFF) - return ULONG_MAX; - - /* * The memory size of the process is the basis for the badness. */ points = mm->total_vm; @@ -77,6 +71,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) task_unlock(p); /* + * swapoff can easily use up all memory, so kill those first. + */ + if (p->flags & PF_SWAPOFF) + return ULONG_MAX; + + /* * Processes which fork a lot of child processes are likely * a good choice. We add half the vmsize of the children if they * have an own mm. This prevents forking servers to flood the @@ -174,10 +174,15 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) { #ifdef CONFIG_NUMA struct zone **z; - nodemask_t nodes = node_online_map; + nodemask_t nodes; + int node; + /* node has memory ? */ + for_each_online_node(node) + if (NODE_DATA(node)->node_present_pages) + node_set(node, nodes); for (z = zonelist->zones; *z; z++) - if (cpuset_zone_allowed(*z, gfp_mask)) + if (cpuset_zone_allowed_softwall(*z, gfp_mask)) node_clear(zone_to_nid(*z), nodes); else return CONSTRAINT_CPUSET; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8d9b19f239c3..be0efbde4994 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -21,6 +21,7 @@ #include <linux/writeback.h> #include <linux/init.h> #include <linux/backing-dev.h> +#include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/mpage.h> #include <linux/rmap.h> @@ -132,11 +133,9 @@ get_dirty_limits(long *pbackground, long *pdirty, #ifdef CONFIG_HIGHMEM /* - * If this mapping can only allocate from low memory, - * we exclude high memory from our count. + * We always exclude high memory from our count. */ - if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM)) - available_memory -= totalhigh_pages; + available_memory -= totalhigh_pages; #endif @@ -525,28 +524,25 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { }; /* - * If the machine has a large highmem:lowmem ratio then scale back the default - * dirty memory thresholds: allowing too much dirty highmem pins an excessive - * number of buffer_heads. + * Called early on to tune the page writeback dirty limits. + * + * We used to scale dirty pages according to how total memory + * related to pages that could be allocated for buffers (by + * comparing nr_free_buffer_pages() to vm_total_pages. + * + * However, that was when we used "dirty_ratio" to scale with + * all memory, and we don't do that any more. "dirty_ratio" + * is now applied to total non-HIGHPAGE memory (by subtracting + * totalhigh_pages from vm_total_pages), and as such we can't + * get into the old insane situation any more where we had + * large amounts of dirty pages compared to a small amount of + * non-HIGHMEM memory. + * + * But we might still want to scale the dirty_ratio by how + * much memory the box has.. */ void __init page_writeback_init(void) { - long buffer_pages = nr_free_buffer_pages(); - long correction; - - correction = (100 * 4 * buffer_pages) / vm_total_pages; - - if (correction < 100) { - dirty_background_ratio *= correction; - dirty_background_ratio /= 100; - vm_dirty_ratio *= correction; - vm_dirty_ratio /= 100; - - if (dirty_background_ratio <= 0) - dirty_background_ratio = 1; - if (vm_dirty_ratio <= 0) - vm_dirty_ratio = 1; - } mod_timer(&wb_timer, jiffies + dirty_writeback_interval); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); @@ -761,23 +757,24 @@ int __set_page_dirty_nobuffers(struct page *page) struct address_space *mapping = page_mapping(page); struct address_space *mapping2; - if (mapping) { - write_lock_irq(&mapping->tree_lock); - mapping2 = page_mapping(page); - if (mapping2) { /* Race with truncate? */ - BUG_ON(mapping2 != mapping); - if (mapping_cap_account_dirty(mapping)) - __inc_zone_page_state(page, - NR_FILE_DIRTY); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - } - write_unlock_irq(&mapping->tree_lock); - if (mapping->host) { - /* !PageAnon && !swapper_space */ - __mark_inode_dirty(mapping->host, - I_DIRTY_PAGES); + if (!mapping) + return 1; + + write_lock_irq(&mapping->tree_lock); + mapping2 = page_mapping(page); + if (mapping2) { /* Race with truncate? */ + BUG_ON(mapping2 != mapping); + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + task_io_account_write(PAGE_CACHE_SIZE); } + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + write_unlock_irq(&mapping->tree_lock); + if (mapping->host) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } return 1; } @@ -843,39 +840,6 @@ int set_page_dirty_lock(struct page *page) EXPORT_SYMBOL(set_page_dirty_lock); /* - * Clear a page's dirty flag, while caring for dirty memory accounting. - * Returns true if the page was previously dirty. - */ -int test_clear_page_dirty(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - unsigned long flags; - - if (mapping) { - write_lock_irqsave(&mapping->tree_lock, flags); - if (TestClearPageDirty(page)) { - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); - write_unlock_irqrestore(&mapping->tree_lock, flags); - /* - * We can continue to use `mapping' here because the - * page is locked, which pins the address_space - */ - if (mapping_cap_account_dirty(mapping)) { - page_mkclean(page); - dec_zone_page_state(page, NR_FILE_DIRTY); - } - return 1; - } - write_unlock_irqrestore(&mapping->tree_lock, flags); - return 0; - } - return TestClearPageDirty(page); -} -EXPORT_SYMBOL(test_clear_page_dirty); - -/* * Clear a page's dirty flag, while caring for dirty memory accounting. * Returns true if the page was previously dirty. * @@ -893,12 +857,41 @@ int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); - if (mapping) { + if (mapping && mapping_cap_account_dirty(mapping)) { + /* + * Yes, Virginia, this is indeed insane. + * + * We use this sequence to make sure that + * (a) we account for dirty stats properly + * (b) we tell the low-level filesystem to + * mark the whole page dirty if it was + * dirty in a pagetable. Only to then + * (c) clean the page again and return 1 to + * cause the writeback. + * + * This way we avoid all nasty races with the + * dirty bit in multiple places and clearing + * them concurrently from different threads. + * + * Note! Normally the "set_page_dirty(page)" + * has no effect on the actual dirty bit - since + * that will already usually be set. But we + * need the side effects, and it can help us + * avoid races. + * + * We basically use the page "master dirty bit" + * as a serialization point for all the different + * threads doing their things. + * + * FIXME! We still have a race here: if somebody + * adds the page back to the page tables in + * between the "page_mkclean()" and the "TestClearPageDirty()", + * we might have it mapped without the dirty bit set. + */ + if (page_mkclean(page)) + set_page_dirty(page); if (TestClearPageDirty(page)) { - if (mapping_cap_account_dirty(mapping)) { - page_mkclean(page); - dec_zone_page_state(page, NR_FILE_DIRTY); - } + dec_zone_page_state(page, NR_FILE_DIRTY); return 1; } return 0; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cace22b3ac25..2c606cc922a5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -40,6 +40,7 @@ #include <linux/sort.h> #include <linux/pfn.h> #include <linux/backing-dev.h> +#include <linux/fault-inject.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -710,6 +711,9 @@ static void __drain_pages(unsigned int cpu) for_each_zone(zone) { struct per_cpu_pageset *pset; + if (!populated_zone(zone)) + continue; + pset = zone_pcp(zone, cpu); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; @@ -892,6 +896,91 @@ failed: #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ +#ifdef CONFIG_FAIL_PAGE_ALLOC + +static struct fail_page_alloc_attr { + struct fault_attr attr; + + u32 ignore_gfp_highmem; + u32 ignore_gfp_wait; + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + + struct dentry *ignore_gfp_highmem_file; + struct dentry *ignore_gfp_wait_file; + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +} fail_page_alloc = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_gfp_wait = 1, + .ignore_gfp_highmem = 1, +}; + +static int __init setup_fail_page_alloc(char *str) +{ + return setup_fault_attr(&fail_page_alloc.attr, str); +} +__setup("fail_page_alloc=", setup_fail_page_alloc); + +static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + if (gfp_mask & __GFP_NOFAIL) + return 0; + if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) + return 0; + if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) + return 0; + + return should_fail(&fail_page_alloc.attr, 1 << order); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_page_alloc_debugfs(void) +{ + mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + int err; + + err = init_fault_attr_dentries(&fail_page_alloc.attr, + "fail_page_alloc"); + if (err) + return err; + dir = fail_page_alloc.attr.dentries.dir; + + fail_page_alloc.ignore_gfp_wait_file = + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_wait); + + fail_page_alloc.ignore_gfp_highmem_file = + debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem); + + if (!fail_page_alloc.ignore_gfp_wait_file || + !fail_page_alloc.ignore_gfp_highmem_file) { + err = -ENOMEM; + debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); + debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); + cleanup_fault_attr_dentries(&fail_page_alloc.attr); + } + + return err; +} + +late_initcall(fail_page_alloc_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else /* CONFIG_FAIL_PAGE_ALLOC */ + +static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + return 0; +} + +#endif /* CONFIG_FAIL_PAGE_ALLOC */ + /* * Return 1 if free pages are above 'mark'. This takes into account the order * of the allocation. @@ -900,8 +989,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags) { /* free_pages my go negative - that's OK */ - unsigned long min = mark; - long free_pages = z->free_pages - (1 << order) + 1; + long min = mark, free_pages = z->free_pages - (1 << order) + 1; int o; if (alloc_flags & ALLOC_HIGH) @@ -1076,7 +1164,7 @@ zonelist_scan: zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) break; if ((alloc_flags & ALLOC_CPUSET) && - !cpuset_zone_allowed(zone, gfp_mask)) + !cpuset_zone_allowed_softwall(zone, gfp_mask)) goto try_next_zone; if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { @@ -1136,6 +1224,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, might_sleep_if(wait); + if (should_fail_alloc_page(gfp_mask, order)) + return NULL; + restart: z = zonelist->zones; /* the list of zones suitable for gfp_mask */ @@ -1864,17 +1955,24 @@ static inline unsigned long wait_table_bits(unsigned long size) * done. Non-atomic initialization, single-pass. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn) + unsigned long start_pfn, enum memmap_context context) { struct page *page; unsigned long end_pfn = start_pfn + size; unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!early_pfn_valid(pfn)) - continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; + /* + * There can be holes in boot-time mem_map[]s + * handed to this function. They do not + * exist on hotplugged memory. + */ + if (context == MEMMAP_EARLY) { + if (!early_pfn_valid(pfn)) + continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; + } page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); init_page_count(page); @@ -1901,7 +1999,7 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, #ifndef __HAVE_ARCH_MEMMAP_INIT #define memmap_init(size, nid, zone, start_pfn) \ - memmap_init_zone((size), (nid), (zone), (start_pfn)) + memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif static int __cpuinit zone_batchsize(struct zone *zone) @@ -2147,7 +2245,8 @@ static __meminit void zone_pcp_init(struct zone *zone) __meminit int init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, - unsigned long size) + unsigned long size, + enum memmap_context context) { struct pglist_data *pgdat = zone->zone_pgdat; int ret; @@ -2591,7 +2690,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, if (!size) continue; - ret = init_currently_empty_zone(zone, zone_start_pfn, size); + ret = init_currently_empty_zone(zone, zone_start_pfn, + size, MEMMAP_EARLY); BUG_ON(ret); zone_start_pfn += size; } @@ -3232,6 +3332,10 @@ void *__init alloc_large_system_hash(const char *tablename, numentries >>= (scale - PAGE_SHIFT); else numentries <<= (PAGE_SHIFT - scale); + + /* Make sure we've got at least a 0-order allocation.. */ + if (unlikely((numentries * bucketsize) < PAGE_SIZE)) + numentries = PAGE_SIZE / bucketsize; } numentries = roundup_pow_of_two(numentries); @@ -3244,7 +3348,7 @@ void *__init alloc_large_system_hash(const char *tablename, if (numentries > max) numentries = max; - log2qty = long_log2(numentries); + log2qty = ilog2(numentries); do { size = bucketsize << log2qty; @@ -3266,7 +3370,7 @@ void *__init alloc_large_system_hash(const char *tablename, printk("%s hash table entries: %d (order: %d, %lu bytes)\n", tablename, (1U << log2qty), - long_log2(size) - PAGE_SHIFT, + ilog2(size) - PAGE_SHIFT, size); if (_hash_shift) diff --git a/mm/readahead.c b/mm/readahead.c index a386f2b6b335..0f539e8e827a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> +#include <linux/task_io_accounting_ops.h> #include <linux/pagevec.h> void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) @@ -151,6 +152,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, put_pages_list(pages); break; } + task_io_account_read(PAGE_CACHE_SIZE); } pagevec_lru_add(&lru_pvec); return ret; @@ -450,7 +452,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp, * * Note that @filp is purely used for passing on to the ->readpage[s]() * handler: it may refer to a different file from @mapping (so we may not use - * @filp->f_mapping or @filp->f_dentry->d_inode here). + * @filp->f_mapping or @filp->f_path.dentry->d_inode here). * Also, @ra may not be equal to &@filp->f_ra. * */ diff --git a/mm/rmap.c b/mm/rmap.c index d8a842a586db..669acb22b572 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -47,6 +47,7 @@ #include <linux/rmap.h> #include <linux/rcupdate.h> #include <linux/module.h> +#include <linux/kallsyms.h> #include <asm/tlbflush.h> @@ -432,7 +433,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; unsigned long address; - pte_t *pte, entry; + pte_t *pte; spinlock_t *ptl; int ret = 0; @@ -444,17 +445,18 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) if (!pte) goto out; - if (!pte_dirty(*pte) && !pte_write(*pte)) - goto unlock; + if (pte_dirty(*pte) || pte_write(*pte)) { + pte_t entry; - entry = ptep_get_and_clear(mm, address, pte); - entry = pte_mkclean(entry); - entry = pte_wrprotect(entry); - ptep_establish(vma, address, pte, entry); - lazy_mmu_prot_update(entry); - ret = 1; + flush_cache_page(vma, address, pte_pfn(*pte)); + entry = ptep_clear_flush(vma, address, pte); + entry = pte_wrprotect(entry); + entry = pte_mkclean(entry); + set_pte_at(mm, address, pte, entry); + lazy_mmu_prot_update(entry); + ret = 1; + } -unlock: pte_unmap_unlock(pte, ptl); out: return ret; @@ -489,6 +491,8 @@ int page_mkclean(struct page *page) if (mapping) ret = page_mkclean_file(mapping, page); } + if (page_test_and_clear_dirty(page)) + ret = 1; return ret; } @@ -567,14 +571,20 @@ void page_add_file_rmap(struct page *page) * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page) +void page_remove_rmap(struct page *page, struct vm_area_struct *vma) { if (atomic_add_negative(-1, &page->_mapcount)) { if (unlikely(page_mapcount(page) < 0)) { printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); + printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); printk (KERN_EMERG " page->flags = %lx\n", page->flags); printk (KERN_EMERG " page->count = %x\n", page_count(page)); printk (KERN_EMERG " page->mapping = %p\n", page->mapping); + print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); + if (vma->vm_ops) + print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage); + if (vma->vm_file && vma->vm_file->f_op) + print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); BUG(); } @@ -679,7 +689,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, dec_mm_counter(mm, file_rss); - page_remove_rmap(page); + page_remove_rmap(page, vma); page_cache_release(page); out_unmap: @@ -769,7 +779,7 @@ static void try_to_unmap_cluster(unsigned long cursor, if (pte_dirty(pteval)) set_page_dirty(page); - page_remove_rmap(page); + page_remove_rmap(page, vma); page_cache_release(page); dec_mm_counter(mm, file_rss); (*mapcount)--; diff --git a/mm/shmem.c b/mm/shmem.c index c820b4f77b8d..70da7a0981bf 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -515,7 +515,12 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) size = SHMEM_NR_DIRECT; nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size); } - if (!topdir) + + /* + * If there are no indirect blocks or we are punching a hole + * below indirect blocks, nothing to be done. + */ + if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT))) goto done2; BUG_ON(limit <= SHMEM_NR_DIRECT); @@ -1225,7 +1230,7 @@ failed: struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type) { - struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct page *page = NULL; unsigned long idx; int error; @@ -1248,7 +1253,7 @@ static int shmem_populate(struct vm_area_struct *vma, unsigned long addr, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock) { - struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct mm_struct *mm = vma->vm_mm; enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; unsigned long size; @@ -1293,14 +1298,14 @@ static int shmem_populate(struct vm_area_struct *vma, #ifdef CONFIG_NUMA int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) { - struct inode *i = vma->vm_file->f_dentry->d_inode; + struct inode *i = vma->vm_file->f_path.dentry->d_inode; return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); } struct mempolicy * shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) { - struct inode *i = vma->vm_file->f_dentry->d_inode; + struct inode *i = vma->vm_file->f_path.dentry->d_inode; unsigned long idx; idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; @@ -1310,7 +1315,7 @@ shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) int shmem_lock(struct file *file, int lock, struct user_struct *user) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM; @@ -1422,7 +1427,7 @@ shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsig static ssize_t shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; loff_t pos; unsigned long written; ssize_t err; @@ -1442,7 +1447,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t if (err || !count) goto out; - err = remove_suid(file->f_dentry); + err = remove_suid(file->f_path.dentry); if (err) goto out; @@ -1524,7 +1529,7 @@ out: static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; unsigned long index, offset; @@ -2493,8 +2498,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ - file->f_vfsmnt = mntget(shm_mnt); - file->f_dentry = dentry; + file->f_path.mnt = mntget(shm_mnt); + file->f_path.dentry = dentry; file->f_mapping = inode->i_mapping; file->f_op = &shmem_file_operations; file->f_mode = FMODE_WRITE | FMODE_READ; diff --git a/mm/slab.c b/mm/slab.c index 068cb4503c15..c6100628a6ef 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -107,7 +107,9 @@ #include <linux/nodemask.h> #include <linux/mempolicy.h> #include <linux/mutex.h> +#include <linux/fault-inject.h> #include <linux/rtmutex.h> +#include <linux/reciprocal_div.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -385,6 +387,7 @@ struct kmem_cache { unsigned int shared; unsigned int buffer_size; + u32 reciprocal_buffer_size; /* 3) touched by every alloc & free from the backend */ struct kmem_list3 *nodelists[MAX_NUMNODES]; @@ -626,10 +629,17 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, return slab->s_mem + cache->buffer_size * idx; } -static inline unsigned int obj_to_index(struct kmem_cache *cache, - struct slab *slab, void *obj) +/* + * We want to avoid an expensive divide : (offset / cache->buffer_size) + * Using the fact that buffer_size is a constant for a particular cache, + * we can replace (offset / cache->buffer_size) by + * reciprocal_divide(offset, cache->reciprocal_buffer_size) + */ +static inline unsigned int obj_to_index(const struct kmem_cache *cache, + const struct slab *slab, void *obj) { - return (unsigned)(obj - slab->s_mem) / cache->buffer_size; + u32 offset = (obj - slab->s_mem); + return reciprocal_divide(offset, cache->reciprocal_buffer_size); } /* @@ -945,7 +955,8 @@ static void __devinit start_cpu_timer(int cpu) if (keventd_up() && reap_work->work.func == NULL) { init_reap_node(cpu); INIT_DELAYED_WORK(reap_work, cache_reap); - schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); + schedule_delayed_work_on(cpu, reap_work, + __round_jiffies_relative(HZ, cpu)); } } @@ -1425,6 +1436,8 @@ void __init kmem_cache_init(void) cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); + cache_cache.reciprocal_buffer_size = + reciprocal_value(cache_cache.buffer_size); for (order = 0; order < MAX_ORDER; order++) { cache_estimate(order, cache_cache.buffer_size, @@ -2311,6 +2324,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (flags & SLAB_CACHE_DMA) cachep->gfpflags |= GFP_DMA; cachep->buffer_size = size; + cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); @@ -3088,12 +3102,89 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) #endif +#ifdef CONFIG_FAILSLAB + +static struct failslab_attr { + + struct fault_attr attr; + + u32 ignore_gfp_wait; +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + struct dentry *ignore_gfp_wait_file; +#endif + +} failslab = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_gfp_wait = 1, +}; + +static int __init setup_failslab(char *str) +{ + return setup_fault_attr(&failslab.attr, str); +} +__setup("failslab=", setup_failslab); + +static int should_failslab(struct kmem_cache *cachep, gfp_t flags) +{ + if (cachep == &cache_cache) + return 0; + if (flags & __GFP_NOFAIL) + return 0; + if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT)) + return 0; + + return should_fail(&failslab.attr, obj_size(cachep)); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init failslab_debugfs(void) +{ + mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + int err; + + err = init_fault_attr_dentries(&failslab.attr, "failslab"); + if (err) + return err; + dir = failslab.attr.dentries.dir; + + failslab.ignore_gfp_wait_file = + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &failslab.ignore_gfp_wait); + + if (!failslab.ignore_gfp_wait_file) { + err = -ENOMEM; + debugfs_remove(failslab.ignore_gfp_wait_file); + cleanup_fault_attr_dentries(&failslab.attr); + } + + return err; +} + +late_initcall(failslab_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else /* CONFIG_FAILSLAB */ + +static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags) +{ + return 0; +} + +#endif /* CONFIG_FAILSLAB */ + static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *objp; struct array_cache *ac; check_irq_off(); + + if (should_failslab(cachep, flags)) + return NULL; + ac = cpu_cache_get(cachep); if (likely(ac->avail)) { STATS_INC_ALLOCHIT(cachep); @@ -3173,6 +3264,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) struct zone **z; void *obj = NULL; int nid; + gfp_t local_flags = (flags & GFP_LEVEL_MASK); retry: /* @@ -3182,21 +3274,26 @@ retry: for (z = zonelist->zones; *z && !obj; z++) { nid = zone_to_nid(*z); - if (cpuset_zone_allowed(*z, flags) && + if (cpuset_zone_allowed_hardwall(*z, flags) && cache->nodelists[nid] && cache->nodelists[nid]->free_objects) obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); } - if (!obj) { + if (!obj && !(flags & __GFP_NO_GROW)) { /* * This allocation will be performed within the constraints * of the current cpuset / memory policy requirements. * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ + if (local_flags & __GFP_WAIT) + local_irq_enable(); + kmem_flagcheck(cache, flags); obj = kmem_getpages(cache, flags, -1); + if (local_flags & __GFP_WAIT) + local_irq_disable(); if (obj) { /* * Insert into the appropriate per node queues @@ -3213,7 +3310,7 @@ retry: */ goto retry; } else { - kmem_freepages(cache, obj); + /* cache_grow already freed obj */ obj = NULL; } } @@ -3456,7 +3553,7 @@ EXPORT_SYMBOL(kmem_cache_zalloc); * * Currently only used for dentry validation. */ -int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr) +int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) { unsigned long addr = (unsigned long)ptr; unsigned long min_addr = PAGE_OFFSET; @@ -3490,6 +3587,7 @@ out: * @cachep: The cache to allocate from. * @flags: See kmalloc(). * @nodeid: node number of the target node. + * @caller: return address of caller, used for debug information * * Identical to kmem_cache_alloc but it will allocate memory on the given * node, which can improve the performance for cpu bound structures. @@ -3928,7 +4026,7 @@ static void cache_reap(struct work_struct *unused) if (!mutex_trylock(&cache_chain_mutex)) { /* Give up. Setup the next iteration. */ schedule_delayed_work(&__get_cpu_var(reap_work), - REAPTIMEOUT_CPUC); + round_jiffies_relative(REAPTIMEOUT_CPUC)); return; } @@ -3974,7 +4072,8 @@ next: next_reap_node(); refresh_cpu_vm_stats(smp_processor_id()); /* Set up the next iteration */ - schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); + schedule_delayed_work(&__get_cpu_var(reap_work), + round_jiffies_relative(REAPTIMEOUT_CPUC)); } #ifdef CONFIG_PROC_FS diff --git a/mm/slob.c b/mm/slob.c index 542394184a58..5adc29cb58dd 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -60,6 +60,8 @@ static DEFINE_SPINLOCK(slob_lock); static DEFINE_SPINLOCK(block_lock); static void slob_free(void *b, int size); +static void slob_timer_cbk(void); + static void *slob_alloc(size_t size, gfp_t gfp, int align) { @@ -157,7 +159,7 @@ static int fastcall find_order(int size) return order; } -void *kmalloc(size_t size, gfp_t gfp) +void *__kmalloc(size_t size, gfp_t gfp) { slob_t *m; bigblock_t *bb; @@ -186,8 +188,7 @@ void *kmalloc(size_t size, gfp_t gfp) slob_free(bb, sizeof(bigblock_t)); return 0; } - -EXPORT_SYMBOL(kmalloc); +EXPORT_SYMBOL(__kmalloc); void kfree(const void *block) { @@ -327,9 +328,25 @@ const char *kmem_cache_name(struct kmem_cache *c) EXPORT_SYMBOL(kmem_cache_name); static struct timer_list slob_timer = TIMER_INITIALIZER( - (void (*)(unsigned long))kmem_cache_init, 0, 0); + (void (*)(unsigned long))slob_timer_cbk, 0, 0); + +int kmem_cache_shrink(struct kmem_cache *d) +{ + return 0; +} +EXPORT_SYMBOL(kmem_cache_shrink); + +int kmem_ptr_validate(struct kmem_cache *a, const void *b) +{ + return 0; +} + +void __init kmem_cache_init(void) +{ + slob_timer_cbk(); +} -void kmem_cache_init(void) +static void slob_timer_cbk(void) { void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); diff --git a/mm/swapfile.c b/mm/swapfile.c index c5431072f422..a2d9bb4e80df 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -434,7 +434,7 @@ void free_swap_and_cache(swp_entry_t entry) * * This is needed for the suspend to disk (aka swsusp). */ -int swap_type_of(dev_t device, sector_t offset) +int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) { struct block_device *bdev = NULL; int i; @@ -450,6 +450,9 @@ int swap_type_of(dev_t device, sector_t offset) continue; if (!bdev) { + if (bdev_p) + *bdev_p = sis->bdev; + spin_unlock(&swap_lock); return i; } @@ -459,6 +462,9 @@ int swap_type_of(dev_t device, sector_t offset) se = list_entry(sis->extent_list.next, struct swap_extent, list); if (se->start_block == offset) { + if (bdev_p) + *bdev_p = sis->bdev; + spin_unlock(&swap_lock); bdput(bdev); return i; @@ -1357,10 +1363,10 @@ static int swap_show(struct seq_file *swap, void *v) } file = ptr->swap_file; - len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\"); + len = seq_path(swap, file->f_path.mnt, file->f_path.dentry, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", len < 40 ? 40 - len : 1, " ", - S_ISBLK(file->f_dentry->d_inode->i_mode) ? + S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? "partition" : "file\t", ptr->pages << (PAGE_SHIFT - 10), ptr->inuse_pages << (PAGE_SHIFT - 10), diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index 5f2cbf0f153c..c7f6e1914bc4 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -79,8 +79,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) d_instantiate(dentry, inode); inode->i_nlink = 0; /* It is unlinked */ - file->f_vfsmnt = mntget(shm_mnt); - file->f_dentry = dentry; + file->f_path.mnt = mntget(shm_mnt); + file->f_path.dentry = dentry; file->f_mapping = inode->i_mapping; file->f_op = &ramfs_file_operations; file->f_mode = FMODE_WRITE | FMODE_READ; diff --git a/mm/truncate.c b/mm/truncate.c index e07b1e682c38..5df947de7654 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/pagemap.h> #include <linux/pagevec.h> +#include <linux/task_io_accounting_ops.h> #include <linux/buffer_head.h> /* grr. try_to_release_page, do_invalidatepage */ @@ -51,6 +52,33 @@ static inline void truncate_partial_page(struct page *page, unsigned partial) } /* + * This cancels just the dirty bit on the kernel page itself, it + * does NOT actually remove dirty bits on any mmap's that may be + * around. It also leaves the page tagged dirty, so any sync + * activity will still find it on the dirty lists, and in particular, + * clear_page_dirty_for_io() will still look at the dirty bits in + * the VM. + * + * Doing this should *normally* only ever be done when a page + * is truncated, and is not actually mapped anywhere at all. However, + * fs/buffer.c does this when it notices that somebody has cleaned + * out all the buffers on a page without actually doing it through + * the VM. Can you say "ext3 is horribly ugly"? Tought you could. + */ +void cancel_dirty_page(struct page *page, unsigned int account_size) +{ + if (TestClearPageDirty(page)) { + struct address_space *mapping = page->mapping; + if (mapping && mapping_cap_account_dirty(mapping)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + if (account_size) + task_io_account_cancelled_write(account_size); + } + } +} +EXPORT_SYMBOL(cancel_dirty_page); + +/* * If truncate cannot remove the fs-private metadata from the page, the page * becomes anonymous. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_nopage(). @@ -66,10 +94,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (page->mapping != mapping) return; + cancel_dirty_page(page, PAGE_CACHE_SIZE); + if (PagePrivate(page)) do_invalidatepage(page, 0); - clear_page_dirty(page); ClearPageUptodate(page); ClearPageMappedToDisk(page); remove_from_page_cache(page); @@ -319,6 +348,15 @@ failed: return 0; } +static int do_launder_page(struct address_space *mapping, struct page *page) +{ + if (!PageDirty(page)) + return 0; + if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) + return 0; + return mapping->a_ops->launder_page(page); +} + /** * invalidate_inode_pages2_range - remove range of pages from an address_space * @mapping: the address_space @@ -348,7 +386,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, for (i = 0; !ret && i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index; - int was_dirty; lock_page(page); if (page->mapping != mapping) { @@ -384,18 +421,14 @@ int invalidate_inode_pages2_range(struct address_space *mapping, PAGE_CACHE_SIZE, 0); } } - was_dirty = test_clear_page_dirty(page); - if (!invalidate_complete_page2(mapping, page)) { - if (was_dirty) - set_page_dirty(page); + ret = do_launder_page(mapping, page); + if (ret == 0 && !invalidate_complete_page2(mapping, page)) ret = -EIO; - } unlock_page(page); } pagevec_release(&pvec); cond_resched(); } - WARN_ON_ONCE(ret); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); diff --git a/mm/vmscan.c b/mm/vmscan.c index 093f5fe6dd77..7430df68cb64 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -692,7 +692,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, __count_vm_events(KSWAPD_STEAL, nr_freed); } else __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); - __count_vm_events(PGACTIVATE, nr_freed); + __count_zone_vm_events(PGSTEAL, zone, nr_freed); if (nr_taken == 0) goto done; @@ -984,7 +984,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, if (!populated_zone(zone)) continue; - if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; note_zone_scanning_priority(zone, priority); @@ -1034,7 +1034,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; - if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; lru_pages += zone->nr_active + zone->nr_inactive; @@ -1089,7 +1089,7 @@ out: for (i = 0; zones[i] != 0; i++) { struct zone *zone = zones[i]; - if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; zone->prev_priority = priority; @@ -1354,7 +1354,7 @@ void wakeup_kswapd(struct zone *zone, int order) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; - if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; if (!waitqueue_active(&pgdat->kswapd_wait)) return; @@ -1369,8 +1369,8 @@ void wakeup_kswapd(struct zone *zone, int order) * * For pass > 3 we also try to shrink the LRU lists that contain a few pages */ -static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, - int prio, struct scan_control *sc) +static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, + int pass, struct scan_control *sc) { struct zone *zone; unsigned long nr_to_scan, ret = 0; @@ -1406,6 +1406,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, return ret; } +static unsigned long count_lru_pages(void) +{ + struct zone *zone; + unsigned long ret = 0; + + for_each_zone(zone) + ret += zone->nr_active + zone->nr_inactive; + return ret; +} + /* * Try to free `nr_pages' of memory, system-wide, and return the number of * freed pages. @@ -1420,7 +1430,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages) unsigned long ret = 0; int pass; struct reclaim_state reclaim_state; - struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_swap = 0, @@ -1431,10 +1440,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) current->reclaim_state = &reclaim_state; - lru_pages = 0; - for_each_zone(zone) - lru_pages += zone->nr_active + zone->nr_inactive; - + lru_pages = count_lru_pages(); nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { @@ -1461,13 +1467,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages) for (pass = 0; pass < 5; pass++) { int prio; - /* Needed for shrinking slab caches later on */ - if (!lru_pages) - for_each_zone(zone) { - lru_pages += zone->nr_active; - lru_pages += zone->nr_inactive; - } - /* Force reclaiming mapped pages in the passes #3 and #4 */ if (pass > 2) { sc.may_swap = 1; @@ -1483,7 +1482,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages) goto out; reclaim_state.reclaimed_slab = 0; - shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); + shrink_slab(sc.nr_scanned, sc.gfp_mask, + count_lru_pages()); ret += reclaim_state.reclaimed_slab; if (ret >= nr_pages) goto out; @@ -1491,20 +1491,19 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (sc.nr_scanned && prio < DEF_PRIORITY - 2) congestion_wait(WRITE, HZ / 10); } - - lru_pages = 0; } /* * If ret = 0, we could not shrink LRUs, but there may be something * in slab caches */ - if (!ret) + if (!ret) { do { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, lru_pages); + shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); ret += reclaim_state.reclaimed_slab; } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); + } out: current->reclaim_state = NULL; |